From b526cf3f6fec7f3b2b3bb541190f084b27731e64 Mon Sep 17 00:00:00 2001 From: Mikolaj Rybinski <mikolaj.rybinski@id.ethz.ch> Date: Tue, 7 May 2019 08:50:11 +0200 Subject: [PATCH] 06 script: quick review fixes --- ...ines_and_hyperparameter_optimization.ipynb | 86 ++++++++++--------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/06_preprocessing_pipelines_and_hyperparameter_optimization.ipynb b/06_preprocessing_pipelines_and_hyperparameter_optimization.ipynb index 8ca9d73..09e66a8 100644 --- a/06_preprocessing_pipelines_and_hyperparameter_optimization.ipynb +++ b/06_preprocessing_pipelines_and_hyperparameter_optimization.ipynb @@ -172,7 +172,7 @@ "\n", "Principal component analysis is a technique to reduce the dimensionality of a multi variate data set. One benefit of PCA is to remove redundancy in your data set, such as correlating columns or linear dependencies between columns.\n", "\n", - "We discussed before that reducing redundancy and noise can help to avoid overfitting.\n", + "We've discussed before that reducing redundancy and noise can help to avoid overfitting.\n", "\n", "\n", "### Function transformers\n", @@ -387,11 +387,11 @@ "\n", "<h3><i class=\"fa fa-info-circle\"></i> Important</h3>\n", "\n", - " When we include preprocessing in a classification approach, we must later apply **exactly the same preprocessing** on new incoming data!\n", + " When we include preprocessing in a classification approach, we must later **apply exactly the same preprocessing on new incoming data**!\n", "\n", - "For preprocessors which depend on the full data set this implies that we never must preprocess data before cross-validation !\n", + "For preprocessors which depend on the full data set this implies that we never must preprocess data before cross-validation!\n", "\n", - "Running such preprocessors on the full data set lets information of \"unseen\" data sneak into the classifier.\n", + "Running such preprocessors on the full dataset lets information of \"unseen\" data sneak into the classifier.\n", "\n", "</div>\n", "\n" @@ -403,27 +403,27 @@ "source": [ "### This is how we must proceed instead:\n", "\n", - "In case for the `MinMaxScaler`:\n", + "In case for the `MinMaxScaler` preprocessor:\n", "\n", - "1. Determine columnwise minimum und maximum values of training features.\n", - "2. Use these to scale training features.\n", - "3. Learn Classifier.\n", + "1. Determine column-wise minimum und maximum values of the training features.\n", + "2. Use these min/max values to scale training data.\n", + "3. Learn classifier `C` on the scaled training data.\n", "\n", "\n", - "4. Use values from 1. to scale evaluation features (thus we might create values outside `0..1`).\n", - "5. Apply classifier to evaluation features.\n", - "6. Assess Performance.\n", + "4. Use values from 1. to scale evaluation data (thus, we might create values outside `0..1`).\n", + "5. Apply classifier `C` to the scaled evaluation data.\n", + "6. Assess `C` performance.\n", "\n", "In general:\n", "\n", - "1. Learn prprocessor `P` on training data set.\n", - "2. Apply `P` on training data set.\n", - "3. Learn classifier `C` on the training data set.\n", + "1. Learn prprocessor `P` on the training data.\n", + "2. Apply `P` to the training data.\n", + "3. Learn classifier `C` on the preprocessed training data.\n", "\n", "\n", - "4. Apply `P` from before to the evaluation data set.\n", - "5. Apply classifier `C` on the scaled evaluation data set.\n", - "6. Assess performance.\n" + "4. Apply `P` from 1. to the evaluation data.\n", + "5. Apply classifier `C` to the preprocessed evaluation data.\n", + "6. Assess `C` performance.\n" ] }, { @@ -546,7 +546,7 @@ "source": [ "<div class=\"alert alert-block alert-warning\">\n", "\n", - "<i class=\"fa fa-info-circle\"></i> One benefit of using a pipeline is that you will not mistakenly scale the full data set first, instead we follow the strategy we described above automatically.\n", + "<i class=\"fa fa-info-circle\"></i> One benefit of using a pipeline is that you will not mistakenly scale the full data set first, instead we follow the strategy we've described above automatically.\n", "\n", "</div>" ] @@ -564,13 +564,15 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "0.844 ['kneighborsclassifier']\n", + "0.937 ['standardscaler', 'kneighborsclassifier']\n", "0.863 ['svc']\n", "0.947 ['standardscaler', 'svc']\n", "0.915 ['minmaxscaler', 'svc']\n", @@ -586,10 +588,13 @@ "from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures\n", "from sklearn.decomposition import PCA\n", "\n", + "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "\n", - "for p in [make_pipeline(SVC()),\n", + "for p in [make_pipeline(KNeighborsClassifier()),\n", + " make_pipeline(StandardScaler(), KNeighborsClassifier()),\n", + " make_pipeline(SVC()),\n", " make_pipeline(StandardScaler(), SVC()),\n", " make_pipeline(MinMaxScaler(), SVC()),\n", " make_pipeline(LogisticRegression()),\n", @@ -617,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": { "tags": [ "solution" @@ -735,9 +740,7 @@ "\n", "Classifiers and pipelines have parameters which must be adapted for improving performance (e.g. `gamma` or `C`). Finding good parameters is also called *hyperparameter optimization* to distinguish from the optimization done during learning of many classification algorithms.\n", "\n", - "### Up to now we adapted such hyperparameters manually, but there are more systematic approaches !\n", - "\n", - "<img src=\"https://i.imgflip.com/3040hg.jpg\" title=\"made at imgflip.com\" width=50%/>" + "### Up to now we adapted such hyperparameters manually, but there are more systematic approaches !" ] }, { @@ -801,10 +804,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The specification of the grid id now a bit more complicated: \n", + "The specification of the grid id now a bit more complicated `PROCESSOR__ARGUMENT`: \n", "\n", - "- first the name of the processor / classifier in lower case letters\n", - "- then two underscores `__` \n", + "- first the name of the processor / classifier in lower case letters,\n", + "- then two underscores `__`,\n", "- finally the name of the argument of the processor / classifier.\n", "\n", "`StandardScaler` e.g. has parameters `with_mean` and `with_std` which can be `True` or `False`:" @@ -870,10 +873,10 @@ "source": [ "from scipy.stats import uniform, randint\n", "\n", - "param_dist = {'polynomialfeatures__degree': randint(1, 4),\n", - " 'standardscaler__with_mean': [True, False],\n", + "param_dist = {'polynomialfeatures__degree': randint(1, 4), # random integer from 1 to 4\n", + " 'standardscaler__with_mean': [True, False], # random value from explicit set of values\n", " 'standardscaler__with_std': [True, False],\n", - " 'logisticregression__C': uniform(.1, 20)\n", + " 'logisticregression__C': uniform(.1, 20) # random number from .1 to 20\n", " }" ] }, @@ -894,7 +897,7 @@ "output_type": "stream", "text": [ "Best parameter (CV score=0.982):\n", - "{'logisticregression__C': 17.31461166512687, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': False, 'standardscaler__with_std': False}\n" + "{'logisticregression__C': 4.675963309832449, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': True, 'standardscaler__with_std': True}\n" ] } ], @@ -939,14 +942,6 @@ "Best parameter (CV score=0.978):\n", "{'pca__n_components': 10, 'polynomialfeatures__degree': 2}\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/uweschmitt/Projects/machinelearning-introduction-workshop/venv37/lib/python3.7/site-packages/sklearn/svm/base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n" - ] } ], "source": [ @@ -970,10 +965,12 @@ "print(search.best_params_)\n", "\n", "\n", + "from sklearn.svm import LinearSVC\n", + "\n", "p = make_pipeline(StandardScaler(), PolynomialFeatures(), PCA(), LinearSVC())\n", "param_grid = {\n", " 'polynomialfeatures__degree': [2, 3, 4],\n", - " 'pca__n_components': [10, 12, 14]\n", + " 'pca__n_components': [4, 6, 8, 10, 12]\n", " }\n", "\n", "search = GridSearchCV(p, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5)\n", @@ -981,6 +978,13 @@ "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (C) 2019 ETH Zurich, SIS ID" + ] } ], "metadata": { @@ -1000,7 +1004,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.3" }, "latex_envs": { "LaTeX_envs_menu_present": true, -- GitLab