diff --git a/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb b/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb index a169eb0b9aa69b0fb8cec0df3e2d006976ba4f0d..e3a613722853b2718d25a3712cb4105e638ef87d 100644 --- a/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb +++ b/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 163, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -113,7 +113,7 @@ "<IPython.core.display.HTML object>" ] }, - "execution_count": 163, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -204,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -264,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -292,7 +292,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -314,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -341,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 6, "metadata": { "scrolled": true }, @@ -469,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -486,12 +486,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Such a pipeline now \"behaves\" like a single classifier, as it implements `.fit` and `.predict`:" + "<div class=\"alert alert-block alert-warning\">\n", + "<p><i class=\"fa fa-info-circle\"></i>\n", + "A pipeline \"behaves\" like a single classifier - it implements <code>.fit()</code> and <code>.predict()</code> methods.</p>\n", + "</div>\n" ] }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -517,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -566,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -624,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 11, "metadata": { "tags": [ "solution" @@ -665,7 +668,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 12, "metadata": { "tags": [ "solution" @@ -725,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 13, "metadata": { "tags": [ "solution" @@ -767,7 +770,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "<Figure size 720x504 with 4 Axes>" ] @@ -884,7 +887,7 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -931,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -953,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -975,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1003,7 +1006,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1045,7 +1048,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1062,12 +1065,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We run now 30 iterations" + "We run now 30 iterations." ] }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 20, "metadata": { "scrolled": true }, @@ -1076,8 +1079,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Best parameter (CV score=0.982):\n", - "{'logisticregression__C': 4.672288322013504, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': True, 'standardscaler__with_std': True}\n" + "Best parameter (CV score=0.978):\n", + "{'logisticregression__C': 3.219890406724053, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': True, 'standardscaler__with_std': False}\n" ] } ], @@ -1086,13 +1089,70 @@ "\n", "\n", "\n", - "search = RandomizedSearchCV(p, param_dist, cv=4, n_jobs=2, n_iter=30)\n", + "search = RandomizedSearchCV(p, param_dist, cv=4, n_jobs=2, n_iter=30,\n", + " random_state=42) # fix randomization for reproduciblity\n", "\n", "search.fit(features, labels)\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-block alert-warning\">\n", + "<p><i class=\"fa fa-info-circle\"></i>\n", + "Hyperparameter search methods also \"behave\" like a single classifier - they implement <code>.fit()</code> and <code>.predict()</code> methods (*).</p>\n", + "</div>\n", + "\n", + "(\\*) Predicition is done with the best parameters found. The underlying model or pipeline with the best parameters is available via `.best_estimator_` property. Importantly, the **refit** with the best parameters is done at the end of the CV-based search, **using a whole training data set**.\n", + "\n", + "The automatic refitting can be disabled by passing `refit=False` argument when specifying the search method. Then neither `.predict()`, nor `.best_estimator_` won't be available.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best estimator:\n", + "Pipeline(memory=None,\n", + " steps=[('polynomialfeatures',\n", + " PolynomialFeatures(degree=3, include_bias=True,\n", + " interaction_only=False, order='C')),\n", + " ('standardscaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=False)),\n", + " ('logisticregression',\n", + " LogisticRegression(C=3.219890406724053, class_weight=None,\n", + " dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None,\n", + " max_iter=100, multi_class='warn',\n", + " n_jobs=None, penalty='l2',\n", + " random_state=None, solver='warn',\n", + " tol=0.0001, verbose=0, warm_start=False))],\n", + " verbose=False)\n", + "\n", + "\n", + "Training set accuracy: 0.9911111111111112\n" + ] + } + ], + "source": [ + "print(\"Best estimator:\")\n", + "print(search.best_estimator_)\n", + "print()\n", + "print()\n", + "print(\"Training set accuracy:\", sum(search.predict(features) == labels)/len(labels))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1106,14 +1166,14 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'logisticregression__C': 4.672288322013504, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': True, 'standardscaler__with_std': True}\n", + "{'logisticregression__C': 3.219890406724053, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': True, 'standardscaler__with_std': False}\n", "\n", "[0 0 1 1 0 0 1 1 0 1]\n" ] @@ -1150,7 +1210,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 23, "metadata": { "tags": [ "solution" @@ -1163,8 +1223,8 @@ "text": [ "Best parameter (CV score=0.978):\n", "{'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 15, 'svc__gamma': 0.1}\n", - "Best parameter (CV score=0.973):\n", - "{'pca__n_components': 9, 'polynomialfeatures__degree': 2}\n" + "Best parameter (CV score=0.978):\n", + "{'pca__n_components': 10, 'polynomialfeatures__degree': 2}\n" ] } ], @@ -1200,7 +1260,8 @@ " 'pca__n_components': randint(4, 15)\n", " }\n", "\n", - "search = RandomizedSearchCV(p, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5)\n", + "search = RandomizedSearchCV(p, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5,\n", + " random_state=42) # fix randomization for reproduciblity\n", "search.fit(features, labels)\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)\n" @@ -1220,7 +1281,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 24, "metadata": { "tags": [ "solution" @@ -1231,8 +1292,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Best parameter (CV score=0.905):\n", - "{'C': 59.85495947411998, 'gamma': 1.4553652628933988}\n" + "Best parameter (CV score=0.920):\n", + "{'C': 12.827747704497042, 'gamma': 1.8440450985343382}\n" ] }, { @@ -1258,6 +1319,7 @@ "data = pd.read_csv(\"data/spiral.csv\")\n", "\n", "features = data.iloc[:, :-1]\n", + "labels = data.iloc[:, -1]\n", "\n", "import matplotlib.pyplot as plt\n", "\n", @@ -1272,7 +1334,8 @@ " 'gamma': uniform(0.01, 10),\n", " }\n", "\n", - "search = RandomizedSearchCV(clf, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5)\n", + "search = RandomizedSearchCV(clf, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5,\n", + " random_state=42) # fix randomization for reproduciblity\n", "search.fit(features, labels)\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)" @@ -1280,7 +1343,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 25, "metadata": { "tags": [ "solution" @@ -1292,7 +1355,7 @@ "output_type": "stream", "text": [ "Best parameter (CV score=0.964):\n", - "{'pca__n_components': 15, 'svc__C': 18.851242451312373, 'svc__gamma': 0.13103264635586054}\n" + "{'pca__n_components': 22, 'svc__C': 22.62496259847715, 'svc__gamma': 0.016632480579933266}\n" ] } ], @@ -1326,7 +1389,8 @@ "p = make_pipeline(PCA(), StandardScaler(), SVC())\n", "\n", "\n", - "search = RandomizedSearchCV(p, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5, n_iter=20)\n", + "search = RandomizedSearchCV(p, param_grid, cv=5, scoring=\"accuracy\", n_jobs=5, n_iter=20,\n", + " random_state=42) # fix randomization for reproduciblity\n", "search.fit(features, labels)\n", "\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", @@ -1391,7 +1455,7 @@ "height": "calc(100% - 180px)", "left": "10px", "top": "150px", - "width": "230.188px" + "width": "246.183px" }, "toc_section_display": true, "toc_window_display": true