From 6b70aaa7a3d64389a5dc2ef8b055ae7c2a6b423a Mon Sep 17 00:00:00 2001
From: Uwe Schmitt <uwe.schmitt@id.ethz.ch>
Date: Sun, 27 Jan 2019 23:10:00 +0100
Subject: [PATCH] rough draft for 05 about pipelines et al

---
 ...ines_and_hyperparameter_optimization.ipynb | 349 ++++++++++++++++++
 1 file changed, 349 insertions(+)
 create mode 100644 05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb

diff --git a/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb b/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb
new file mode 100644
index 0000000..de86c6f
--- /dev/null
+++ b/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb
@@ -0,0 +1,349 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Draft\n",
+    "\n",
+    "- Scicit learn api:  recall what we have seen up to now.\n",
+    "- pipelines, preprocessing (scaler, PCA)\n",
+    "- cross validatioon on pipeline\n",
+    "- parameter tuning: grid search / random search."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Up to now all classifiers had methods `fit` and `predict`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocessing\n",
+    "\n",
+    "Scaler: SVC work better when all columns of feature matrix are in the same numerical range.\n",
+    "\n",
+    "PCA can reduce redundancy / correlations (see overfitting script) and thus avoid overfitting.\n",
+    "\n",
+    "Polynomial features: extend feature matrix by computing products between and within feature matrix columns\n",
+    "\n",
+    "FunctionTransformer (sklearn): Apply functions like log\n",
+    "\n",
+    "Danger: PCA and Scaler learn on data sets. Make sure that test/validation datasets do not sneak in !\n",
+    "\n",
+    "DONT DO: scale on full dataset first, then cross validation etc."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Preprocessors in `sklearn` all have `fit`, `transform` and `fit_transform` methods."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pipelines\n",
+    "\n",
+    "Combine different preprocessing steps into one \"classifier\".\n",
+    "\n",
+    "Pipeline is a list for preprocessors followed by a classifier.\n",
+    "\n",
+    "Thus for a pipeline of len $n$ there are $n - 1$ objects having `fit` and `transform` methods, the last element has `fit` and `predict` methods.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import PolynomialFeatures, StandardScaler\n",
+    "from sklearn.decomposition import PCA\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.svm import SVC\n",
+    "from sklearn.linear_model import LogisticRegression"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html#sklearn.pipeline.make_pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.cross_validation import cross_val_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "beer_data = pd.read_csv(\"beers.csv\")\n",
+    "\n",
+    "features = beer_data.iloc[:, :-1]\n",
+    "labels = beer_data.iloc[:, -1];"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 149,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.8625911286780852\n",
+      "0.94655248133509\n",
+      "\n",
+      "pipeline\n",
+      "\n",
+      "standardscaler       StandardScaler(copy=True, with_mean=True, with_std=True)\n",
+      "svc                  SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
+      "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
+      "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
+      "  tol=0.001, verbose=False)\n",
+      "\n",
+      "0.9286736934563022\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(cross_val_score(SVC(), features, labels, \"accuracy\", cv=5).mean())\n",
+    "\n",
+    "# here we see benefit of scaling for SVC\n",
+    "p = make_pipeline(StandardScaler(), SVC())\n",
+    "\n",
+    "print(cross_val_score(p, features, labels, \"accuracy\", cv=5).mean())\n",
+    "\n",
+    "print()\n",
+    "print(\"pipeline\")\n",
+    "print()\n",
+    "\n",
+    "\n",
+    "for name, step in p.steps:\n",
+    "    print(\"{:20s} {}\".format(name, step))\n",
+    "    \n",
+    "print()\n",
+    "\n",
+    "# this is how we can set parameters of a single step in the pipeline:\n",
+    "p.set_params(svc__C = 3)\n",
+    "\n",
+    "print(cross_val_score(p, features, labels, \"accuracy\", cv=5).mean())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.863 ['svc']\n",
+      "0.947 ['standardscaler', 'svc']\n",
+      "0.804 ['logisticregression']\n",
+      "0.920 ['standardscaler', 'pca', 'logisticregression']\n",
+      "0.840 ['polynomialfeatures', 'svc']\n",
+      "0.942 ['polynomialfeatures', 'standardscaler', 'svc']\n",
+      "0.925 ['polynomialfeatures', 'logisticregression']\n",
+      "0.964 ['polynomialfeatures', 'standardscaler', 'logisticregression']\n"
+     ]
+    }
+   ],
+   "source": [
+    "for p in [make_pipeline(SVC()),\n",
+    "          make_pipeline(StandardScaler(), SVC()),\n",
+    "          make_pipeline(LogisticRegression()),\n",
+    "          make_pipeline(StandardScaler(), PCA(), LogisticRegression()),\n",
+    "\n",
+    "          make_pipeline(PolynomialFeatures(), SVC()),\n",
+    "          make_pipeline(PolynomialFeatures(), StandardScaler(), SVC()),\n",
+    "          make_pipeline(PolynomialFeatures(), LogisticRegression()),\n",
+    "          make_pipeline(PolynomialFeatures(), StandardScaler(), LogisticRegression()),\n",
+    "          ]:\n",
+    "    \n",
+    "    print(\"{:.3f}\".format(cross_val_score(p, features, labels, \"accuracy\", cv=5).mean()), end=\" \")\n",
+    "    print([pi[0] for pi in p.steps])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercises:\n",
+    "- use the beer data frame with random / redundant features from cross val script and demo benefit of PCA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9822222222222222 {'C': 5, 'kernel': 'poly'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
+    "\n",
+    "# optimize parameters of one single classifier\n",
+    "\n",
+    "parameters = {'kernel':('linear', 'rbf', 'poly'), \n",
+    "              'C':[1, 5, 10, 15]\n",
+    "              }\n",
+    "\n",
+    "svc = SVC()\n",
+    "search = GridSearchCV(svc, parameters, cv=5)\n",
+    "search.fit(features, labels)\n",
+    "print(search.best_score_, search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we optimize a pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = make_pipeline(PolynomialFeatures(), StandardScaler(), LogisticRegression())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TODO: explain param_grid"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "param_grid = {'polynomialfeatures__degree': [1, 2, 3],\n",
+    "              'standardscaler__with_mean': [True, False],\n",
+    "              'standardscaler__with_std': [True, False],\n",
+    "              'logisticregression__C': [1, 10, 15, 20, 25],\n",
+    "             }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 164,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best parameter (CV score=0.983):\n",
+      "{'logisticregression__C': 20, 'polynomialfeatures__degree': 2, 'standardscaler__with_mean': True, 'standardscaler__with_std': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "search = GridSearchCV(p, param_grid, cv=5, scoring=\"f1\", return_train_score=False, n_jobs=5)\n",
+    "search.fit(features, labels)\n",
+    "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
+    "print(search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.stats import uniform, randint\n",
+    "\n",
+    "param_dist = {'polynomialfeatures__degree': randint(1, 5),\n",
+    "              'standardscaler__with_mean': [True, False],\n",
+    "              'standardscaler__with_std': [True, False],\n",
+    "              'logisticregression__C': uniform(0.1, 20)\n",
+    "             }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 162,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best parameter (CV score=0.982):\n",
+      "{'logisticregression__C': 15.053760390091858, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': False, 'standardscaler__with_std': True}\n"
+     ]
+    }
+   ],
+   "source": [
+    "search = RandomizedSearchCV(p, param_dist, n_jobs=5, n_iter=100)\n",
+    "\n",
+    "search.fit(features, labels)\n",
+    "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
+    "print(search.best_params_)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab