From 6b70aaa7a3d64389a5dc2ef8b055ae7c2a6b423a Mon Sep 17 00:00:00 2001 From: Uwe Schmitt <uwe.schmitt@id.ethz.ch> Date: Sun, 27 Jan 2019 23:10:00 +0100 Subject: [PATCH] rough draft for 05 about pipelines et al --- ...ines_and_hyperparameter_optimization.ipynb | 349 ++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb diff --git a/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb b/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb new file mode 100644 index 0000000..de86c6f --- /dev/null +++ b/05_preprocessing_pipelines_and_hyperparameter_optimization.ipynb @@ -0,0 +1,349 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Draft\n", + "\n", + "- Scicit learn api: recall what we have seen up to now.\n", + "- pipelines, preprocessing (scaler, PCA)\n", + "- cross validatioon on pipeline\n", + "- parameter tuning: grid search / random search." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Up to now all classifiers had methods `fit` and `predict`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing\n", + "\n", + "Scaler: SVC work better when all columns of feature matrix are in the same numerical range.\n", + "\n", + "PCA can reduce redundancy / correlations (see overfitting script) and thus avoid overfitting.\n", + "\n", + "Polynomial features: extend feature matrix by computing products between and within feature matrix columns\n", + "\n", + "FunctionTransformer (sklearn): Apply functions like log\n", + "\n", + "Danger: PCA and Scaler learn on data sets. Make sure that test/validation datasets do not sneak in !\n", + "\n", + "DONT DO: scale on full dataset first, then cross validation etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preprocessors in `sklearn` all have `fit`, `transform` and `fit_transform` methods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pipelines\n", + "\n", + "Combine different preprocessing steps into one \"classifier\".\n", + "\n", + "Pipeline is a list for preprocessors followed by a classifier.\n", + "\n", + "Thus for a pipeline of len $n$ there are $n - 1$ objects having `fit` and `transform` methods, the last element has `fit` and `predict` methods.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import PolynomialFeatures, StandardScaler\n", + "from sklearn.decomposition import PCA\n" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html#sklearn.pipeline.make_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.cross_validation import cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "beer_data = pd.read_csv(\"beers.csv\")\n", + "\n", + "features = beer_data.iloc[:, :-1]\n", + "labels = beer_data.iloc[:, -1];" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8625911286780852\n", + "0.94655248133509\n", + "\n", + "pipeline\n", + "\n", + "standardscaler StandardScaler(copy=True, with_mean=True, with_std=True)\n", + "svc SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", + " decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n", + " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False)\n", + "\n", + "0.9286736934563022\n" + ] + } + ], + "source": [ + "print(cross_val_score(SVC(), features, labels, \"accuracy\", cv=5).mean())\n", + "\n", + "# here we see benefit of scaling for SVC\n", + "p = make_pipeline(StandardScaler(), SVC())\n", + "\n", + "print(cross_val_score(p, features, labels, \"accuracy\", cv=5).mean())\n", + "\n", + "print()\n", + "print(\"pipeline\")\n", + "print()\n", + "\n", + "\n", + "for name, step in p.steps:\n", + " print(\"{:20s} {}\".format(name, step))\n", + " \n", + "print()\n", + "\n", + "# this is how we can set parameters of a single step in the pipeline:\n", + "p.set_params(svc__C = 3)\n", + "\n", + "print(cross_val_score(p, features, labels, \"accuracy\", cv=5).mean())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.863 ['svc']\n", + "0.947 ['standardscaler', 'svc']\n", + "0.804 ['logisticregression']\n", + "0.920 ['standardscaler', 'pca', 'logisticregression']\n", + "0.840 ['polynomialfeatures', 'svc']\n", + "0.942 ['polynomialfeatures', 'standardscaler', 'svc']\n", + "0.925 ['polynomialfeatures', 'logisticregression']\n", + "0.964 ['polynomialfeatures', 'standardscaler', 'logisticregression']\n" + ] + } + ], + "source": [ + "for p in [make_pipeline(SVC()),\n", + " make_pipeline(StandardScaler(), SVC()),\n", + " make_pipeline(LogisticRegression()),\n", + " make_pipeline(StandardScaler(), PCA(), LogisticRegression()),\n", + "\n", + " make_pipeline(PolynomialFeatures(), SVC()),\n", + " make_pipeline(PolynomialFeatures(), StandardScaler(), SVC()),\n", + " make_pipeline(PolynomialFeatures(), LogisticRegression()),\n", + " make_pipeline(PolynomialFeatures(), StandardScaler(), LogisticRegression()),\n", + " ]:\n", + " \n", + " print(\"{:.3f}\".format(cross_val_score(p, features, labels, \"accuracy\", cv=5).mean()), end=\" \")\n", + " print([pi[0] for pi in p.steps])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercises:\n", + "- use the beer data frame with random / redundant features from cross val script and demo benefit of PCA" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9822222222222222 {'C': 5, 'kernel': 'poly'}\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", + "\n", + "# optimize parameters of one single classifier\n", + "\n", + "parameters = {'kernel':('linear', 'rbf', 'poly'), \n", + " 'C':[1, 5, 10, 15]\n", + " }\n", + "\n", + "svc = SVC()\n", + "search = GridSearchCV(svc, parameters, cv=5)\n", + "search.fit(features, labels)\n", + "print(search.best_score_, search.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we optimize a pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [], + "source": [ + "p = make_pipeline(PolynomialFeatures(), StandardScaler(), LogisticRegression())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODO: explain param_grid" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {'polynomialfeatures__degree': [1, 2, 3],\n", + " 'standardscaler__with_mean': [True, False],\n", + " 'standardscaler__with_std': [True, False],\n", + " 'logisticregression__C': [1, 10, 15, 20, 25],\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameter (CV score=0.983):\n", + "{'logisticregression__C': 20, 'polynomialfeatures__degree': 2, 'standardscaler__with_mean': True, 'standardscaler__with_std': False}\n" + ] + } + ], + "source": [ + "search = GridSearchCV(p, param_grid, cv=5, scoring=\"f1\", return_train_score=False, n_jobs=5)\n", + "search.fit(features, labels)\n", + "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", + "print(search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import uniform, randint\n", + "\n", + "param_dist = {'polynomialfeatures__degree': randint(1, 5),\n", + " 'standardscaler__with_mean': [True, False],\n", + " 'standardscaler__with_std': [True, False],\n", + " 'logisticregression__C': uniform(0.1, 20)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameter (CV score=0.982):\n", + "{'logisticregression__C': 15.053760390091858, 'polynomialfeatures__degree': 3, 'standardscaler__with_mean': False, 'standardscaler__with_std': True}\n" + ] + } + ], + "source": [ + "search = RandomizedSearchCV(p, param_dist, n_jobs=5, n_iter=100)\n", + "\n", + "search.fit(features, labels)\n", + "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", + "print(search.best_params_)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab