From 9e0f38ae37b542c8921e34137d49a243b43ba371 Mon Sep 17 00:00:00 2001
From: Uwe Schmitt <uwe.schmitt@id.ethz.ch>
Date: Thu, 25 Feb 2021 18:44:04 +0100
Subject: [PATCH] added generalized linear models

---
 07_regression.ipynb | 168 +++++++++++++++++++++++++-------------------
 1 file changed, 95 insertions(+), 73 deletions(-)

diff --git a/07_regression.ipynb b/07_regression.ipynb
index afd65a3..a9e80d1 100644
--- a/07_regression.ipynb
+++ b/07_regression.ipynb
@@ -150,14 +150,18 @@
    ],
    "source": [
     "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
-    "import matplotlib.pyplot as plt\n",
     "%matplotlib inline\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "import warnings\n",
-    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
-    "warnings.filterwarnings('ignore', category=DeprecationWarning)\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
     "warnings.filterwarnings = lambda *a, **kw: None\n",
-    "from IPython.core.display import HTML; HTML(open(\"custom.html\", \"r\").read())"
+    "from IPython.core.display import HTML\n",
+    "\n",
+    "HTML(open(\"custom.html\", \"r\").read())"
    ]
   },
   {
@@ -409,6 +413,7 @@
    ],
    "source": [
     "import seaborn as sns\n",
+    "\n",
     "sns.set(style=\"ticks\")\n",
     "\n",
     "sns.pairplot(df, hue=\"kind\", diag_kind=\"hist\");"
@@ -451,7 +456,7 @@
     "\n",
     "# needs 2d data structure, features.iloc[2] has dimension 1\n",
     "encoder = OneHotEncoder(sparse=False)\n",
-    "one_hot = encoder.fit_transform(features.iloc[:, 2: 3]) \n",
+    "one_hot = encoder.fit_transform(features.iloc[:, 2:3])\n",
     "\n",
     "one_hot[:5, :]"
    ]
@@ -587,9 +592,9 @@
    "source": [
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "(features_train, features_test, \n",
-    " values_train, \n",
-    " values_test) = train_test_split(features, values, random_state=42)"
+    "(features_train, features_test, values_train, values_test) = train_test_split(\n",
+    "    features, values, random_state=42\n",
+    ")"
    ]
   },
   {
@@ -606,7 +611,8 @@
    "outputs": [],
    "source": [
     "from sklearn.kernel_ridge import KernelRidge\n",
-    "kr = KernelRidge(alpha=.001, kernel=\"rbf\", gamma=.05)"
+    "\n",
+    "kr = KernelRidge(alpha=0.001, kernel=\"rbf\", gamma=0.05)"
    ]
   },
   {
@@ -662,31 +668,33 @@
     "\n",
     "\n",
     "def plot_fit_quality(values_test, predicted):\n",
-    "    \n",
+    "\n",
     "    plt.figure(figsize=(12, 4))\n",
     "    plt.subplot(1, 2, 1)\n",
     "\n",
     "    x = np.arange(len(predicted))\n",
-    "    plt.scatter(x, predicted - values_test, color='steelblue', marker='o') \n",
+    "    plt.scatter(x, predicted - values_test, color=\"steelblue\", marker=\"o\")\n",
     "\n",
     "    plt.plot([0, len(predicted)], [0, 0], \"k:\")\n",
-    "    \n",
+    "\n",
     "    max_diff = np.max(np.abs(predicted - values_test))\n",
     "    plt.ylim([-max_diff, max_diff])\n",
-    "    \n",
+    "\n",
     "    plt.ylabel(\"error\")\n",
     "    plt.xlabel(\"sample id\")\n",
     "\n",
     "    plt.subplot(1, 2, 2)\n",
     "\n",
-    "    plt.scatter(x, (predicted - values_test) / values_test, color='steelblue', marker='o') \n",
+    "    plt.scatter(\n",
+    "        x, (predicted - values_test) / values_test, color=\"steelblue\", marker=\"o\"\n",
+    "    )\n",
     "    plt.plot([0, len(predicted)], [0, 0], \"k:\")\n",
-    "    plt.ylim([-.5, .5])\n",
-    "      \n",
+    "    plt.ylim([-0.5, 0.5])\n",
+    "\n",
     "    plt.ylabel(\"relative error\")\n",
     "    plt.xlabel(\"sample id\")\n",
     "\n",
-    "    \n",
+    "\n",
     "plot_fit_quality(values_test, predicted)"
    ]
   },
@@ -901,7 +909,7 @@
     "\n",
     "- `sklearn.tree.DecisionTreeRegressor` expands the concept of decision trees to regression [is documented here](https://scikit-learn.org/stable/modules/tree.html#regression).\n",
     "\n",
-    "\n"
+    "- `sklearn.linear_model.TweedieRegressor`, `sklearn.linear_model.PoissonRegressor`, `sklearn.linear_model.GammaRegressor` offer so-called *Generalized Linear Models* which might be of interest if your target values are bounded or discrete. This is for example the case when your target values are rates or probabilities. The [wikipedia article about generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model#Intuition) gives a nice intuition and [this discussion](https://stats.stackexchange.com/questions/190763/how-to-decide-which-glm-family-to-use) provides a guide which GLM should be used when."
    ]
   },
   {
@@ -942,22 +950,24 @@
     }
    ],
    "source": [
-    "from sklearn.pipeline import make_pipeline\n",
-    "from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n",
+    "from sklearn.decomposition import PCA\n",
     "from sklearn.kernel_ridge import KernelRidge\n",
     "from sklearn.linear_model import LinearRegression\n",
     "from sklearn.model_selection import cross_val_score\n",
-    "from sklearn.decomposition import PCA\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.preprocessing import PolynomialFeatures, StandardScaler\n",
     "\n",
     "\n",
     "def eval_regression(p, features, values):\n",
-    "    score = cross_val_score(p, features, values, scoring=\"neg_median_absolute_error\", cv=4).mean()\n",
+    "    score = cross_val_score(\n",
+    "        p, features, values, scoring=\"neg_median_absolute_error\", cv=4\n",
+    "    ).mean()\n",
     "    print(\"cross val score:\", score)\n",
-    "  \n",
+    "\n",
     "    predicted = p.fit(features_train, values_train).predict(features_test)\n",
     "    plot_fit_quality(values_test, predicted)\n",
     "\n",
-    "    \n",
+    "\n",
     "p = make_pipeline(PolynomialFeatures(2), PCA(2), LinearRegression())\n",
     "eval_regression(p, features, values)"
    ]
@@ -970,9 +980,10 @@
    "source": [
     "p = make_pipeline(PolynomialFeatures(), PCA(), LinearRegression())\n",
     "\n",
-    "param_grid = {'polynomialfeatures__degree': range(3, 6),\n",
-    "              'pca__n_components': range(3, 11),\n",
-    "             }"
+    "param_grid = {\n",
+    "    \"polynomialfeatures__degree\": range(3, 6),\n",
+    "    \"pca__n_components\": range(3, 11),\n",
+    "}"
    ]
   },
   {
@@ -1007,7 +1018,9 @@
    "source": [
     "from sklearn.model_selection import GridSearchCV\n",
     "\n",
-    "search = GridSearchCV(p, param_grid, scoring=\"neg_median_absolute_error\", cv=4, n_jobs=4)\n",
+    "search = GridSearchCV(\n",
+    "    p, param_grid, scoring=\"neg_median_absolute_error\", cv=4, n_jobs=4\n",
+    ")\n",
     "\n",
     "search.fit(features, values)\n",
     "\n",
@@ -1110,9 +1123,9 @@
     }
    ],
    "source": [
-    "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
     "\n",
     "sales_data = pd.read_csv(\"data/sales.csv\")\n",
     "sales_data.head()"
@@ -1190,8 +1203,8 @@
     "    features = np.zeros((len(sales) - window_size, window_size))\n",
     "\n",
     "    for i in range(len(sales) - window_size):\n",
-    "        features[i] = sales[i: i + window_size]\n",
-    "        \n",
+    "        features[i] = sales[i : i + window_size]\n",
+    "\n",
     "    return features, sales[window_size:]"
    ]
   },
@@ -1210,11 +1223,12 @@
     "    assert np.all(X[0] == sales[:4])\n",
     "    assert np.all(X[1] == sales[1:5])\n",
     "    assert np.all(X[2] == sales[2:6])\n",
-    "    assert np.all(X[-1] == sales[-5: -1])\n",
-    "    \n",
+    "    assert np.all(X[-1] == sales[-5:-1])\n",
+    "\n",
     "    assert np.all(y[0] == sales[4])\n",
     "    assert np.all(y[-1] == sales[-1])\n",
-    "    \n",
+    "\n",
+    "\n",
     "test()"
    ]
   },
@@ -1234,11 +1248,10 @@
    "source": [
     "from sklearn.kernel_ridge import KernelRidge\n",
     "from sklearn.linear_model import Lasso\n",
+    "from sklearn.metrics import r2_score\n",
     "from sklearn.model_selection import GridSearchCV\n",
     "from sklearn.svm import SVR\n",
     "\n",
-    "from sklearn.metrics import r2_score\n",
-    "\n",
     "# ..."
    ]
   },
@@ -1298,54 +1311,62 @@
     }
    ],
    "source": [
-    "lasso_grid = {'alpha' : 10 ** np.linspace(-2, 3, 30)}\n",
-    "svr_grid = {'C': 10 ** np.linspace(-4, 2, 30)}\n",
-    "kernel_ridge_grid = {'alpha' : 10 ** np.linspace(-4, 3, 30)}\n",
+    "lasso_grid = {\"alpha\": 10 ** np.linspace(-2, 3, 30)}\n",
+    "svr_grid = {\"C\": 10 ** np.linspace(-4, 2, 30)}\n",
+    "kernel_ridge_grid = {\"alpha\": 10 ** np.linspace(-4, 3, 30)}\n",
     "\n",
     "WINDOW_SIZE = 36\n",
     "X, y = create_feature_matrix_and_target_values(sales, WINDOW_SIZE)\n",
     "\n",
+    "\n",
     "def main(X, y):\n",
-    "    \n",
+    "\n",
     "    regressors = []\n",
-    "    \n",
-    "    for regressor, param_grid in [(Lasso(), lasso_grid),\n",
-    "                           (SVR(), svr_grid),\n",
-    "                           (KernelRidge(kernel=\"rbf\"), kernel_ridge_grid)\n",
-    "                          ]:\n",
+    "\n",
+    "    for regressor, param_grid in [\n",
+    "        (Lasso(), lasso_grid),\n",
+    "        (SVR(), svr_grid),\n",
+    "        (KernelRidge(kernel=\"rbf\"), kernel_ridge_grid),\n",
+    "    ]:\n",
     "        search = GridSearchCV(regressor, param_grid, scoring=\"r2\", cv=5)\n",
     "        search.fit(X, y)\n",
-    "        \n",
+    "\n",
     "        # we predict on the learning data set to get a general\n",
     "        # \"feeling\" how well the regressors work\n",
     "        predicted = search.predict(X)\n",
-    "        \n",
+    "\n",
     "        plot_regression(regressor.__class__.__qualname__, predicted)\n",
-    "        \n",
+    "\n",
     "        regressors.append(search)\n",
     "\n",
     "    return regressors\n",
     "\n",
-    "        \n",
+    "\n",
     "def plot_regression(title, predicted):\n",
     "    plt.figure(figsize=(14, 4.5))\n",
     "    plt.suptitle(title)\n",
     "    plt.subplot(1, 2, 1)\n",
-    "    \n",
-    "    plt.plot(months, sales, label='sales', color=\"steelblue\")\n",
-    "    plt.plot(months[WINDOW_SIZE:], predicted, color=\"chocolate\", linestyle=\":\", label='predicted');\n",
+    "\n",
+    "    plt.plot(months, sales, label=\"sales\", color=\"steelblue\")\n",
+    "    plt.plot(\n",
+    "        months[WINDOW_SIZE:],\n",
+    "        predicted,\n",
+    "        color=\"chocolate\",\n",
+    "        linestyle=\":\",\n",
+    "        label=\"predicted\",\n",
+    "    )\n",
     "    plt.legend()\n",
-    "    \n",
+    "\n",
     "    plt.subplot(1, 2, 2)\n",
     "    plt.scatter(sales[WINDOW_SIZE:], predicted, color=\"steelblue\")\n",
     "    r2 = r2_score(sales[WINDOW_SIZE:], predicted)\n",
-    "    \n",
+    "\n",
     "    plt.title(\"r2 = {:.3f}\".format(r2))\n",
     "    plt.plot([0, 3], [0, 3], \"k:\")\n",
-    "    plt.xlabel('sales')\n",
-    "    plt.ylabel('predicted')\n",
+    "    plt.xlabel(\"sales\")\n",
+    "    plt.ylabel(\"predicted\")\n",
+    "\n",
     "\n",
-    "    \n",
     "regressors = main(X, y)"
    ]
   },
@@ -1440,13 +1461,14 @@
     "# start from line 96 in our feature matrix:\n",
     "NLAST = 96\n",
     "\n",
+    "\n",
     "def forecast(X, y, regressor, n_last):\n",
-    "    \n",
+    "\n",
     "    # we crate a copy because we change below content\n",
     "    # current_window in place. Without copy we would\n",
     "    # also change X!\n",
-    "    current_window = X[-n_last, :].copy() \n",
-    "   \n",
+    "    current_window = X[-n_last, :].copy()\n",
+    "\n",
     "    predicted = []\n",
     "\n",
     "    for k in range(n_last):\n",
@@ -1456,7 +1478,7 @@
     "        # modify the window data in place:\n",
     "        current_window[:-1] = current_window[1:]\n",
     "        current_window[-1] = new\n",
-    "        \n",
+    "\n",
     "    return np.array(predicted).flatten(), y[-n_last:]\n",
     "\n",
     "\n",
@@ -1465,29 +1487,29 @@
     "\n",
     "    x_axis = list(range(len(y)))[-n_last:]\n",
     "\n",
-    "    plt.figure(figsize=(14, 5))    \n",
-    "    plt.subplot(1, 2, 1)    \n",
-    "    plt.plot(x_axis, predicted, color=\"steelblue\", label='predicted')\n",
-    "    plt.plot(x_axis, correct, color=\"chocolate\", linestyle=\":\", label='true');\n",
-    "    plt.legend();\n",
+    "    plt.figure(figsize=(14, 5))\n",
+    "    plt.subplot(1, 2, 1)\n",
+    "    plt.plot(x_axis, predicted, color=\"steelblue\", label=\"predicted\")\n",
+    "    plt.plot(x_axis, correct, color=\"chocolate\", linestyle=\":\", label=\"true\")\n",
+    "    plt.legend()\n",
     "    plt.xlabel(\"month\")\n",
     "    plt.ylabel(\"sales\")\n",
     "\n",
     "    plt.subplot(1, 2, 2)\n",
-    "    plt.scatter(correct, predicted, color='steelblue')\n",
-    "    \n",
+    "    plt.scatter(correct, predicted, color=\"steelblue\")\n",
+    "\n",
     "    r2 = r2_score(correct, predicted)\n",
     "    plt.title(\"r2 = {:.3f}\".format(r2))\n",
     "    plt.xlabel(\"sales\")\n",
     "    plt.ylabel(\"predicted\")\n",
     "\n",
     "    mi, ma = np.min(correct), np.max(correct)\n",
-    "    plt.plot([mi, ma], [mi, ma], 'k:');\n",
+    "    plt.plot([mi, ma], [mi, ma], \"k:\")\n",
     "    plt.suptitle(regressor.estimator.__class__.__qualname__)\n",
-    "    \n",
-    "    \n",
+    "\n",
+    "\n",
     "for regressor in regressors:\n",
-    "    forecast_and_plot(regressor, NLAST)  "
+    "    forecast_and_plot(regressor, NLAST)"
    ]
   },
   {
@@ -1515,7 +1537,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.7.7"
   },
   "latex_envs": {
    "LaTeX_envs_menu_present": true,
-- 
GitLab