From 9e0f38ae37b542c8921e34137d49a243b43ba371 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt <uwe.schmitt@id.ethz.ch> Date: Thu, 25 Feb 2021 18:44:04 +0100 Subject: [PATCH] added generalized linear models --- 07_regression.ipynb | 168 +++++++++++++++++++++++++------------------- 1 file changed, 95 insertions(+), 73 deletions(-) diff --git a/07_regression.ipynb b/07_regression.ipynb index afd65a3..a9e80d1 100644 --- a/07_regression.ipynb +++ b/07_regression.ipynb @@ -150,14 +150,18 @@ ], "source": [ "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n", - "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "import warnings\n", - "warnings.filterwarnings('ignore', category=FutureWarning)\n", - "warnings.filterwarnings('ignore', category=DeprecationWarning)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n", + "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", "warnings.filterwarnings = lambda *a, **kw: None\n", - "from IPython.core.display import HTML; HTML(open(\"custom.html\", \"r\").read())" + "from IPython.core.display import HTML\n", + "\n", + "HTML(open(\"custom.html\", \"r\").read())" ] }, { @@ -409,6 +413,7 @@ ], "source": [ "import seaborn as sns\n", + "\n", "sns.set(style=\"ticks\")\n", "\n", "sns.pairplot(df, hue=\"kind\", diag_kind=\"hist\");" @@ -451,7 +456,7 @@ "\n", "# needs 2d data structure, features.iloc[2] has dimension 1\n", "encoder = OneHotEncoder(sparse=False)\n", - "one_hot = encoder.fit_transform(features.iloc[:, 2: 3]) \n", + "one_hot = encoder.fit_transform(features.iloc[:, 2:3])\n", "\n", "one_hot[:5, :]" ] @@ -587,9 +592,9 @@ "source": [ "from sklearn.model_selection import train_test_split\n", "\n", - "(features_train, features_test, \n", - " values_train, \n", - " values_test) = train_test_split(features, values, random_state=42)" + "(features_train, features_test, values_train, values_test) = train_test_split(\n", + " features, values, random_state=42\n", + ")" ] }, { @@ -606,7 +611,8 @@ "outputs": [], "source": [ "from sklearn.kernel_ridge import KernelRidge\n", - "kr = KernelRidge(alpha=.001, kernel=\"rbf\", gamma=.05)" + "\n", + "kr = KernelRidge(alpha=0.001, kernel=\"rbf\", gamma=0.05)" ] }, { @@ -662,31 +668,33 @@ "\n", "\n", "def plot_fit_quality(values_test, predicted):\n", - " \n", + "\n", " plt.figure(figsize=(12, 4))\n", " plt.subplot(1, 2, 1)\n", "\n", " x = np.arange(len(predicted))\n", - " plt.scatter(x, predicted - values_test, color='steelblue', marker='o') \n", + " plt.scatter(x, predicted - values_test, color=\"steelblue\", marker=\"o\")\n", "\n", " plt.plot([0, len(predicted)], [0, 0], \"k:\")\n", - " \n", + "\n", " max_diff = np.max(np.abs(predicted - values_test))\n", " plt.ylim([-max_diff, max_diff])\n", - " \n", + "\n", " plt.ylabel(\"error\")\n", " plt.xlabel(\"sample id\")\n", "\n", " plt.subplot(1, 2, 2)\n", "\n", - " plt.scatter(x, (predicted - values_test) / values_test, color='steelblue', marker='o') \n", + " plt.scatter(\n", + " x, (predicted - values_test) / values_test, color=\"steelblue\", marker=\"o\"\n", + " )\n", " plt.plot([0, len(predicted)], [0, 0], \"k:\")\n", - " plt.ylim([-.5, .5])\n", - " \n", + " plt.ylim([-0.5, 0.5])\n", + "\n", " plt.ylabel(\"relative error\")\n", " plt.xlabel(\"sample id\")\n", "\n", - " \n", + "\n", "plot_fit_quality(values_test, predicted)" ] }, @@ -901,7 +909,7 @@ "\n", "- `sklearn.tree.DecisionTreeRegressor` expands the concept of decision trees to regression [is documented here](https://scikit-learn.org/stable/modules/tree.html#regression).\n", "\n", - "\n" + "- `sklearn.linear_model.TweedieRegressor`, `sklearn.linear_model.PoissonRegressor`, `sklearn.linear_model.GammaRegressor` offer so-called *Generalized Linear Models* which might be of interest if your target values are bounded or discrete. This is for example the case when your target values are rates or probabilities. The [wikipedia article about generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model#Intuition) gives a nice intuition and [this discussion](https://stats.stackexchange.com/questions/190763/how-to-decide-which-glm-family-to-use) provides a guide which GLM should be used when." ] }, { @@ -942,22 +950,24 @@ } ], "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n", + "from sklearn.decomposition import PCA\n", "from sklearn.kernel_ridge import KernelRidge\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import cross_val_score\n", - "from sklearn.decomposition import PCA\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures, StandardScaler\n", "\n", "\n", "def eval_regression(p, features, values):\n", - " score = cross_val_score(p, features, values, scoring=\"neg_median_absolute_error\", cv=4).mean()\n", + " score = cross_val_score(\n", + " p, features, values, scoring=\"neg_median_absolute_error\", cv=4\n", + " ).mean()\n", " print(\"cross val score:\", score)\n", - " \n", + "\n", " predicted = p.fit(features_train, values_train).predict(features_test)\n", " plot_fit_quality(values_test, predicted)\n", "\n", - " \n", + "\n", "p = make_pipeline(PolynomialFeatures(2), PCA(2), LinearRegression())\n", "eval_regression(p, features, values)" ] @@ -970,9 +980,10 @@ "source": [ "p = make_pipeline(PolynomialFeatures(), PCA(), LinearRegression())\n", "\n", - "param_grid = {'polynomialfeatures__degree': range(3, 6),\n", - " 'pca__n_components': range(3, 11),\n", - " }" + "param_grid = {\n", + " \"polynomialfeatures__degree\": range(3, 6),\n", + " \"pca__n_components\": range(3, 11),\n", + "}" ] }, { @@ -1007,7 +1018,9 @@ "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", - "search = GridSearchCV(p, param_grid, scoring=\"neg_median_absolute_error\", cv=4, n_jobs=4)\n", + "search = GridSearchCV(\n", + " p, param_grid, scoring=\"neg_median_absolute_error\", cv=4, n_jobs=4\n", + ")\n", "\n", "search.fit(features, values)\n", "\n", @@ -1110,9 +1123,9 @@ } ], "source": [ - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", + "import pandas as pd\n", "\n", "sales_data = pd.read_csv(\"data/sales.csv\")\n", "sales_data.head()" @@ -1190,8 +1203,8 @@ " features = np.zeros((len(sales) - window_size, window_size))\n", "\n", " for i in range(len(sales) - window_size):\n", - " features[i] = sales[i: i + window_size]\n", - " \n", + " features[i] = sales[i : i + window_size]\n", + "\n", " return features, sales[window_size:]" ] }, @@ -1210,11 +1223,12 @@ " assert np.all(X[0] == sales[:4])\n", " assert np.all(X[1] == sales[1:5])\n", " assert np.all(X[2] == sales[2:6])\n", - " assert np.all(X[-1] == sales[-5: -1])\n", - " \n", + " assert np.all(X[-1] == sales[-5:-1])\n", + "\n", " assert np.all(y[0] == sales[4])\n", " assert np.all(y[-1] == sales[-1])\n", - " \n", + "\n", + "\n", "test()" ] }, @@ -1234,11 +1248,10 @@ "source": [ "from sklearn.kernel_ridge import KernelRidge\n", "from sklearn.linear_model import Lasso\n", + "from sklearn.metrics import r2_score\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import SVR\n", "\n", - "from sklearn.metrics import r2_score\n", - "\n", "# ..." ] }, @@ -1298,54 +1311,62 @@ } ], "source": [ - "lasso_grid = {'alpha' : 10 ** np.linspace(-2, 3, 30)}\n", - "svr_grid = {'C': 10 ** np.linspace(-4, 2, 30)}\n", - "kernel_ridge_grid = {'alpha' : 10 ** np.linspace(-4, 3, 30)}\n", + "lasso_grid = {\"alpha\": 10 ** np.linspace(-2, 3, 30)}\n", + "svr_grid = {\"C\": 10 ** np.linspace(-4, 2, 30)}\n", + "kernel_ridge_grid = {\"alpha\": 10 ** np.linspace(-4, 3, 30)}\n", "\n", "WINDOW_SIZE = 36\n", "X, y = create_feature_matrix_and_target_values(sales, WINDOW_SIZE)\n", "\n", + "\n", "def main(X, y):\n", - " \n", + "\n", " regressors = []\n", - " \n", - " for regressor, param_grid in [(Lasso(), lasso_grid),\n", - " (SVR(), svr_grid),\n", - " (KernelRidge(kernel=\"rbf\"), kernel_ridge_grid)\n", - " ]:\n", + "\n", + " for regressor, param_grid in [\n", + " (Lasso(), lasso_grid),\n", + " (SVR(), svr_grid),\n", + " (KernelRidge(kernel=\"rbf\"), kernel_ridge_grid),\n", + " ]:\n", " search = GridSearchCV(regressor, param_grid, scoring=\"r2\", cv=5)\n", " search.fit(X, y)\n", - " \n", + "\n", " # we predict on the learning data set to get a general\n", " # \"feeling\" how well the regressors work\n", " predicted = search.predict(X)\n", - " \n", + "\n", " plot_regression(regressor.__class__.__qualname__, predicted)\n", - " \n", + "\n", " regressors.append(search)\n", "\n", " return regressors\n", "\n", - " \n", + "\n", "def plot_regression(title, predicted):\n", " plt.figure(figsize=(14, 4.5))\n", " plt.suptitle(title)\n", " plt.subplot(1, 2, 1)\n", - " \n", - " plt.plot(months, sales, label='sales', color=\"steelblue\")\n", - " plt.plot(months[WINDOW_SIZE:], predicted, color=\"chocolate\", linestyle=\":\", label='predicted');\n", + "\n", + " plt.plot(months, sales, label=\"sales\", color=\"steelblue\")\n", + " plt.plot(\n", + " months[WINDOW_SIZE:],\n", + " predicted,\n", + " color=\"chocolate\",\n", + " linestyle=\":\",\n", + " label=\"predicted\",\n", + " )\n", " plt.legend()\n", - " \n", + "\n", " plt.subplot(1, 2, 2)\n", " plt.scatter(sales[WINDOW_SIZE:], predicted, color=\"steelblue\")\n", " r2 = r2_score(sales[WINDOW_SIZE:], predicted)\n", - " \n", + "\n", " plt.title(\"r2 = {:.3f}\".format(r2))\n", " plt.plot([0, 3], [0, 3], \"k:\")\n", - " plt.xlabel('sales')\n", - " plt.ylabel('predicted')\n", + " plt.xlabel(\"sales\")\n", + " plt.ylabel(\"predicted\")\n", + "\n", "\n", - " \n", "regressors = main(X, y)" ] }, @@ -1440,13 +1461,14 @@ "# start from line 96 in our feature matrix:\n", "NLAST = 96\n", "\n", + "\n", "def forecast(X, y, regressor, n_last):\n", - " \n", + "\n", " # we crate a copy because we change below content\n", " # current_window in place. Without copy we would\n", " # also change X!\n", - " current_window = X[-n_last, :].copy() \n", - " \n", + " current_window = X[-n_last, :].copy()\n", + "\n", " predicted = []\n", "\n", " for k in range(n_last):\n", @@ -1456,7 +1478,7 @@ " # modify the window data in place:\n", " current_window[:-1] = current_window[1:]\n", " current_window[-1] = new\n", - " \n", + "\n", " return np.array(predicted).flatten(), y[-n_last:]\n", "\n", "\n", @@ -1465,29 +1487,29 @@ "\n", " x_axis = list(range(len(y)))[-n_last:]\n", "\n", - " plt.figure(figsize=(14, 5)) \n", - " plt.subplot(1, 2, 1) \n", - " plt.plot(x_axis, predicted, color=\"steelblue\", label='predicted')\n", - " plt.plot(x_axis, correct, color=\"chocolate\", linestyle=\":\", label='true');\n", - " plt.legend();\n", + " plt.figure(figsize=(14, 5))\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(x_axis, predicted, color=\"steelblue\", label=\"predicted\")\n", + " plt.plot(x_axis, correct, color=\"chocolate\", linestyle=\":\", label=\"true\")\n", + " plt.legend()\n", " plt.xlabel(\"month\")\n", " plt.ylabel(\"sales\")\n", "\n", " plt.subplot(1, 2, 2)\n", - " plt.scatter(correct, predicted, color='steelblue')\n", - " \n", + " plt.scatter(correct, predicted, color=\"steelblue\")\n", + "\n", " r2 = r2_score(correct, predicted)\n", " plt.title(\"r2 = {:.3f}\".format(r2))\n", " plt.xlabel(\"sales\")\n", " plt.ylabel(\"predicted\")\n", "\n", " mi, ma = np.min(correct), np.max(correct)\n", - " plt.plot([mi, ma], [mi, ma], 'k:');\n", + " plt.plot([mi, ma], [mi, ma], \"k:\")\n", " plt.suptitle(regressor.estimator.__class__.__qualname__)\n", - " \n", - " \n", + "\n", + "\n", "for regressor in regressors:\n", - " forecast_and_plot(regressor, NLAST) " + " forecast_and_plot(regressor, NLAST)" ] }, { @@ -1515,7 +1537,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.7.7" }, "latex_envs": { "LaTeX_envs_menu_present": true, -- GitLab