diff --git a/07_regression.ipynb b/07_regression.ipynb index afd65a3e5fda810e2f93c2f5eefdbba0f2827a7f..7c0455ea14452505e5cd1b64f9a2d4e2f123477e 100644 --- a/07_regression.ipynb +++ b/07_regression.ipynb @@ -150,14 +150,18 @@ ], "source": [ "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n", - "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "import warnings\n", - "warnings.filterwarnings('ignore', category=FutureWarning)\n", - "warnings.filterwarnings('ignore', category=DeprecationWarning)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n", + "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", "warnings.filterwarnings = lambda *a, **kw: None\n", - "from IPython.core.display import HTML; HTML(open(\"custom.html\", \"r\").read())" + "from IPython.core.display import HTML\n", + "\n", + "HTML(open(\"custom.html\", \"r\").read())" ] }, { @@ -409,6 +413,7 @@ ], "source": [ "import seaborn as sns\n", + "\n", "sns.set(style=\"ticks\")\n", "\n", "sns.pairplot(df, hue=\"kind\", diag_kind=\"hist\");" @@ -451,7 +456,7 @@ "\n", "# needs 2d data structure, features.iloc[2] has dimension 1\n", "encoder = OneHotEncoder(sparse=False)\n", - "one_hot = encoder.fit_transform(features.iloc[:, 2: 3]) \n", + "one_hot = encoder.fit_transform(features.iloc[:, 2:3])\n", "\n", "one_hot[:5, :]" ] @@ -587,9 +592,9 @@ "source": [ "from sklearn.model_selection import train_test_split\n", "\n", - "(features_train, features_test, \n", - " values_train, \n", - " values_test) = train_test_split(features, values, random_state=42)" + "(features_train, features_test, values_train, values_test) = train_test_split(\n", + " features, values, random_state=42\n", + ")" ] }, { @@ -606,7 +611,8 @@ "outputs": [], "source": [ "from sklearn.kernel_ridge import KernelRidge\n", - "kr = KernelRidge(alpha=.001, kernel=\"rbf\", gamma=.05)" + "\n", + "kr = KernelRidge(alpha=0.001, kernel=\"rbf\", gamma=0.05)" ] }, { @@ -662,31 +668,33 @@ "\n", "\n", "def plot_fit_quality(values_test, predicted):\n", - " \n", + "\n", " plt.figure(figsize=(12, 4))\n", " plt.subplot(1, 2, 1)\n", "\n", " x = np.arange(len(predicted))\n", - " plt.scatter(x, predicted - values_test, color='steelblue', marker='o') \n", + " plt.scatter(x, predicted - values_test, color=\"steelblue\", marker=\"o\")\n", "\n", " plt.plot([0, len(predicted)], [0, 0], \"k:\")\n", - " \n", + "\n", " max_diff = np.max(np.abs(predicted - values_test))\n", " plt.ylim([-max_diff, max_diff])\n", - " \n", + "\n", " plt.ylabel(\"error\")\n", " plt.xlabel(\"sample id\")\n", "\n", " plt.subplot(1, 2, 2)\n", "\n", - " plt.scatter(x, (predicted - values_test) / values_test, color='steelblue', marker='o') \n", + " plt.scatter(\n", + " x, (predicted - values_test) / values_test, color=\"steelblue\", marker=\"o\"\n", + " )\n", " plt.plot([0, len(predicted)], [0, 0], \"k:\")\n", - " plt.ylim([-.5, .5])\n", - " \n", + " plt.ylim([-0.5, 0.5])\n", + "\n", " plt.ylabel(\"relative error\")\n", " plt.xlabel(\"sample id\")\n", "\n", - " \n", + "\n", "plot_fit_quality(values_test, predicted)" ] }, @@ -885,15 +893,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Some algorithms from sklearn\n", + "## Some algorithms from `scikit-learn`\n", "\n", - "- `sklearn.linear_model.LinearRegression` is a linear regression method, which only works well for target values which can be described as a linear combination of feature values.\n", + "- `sklearn.linear_model.LinearRegression` is a linear regression method, which only works well for target values which can be described as a linear combination of feature values. This is also known as linear least squares method.\n", "\n", + "- `sklearn.linear_model.Ridge`, `sklearn.linear_model.Lasso`, and `sklearn.linear_model.ElasticNet` are linear regression methods with `L2`, `L1` resp `L2 + L1` regularization terms to avoid overfitting resp. improve generalization.\n", "\n", - "- `sklearn.kernel_ridge.KernelRidge` is [documented here](https://scikit-learn.org/stable/modules/kernel_ridge.html#kernel-ridge). It combines the kernel trick from SVMs with classical least squares regression.\n", + "- `sklearn.kernel_ridge.KernelRidge` is [documented here](https://scikit-learn.org/stable/modules/kernel_ridge.html#kernel-ridge). It combines the kernel trick from SVMs with ridge regression.\n", "\n", "\n", - "- `sklearn.svm.SVR` is an extension of support vector classification concept to regression, [you find examples here](https://scikit-learn.org/stable/modules/svm.html#svm-regression)\n", + "- `sklearn.svm.SVR` is an extension of support vector classification concepts to regression, including the kernel trick for non-linear regression, [you find examples here](https://scikit-learn.org/stable/modules/svm.html#svm-regression).\n", "\n", "\n", "- `sklearn.neighbors.KNeighborsRegressor` extends the idea of nearest neighbour classification to regression: Search for similar data points in the learning data set and compute the predicted value from the values from the neighbourhood, e.g. by averaging or by linear interpolation. [Documentation is available here](https://scikit-learn.org/stable/modules/neighbors.html#regression)\n", @@ -901,7 +910,10 @@ "\n", "- `sklearn.tree.DecisionTreeRegressor` expands the concept of decision trees to regression [is documented here](https://scikit-learn.org/stable/modules/tree.html#regression).\n", "\n", - "\n" + "- `sklearn.linear_model.TweedieRegressor`, `sklearn.linear_model.PoissonRegressor`, `sklearn.linear_model.GammaRegressor` offer so-called *Generalized Linear Models* (**GLM**) \n", + " - These models are usually of interest when your target values are event-based discrete counts/frequencies, or continuous amounts, durations, costs/prices, or rates/probabilities. The [scikit-learn GLM tutorial](https://scikit-learn.org/stable/modules/linear_model.html#generalized-linear-regression) provides a formal insight as well as tips for choosing GLM with some use case examples. \n", + " - Beyond that the [wikipedia article about generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model#Intuition) gives a nice intuition and [this discussion](https://stats.stackexchange.com/questions/190763/how-to-decide-which-glm-family-to-use) provides a guide which GLM should be used when.\n", + " - For assessing and hyperparameter-optimization of such General Linear Models you should also use [suitable metrics from scikit-learn](https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances)." ] }, { @@ -942,22 +954,24 @@ } ], "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n", + "from sklearn.decomposition import PCA\n", "from sklearn.kernel_ridge import KernelRidge\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import cross_val_score\n", - "from sklearn.decomposition import PCA\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures, StandardScaler\n", "\n", "\n", "def eval_regression(p, features, values):\n", - " score = cross_val_score(p, features, values, scoring=\"neg_median_absolute_error\", cv=4).mean()\n", + " score = cross_val_score(\n", + " p, features, values, scoring=\"neg_median_absolute_error\", cv=4\n", + " ).mean()\n", " print(\"cross val score:\", score)\n", - " \n", + "\n", " predicted = p.fit(features_train, values_train).predict(features_test)\n", " plot_fit_quality(values_test, predicted)\n", "\n", - " \n", + "\n", "p = make_pipeline(PolynomialFeatures(2), PCA(2), LinearRegression())\n", "eval_regression(p, features, values)" ] @@ -970,9 +984,10 @@ "source": [ "p = make_pipeline(PolynomialFeatures(), PCA(), LinearRegression())\n", "\n", - "param_grid = {'polynomialfeatures__degree': range(3, 6),\n", - " 'pca__n_components': range(3, 11),\n", - " }" + "param_grid = {\n", + " \"polynomialfeatures__degree\": range(3, 6),\n", + " \"pca__n_components\": range(3, 11),\n", + "}" ] }, { @@ -1007,7 +1022,9 @@ "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", - "search = GridSearchCV(p, param_grid, scoring=\"neg_median_absolute_error\", cv=4, n_jobs=4)\n", + "search = GridSearchCV(\n", + " p, param_grid, scoring=\"neg_median_absolute_error\", cv=4, n_jobs=4\n", + ")\n", "\n", "search.fit(features, values)\n", "\n", @@ -1110,9 +1127,9 @@ } ], "source": [ - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", + "import pandas as pd\n", "\n", "sales_data = pd.read_csv(\"data/sales.csv\")\n", "sales_data.head()" @@ -1190,8 +1207,8 @@ " features = np.zeros((len(sales) - window_size, window_size))\n", "\n", " for i in range(len(sales) - window_size):\n", - " features[i] = sales[i: i + window_size]\n", - " \n", + " features[i] = sales[i : i + window_size]\n", + "\n", " return features, sales[window_size:]" ] }, @@ -1210,11 +1227,12 @@ " assert np.all(X[0] == sales[:4])\n", " assert np.all(X[1] == sales[1:5])\n", " assert np.all(X[2] == sales[2:6])\n", - " assert np.all(X[-1] == sales[-5: -1])\n", - " \n", + " assert np.all(X[-1] == sales[-5:-1])\n", + "\n", " assert np.all(y[0] == sales[4])\n", " assert np.all(y[-1] == sales[-1])\n", - " \n", + "\n", + "\n", "test()" ] }, @@ -1234,11 +1252,10 @@ "source": [ "from sklearn.kernel_ridge import KernelRidge\n", "from sklearn.linear_model import Lasso\n", + "from sklearn.metrics import r2_score\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import SVR\n", "\n", - "from sklearn.metrics import r2_score\n", - "\n", "# ..." ] }, @@ -1298,54 +1315,62 @@ } ], "source": [ - "lasso_grid = {'alpha' : 10 ** np.linspace(-2, 3, 30)}\n", - "svr_grid = {'C': 10 ** np.linspace(-4, 2, 30)}\n", - "kernel_ridge_grid = {'alpha' : 10 ** np.linspace(-4, 3, 30)}\n", + "lasso_grid = {\"alpha\": 10 ** np.linspace(-2, 3, 30)}\n", + "svr_grid = {\"C\": 10 ** np.linspace(-4, 2, 30)}\n", + "kernel_ridge_grid = {\"alpha\": 10 ** np.linspace(-4, 3, 30)}\n", "\n", "WINDOW_SIZE = 36\n", "X, y = create_feature_matrix_and_target_values(sales, WINDOW_SIZE)\n", "\n", + "\n", "def main(X, y):\n", - " \n", + "\n", " regressors = []\n", - " \n", - " for regressor, param_grid in [(Lasso(), lasso_grid),\n", - " (SVR(), svr_grid),\n", - " (KernelRidge(kernel=\"rbf\"), kernel_ridge_grid)\n", - " ]:\n", + "\n", + " for regressor, param_grid in [\n", + " (Lasso(), lasso_grid),\n", + " (SVR(), svr_grid),\n", + " (KernelRidge(kernel=\"rbf\"), kernel_ridge_grid),\n", + " ]:\n", " search = GridSearchCV(regressor, param_grid, scoring=\"r2\", cv=5)\n", " search.fit(X, y)\n", - " \n", + "\n", " # we predict on the learning data set to get a general\n", " # \"feeling\" how well the regressors work\n", " predicted = search.predict(X)\n", - " \n", + "\n", " plot_regression(regressor.__class__.__qualname__, predicted)\n", - " \n", + "\n", " regressors.append(search)\n", "\n", " return regressors\n", "\n", - " \n", + "\n", "def plot_regression(title, predicted):\n", " plt.figure(figsize=(14, 4.5))\n", " plt.suptitle(title)\n", " plt.subplot(1, 2, 1)\n", - " \n", - " plt.plot(months, sales, label='sales', color=\"steelblue\")\n", - " plt.plot(months[WINDOW_SIZE:], predicted, color=\"chocolate\", linestyle=\":\", label='predicted');\n", + "\n", + " plt.plot(months, sales, label=\"sales\", color=\"steelblue\")\n", + " plt.plot(\n", + " months[WINDOW_SIZE:],\n", + " predicted,\n", + " color=\"chocolate\",\n", + " linestyle=\":\",\n", + " label=\"predicted\",\n", + " )\n", " plt.legend()\n", - " \n", + "\n", " plt.subplot(1, 2, 2)\n", " plt.scatter(sales[WINDOW_SIZE:], predicted, color=\"steelblue\")\n", " r2 = r2_score(sales[WINDOW_SIZE:], predicted)\n", - " \n", + "\n", " plt.title(\"r2 = {:.3f}\".format(r2))\n", " plt.plot([0, 3], [0, 3], \"k:\")\n", - " plt.xlabel('sales')\n", - " plt.ylabel('predicted')\n", + " plt.xlabel(\"sales\")\n", + " plt.ylabel(\"predicted\")\n", + "\n", "\n", - " \n", "regressors = main(X, y)" ] }, @@ -1440,13 +1465,14 @@ "# start from line 96 in our feature matrix:\n", "NLAST = 96\n", "\n", + "\n", "def forecast(X, y, regressor, n_last):\n", - " \n", + "\n", " # we crate a copy because we change below content\n", " # current_window in place. Without copy we would\n", " # also change X!\n", - " current_window = X[-n_last, :].copy() \n", - " \n", + " current_window = X[-n_last, :].copy()\n", + "\n", " predicted = []\n", "\n", " for k in range(n_last):\n", @@ -1456,7 +1482,7 @@ " # modify the window data in place:\n", " current_window[:-1] = current_window[1:]\n", " current_window[-1] = new\n", - " \n", + "\n", " return np.array(predicted).flatten(), y[-n_last:]\n", "\n", "\n", @@ -1465,29 +1491,29 @@ "\n", " x_axis = list(range(len(y)))[-n_last:]\n", "\n", - " plt.figure(figsize=(14, 5)) \n", - " plt.subplot(1, 2, 1) \n", - " plt.plot(x_axis, predicted, color=\"steelblue\", label='predicted')\n", - " plt.plot(x_axis, correct, color=\"chocolate\", linestyle=\":\", label='true');\n", - " plt.legend();\n", + " plt.figure(figsize=(14, 5))\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(x_axis, predicted, color=\"steelblue\", label=\"predicted\")\n", + " plt.plot(x_axis, correct, color=\"chocolate\", linestyle=\":\", label=\"true\")\n", + " plt.legend()\n", " plt.xlabel(\"month\")\n", " plt.ylabel(\"sales\")\n", "\n", " plt.subplot(1, 2, 2)\n", - " plt.scatter(correct, predicted, color='steelblue')\n", - " \n", + " plt.scatter(correct, predicted, color=\"steelblue\")\n", + "\n", " r2 = r2_score(correct, predicted)\n", " plt.title(\"r2 = {:.3f}\".format(r2))\n", " plt.xlabel(\"sales\")\n", " plt.ylabel(\"predicted\")\n", "\n", " mi, ma = np.min(correct), np.max(correct)\n", - " plt.plot([mi, ma], [mi, ma], 'k:');\n", + " plt.plot([mi, ma], [mi, ma], \"k:\")\n", " plt.suptitle(regressor.estimator.__class__.__qualname__)\n", - " \n", - " \n", + "\n", + "\n", "for regressor in regressors:\n", - " forecast_and_plot(regressor, NLAST) " + " forecast_and_plot(regressor, NLAST)" ] }, { @@ -1515,7 +1541,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.7.7" }, "latex_envs": { "LaTeX_envs_menu_present": true,