From 0e718bc2695409539ead8ecc9a460b66f1da4d55 Mon Sep 17 00:00:00 2001
From: Uwe Schmitt <uwe.schmitt@id.ethz.ch>
Date: Sun, 28 Apr 2019 00:24:45 +0200
Subject: [PATCH] polished layout

---
 07_regression.ipynb | 171 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 144 insertions(+), 27 deletions(-)

diff --git a/07_regression.ipynb b/07_regression.ipynb
index ff46724..63af2b3 100644
--- a/07_regression.ipynb
+++ b/07_regression.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -108,7 +108,7 @@
        "<IPython.core.display.HTML object>"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -131,7 +131,7 @@
     "\n",
     "Regression belongs like classification to the field of supervised learning. \n",
     "\n",
-    "<div class=\"alert alert-block alert-warning\">\n",
+    "<div class=\"alert alert-block alert-info\">\n",
     "<i class=\"fa fa-info-circle\"></i>&nbsp; \n",
     "<strong>Regression predicts numerical values</strong> \n",
     "in contrast to classification which predicts categories.\n",
@@ -160,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 206,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -239,7 +239,7 @@
        "4           24.5    74.5  atlantic    24.2"
       ]
      },
-     "execution_count": 206,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -253,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 207,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -332,7 +332,7 @@
        "99           27.5    86.5  sockeye    43.4"
       ]
      },
-     "execution_count": 207,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -350,7 +350,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 210,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -380,18 +380,108 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Before we show how to use a regression method, we have to convert the `kind` column to numerical values (`sklearn.preprocessing.LabelEncoder` was mentioned in chapter 6.) first:"
+    "In contrast to our previous examples, our data set contains a non-numerical text column `kind`.\n",
+    "\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<i class=\"fa fa-info-circle\"></i>&nbsp; \n",
+    "    <code>sklearn.preprocessing.LabelEncoder</code> is a preprocessor which encodes text values to according categorical numbers.\n",
+    "</div>\n",
+    "\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 211,
+   "execution_count": 26,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>circumference</th>\n",
+       "      <th>length</th>\n",
+       "      <th>kind</th>\n",
+       "      <th>weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>19.0</td>\n",
+       "      <td>69.5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>18.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>18.5</td>\n",
+       "      <td>67.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>18.9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>24.5</td>\n",
+       "      <td>67.5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>24.7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>21.0</td>\n",
+       "      <td>66.5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>26.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>27.5</td>\n",
+       "      <td>86.5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>43.4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    circumference  length  kind  weight\n",
+       "95           19.0    69.5     1    18.8\n",
+       "96           18.5    67.0     1    18.9\n",
+       "97           24.5    67.5     1    24.7\n",
+       "98           21.0    66.5     1    26.0\n",
+       "99           27.5    86.5     1    43.4"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from sklearn.preprocessing import LabelEncoder\n",
     "\n",
-    "df.iloc[:, 2] = LabelEncoder().fit_transform(df.iloc[:, 2]) * 1"
+    "df.iloc[:, 2] = LabelEncoder().fit_transform(df.iloc[:, 2]) \n",
+    "\n",
+    "df.tail()"
    ]
   },
   {
@@ -403,7 +493,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 212,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -426,7 +516,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 213,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -438,12 +528,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Regression methods in `scikit-learn` also have `fit` and `predict` methods:"
+    "<div class=\"alert alert-block alert-info\">\n",
+    "    <i class=\"fa fa-info-circle\"></i>&nbsp; Regression methods in <code>scikit-learn</code> also have <code>fit</code> and <code>predict</code> methods. Thus cross validation, pipelines and hyperparameter-optimization will be available.\n",
+    "    \n",
+    "</div>"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 214,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -460,7 +553,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 219,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -517,18 +610,16 @@
    "source": [
     "For assessing the quality of the predictions of a regression method, we can use multiple methods which we will discuss later in this script.\n",
     "\n",
-    "For our current example we compute the average absolute difference between given values $y_i$ and predicted values  $\\hat{y}_i$. Thus\n",
+    "For our current example we compute the average absolute difference between given values $y_i$ and predicted values  $\\hat{y}_i$:\n",
     "\n",
     "$$\n",
     "\\frac{1}{n} \\left(|y_1 - \\hat{y}_1| + |y_2 - \\hat{y}_2| + ... + |y_n - \\hat{y}_n| \\right)\n",
-    "$$\n",
-    "\n",
-    "This is called **mean absolute error**."
+    "$$\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 220,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -597,12 +688,19 @@
     "\n",
     "### 1. Mean absolute error\n",
     "\n",
-    "This is the metric we used before. Formula is\n",
+    "This is the metric we used before.\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "    <i class=\"fa fa-info-circle\"></i>&nbsp; <strong>mean absolute error</strong> is defined as \n",
     "\n",
     "$$\n",
     "\\frac{1}{n} \\left(|y_1 - \\hat{y}_1| + |y_2 - \\hat{y}_2| + ... + |y_n - \\hat{y}_n| \\right)\n",
     "$$\n",
     "\n",
+    "\n",
+    "\n",
+    "</div>\n",
+    "\n",
+    "\n",
     "The name of the corresponding score in `scikit-learn` is `neg_mean_absolute_error`.\n",
     "\n",
     "\n",
@@ -610,10 +708,20 @@
     "\n",
     "Here we replace the absolute difference by its squared value.\n",
     "\n",
+    "\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "    <i class=\"fa fa-info-circle\"></i>&nbsp; <strong>mean squared error</strong> is defined as \n",
+    "\n",
+    "\n",
     "$$\n",
     "\\frac{1}{n} \\left((y_1 - \\hat{y}_1)^2 + (y_2 - \\hat{y}_2)^2 + ... + (y_n - \\hat{y}_n)^2 \\right)\n",
     "$$\n",
     "\n",
+    "\n",
+    "</div>\n",
+    "\n",
+    "\n",
+    "\n",
     "This measure is more sensitive to outliers: A few larger differences contribute more significantly to a larger mean squared error. The name of the corresponding score in `scikit-learn` is `neg_mean_squared_error`.\n",
     "\n",
     "\n",
@@ -621,10 +729,19 @@
     "\n",
     "Here we replace mean calculation by median:\n",
     "\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "    <i class=\"fa fa-info-circle\"></i>&nbsp; <strong>median absolute error</strong> is defined as \n",
+    "\n",
+    "\n",
+    "\n",
     "$$\n",
-    "median\\left(|y_1 - \\hat{y}_1|, |y_2 - \\hat{y}_2|, ..., |y_n - \\hat{y}_n| \\right)\n",
+    "\\text{median}\\left(|y_1 - \\hat{y}_1|, |y_2 - \\hat{y}_2|, ..., |y_n - \\hat{y}_n| \\right)\n",
     "$$\n",
     "\n",
+    "\n",
+    "</div>\n",
+    "\n",
+    "\n",
     "This measure is less sensitive to outliers than the metrics we discussed before: A few larger differences will not contribute significantly to a larger error value. The name of the corresponding score in `scikit-learn` is `neg_median_absolute_error`.\n",
     "\n",
     "### 4. Mean squared log error\n",
@@ -681,7 +798,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 221,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -730,7 +847,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 222,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -743,7 +860,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 223,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
-- 
GitLab