From 3d86c8681c5d43f5a0237eb322a1cde068b38b78 Mon Sep 17 00:00:00 2001
From: Uwe Schmitt <uwe.schmitt@id.ethz.ch>
Date: Mon, 7 Jan 2019 17:42:25 +0100
Subject: [PATCH] changes after review session with Mik

---
 01_introduction.ipynb | 211 +++++++++++++++++++-----------------------
 1 file changed, 96 insertions(+), 115 deletions(-)

diff --git a/01_introduction.ipynb b/01_introduction.ipynb
index 313f532..4d7452a 100644
--- a/01_introduction.ipynb
+++ b/01_introduction.ipynb
@@ -86,8 +86,8 @@
     "    1957-65: \"k-means\" clustering algorithm\n",
     "    1959: Term \"machine learning\" is coined by Arthur Samuel, an AI pioneer\n",
     "    1969: Book \"Perceptrons\": Limitations of Neural Networks\n",
-    "    1984: Book \"Classification And Regression Trees\"\n",
     "    1974-86: Neural networks learning breakthrough: backpropagation method\n",
+    "    1984: Book \"Classification And Regression Trees\"\n",
     "    1995: Randomized Forests and Support Vector Machines methods\n",
     "    1998: Public appearance: first ML implementations of spam filtering methods; naive Bayes Classifier method\n",
     "    2006-12: Neural networks learning breakthrough: deep learning\n",
@@ -327,6 +327,39 @@
     "print(dir(dd))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DESCR:\n",
+      " Optical Recognition of Handwritten Digits Data Set\n",
+      "===================================================\n",
+      "\n",
+      "Notes\n",
+      "-----\n",
+      "Data Set Characteristics:\n",
+      "    :Number of Instances: 5620\n",
+      "    :Number of Attributes: 64\n",
+      "    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n",
+      "    :Missing Attribute Values: None\n",
+      "    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n",
+      "    :Date: July; 1998\n",
+      "\n",
+      "This is a copy of the test set of the UCI ML hand-written digits datasets\n",
+      "http://archive.ics.uci.edu/ml/datas \n",
+      "[...]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"DESCR:\\n\", dd.DESCR[:500], \"\\n[...]\") # description of the dataset"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -336,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -371,15 +404,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "images.ndim: 3\n",
       "images[0].shape: (8, 8)\n",
+      "\n",
       "images[0]:\n",
       " [[ 0.  0.  5. 13.  9.  1.  0.  0.]\n",
       " [ 0.  0. 13. 15. 10. 15.  5.  0.]\n",
@@ -388,44 +421,14 @@
       " [ 0.  5.  8.  0.  0.  9.  8.  0.]\n",
       " [ 0.  4. 11.  0.  1. 12.  7.  0.]\n",
       " [ 0.  2. 14.  5. 10. 12.  0.  0.]\n",
-      " [ 0.  0.  6. 13. 10.  0.  0.  0.]]\n",
-      "images.shape: (1797, 8, 8)\n",
-      "images.size: 115008\n",
-      "images.dtype: float64\n",
-      "images.itemsize: 8\n",
-      "target.size: 1797\n",
-      "target_names: [0 1 2 3 4 5 6 7 8 9]\n",
-      "DESCR:\n",
-      " Optical Recognition of Handwritten Digits Data Set\n",
-      "===================================================\n",
-      "\n",
-      "Notes\n",
-      "-----\n",
-      "Data Set Characteristics:\n",
-      "    :Number of Instances: 5620\n",
-      "    :Number of Attributes: 64\n",
-      "    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n",
-      "    :Missing Attribute Values: None\n",
-      "    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n",
-      "    :Date: July; 1998\n",
-      "\n",
-      "This is a copy of the test set of the UCI ML hand-written digits datasets\n",
-      "http://archive.ics.uci.edu/ml/datas \n",
-      "[...]\n"
+      " [ 0.  0.  6. 13. 10.  0.  0.  0.]]\n"
      ]
     }
    ],
    "source": [
-    "print(\"images.ndim:\", dd.images.ndim) # number of dimensions of the array\n",
     "print(\"images[0].shape:\", dd.images[0].shape) # dimensions of a first sample array\n",
-    "print(\"images[0]:\\n\", dd.images[0]) # first sample array\n",
-    "print(\"images.shape:\", dd.images.shape) # dimensions of the array of all samples\n",
-    "print(\"images.size:\", dd.images.size) # total number of elements of the array\n",
-    "print(\"images.dtype:\", dd.images.dtype) # type of the elements in the array\n",
-    "print(\"images.itemsize:\", dd.images.itemsize) # size in bytes of each element of the array\n",
-    "print(\"target.size:\", dd.target.size) # size of the target feature vector (labels of samples)\n",
-    "print(\"target_names:\", dd.target_names) # classes vector\n",
-    "print(\"DESCR:\\n\", dd.DESCR[:500], \"\\n[...]\") # description of the dataset"
+    "print()\n",
+    "print(\"images[0]:\\n\", dd.images[0]) # first sample array"
    ]
   },
   {
@@ -437,7 +440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -458,33 +461,6 @@
     "print(\"image_vector:\", image_vector)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1797, 8, 8)\n",
-      "(1797, 64)\n",
-      "[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.\n",
-      " 15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.\n",
-      "  0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.\n",
-      "  0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(dd.images.shape)\n",
-    "\n",
-    "# reashape to 1797, 64:\n",
-    "images_flat = dd.images.reshape(-1, 64)\n",
-    "print(images_flat.shape)\n",
-    "print(images_flat[0])"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -496,7 +472,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If we start a machine learning project for texts, we first have to choose a dictionary - set of words for this project. The final representation of a text as a feature vector depends on this dictionary.\n",
+    "If we start a machine learning project for texts, we first have to choose a dictionary (a set of words) for this project. The words in the dictionary are enumerated. The final representation of a text as a feature vector depends on this dictionary.\n",
     "\n",
     "Such a dictionary can be very large, but for the sake of simplicity we use a very small enumerated dictionary to explain the overall procedure:\n",
     "\n",
@@ -582,6 +558,11 @@
    "source": [
     "## ML lingo: What are the different types of datasets?\n",
     "\n",
+    "<div class=\"alert alert-block alert-danger\">\n",
+    "<strong>TODO:</strong> move to later section about cross validation.</div>\n",
+    "\n",
+    "\n",
+    "\n",
     "<div class=\"alert alert-block alert-warning\">\n",
     "<i class=\"fa fa-warning\"></i>&nbsp;<strong>Definitions</strong>\n",
     "\n",
@@ -702,7 +683,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Exercise section 1"
+    "# Hands-on section"
    ]
   },
   {
@@ -710,8 +691,9 @@
    "metadata": {},
    "source": [
     "<div class=\"alert alert-block alert-danger\">\n",
-    "<strong>TODO:</strong> transform to az set of small exercises (instead of a tutorial/example as it is now).\n",
-    "</div>"
+    "<strong>TODO:</strong> transform to a set of small exercises (instead of a tutorial/example as it is now).\n",
+    "</div>\n",
+    "\n"
    ]
   },
   {
@@ -1102,13 +1084,6 @@
     "</div>"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": 14,
@@ -1161,42 +1136,43 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
+   "outputs": [
+    {
+     "ename": "NotFittedError",
+     "evalue": "This LogisticRegression instance is not fitted yet",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNotFittedError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-15-9e1ed3d39774>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Sanity check: can't predict if not fitted (trained)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mclassifier\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/Projects/machinelearning-introduction-workshop/venv3.6/lib/python3.6/site-packages/sklearn/linear_model/base.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    322\u001b[0m             \u001b[0mPredicted\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mper\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    323\u001b[0m         \"\"\"\n\u001b[0;32m--> 324\u001b[0;31m         \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecision_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    325\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    326\u001b[0m             \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/Projects/machinelearning-introduction-workshop/venv3.6/lib/python3.6/site-packages/sklearn/linear_model/base.py\u001b[0m in \u001b[0;36mdecision_function\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    296\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'coef_'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoef_\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    297\u001b[0m             raise NotFittedError(\"This %(name)s instance is not fitted \"\n\u001b[0;32m--> 298\u001b[0;31m                                  \"yet\" % {'name': type(self).__name__})\n\u001b[0m\u001b[1;32m    299\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    300\u001b[0m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNotFittedError\u001b[0m: This LogisticRegression instance is not fitted yet"
+     ]
+    }
+   ],
    "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<i class=\"fa fa-warning\"></i>&nbsp;<strong>`scikit-learn` API</strong>\n",
-    "\n",
-    "In <code>scikit-learn</code> all classifiers have:\n",
-    "<ul>\n",
-    "    <li>a <strong><code>fit()</code></strong> method to learn from data, and</li>\n",
-    "    <li>and a subsequent <strong><code>predict()</code></strong> method for predicting classes from input features.</li>\n",
-    "</ul>\n",
-    "</div>"
+    "# Sanity check: can't predict if not fitted (trained)\n",
+    "classifier.predict(input_features)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "train first..\n",
       "(225,)\n"
      ]
     }
    ],
    "source": [
-    "# Sanity check: can't predict if not fitted (trained)\n",
-    "from sklearn.exceptions import NotFittedError\n",
-    "try:\n",
-    "    classifier.predict(input_features)\n",
-    "except NotFittedError:\n",
-    "    print(\"train first..\")\n",
-    "\n",
     "# Fit\n",
     "classifier.fit(input_features, labels)\n",
     "\n",
@@ -1214,7 +1190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1245,7 +1221,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -1301,7 +1277,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Exercise section 2"
+    "# Exercise section 1"
    ]
   },
   {
@@ -1334,7 +1310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -1368,8 +1344,10 @@
     "\n",
     "<div class=\"alert alert-block alert-info\">\n",
     "<i class=\"fa fa-info-circle\"></i>\n",
-    "Better re-classification does not indicate here that <code>SVC</code> is better than <code>LogisticRegression</code>. At most it seems to fit better to our training data. We will learn later that this may be actually not a good thing.\n",
-    "</div>\n"
+    "Better re-classification in our example does not indicate here that <code>SVC</code> is better than <code>LogisticRegression</code> in all cases. The performance of a classifier strongly depends on the data set.\n",
+    "</div>\n",
+    "\n",
+    "\n"
    ]
   },
   {
@@ -1383,12 +1361,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Both `LogisticRegression` and `SVC` classifiers have a parameter `C` which allows to enforce a \"simplification\" (often called **regularization**) of the resulting model. Test the beers data \"re-classification\" with different values of this parameter.\n"
+    "Both `LogisticRegression` and `SVC` classifiers have a parameter `C` which allows to enforce a \"simplification\" (often called **regularization**) of the resulting model. Test the beers data \"re-classification\" with different values of this parameter.\n",
+    "\n",
+    "\n",
+    "**TOBE discussed**: is \"regularization\" to technical here ? decision surfaces and details of classifers come later. Original purpose (Uwe) was to demonstrate that classifiers can be tuned to the data set."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1409,7 +1390,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Exercise section 3 (optional)"
+    "# Exercise section 2 (optional)"
    ]
   },
   {
@@ -1433,7 +1414,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -1459,7 +1440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -1551,7 +1532,7 @@
        "4      0  "
       ]
      },
-     "execution_count": 21,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1565,7 +1546,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -1687,7 +1668,7 @@
        "max            2.500000    2.000000  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1698,7 +1679,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -1739,16 +1720,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
    "metadata": {
-    "collapsed": true
+    "scrolled": true
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/mikolajr/Workspace/SSDM/machinelearning-introduction-workshop/.venv/lib/python3.7/site-packages/ipykernel_launcher.py:9: UserWarning: get_ipython_dir has moved to the IPython.paths module since IPython 4.0.\n",
+      "/Users/uweschmitt/Projects/machinelearning-introduction-workshop/venv3.6/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: get_ipython_dir has moved to the IPython.paths module since IPython 4.0.\n",
       "  if __name__ == '__main__':\n"
      ]
     },
@@ -1853,7 +1834,7 @@
        "<IPython.core.display.HTML object>"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1983,7 +1964,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.6.6"
   }
  },
  "nbformat": 4,
-- 
GitLab