From 3d86c8681c5d43f5a0237eb322a1cde068b38b78 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt <uwe.schmitt@id.ethz.ch> Date: Mon, 7 Jan 2019 17:42:25 +0100 Subject: [PATCH] changes after review session with Mik --- 01_introduction.ipynb | 211 +++++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 115 deletions(-) diff --git a/01_introduction.ipynb b/01_introduction.ipynb index 313f532..4d7452a 100644 --- a/01_introduction.ipynb +++ b/01_introduction.ipynb @@ -86,8 +86,8 @@ " 1957-65: \"k-means\" clustering algorithm\n", " 1959: Term \"machine learning\" is coined by Arthur Samuel, an AI pioneer\n", " 1969: Book \"Perceptrons\": Limitations of Neural Networks\n", - " 1984: Book \"Classification And Regression Trees\"\n", " 1974-86: Neural networks learning breakthrough: backpropagation method\n", + " 1984: Book \"Classification And Regression Trees\"\n", " 1995: Randomized Forests and Support Vector Machines methods\n", " 1998: Public appearance: first ML implementations of spam filtering methods; naive Bayes Classifier method\n", " 2006-12: Neural networks learning breakthrough: deep learning\n", @@ -327,6 +327,39 @@ "print(dir(dd))" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DESCR:\n", + " Optical Recognition of Handwritten Digits Data Set\n", + "===================================================\n", + "\n", + "Notes\n", + "-----\n", + "Data Set Characteristics:\n", + " :Number of Instances: 5620\n", + " :Number of Attributes: 64\n", + " :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n", + " :Missing Attribute Values: None\n", + " :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n", + " :Date: July; 1998\n", + "\n", + "This is a copy of the test set of the UCI ML hand-written digits datasets\n", + "http://archive.ics.uci.edu/ml/datas \n", + "[...]\n" + ] + } + ], + "source": [ + "print(\"DESCR:\\n\", dd.DESCR[:500], \"\\n[...]\") # description of the dataset" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -336,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -371,15 +404,15 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "images.ndim: 3\n", "images[0].shape: (8, 8)\n", + "\n", "images[0]:\n", " [[ 0. 0. 5. 13. 9. 1. 0. 0.]\n", " [ 0. 0. 13. 15. 10. 15. 5. 0.]\n", @@ -388,44 +421,14 @@ " [ 0. 5. 8. 0. 0. 9. 8. 0.]\n", " [ 0. 4. 11. 0. 1. 12. 7. 0.]\n", " [ 0. 2. 14. 5. 10. 12. 0. 0.]\n", - " [ 0. 0. 6. 13. 10. 0. 0. 0.]]\n", - "images.shape: (1797, 8, 8)\n", - "images.size: 115008\n", - "images.dtype: float64\n", - "images.itemsize: 8\n", - "target.size: 1797\n", - "target_names: [0 1 2 3 4 5 6 7 8 9]\n", - "DESCR:\n", - " Optical Recognition of Handwritten Digits Data Set\n", - "===================================================\n", - "\n", - "Notes\n", - "-----\n", - "Data Set Characteristics:\n", - " :Number of Instances: 5620\n", - " :Number of Attributes: 64\n", - " :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n", - " :Missing Attribute Values: None\n", - " :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n", - " :Date: July; 1998\n", - "\n", - "This is a copy of the test set of the UCI ML hand-written digits datasets\n", - "http://archive.ics.uci.edu/ml/datas \n", - "[...]\n" + " [ 0. 0. 6. 13. 10. 0. 0. 0.]]\n" ] } ], "source": [ - "print(\"images.ndim:\", dd.images.ndim) # number of dimensions of the array\n", "print(\"images[0].shape:\", dd.images[0].shape) # dimensions of a first sample array\n", - "print(\"images[0]:\\n\", dd.images[0]) # first sample array\n", - "print(\"images.shape:\", dd.images.shape) # dimensions of the array of all samples\n", - "print(\"images.size:\", dd.images.size) # total number of elements of the array\n", - "print(\"images.dtype:\", dd.images.dtype) # type of the elements in the array\n", - "print(\"images.itemsize:\", dd.images.itemsize) # size in bytes of each element of the array\n", - "print(\"target.size:\", dd.target.size) # size of the target feature vector (labels of samples)\n", - "print(\"target_names:\", dd.target_names) # classes vector\n", - "print(\"DESCR:\\n\", dd.DESCR[:500], \"\\n[...]\") # description of the dataset" + "print()\n", + "print(\"images[0]:\\n\", dd.images[0]) # first sample array" ] }, { @@ -437,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -458,33 +461,6 @@ "print(\"image_vector:\", image_vector)" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1797, 8, 8)\n", - "(1797, 64)\n", - "[ 0. 0. 5. 13. 9. 1. 0. 0. 0. 0. 13. 15. 10. 15. 5. 0. 0. 3.\n", - " 15. 2. 0. 11. 8. 0. 0. 4. 12. 0. 0. 8. 8. 0. 0. 5. 8. 0.\n", - " 0. 9. 8. 0. 0. 4. 11. 0. 1. 12. 7. 0. 0. 2. 14. 5. 10. 12.\n", - " 0. 0. 0. 0. 6. 13. 10. 0. 0. 0.]\n" - ] - } - ], - "source": [ - "print(dd.images.shape)\n", - "\n", - "# reashape to 1797, 64:\n", - "images_flat = dd.images.reshape(-1, 64)\n", - "print(images_flat.shape)\n", - "print(images_flat[0])" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -496,7 +472,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If we start a machine learning project for texts, we first have to choose a dictionary - set of words for this project. The final representation of a text as a feature vector depends on this dictionary.\n", + "If we start a machine learning project for texts, we first have to choose a dictionary (a set of words) for this project. The words in the dictionary are enumerated. The final representation of a text as a feature vector depends on this dictionary.\n", "\n", "Such a dictionary can be very large, but for the sake of simplicity we use a very small enumerated dictionary to explain the overall procedure:\n", "\n", @@ -582,6 +558,11 @@ "source": [ "## ML lingo: What are the different types of datasets?\n", "\n", + "<div class=\"alert alert-block alert-danger\">\n", + "<strong>TODO:</strong> move to later section about cross validation.</div>\n", + "\n", + "\n", + "\n", "<div class=\"alert alert-block alert-warning\">\n", "<i class=\"fa fa-warning\"></i> <strong>Definitions</strong>\n", "\n", @@ -702,7 +683,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Exercise section 1" + "# Hands-on section" ] }, { @@ -710,8 +691,9 @@ "metadata": {}, "source": [ "<div class=\"alert alert-block alert-danger\">\n", - "<strong>TODO:</strong> transform to az set of small exercises (instead of a tutorial/example as it is now).\n", - "</div>" + "<strong>TODO:</strong> transform to a set of small exercises (instead of a tutorial/example as it is now).\n", + "</div>\n", + "\n" ] }, { @@ -1102,13 +1084,6 @@ "</div>" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 14, @@ -1161,42 +1136,43 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 15, "metadata": {}, + "outputs": [ + { + "ename": "NotFittedError", + "evalue": "This LogisticRegression instance is not fitted yet", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotFittedError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-15-9e1ed3d39774>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Sanity check: can't predict if not fitted (trained)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mclassifier\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Projects/machinelearning-introduction-workshop/venv3.6/lib/python3.6/site-packages/sklearn/linear_model/base.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0mPredicted\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mper\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \"\"\"\n\u001b[0;32m--> 324\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecision_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 325\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/machinelearning-introduction-workshop/venv3.6/lib/python3.6/site-packages/sklearn/linear_model/base.py\u001b[0m in \u001b[0;36mdecision_function\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'coef_'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoef_\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 297\u001b[0m raise NotFittedError(\"This %(name)s instance is not fitted \"\n\u001b[0;32m--> 298\u001b[0;31m \"yet\" % {'name': type(self).__name__})\n\u001b[0m\u001b[1;32m 299\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNotFittedError\u001b[0m: This LogisticRegression instance is not fitted yet" + ] + } + ], "source": [ - "<div class=\"alert alert-block alert-warning\">\n", - "<i class=\"fa fa-warning\"></i> <strong>`scikit-learn` API</strong>\n", - "\n", - "In <code>scikit-learn</code> all classifiers have:\n", - "<ul>\n", - " <li>a <strong><code>fit()</code></strong> method to learn from data, and</li>\n", - " <li>and a subsequent <strong><code>predict()</code></strong> method for predicting classes from input features.</li>\n", - "</ul>\n", - "</div>" + "# Sanity check: can't predict if not fitted (trained)\n", + "classifier.predict(input_features)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "train first..\n", "(225,)\n" ] } ], "source": [ - "# Sanity check: can't predict if not fitted (trained)\n", - "from sklearn.exceptions import NotFittedError\n", - "try:\n", - " classifier.predict(input_features)\n", - "except NotFittedError:\n", - " print(\"train first..\")\n", - "\n", "# Fit\n", "classifier.fit(input_features, labels)\n", "\n", @@ -1214,7 +1190,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1245,7 +1221,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1301,7 +1277,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Exercise section 2" + "# Exercise section 1" ] }, { @@ -1334,7 +1310,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1368,8 +1344,10 @@ "\n", "<div class=\"alert alert-block alert-info\">\n", "<i class=\"fa fa-info-circle\"></i>\n", - "Better re-classification does not indicate here that <code>SVC</code> is better than <code>LogisticRegression</code>. At most it seems to fit better to our training data. We will learn later that this may be actually not a good thing.\n", - "</div>\n" + "Better re-classification in our example does not indicate here that <code>SVC</code> is better than <code>LogisticRegression</code> in all cases. The performance of a classifier strongly depends on the data set.\n", + "</div>\n", + "\n", + "\n" ] }, { @@ -1383,12 +1361,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Both `LogisticRegression` and `SVC` classifiers have a parameter `C` which allows to enforce a \"simplification\" (often called **regularization**) of the resulting model. Test the beers data \"re-classification\" with different values of this parameter.\n" + "Both `LogisticRegression` and `SVC` classifiers have a parameter `C` which allows to enforce a \"simplification\" (often called **regularization**) of the resulting model. Test the beers data \"re-classification\" with different values of this parameter.\n", + "\n", + "\n", + "**TOBE discussed**: is \"regularization\" to technical here ? decision surfaces and details of classifers come later. Original purpose (Uwe) was to demonstrate that classifiers can be tuned to the data set." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1409,7 +1390,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Exercise section 3 (optional)" + "# Exercise section 2 (optional)" ] }, { @@ -1433,7 +1414,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1459,7 +1440,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1551,7 +1532,7 @@ "4 0 " ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1565,7 +1546,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1687,7 +1668,7 @@ "max 2.500000 2.000000 " ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1698,7 +1679,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1739,16 +1720,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": { - "collapsed": true + "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/mikolajr/Workspace/SSDM/machinelearning-introduction-workshop/.venv/lib/python3.7/site-packages/ipykernel_launcher.py:9: UserWarning: get_ipython_dir has moved to the IPython.paths module since IPython 4.0.\n", + "/Users/uweschmitt/Projects/machinelearning-introduction-workshop/venv3.6/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: get_ipython_dir has moved to the IPython.paths module since IPython 4.0.\n", " if __name__ == '__main__':\n" ] }, @@ -1853,7 +1834,7 @@ "<IPython.core.display.HTML object>" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1983,7 +1964,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.6.6" } }, "nbformat": 4, -- GitLab