From 5ecec6d53a4f83b7331c3b3f14bf97b9ffa8de2d Mon Sep 17 00:00:00 2001 From: Mikolaj Rybinski <mikolaj.rybinski@id.ethz.ch> Date: Mon, 9 Sep 2019 08:16:54 +0200 Subject: [PATCH] minor corrections post review by Uwe --- 01_introduction.ipynb | 2 +- 02_classification.ipynb | 2 +- 03_overfitting_and_cross_validation.ipynb | 83 +++++++++++++---------- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/01_introduction.ipynb b/01_introduction.ipynb index b478166..7ffecd1 100644 --- a/01_introduction.ipynb +++ b/01_introduction.ipynb @@ -1408,7 +1408,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### What happened?\n", + "<div style=\"font-weight: bold; font-size: 200%;\">What happened?</div>\n", "\n", "Why were not all labels predicted correctly?\n", "\n", diff --git a/02_classification.ipynb b/02_classification.ipynb index 2a2e431..20ec770 100644 --- a/02_classification.ipynb +++ b/02_classification.ipynb @@ -1353,7 +1353,7 @@ "Most cases have higher dimensions than 2 or 3 and visual inspection can be difficult. Thus, engineering features as we did in the 2D examples becomes tricky.\n", "\n", "<div class=\"alert alert-block alert-warning\"><p><i class=\"fa fa-warning\"></i> \n", - "<span style=\"font-size: large\">General recommendations for feature engineering</span>\n", + "<span style=\"font-weight: bold; font-size: 125%;\">General recommendations for feature engineering</span>\n", "<ul>\n", "<li>use descriptive statistics (mean, standard deviations, higher order statistics), as well as histograms if applicable;</li>\n", "<li>polynomial features (e.g. extend <code>x, y</code> to <code>x, y, x * y, x ** 2, y ** 2</code>) (see below);</li>\n", diff --git a/03_overfitting_and_cross_validation.ipynb b/03_overfitting_and_cross_validation.ipynb index 99b495f..7e3d3f6 100644 --- a/03_overfitting_and_cross_validation.ipynb +++ b/03_overfitting_and_cross_validation.ipynb @@ -1171,14 +1171,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "300\n", - "0.49666666666666665\n", + "# Whole dataset \n", + "number of all samples: 300\n", + "proportion of yummy samples: 0.49666666666666665\n", "\n", - "240\n", - "0.49583333333333335\n", + "# Cross-validation dataset \n", + "number of all samples: 240\n", + "proportion of yummy samples: 0.49583333333333335\n", "\n", - "60\n", - "0.5\n" + "# Validation dataset \n", + "number of all samples: 60\n", + "proportion of yummy samples: 0.5\n" ] } ], @@ -1198,14 +1201,17 @@ "\n", ") = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)\n", "\n", - "print(len(labels))\n", - "print(sum(labels == 1)/len(labels))\n", + "print(\"# Whole dataset \")\n", + "print(\"number of all samples:\", len(labels))\n", + "print(\"proportion of yummy samples:\", sum(labels == 1)/len(labels))\n", "print()\n", - "print(len(labels_crosseval))\n", - "print(sum(labels_crosseval == 1)/len(labels_crosseval))\n", + "print(\"# Cross-validation dataset \")\n", + "print(\"number of all samples:\", len(labels_crosseval))\n", + "print(\"proportion of yummy samples:\", sum(labels_crosseval == 1)/len(labels_crosseval))\n", "print()\n", - "print(len(labels_validation))\n", - "print(sum(labels_validation == 1)/len(labels_validation))" + "print(\"# Validation dataset \")\n", + "print(\"number of all samples:\", len(labels_validation))\n", + "print(\"proportion of yummy samples:\", sum(labels_validation == 1)/len(labels_validation))" ] }, { @@ -1225,21 +1231,21 @@ "output_type": "stream", "text": [ "OPTIMIZE HYPERPARAMETERS\n", - "score = 0.717 +/- 0.108, C = 0.1, gamma = 0.1\n", - "score = 0.796 +/- 0.063, C = 0.1, gamma = 1.0\n", - "score = 0.862 +/- 0.056, C = 0.1, gamma = 10.0\n", + "score = 0.728 +/- 0.107, C = 0.1, gamma = 0.1\n", + "score = 0.791 +/- 0.090, C = 0.1, gamma = 1.0\n", + "score = 0.824 +/- 0.057, C = 0.1, gamma = 10.0\n", "score = 0.504 +/- 0.008, C = 0.1, gamma = 100.0\n", - "score = 0.825 +/- 0.048, C = 1.0, gamma = 0.1\n", - "score = 0.908 +/- 0.036, C = 1.0, gamma = 1.0\n", - "score = 0.900 +/- 0.046, C = 1.0, gamma = 10.0\n", - "score = 0.784 +/- 0.074, C = 1.0, gamma = 100.0\n", - "score = 0.933 +/- 0.050, C = 10.0, gamma = 0.1\n", - "score = 0.958 +/- 0.037, C = 10.0, gamma = 1.0\n", - "score = 0.909 +/- 0.040, C = 10.0, gamma = 10.0\n", - "score = 0.780 +/- 0.072, C = 10.0, gamma = 100.0\n", + "score = 0.837 +/- 0.094, C = 1.0, gamma = 0.1\n", + "score = 0.875 +/- 0.042, C = 1.0, gamma = 1.0\n", + "score = 0.871 +/- 0.054, C = 1.0, gamma = 10.0\n", + "score = 0.775 +/- 0.051, C = 1.0, gamma = 100.0\n", + "score = 0.908 +/- 0.031, C = 10.0, gamma = 0.1\n", + "score = 0.950 +/- 0.031, C = 10.0, gamma = 1.0\n", + "score = 0.891 +/- 0.034, C = 10.0, gamma = 10.0\n", + "score = 0.780 +/- 0.057, C = 10.0, gamma = 100.0\n", "\n", "BEST RESULT CROSS VALIDATION\n", - "score = 0.958 +/- 0.037, C = 10.0, gamma = 1.0\n" + "score = 0.950 +/- 0.031, C = 10.0, gamma = 1.0\n" ] } ], @@ -1263,7 +1269,7 @@ " for gamma in SVC_gamma_values:\n", " classifier = SVC(C=C, gamma=gamma)\n", " test_scores = cross_val_score(classifier, features_crosseval, labels_crosseval,\n", - " scoring=\"accuracy\", cv=cross_validator)\n", + " scoring=\"accuracy\", cv=cross_validator) # cv arg is now different\n", " print(\"score = {:.3f} +/- {:.3f}, C = {:5.1f}, gamma = {:5.1f}\".format(\n", " test_scores.mean(), test_scores.std(), C, gamma))\n", " results.append((\n", @@ -1506,25 +1512,27 @@ "output_type": "stream", "text": [ "OPTIMIZE SETTINGS\n", - "score = 0.921 +/- 0.039, C = 1.0, penalty = l1\n", - "score = 0.925 +/- 0.041, C = 10.0, penalty = l1\n", - "score = 0.925 +/- 0.041, C = 100.0, penalty = l1\n", - "score = 0.929 +/- 0.037, C = 1000.0, penalty = l1\n", - "score = 0.825 +/- 0.083, C = 1.0, penalty = l2\n", - "score = 0.913 +/- 0.039, C = 10.0, penalty = l2\n", - "score = 0.917 +/- 0.037, C = 100.0, penalty = l2\n", - "score = 0.925 +/- 0.041, C = 1000.0, penalty = l2\n", + "score = 0.883 +/- 0.085, C = 1.0, penalty = l1\n", + "score = 0.904 +/- 0.077, C = 10.0, penalty = l1\n", + "score = 0.904 +/- 0.077, C = 100.0, penalty = l1\n", + "score = 0.904 +/- 0.077, C = 1000.0, penalty = l1\n", + "score = 0.804 +/- 0.079, C = 1.0, penalty = l2\n", + "score = 0.888 +/- 0.076, C = 10.0, penalty = l2\n", + "score = 0.908 +/- 0.074, C = 100.0, penalty = l2\n", + "score = 0.904 +/- 0.077, C = 1000.0, penalty = l2\n", "\n", "BEST RESULT CROSS VALIDATION\n", - "score = 0.929 +/- 0.037, C = 1000.0, penalty = l1\n", + "score = 0.908 +/- 0.074, C = 100.0, penalty = l2\n", "\n", "VALIDATION\n", - "score = 0.850\n" + "score = 0.933\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.model_selection import cross_val_score\n", "\n", "beer = pd.read_csv(\"data/beers.csv\")\n", "beer_eval = pd.read_csv(\"data/beers_eval.csv\")\n", @@ -1534,13 +1542,14 @@ "labels = all_beer.iloc[:, -1]\n", "\n", "(\n", - "\n", " features_crosseval, \n", " features_validation, \n", " labels_crosseval, \n", " labels_validation,\n", "\n", - ") = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)\n", + ") = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42) \n", + "\n", + "# TRY random_state 43 OR 1 INSTEAD OF 42!\n", "\n", "cross_validator = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)\n", "\n", -- GitLab