Newer
Older
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format = 'retina'\n",
"import warnings\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
"warnings.filterwarnings(\n",
" \"ignore\",\n",
" message=\"X does not have valid feature names, but [a-zA-Z]+ was fitted with feature names\",\n",
" category=UserWarning,\n",
")\n",
"\n",
"warnings.filterwarnings = lambda *a, **kw: None\n",
"from IPython.core.display import HTML\n",
"\n",
"HTML(open(\"custom.html\", \"r\").read())"
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Chapter 6: An overview of classifiers, Part 2\n",
"\n",
"<span style=\"font-size: 150%;\">Decision trees, ensemble methods and summary</span>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's repeat our helper functions from previous part:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"\n",
"def samples_color(ilabels, colors=[\"steelblue\", \"chocolate\"]):\n",
" \"\"\"Return colors list from labels list given as indices.\"\"\"\n",
" return [colors[int(i)] for i in ilabels]\n",
"\n",
"\n",
"def plot_decision_surface(\n",
" features_2d,\n",
" labels,\n",
" classifier,\n",
" preprocessing=None,\n",
" plt=plt,\n",
" marker=\".\",\n",
" N=100,\n",
" alpha=0.2,\n",
" colors=[\"steelblue\", \"chocolate\"],\n",
" title=None,\n",
" test_features_2d=None,\n",
" test_labels=None,\n",
" test_s=60,\n",
"):\n",
" \"\"\"Plot a 2D decision surface for a already trained classifier.\"\"\"\n",
"\n",
" # sanity check\n",
" assert len(features_2d.columns) == 2\n",
"\n",
" # pandas to numpy array; get min/max values\n",
" xy = np.array(features_2d)\n",
" min_x, min_y = xy.min(axis=0)\n",
" max_x, max_y = xy.max(axis=0)\n",
"\n",
" # create mesh of NxN points; tech: `N*1j` is spec for including max value\n",
" XX, YY = np.mgrid[min_x : max_x : N * 1j, min_y : max_y : N * 1j]\n",
" points = np.c_[XX.ravel(), YY.ravel()] # shape: (N*N)x2\n",
" points = pd.DataFrame(points, columns=[\"x\", \"y\"])\n",
"\n",
" # apply scikit-learn API preprocessing\n",
" if preprocessing is not None:\n",
" points = preprocessing.transform(points)\n",
"\n",
" # classify grid points\n",
" classes = classifier.predict(points)\n",
"\n",
" # plot classes color mesh\n",
" ZZ = classes.reshape(XX.shape) # shape: NxN\n",
" plt.pcolormesh(\n",
" XX,\n",
" YY,\n",
" ZZ,\n",
" alpha=alpha,\n",
" cmap=matplotlib.colors.ListedColormap(colors),\n",
" shading=\"auto\",\n",
" )\n",
" # plot points\n",
" plt.scatter(\n",
" xy[:, 0],\n",
" xy[:, 1],\n",
" marker=marker,\n",
" color=samples_color(labels, colors=colors),\n",
" )\n",
" # set title\n",
" if title:\n",
" if hasattr(plt, \"set_title\"):\n",
" plt.set_title(title)\n",
" else:\n",
" plt.title(title)\n",
" # plot test points\n",
" if test_features_2d is not None:\n",
" assert test_labels is not None\n",
" assert len(test_features_2d.columns) == 2\n",
" test_xy = np.array(test_features_2d)\n",
" plt.scatter(\n",
" test_xy[:, 0],\n",
" test_xy[:, 1],\n",
" s=test_s,\n",
" facecolors=\"none\",\n",
" linewidths=2,\n",
" color=samples_color(test_labels),\n",
" );"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since the latest version, `sklearn` offers its own method for visualizing decision boundaries: `DecisionBoundaryDisplay`. Documentation for this method can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.inspection.DecisionBoundaryDisplay.html."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Decision trees\n",
"\n",
"Let's see what a decision tree is by looking at an (artificial) example: \n",
"\n",
"Following the yes/no decision nodes from the top (root node) you will end up at a **leaf** which concludes if you should work or relax: \n",
"\n",
"<table>\n",
" <tr><td><img src=\"./images/decision_tree-work.png\" width=600px></td></tr>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How are the decision tree splits selected?\n",
"\n",
"Starting from the top the decision tree is build by selecting **best split of the dataset using a single feature**. \n",
"\n",
"Such a split is done on a single feature and splits your space into two halves.\n",
"\n",
"The optimal feature and its split value are determined for each node such that the resulting **subsets are as pure as possible** .\n",
"\n",
"Purity of a set is measured using so-called [gini-index](https://en.wikipedia.org/wiki/Diversity_index#Gini%E2%80%93Simpson_index). The lower this index the more pure a set is.\n",
"\n",
"The formula to compute this is index is $1 - p_0^2 - p_1^2$, where $p_i$ is the fraction of samples of class $i$ in the set. \n",
"\n",
"Thus for a set with only postive or negative samples either $p_0$ or $p_1$ is 1.0 and the other is 0, resulting in an optimal gini index of $0$.\n",
"\n",
"\n",
"\n",
"<table>\n",
" <tr><td><img src=\"./images/decision_tree-split.png\" width=600px></td></tr>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the left tree the gini index of the left split is $0$, and the one from the right split is $1 - 0.4^2 - 0.6^2 = 0.48$. The weighted sum (using the number of elements in each set as weights) is then $0.48 * 5/7 = 0.34$\n",
"\n",
"An similar computation yields a value of $0.21$ for the right split, and thus the right split would be preferred when constructing the tree.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### XOR decision tree\n",
"\n",
"Let's try out decision trees with the XOR dataset, in which samples have class `True` when the two coordinates `x` and `y` have different sign, otherwise they have class `False`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"data/xor.csv\")\n",
"features_2d = df.loc[:, (\"x\", \"y\")]\n",
"labelv = df[\"label\"]\n",
"\n",
"plt.figure(figsize=(5, 5))\n",
"plt.xlabel(\"x\")\n",
"plt.ylabel(\"y\")\n",
"plt.title(\"Orange is True, blue is False\")\n",
"plt.scatter(features_2d.iloc[:, 0], features_2d.iloc[:, 1], color=samples_color(labelv));"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Decision trees live in the `sklearn.tree` module."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"# Note: split randomness picked manually for educational purpose\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" features_2d, labelv, random_state=10\n",
")\n",
"\n",
"# Note: features are permuted reandomly in case equally good splits are found\n",
"# fix randomization for reproduciblity\n",
"classifier = DecisionTreeClassifier(random_state=0)\n",
"classifier.fit(X_train, y_train)\n",
"\n",
"print(\"train score: {:.2f}%\".format(100 * classifier.score(X_train, y_train)))\n",
"print(\"test score: {:.2f}%\".format(100 * classifier.score(X_test, y_test)))\n",
"\n",
"plt.figure(figsize=(5, 5))\n",
"plot_decision_surface(\n",
" features_2d,\n",
" labelv,\n",
" classifier,\n",
" test_features_2d=X_test,\n",
" test_labels=y_test,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"About the plot: **the points surrounded with a circle are from the test data set** (not used for learning), all other points belong to the training data.\n",
"\n",
"This surface seems a bit rough on edges. One of the biggest advantages of the decision trees is interpretability of the model. Let's **inspect the model by looking at the tree that was built**:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.tree import plot_tree\n",
"\n",
"fig = plt.figure(figsize=(12, 8))\n",
"fig.suptitle(\"XOR Decision Tree\")\n",
"plot_tree(classifier, feature_names=[\"x\", \"y\"], class_names=[\"False\", \"True\"]);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span style=\"font-size: 150%\">Whoaaa .. what happened here?</span>\n",
"\n",
"XOR is the **anti-example** for DTs!\n",
"\n",
"They cannot make the \"natural\" split at value `0`:\n",
"Either splitting at $x=0$ or $y=0$ would leave very impure splits with ~50% of samples from both classes. Instead the tree focusses on details resp. noise in the particular data set during construction.\n",
"\n",
"Note: This would be not an issue if a decision tree would not constructed in a greedy manner, split per split.\n",
"\n",
"\n",
"Moreover, the tree is quite deep because, by default, it is built until all nodes are \"pure\" (`gini = 0.0`). This tree is **overfitted**."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How to avoid overfitting?\n",
"\n",
"There is no regularization penalty like in logistic regression or SVM methods when bulding a decision tree. Instead we can set learning hyperparameters such as:\n",
"or\n",
"* upper limit on tree depth (`max_depth`), or\n",
"* a minimum number of samples required at a node or at a leaf node (`min_samples_split`, `min_samples_leaf`), or\n",
"* an early stopping criteria based on minumum value of impurity or on minimum decrease in impurity (`min_impurity_split`, `min_impurity_decrease`),\n",
"* tree pruning (based on minimal cost-complexity; `ccp_alpha`) after the tree has been built, \n",
"* ... and few more - see `DecisionTreeClassifier` docs.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exercise section\n",
"\n",
"1. In theory for the XOR dataset it should suffice to use each feature exactly once with splits at `0`, but the decision tree learning algorithm is unable to find such a solution. Play around with `max_depth` to get a smaller but similarly performing decision tree for the XOR dataset.<br/>\n",
" Bonus question: which other hyperparameter could you have used to get the same result?\n",
"\n",
"2. Build a decision tree for the beers dataset. Use maximum depth and tree pruning strategies to get a much smaller tree that performs as well as the default tree.<br/>\n",
" Note: `classifier.tree_` instance has attributes such as `max_depth`, `node_count`, or `n_leaves`, which measure size of the tree."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
"\n",
"df = pd.read_csv(\"data/xor.csv\")\n",
"features_2d = df.loc[:, (\"x\", \"y\")]\n",
"labelv = df[\"label\"]\n",
"\n",
"max_depths = [2, 3, 4]\n",
"# ..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
"\n",
"df = pd.read_csv(\"data/beers.csv\")\n",
"features = df.iloc[:, :-1]\n",
"labelv = df.iloc[:, -1]\n",
"# ..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"solution"
]
},
"outputs": [],
"source": [
"# SOLUTION 1\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
"\n",
"df = pd.read_csv(\"data/xor.csv\")\n",
"features_2d = df.loc[:, (\"x\", \"y\")]\n",
"labelv = df[\"label\"]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" features_2d, labelv, random_state=10\n",
")\n",
"\n",
"max_depths = [2, 3, 4]\n",
"\n",
"n_params = len(max_depths)\n",
"fig, ax_arr = plt.subplots(ncols=n_params, nrows=2, figsize=(7 * n_params, 7 * 2))\n",
"fig.suptitle(\"smaller XOR Decision Trees\")\n",
"for i, max_depth in enumerate(max_depths):\n",
"\n",
" classifier = DecisionTreeClassifier(\n",
" max_depth=max_depth,\n",
" random_state=0,\n",
" )\n",
" classifier.fit(X_train, y_train)\n",
"\n",
" ax = ax_arr[0, i]\n",
" plot_tree(\n",
" classifier,\n",
" feature_names=features_2d.columns.values,\n",
" class_names=[\"False\", \"True\"],\n",
" ax=ax,\n",
" fontsize=7,\n",
" )\n",
" ax.set_title(\n",
" (\n",
" f\"max depth = {max_depth}\\n\"\n",
" f\"train score: {100 * classifier.score(X_train, y_train):.2f}%\\n\"\n",
" f\"test score: {100 * classifier.score(X_test, y_test):.2f}%\"\n",
" )\n",
" )\n",
"\n",
" ax = ax_arr[1, i]\n",
" plot_decision_surface(\n",
" features_2d,\n",
" labelv,\n",
" classifier,\n",
" test_features_2d=X_test,\n",
" test_labels=y_test,\n",
" plt=ax,\n",
" )\n",
"\n",
"# We could have used equivalently `min_impurity_split` early stopping criterium with any (gini) value between 0.15 and 0.4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# SOLUTION 2\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
"\n",
"df = pd.read_csv(\"data/beers.csv\")\n",
"print(df.head(2))\n",
"\n",
"features = df.iloc[:, :-1]\n",
"labelv = df.iloc[:, -1]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(features, labelv, random_state=10)\n",
"\n",
"# default\n",
"classifier = DecisionTreeClassifier(random_state=0)\n",
"pipeline = make_pipeline(StandardScaler(), classifier)\n",
"pipeline.fit(X_train, y_train)\n",
"print()\n",
"print(\"#### default Beers Decision Tree\")\n",
"print(\n",
" f\"depth: {classifier.tree_.max_depth}, \",\n",
" f\"#nodes: {classifier.tree_.node_count}, \",\n",
" f\"#leaves: {classifier.tree_.n_leaves}\",\n",
")\n",
"print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
"print(f\" test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
"\n",
"# smaller\n",
"classifier = DecisionTreeClassifier(max_depth=4, ccp_alpha=0.02, random_state=0)\n",
"pipeline = make_pipeline(StandardScaler(), classifier)\n",
"pipeline.fit(X_train, y_train)\n",
"print()\n",
"print(\"#### smaller Beers Decision Tree\")\n",
"print(\n",
" f\"depth: {classifier.tree_.max_depth}, \",\n",
" f\"#nodes: {classifier.tree_.node_count}, \",\n",
" f\"#leaves: {classifier.tree_.n_leaves}\",\n",
")\n",
"print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
"print(f\" test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
"\n",
"fig = plt.figure(figsize=(10, 6))\n",
"plot_tree(classifier, feature_names=features.columns.values);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One **issue with decision trees is their instability** - a small changes in the training data usually results in a completely different order of splits (different tree structure)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ensemble Averaging: Random Forests\n",
"\n",
"The idea of Random Forest method is to generate **ensemble of many \"weak\" decision trees** and by **averaging out their probabilistic predictions**. (The original Random Forests method used voting.)\n",
"\n",
"\n",
"Weak classifier here are **shallow trees with feature-splits picked only out of random subsets of features** (*features bagging*). Random subset of features is selected per each split, not for the whole classifier.\n",
"\n",
"<table>\n",
" <tr><td><img src=\"./images/random_forest.png\" width=800px></td></tr>\n",
" <tr><td><center><sub>Source: <a href=\"https://towardsdatascience.com/random-forests-and-decision-trees-from-scratch-in-python-3e4fa5ae4249\">https://towardsdatascience.com/random-forests-and-decision-trees-from-scratch-in-python-3e4fa5ae4249</a></sub></center></td></tr>\n",
"</table>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Demonstration\n",
"\n",
"You will find Random Forest method implementation in the `sklearn.ensemble` module.\n",
"\n",
"The main parameters are:\n",
"* number of trees (`n_estimators`),\n",
"* each tree max. depth 2 (`max_depth`), and\n",
"* max. number of randomly selected features to pick from when building each tree (`max_features`).\n",
"\n",
"Let's build a small Random Forest and have a look at its trees, available under `.estimators_` property."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.tree import plot_tree\n",
"\n",
"df = pd.read_csv(\"data/beers.csv\")\n",
"print(df.head(2))\n",
"\n",
"features = df.iloc[:, :-1]\n",
"labelv = df.iloc[:, -1]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(features, labelv, random_state=10)\n",
"\n",
"# 4 shallow (depth 2) trees, each using only 3 randomly selected features\n",
"# total: up to 4*3 decision nodes, up to 4*4 class nodes\n",
"n_trees = 4\n",
"classifier = RandomForestClassifier(\n",
" max_depth=2,\n",
" n_estimators=n_trees,\n",
" max_features=3,\n",
" random_state=0,\n",
")\n",
"pipeline = make_pipeline(StandardScaler(), classifier)\n",
"pipeline.fit(X_train, y_train)\n",
"\n",
"print()\n",
"print(\"#### Random Forest\")\n",
"print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
"print(f\" test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
"\n",
"# to evaluate ensemble estimators, we need to use transformed data\n",
"X_train_trans = pipeline[:-1].transform(X_train)\n",
"X_test_trans = pipeline[:-1].transform(X_test)\n",
"\n",
"fig, ax_arr = plt.subplots(ncols=n_trees, nrows=1, figsize=(7 * n_trees, 5))\n",
"for i, internal_classifier in enumerate(classifier.estimators_):\n",
" ax = ax_arr[i]\n",
" plot_tree(internal_classifier, feature_names=features.columns.values, ax=ax)\n",
" ax.set_title(\n",
" (\n",
" f\"Tree #{i}\\n\"\n",
" f\"train score: {100 * internal_classifier.score(X_train_trans, y_train):.2f}%\\n\"\n",
" f\" test score: {100 * internal_classifier.score(X_test_trans, y_test):.2f}%\"\n",
" )\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Random forests are fast and shine with high dimensional data (many features).\n",
"\n",
"<div class=\"alert alert-block alert-info\">\n",
" <p><i class=\"fa fa-info-circle\"></i>\n",
" Random Forest can estimate <em>out-of-bag error</em> (OOB) while learning; set <code>oob_score=True</code>. (The out-of-bag (OOB) error is the average error for each data sample, calculated using predictions from the trees that do not contain that sample in their respective bootstrap samples.)\n",
" OOB is a generalisation/predictive error that, together with <code>warm_start=True</code>, can be used for efficient search for a good-enough number of trees, i.e. the <code>n_estimators</code> hyperparameter value (see: <a href=https://scikit-learn.org/stable/auto_examples/ensemble/plot_ensemble_oob.html>OOB Errors for Random Forests</a>).\n",
" </p>\n",
"</div>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Boosting: AdaBoost\n",
"\n",
"<span style=\"font-size: 125%;\">What is it?</span>\n",
"\n",
"Boosting is another sub-type of ensemble learning. Same as in averaging, the idea is to generate many **weak classifiers to create a single strong classifier**, but in contrast to averaging, the classifiers are learnt **iteratively**.\n",
"\n",
"<span style=\"font-size: 125%;\">How does it work?</span>\n",
"\n",
"Each iteration focuses more on **previously misclassified samples**. To that end, **data samples are weighted**, and after each learning iteration the data weights are readjusted.\n",
"\n",
"<table>\n",
" <tr><td><img src=\"./images/AdaBoost.png\" width=800px></td></tr>\n",
" <tr><td><center><sub>Source: Marsh, B., (2016), <em>Multivariate Analysis of the Vector Boson Fusion Higgs Boson</em>.</sub></center></td></tr>\n",
"</table>\n",
"\n",
"The final prediction is a weighted majority vote or weighted sum of predictions of the weighted weak classifiers.\n",
"\n",
"Boosting works very well out of the box. There is usually no need to fine tune method hyperparameters to get good performance.\n",
"\n",
"<span style=\"font-size: 125%;\">Where do i start?</span>\n",
"\n",
"**AdaBoost (\u201cAdaptive Boosting\u201d) is a baseline boosting algorithm** that originally used decisoin trees as weak classifiers, but, in principle, works with any classification method (`base_estimator` parameter).\n",
"\n",
"In each AdaBoost learning iteration, additionally to samples weights, the **weak classifiers are weighted**. Their weights are readjusted, such that **the more accurate a weak classifier is, the larger its weight is**.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Demonstration\n",
"\n",
"You will find AdaBoost algorithm implementation in the `sklearn.ensemble` module.\n",
"\n",
"We'll use `n_estimators` parameter to determine number of weak classifiers. These by default are single node decision trees (`base_estimator = DecisionTreeClassifier(max_depth=1)`). We can examine them via `.estimators_` property of a trained method.\n",
"\n",
"For presentation, in order to weight the classifiers, we will use the original discrete AdaBoost learning method (`algorithm=\"SAMME\"`). Because the classifiers learn iteratively on differently weighted samples, to understand the weights we have to look at internal train errors and not at the final scores on the training data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from math import ceil, floor\n",
"\n",
"import pandas as pd\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.tree import plot_tree\n",
"\n",
"df = pd.read_csv(\"data/beers.csv\")\n",
"print(df.head(2))\n",
"\n",
"features = df.iloc[:, :-1]\n",
"labelv = df.iloc[:, -1]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(features, labelv, random_state=10)\n",
"\n",
"# 9 single node decision trees\n",
"# total: 9*1 decision nodes, 9*2 class nodes\n",
"# (Note: with default real AdaBoost \"SAMME.R\" algorithm all weights are 1 at the end)\n",
"n_trees = 9\n",
"classifier = AdaBoostClassifier(n_estimators=n_trees, algorithm=\"SAMME\", random_state=0)\n",
"pipeline = make_pipeline(StandardScaler(), classifier)\n",
"pipeline.fit(X_train, y_train)\n",
"\n",
"print()\n",
"print(\"AdaBoost\")\n",
"print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
"print(f\"test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
"\n",
"# to evaluate ensemble estimators, we need to use transformed data\n",
"X_train_trans = pipeline[:-1].transform(X_train)\n",
"X_test_trans = pipeline[:-1].transform(X_test)\n",
"\n",
"fig, ax_arr = plt.subplots(ncols=n_trees, nrows=1, figsize=(5 * n_trees, 4))\n",
"for i, internal_classifier in enumerate(classifier.estimators_):\n",
" ax = ax_arr[i]\n",
" plot_tree(internal_classifier, feature_names=features.columns.values, ax=ax)\n",
" ax.set_title(\n",
" (\n",
" f\"Tree #{i}, weight: {classifier.estimator_weights_[i]:.2f}\\n\"\n",
" f\"train error: {classifier.estimator_errors_[i]:.2f}\\n\"\n",
" f\"(train score: {100 * internal_classifier.score(X_train_trans, y_train):.2f}%)\\n\"\n",
" f\"test score: {100 * internal_classifier.score(X_test_trans, y_test):.2f}%\"\n",
" )\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Other boosting methods\n",
"\n",
"In practice you will mostly want to use other than AdaBoost methods for boosting.\n",
"\n",
"#### Gradient Tree Boosting (GTB)\n",
"\n",
"It re-formulates boosting problem as an optimization problem which is solved with efficient Stochastic Gradient Descent optimization method (more on that in the neuronal networks script).\n",
"\n",
"In contrast to AdaBoost, GTB relies on using decision trees.\n",
"\n",
"In particular, try out [XGboost](https://xgboost.readthedocs.io/en/latest/); it's a package that won many competitions, cf. [XGboost@Kaggle](https://www.kaggle.com/dansbecker/xgboost). It is not part of scikit-learn, but it offers a `scikit-learn` API (see https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn ); a `scikit-learn` equivalent is [`GradientBoostingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html).\n",
"\n",
"#### Histogram-based Gradient Boosting Classification Tree.\n",
"\n",
"A new `scikit-learn` implementation of boosting based on decision trees is [`HistGradientBoostingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html). It is much faster then `GradientBoostingClassifier` for big datasets (`n_samples >= 10 000`).\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ensemble Stacking: a honorary mention\n",
"\n",
"Stacking is used often in case of different types of base models, when it's not clear which type of model will perform best.\n",
"\n",
"The stacking architectore combines multiple classfiers which feed into a final meta-model (called also *combiner*, *blender*, or *generalizer*). Here is an example:\n",
"\n",
"\n",
"<table>\n",
" <tr><td><img src=\"./images/ensemble-learning-stacking.png\" width=\"400px\"></td></tr>\n",
" <tr><td><center><sub><a href=\"https://data-science-blog.com/blog/2017/12/03/ensemble-learning/\">https://data-science-blog.com/blog/2017/12/03/ensemble-learning/</a></sub></center></td></tr>\n",
"</table>\n",
"\n",
"The learning procedure is as follows:\n",
"- split your training data (e.g. in an 80-20 split),\n",
"- train the first layer on the 80% of the data,\n",
"- feed the remaining 20% through the first layer,\n",
"- train the meta-model on the output from these 20%.\n",
"\n",
"The idea is that the meta-model also is able to correct overfitting from the first layer.\n",
"\n",
"Stacking combines strengths of different models and usually slightly outperforms the best individual model. In practice often multiple stacking layers are used with groups of different but repeating types of classifiers.\n",
"\n",
"<table>\n",
" <tr><td><center><img src=\"./images/ensemble-learning-stacking-kdd_2015_winner.png\" width=\"800px\"></center></td></tr>\n",
" <tr><td><center><sub>KDD Cup 2015 winner</sub></center></td></tr>\n",
" <tr><td><center><sub>GBM: Gradient Boosting Machine; NN: Neural Network; FM: Factorization Machine; LR: Logistic Regression; KRR: Kernel Ridge Regression; ET: Extra Trees; RF: Random Forests; KNN: K-Nearest Neighbors</sub></center></td></tr>\n",
" <tr><td><center><sub><a href=\"https://www.slideshare.net/jeongyoonlee/winning-data-science-competitions-74391113\"> Jeong-Yoon Lee, <em>Winning Data Science Competitions</em>, Apr 2017</a></sub></center></td></tr>\n",
"</table>\n",
"\n",
"In the `sklearn.ensemble` the stacking is implemented by `StackingClassifier` and `StackingRegressor`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Why does ensemble learning work?\n",
"\n",
"* Probability of making an error by majority of the classifiers in the ensemble is much lower then error that each of the weak classifiers makes alone.\n",
"\n",
"* An ensemble classifier is more roboust (has lower variance) with respect to the training data.\n",
"\n",
"* The weak classifiers are small, fast to learn, and, in case of averaging, they can be learnt in parallel.\n",
"\n",
"In general, **usually ensemble classifier performs better than any of the weak classifiers in the ensemble**."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Coding session\n",
"\n",
"Apply all classifiers seen so far to the MNIST images dataset (`from sklearn.datasets import load_digits`). To that end:\n",
"\n",
"1. split your data into training and a test \"holdout\" data (`from sklearn.model_selection import train_test_split`) in 80:20 proporition; stratify the 10 target classes (`stratify=...`);\n",
"2. compare cross validation mean f1 scores; use the stratified k-fold CV strategy (`from sklearn.model_selection import cross_val_score, StratifiedKFold`; attention: this is a non-binary multi-class problem, you will have to use an adjusted f1 score, e.g. unweighted per class mean via `scoring=\"f1_macro\"` keyword arg - see [the `scoring` parameter predefined values](https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules));\n",
"3. train the final model on the whole dataset and, use the test \"holdout\" data to repot a final model f1 performance; report also accuracy, precision and recall scores (`from sklearn.metrics import classification_report`);\n",
"4. next, try to manually tune hyperparameters to minimize the train test gap and to squeeze out at least 90% cross validation f1 score performance out of each classifier; try using PCA preprocessing (`sklearn.pipeline.make_pipeline`, `sklearn.decomposition.PCA`); what about data scaling? which models are most effective and easiest to tune manually?\n",
"5. optionally, once you get a feel of good preprocessing and classifier hyperparameters, define parameters ranges and apply a CV-based search to find optimal values for the hyperparameters (`from sklearn.model_selection import GridSearchCV, RandomizedSearchCV`)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.datasets import load_digits\n",
"from sklearn.decomposition import PCA\n",
"\n",
"# classifiers\n",
"from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.svm import SVC, LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"digits = load_digits()\n",
"\n",
"labels = digits.target\n",
"n_samples = len(labels)\n",
"\n",
"# flatten images of shape N_SAMPLES x 8 x 8 to N_SAMPLES x 64:\n",
"print(\"digits data set shape:\", digits.images.shape)\n",
"features = digits.images.reshape((n_samples, -1))\n",
"print(\"feature matrix shape:\", features.shape)\n",
"\n",
"# (\n",
"# features_train,\n",
"# features_test,\n",
"# labels_train,\n",
"# labels_test,\n",
"# ) = train_test_split(..., test_size=..., stratify=..., random_state=42)\n",
"#\n",
"# classifiers = [\n",
"# LogisticRegression(...),\n",
"# LinearSVC(...),\n",
"# SVC(...),\n",
"# DecisionTreeClassifier(..., random_state=42), # rather won't do very well when not used in an ensemble method\n",
"# RandomForestClassifier(..., random_state=42),\n",
"# AdaBoostClassifier(..., random_state=42),\n",
"# ]\n",
"#\n",
"# pipeline = make_pipeline(...)\n",
"#\n",
"# cross_validator = StratifiedKFold(...)\n",
"# scores = cross_val_score(..., scoring=..., cv=...)\n",
"# ..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"solution"
]
},
"outputs": [],
"source": [
"# SOLUTION\n",
"import numpy as np\n",
"from sklearn.datasets import load_digits\n",
"from sklearn.decomposition import PCA\n",
"\n",
"# classifiers\n",
"from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.svm import SVC, LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"digits = load_digits()\n",
"\n",
"labels = digits.target\n",
"n_samples = len(labels)\n",
"\n",
"# flatten images of shape N_SAMPLES x 8 x 8 to N_SAMPLES x 64:\n",
"print(\"digits data set shape:\", digits.images.shape)\n",
"features = digits.images.reshape((n_samples, -1))\n",
"print(\"feature matrix shape:\", features.shape)\n",
"\n",
"(\n",
" features_train,\n",
" features_test,\n",
" labels_train,\n",
" labels_test,\n",
") = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)\n",
"\n",
"classifiers = [\n",
" LogisticRegression(C=10, penalty=\"l1\", solver=\"liblinear\", max_iter=10_000),\n",
" LinearSVC(C=1, penalty=\"l1\", dual=False, max_iter=50_000),\n",
" SVC(C=10, gamma=0.001),\n",
" DecisionTreeClassifier(\n",
" max_depth=9, random_state=42\n",
" ), # rather won't do very well when not used in an ensemble method\n",
" RandomForestClassifier(\n",
" max_depth=6,\n",
" n_estimators=50,\n",
" max_features=\"log2\",\n",
" random_state=42,\n",
" ),\n",
" AdaBoostClassifier(\n",
" base_estimator=DecisionTreeClassifier(\n",
" max_depth=3\n",
" ), # in CV search use: adaboostclassifier__base_estimator__max_depth\n",
" n_estimators=100,\n",
" algorithm=\"SAMME\", # works better than default for a multi-class problem\n",
" random_state=42,\n",
" ),\n",
"]\n",
"\n",
"# We do already account for class imbalances in train/test and CV splits via stratification\n",
"# => take an unweighted f1 score per class mean (\"f1_macro\"), thus, keeping f1 score in between\n",
"# the precision and recall scores.\n",
"scoring = \"f1_macro\"\n",
"pca_n_components = (\n",
" features.shape[1] // 3\n",
") # accuracy-wise does not add much but speeds up training\n",
"cv_n_splits = 10\n",
"\n",
"for classifier in classifiers:\n",
" print()\n",
" print(f\"#### {classifier}\")\n",
" pipeline = make_pipeline(PCA(n_components=pca_n_components), classifier)\n",
" cross_validator = StratifiedKFold(\n",
" n_splits=cv_n_splits, shuffle=True, random_state=42\n",
" )\n",
" scores = cross_val_score(\n",
" pipeline, features_train, labels_train, scoring=scoring, cv=cross_validator\n",
" )\n",
" print()\n",
" print(f\"train {cv_n_splits}-fold CV mean {scoring}:\")\n",
" print(f\"\\t{scores.mean():.2f} +/- {scores.std():.2f}\")\n",
" print()\n",
" # the final model - train on the whole dataset\n",
" classifier.fit(features_train, labels_train)\n",
" # the final \"holdout\" test\n",
" print(f\"test score:\")\n",
" predicted_test = classifier.predict(features_test)\n",
" print(\n",
" classification_report(\n",
" labels_test,\n",
" predicted_test,\n",
" )\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below you will find a table with some guidelines, as well as pros and cons of different classication methods available in scikit-learn.\n",
"\n",
"<div class=\"alert alert-block alert-warning\">\n",
" <p><i class=\"fa fa-warning\"></i> <strong>Summary table</strong></p>\n",
"\n",
"<p>\n",
"<em>Disclaimer</em>: this table is neither a single source of truth nor complete - it's intended only to provide some first considerations when starting out. At the end of the day, you have to try and pick a method that works for your problem/data.\n",
"</p>\n",
"\n",
"<table>\n",
"<thead>\n",
"<tr>\n",
"<th style=\"text-align: center;\">Classifier type</th>\n",
"<th style=\"text-align: center;\">When?</th>\n",
"<th style=\"text-align: center;\">Advantages</th>\n",
"<th style=\"text-align: center;\">Disadvantages</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Nearest Neighbors<br><br><code>KNeighborsClassifier</code></td>\n",
"<td style=\"text-align: left;\">- numeric data<br> - when (fast) linear classifiers do not work</td>\n",
"<td style=\"text-align: left;\">- simple (not many parameters to tweak), hence, a good baseline classifier</td>\n",
"<td style=\"text-align: left;\">- known not to work well for many dimensions (20 or even less features)</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Logistic Regression<br><br><code>LogisticRegression</code></td>\n",
"<td style=\"text-align: left;\">- high-dimensional data<br> - a lot of data</td>\n",
"<td style=\"text-align: left;\">- fast, also in high dimensions<br> - weights can be interpreted</td>\n",
"<td style=\"text-align: left;\">- data has to be linearly separable (happens often in higher dimensions)<br> - not very efficient with large number of samples</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Linear SVM<br><br><code>LinearSVC</code></td>\n",
"<td style=\"text-align: left;\">same as above but might be better for text analysis (many features)</td>\n",
"<td style=\"text-align: left;\">same as above but might be better with very large number of features</td>\n",
"<td style=\"text-align: left;\">same as above but possibly a bit better with large number of samples</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Kernel SVM<br><br><code>SVC</code></td>\n",
"<td style=\"text-align: left;\">same as above but when linear SVM does not work<br>- not too many data points</td>\n",
"<td style=\"text-align: left;\">same as above but learns non-linear boundaries</td>\n",
"<td style=\"text-align: left;\">same as above but much slower and requires data scaling<br>- model is not easily interpretable</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Decision Tree<br><br><code>DecisionTreeClassifier</code></td>\n",
"<td style=\"text-align: left;\">- for illustration/insight<br> - with multi-class problems <br> - with categorical or mixed categorical and numerical data</td>\n",
"<td style=\"text-align: left;\">- simple to interpret<br> - good classification speed and performance</td>\n",
"<td style=\"text-align: left;\">- prone to overfitting<br> - unstable: small change in the training data can give very different model</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Ensemble Averaging<br><br><code>RandomForestClassifier</code></td>\n",
"<td style=\"text-align: left;\">- when decision tree would be used but for performance</td>\n",
"<td style=\"text-align: left;\">- fixes decision tree issues: does not overfit easily and is stable with respect to training data<br> - takes into account features dependencies<br> - can compute predicitve error when learning<br> ...</td>\n",
"<td style=\"text-align: left;\">- harder to interpret than a single decision tree</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">Boosting<br><br><code>AdaBoostClassifier</code> (<code>XGBClassifier</code>, <code>HistGradientBoostingClassifier</code>)</td>\n",
"<td style=\"text-align: left;\">same as above</td>\n",
"<td style=\"text-align: left;\">- works very well out-of-the-box<br>- better performance and more interpretable than random forest when using depth 1 trees</td>\n",