Skip to content
Snippets Groups Projects
06_classifiers_overview-part_2.ipynb 50 KiB
Newer Older
schmittu's avatar
schmittu committed
  "cells": [
schmittu's avatar
schmittu committed
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": []
      },
      "outputs": [],
      "source": [
        "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
        "%matplotlib inline\n",
        "%config InlineBackend.figure_format = 'retina'\n",
        "import warnings\n",
        "\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
        "warnings.filterwarnings(\n",
        "    \"ignore\",\n",
        "    message=\"X does not have valid feature names, but [a-zA-Z]+ was fitted with feature names\",\n",
        "    category=UserWarning,\n",
        ")\n",
        "\n",
        "warnings.filterwarnings = lambda *a, **kw: None\n",
        "from IPython.core.display import HTML\n",
        "\n",
        "HTML(open(\"custom.html\", \"r\").read())"
schmittu's avatar
schmittu committed
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Chapter 6: An overview of classifiers, Part 2\n",
        "\n",
        "<span style=\"font-size: 150%;\">Decision trees, ensemble methods and summary</span>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Let's repeat our helper functions from previous part:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": []
      },
      "outputs": [],
      "source": [
        "import matplotlib\n",
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "\n",
        "\n",
        "def samples_color(ilabels, colors=[\"steelblue\", \"chocolate\"]):\n",
        "    \"\"\"Return colors list from labels list given as indices.\"\"\"\n",
        "    return [colors[int(i)] for i in ilabels]\n",
        "\n",
        "\n",
        "def plot_decision_surface(\n",
        "    features_2d,\n",
        "    labels,\n",
        "    classifier,\n",
        "    preprocessing=None,\n",
        "    plt=plt,\n",
        "    marker=\".\",\n",
        "    N=100,\n",
        "    alpha=0.2,\n",
        "    colors=[\"steelblue\", \"chocolate\"],\n",
        "    title=None,\n",
        "    test_features_2d=None,\n",
        "    test_labels=None,\n",
        "    test_s=60,\n",
        "):\n",
        "    \"\"\"Plot a 2D decision surface for a already trained classifier.\"\"\"\n",
        "\n",
        "    # sanity check\n",
        "    assert len(features_2d.columns) == 2\n",
        "\n",
        "    # pandas to numpy array; get min/max values\n",
        "    xy = np.array(features_2d)\n",
        "    min_x, min_y = xy.min(axis=0)\n",
        "    max_x, max_y = xy.max(axis=0)\n",
        "\n",
        "    # create mesh of NxN points; tech: `N*1j` is spec for including max value\n",
        "    XX, YY = np.mgrid[min_x : max_x : N * 1j, min_y : max_y : N * 1j]\n",
        "    points = np.c_[XX.ravel(), YY.ravel()]  # shape: (N*N)x2\n",
        "    points = pd.DataFrame(points, columns=[\"x\", \"y\"])\n",
        "\n",
        "    # apply scikit-learn API preprocessing\n",
        "    if preprocessing is not None:\n",
        "        points = preprocessing.transform(points)\n",
        "\n",
        "    # classify grid points\n",
        "    classes = classifier.predict(points)\n",
        "\n",
        "    # plot classes color mesh\n",
        "    ZZ = classes.reshape(XX.shape)  # shape: NxN\n",
        "    plt.pcolormesh(\n",
        "        XX,\n",
        "        YY,\n",
        "        ZZ,\n",
        "        alpha=alpha,\n",
        "        cmap=matplotlib.colors.ListedColormap(colors),\n",
        "        shading=\"auto\",\n",
        "    )\n",
        "    # plot points\n",
        "    plt.scatter(\n",
        "        xy[:, 0],\n",
        "        xy[:, 1],\n",
        "        marker=marker,\n",
        "        color=samples_color(labels, colors=colors),\n",
        "    )\n",
        "    # set title\n",
        "    if title:\n",
        "        if hasattr(plt, \"set_title\"):\n",
        "            plt.set_title(title)\n",
        "        else:\n",
        "            plt.title(title)\n",
        "    # plot test points\n",
        "    if test_features_2d is not None:\n",
        "        assert test_labels is not None\n",
        "        assert len(test_features_2d.columns) == 2\n",
        "        test_xy = np.array(test_features_2d)\n",
        "        plt.scatter(\n",
        "            test_xy[:, 0],\n",
        "            test_xy[:, 1],\n",
        "            s=test_s,\n",
        "            facecolors=\"none\",\n",
        "            linewidths=2,\n",
        "            color=samples_color(test_labels),\n",
        "        );"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Since the latest version, `sklearn` offers its own method for visualizing decision boundaries: `DecisionBoundaryDisplay`. Documentation for this method can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.inspection.DecisionBoundaryDisplay.html."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Decision trees\n",
        "\n",
        "Let's see what a decision tree is by looking at an (artificial) example: \n",
        "\n",
        "Following the yes/no decision nodes from the top (root node) you will end up at a **leaf** which concludes if you should work or relax: \n",
        "\n",
        "<table>\n",
        "    <tr><td><img src=\"./images/decision_tree-work.png\" width=600px></td></tr>\n",
        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### How are the decision tree splits selected?\n",
        "\n",
        "Starting from the top the decision tree is build by selecting **best split of the dataset using a single feature**. \n",
        "\n",
        "Such a split is done on a single feature and splits your space into two halves.\n",
        "\n",
        "The optimal feature and its split value are determined for each node such that the resulting **subsets are as pure as possible** .\n",
        "\n",
        "Purity of a set is measured using so-called [gini-index](https://en.wikipedia.org/wiki/Diversity_index#Gini%E2%80%93Simpson_index). The lower this index the more pure a set is.\n",
        "\n",
        "The formula to compute this is index is $1 - p_0^2 - p_1^2$, where $p_i$ is the fraction of samples of class $i$ in the set. \n",
        "\n",
        "Thus for a set with only postive or negative samples either $p_0$ or $p_1$ is 1.0 and the other is 0, resulting in an optimal gini index of $0$.\n",
        "\n",
        "\n",
        "\n",
        "<table>\n",
        "    <tr><td><img src=\"./images/decision_tree-split.png\" width=600px></td></tr>\n",
        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "In the left tree the gini index of the left split is $0$, and the one from the right split is $1 - 0.4^2 - 0.6^2 = 0.48$. The weighted sum (using the number of elements in each set as weights) is then $0.48 * 5/7 = 0.34$\n",
        "\n",
        "An similar computation yields a value of $0.21$ for the right split, and thus the right split would be preferred when constructing the tree.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### XOR decision tree\n",
        "\n",
        "Let's try out decision trees with the XOR dataset, in which samples have class `True` when the two coordinates `x` and `y` have different sign, otherwise they have class `False`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.read_csv(\"data/xor.csv\")\n",
        "features_2d = df.loc[:, (\"x\", \"y\")]\n",
        "labelv = df[\"label\"]\n",
        "\n",
        "plt.figure(figsize=(5, 5))\n",
        "plt.xlabel(\"x\")\n",
        "plt.ylabel(\"y\")\n",
        "plt.title(\"Orange is True, blue is False\")\n",
        "plt.scatter(features_2d.iloc[:, 0], features_2d.iloc[:, 1], color=samples_color(labelv));"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Decision trees live in the `sklearn.tree` module."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.tree import DecisionTreeClassifier\n",
        "\n",
        "# Note: split randomness picked manually for educational purpose\n",
        "X_train, X_test, y_train, y_test = train_test_split(\n",
        "    features_2d, labelv, random_state=10\n",
        ")\n",
        "\n",
        "# Note: features are permuted reandomly in case equally good splits are found\n",
        "# fix randomization for reproduciblity\n",
        "classifier = DecisionTreeClassifier(random_state=0)\n",
        "classifier.fit(X_train, y_train)\n",
        "\n",
        "print(\"train score: {:.2f}%\".format(100 * classifier.score(X_train, y_train)))\n",
        "print(\"test score: {:.2f}%\".format(100 * classifier.score(X_test, y_test)))\n",
        "\n",
        "plt.figure(figsize=(5, 5))\n",
        "plot_decision_surface(\n",
        "    features_2d,\n",
        "    labelv,\n",
        "    classifier,\n",
        "    test_features_2d=X_test,\n",
        "    test_labels=y_test,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "About the plot: **the points surrounded with a circle are from the test data set** (not used for learning), all other points belong to the training data.\n",
        "\n",
        "This surface seems a bit rough on edges. One of the biggest advantages of the decision trees is interpretability of the model. Let's **inspect the model by looking at the tree that was built**:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from sklearn.tree import plot_tree\n",
        "\n",
        "fig = plt.figure(figsize=(12, 8))\n",
        "fig.suptitle(\"XOR Decision Tree\")\n",
        "plot_tree(classifier, feature_names=[\"x\", \"y\"], class_names=[\"False\", \"True\"]);"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "<span style=\"font-size: 150%\">Whoaaa .. what happened here?</span>\n",
        "\n",
        "XOR is the **anti-example** for DTs!\n",
        "\n",
        "They cannot make the \"natural\" split at value `0`:\n",
        "Either splitting at $x=0$ or $y=0$ would leave very impure splits with ~50% of samples from both classes. Instead the tree focusses on details resp. noise in the particular data set during construction.\n",
        "\n",
        "Note: This would be not an issue if a decision tree would not constructed in a greedy manner, split per split.\n",
        "\n",
        "\n",
        "Moreover, the tree is quite deep because, by default, it is built until all nodes are \"pure\" (`gini = 0.0`). This tree is **overfitted**."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### How to avoid overfitting?\n",
        "\n",
        "There is no regularization penalty like in logistic regression or SVM methods when bulding a decision tree. Instead we can set learning hyperparameters such as:\n",
        "or\n",
        "* upper limit on tree depth (`max_depth`), or\n",
        "* a minimum number of samples required at a node or at a leaf node (`min_samples_split`, `min_samples_leaf`), or\n",
        "* an early stopping criteria based on minumum value of impurity or on minimum decrease in impurity (`min_impurity_split`, `min_impurity_decrease`),\n",
        "* tree pruning (based on minimal cost-complexity; `ccp_alpha`) after the tree has been built, \n",
        "* ... and few more - see `DecisionTreeClassifier` docs.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Exercise section\n",
        "\n",
        "1. In theory for the XOR dataset it should suffice to use each feature exactly once with splits at `0`, but the decision tree learning algorithm is unable to find such a solution. Play around with `max_depth` to get a smaller but similarly performing decision tree for the XOR dataset.<br/>\n",
        "  Bonus question: which other hyperparameter could you have used to get the same result?\n",
        "\n",
        "2. Build a decision tree for the beers dataset. Use maximum depth and tree pruning strategies to get a much smaller tree that performs as well as the default tree.<br/>\n",
        "  Note: `classifier.tree_` instance has attributes such as `max_depth`, `node_count`, or `n_leaves`, which measure size of the tree."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
        "\n",
        "df = pd.read_csv(\"data/xor.csv\")\n",
        "features_2d = df.loc[:, (\"x\", \"y\")]\n",
        "labelv = df[\"label\"]\n",
        "\n",
        "max_depths = [2, 3, 4]\n",
        "# ..."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
        "\n",
        "df = pd.read_csv(\"data/beers.csv\")\n",
        "features = df.iloc[:, :-1]\n",
        "labelv = df.iloc[:, -1]\n",
        "# ..."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": [
          "solution"
        ]
      },
      "outputs": [],
      "source": [
        "# SOLUTION 1\n",
        "import pandas as pd\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
        "\n",
        "df = pd.read_csv(\"data/xor.csv\")\n",
        "features_2d = df.loc[:, (\"x\", \"y\")]\n",
        "labelv = df[\"label\"]\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(\n",
        "    features_2d, labelv, random_state=10\n",
        ")\n",
        "\n",
        "max_depths = [2, 3, 4]\n",
        "\n",
        "n_params = len(max_depths)\n",
        "fig, ax_arr = plt.subplots(ncols=n_params, nrows=2, figsize=(7 * n_params, 7 * 2))\n",
        "fig.suptitle(\"smaller XOR Decision Trees\")\n",
        "for i, max_depth in enumerate(max_depths):\n",
        "\n",
        "    classifier = DecisionTreeClassifier(\n",
        "        max_depth=max_depth,\n",
        "        random_state=0,\n",
        "    )\n",
        "    classifier.fit(X_train, y_train)\n",
        "\n",
        "    ax = ax_arr[0, i]\n",
        "    plot_tree(\n",
        "        classifier,\n",
        "        feature_names=features_2d.columns.values,\n",
        "        class_names=[\"False\", \"True\"],\n",
        "        ax=ax,\n",
        "        fontsize=7,\n",
        "    )\n",
        "    ax.set_title(\n",
        "        (\n",
        "            f\"max depth = {max_depth}\\n\"\n",
        "            f\"train score: {100 * classifier.score(X_train, y_train):.2f}%\\n\"\n",
        "            f\"test score: {100 * classifier.score(X_test, y_test):.2f}%\"\n",
        "        )\n",
        "    )\n",
        "\n",
        "    ax = ax_arr[1, i]\n",
        "    plot_decision_surface(\n",
        "        features_2d,\n",
        "        labelv,\n",
        "        classifier,\n",
        "        test_features_2d=X_test,\n",
        "        test_labels=y_test,\n",
        "        plt=ax,\n",
        "    )\n",
        "\n",
        "# We could have used equivalently `min_impurity_split` early stopping criterium with any (gini) value between 0.15 and 0.4"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# SOLUTION 2\n",
        "import pandas as pd\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
        "\n",
        "df = pd.read_csv(\"data/beers.csv\")\n",
        "print(df.head(2))\n",
        "\n",
        "features = df.iloc[:, :-1]\n",
        "labelv = df.iloc[:, -1]\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(features, labelv, random_state=10)\n",
        "\n",
        "# default\n",
        "classifier = DecisionTreeClassifier(random_state=0)\n",
        "pipeline = make_pipeline(StandardScaler(), classifier)\n",
        "pipeline.fit(X_train, y_train)\n",
        "print()\n",
        "print(\"#### default Beers Decision Tree\")\n",
        "print(\n",
        "    f\"depth: {classifier.tree_.max_depth}, \",\n",
        "    f\"#nodes: {classifier.tree_.node_count}, \",\n",
        "    f\"#leaves: {classifier.tree_.n_leaves}\",\n",
        ")\n",
        "print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
        "print(f\" test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
        "\n",
        "# smaller\n",
        "classifier = DecisionTreeClassifier(max_depth=4, ccp_alpha=0.02, random_state=0)\n",
        "pipeline = make_pipeline(StandardScaler(), classifier)\n",
        "pipeline.fit(X_train, y_train)\n",
        "print()\n",
        "print(\"#### smaller Beers Decision Tree\")\n",
        "print(\n",
        "    f\"depth: {classifier.tree_.max_depth}, \",\n",
        "    f\"#nodes: {classifier.tree_.node_count}, \",\n",
        "    f\"#leaves: {classifier.tree_.n_leaves}\",\n",
        ")\n",
        "print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
        "print(f\" test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
        "\n",
        "fig = plt.figure(figsize=(10, 6))\n",
        "plot_tree(classifier, feature_names=features.columns.values);"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "One **issue with decision trees is their instability** - a small changes in the training data usually results in a completely different order of splits (different tree structure)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Ensemble Averaging: Random Forests\n",
        "\n",
        "The idea of Random Forest method is to generate **ensemble of many \"weak\" decision trees** and by **averaging out their probabilistic predictions**. (The original Random Forests method used voting.)\n",
        "\n",
        "\n",
        "Weak classifier here are **shallow trees with feature-splits picked only out of random subsets of features** (*features bagging*). Random subset of features is selected per each split, not for the whole classifier.\n",
        "\n",
        "<table>\n",
        "    <tr><td><img src=\"./images/random_forest.png\" width=800px></td></tr>\n",
        "    <tr><td><center><sub>Source: <a href=\"https://towardsdatascience.com/random-forests-and-decision-trees-from-scratch-in-python-3e4fa5ae4249\">https://towardsdatascience.com/random-forests-and-decision-trees-from-scratch-in-python-3e4fa5ae4249</a></sub></center></td></tr>\n",
        "</table>\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Demonstration\n",
        "\n",
        "You will find Random Forest method implementation in the `sklearn.ensemble` module.\n",
        "\n",
        "The main parameters are:\n",
        "* number of trees (`n_estimators`),\n",
        "* each tree max. depth 2 (`max_depth`), and\n",
        "* max. number of randomly selected features to pick from when building each tree (`max_features`).\n",
        "\n",
        "Let's build a small Random Forest and have a look at its trees, available under `.estimators_` property."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.tree import plot_tree\n",
        "\n",
        "df = pd.read_csv(\"data/beers.csv\")\n",
        "print(df.head(2))\n",
        "\n",
        "features = df.iloc[:, :-1]\n",
        "labelv = df.iloc[:, -1]\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(features, labelv, random_state=10)\n",
        "\n",
        "# 4 shallow (depth 2) trees, each using only 3 randomly selected features\n",
        "# total: up to 4*3 decision nodes, up to 4*4 class nodes\n",
        "n_trees = 4\n",
        "classifier = RandomForestClassifier(\n",
        "    max_depth=2,\n",
        "    n_estimators=n_trees,\n",
        "    max_features=3,\n",
        "    random_state=0,\n",
        ")\n",
        "pipeline = make_pipeline(StandardScaler(), classifier)\n",
        "pipeline.fit(X_train, y_train)\n",
        "\n",
        "print()\n",
        "print(\"#### Random Forest\")\n",
        "print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
        "print(f\" test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
        "\n",
        "# to evaluate ensemble estimators, we need to use transformed data\n",
        "X_train_trans = pipeline[:-1].transform(X_train)\n",
        "X_test_trans = pipeline[:-1].transform(X_test)\n",
        "\n",
        "fig, ax_arr = plt.subplots(ncols=n_trees, nrows=1, figsize=(7 * n_trees, 5))\n",
        "for i, internal_classifier in enumerate(classifier.estimators_):\n",
        "    ax = ax_arr[i]\n",
        "    plot_tree(internal_classifier, feature_names=features.columns.values, ax=ax)\n",
        "    ax.set_title(\n",
        "        (\n",
        "            f\"Tree #{i}\\n\"\n",
        "            f\"train score: {100 * internal_classifier.score(X_train_trans, y_train):.2f}%\\n\"\n",
        "            f\" test score: {100 * internal_classifier.score(X_test_trans, y_test):.2f}%\"\n",
        "        )\n",
        "    )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Random forests are fast and shine with high dimensional data (many features).\n",
        "\n",
        "<div class=\"alert alert-block alert-info\">\n",
        "    <p><i class=\"fa fa-info-circle\"></i>\n",
        "        Random Forest can estimate <em>out-of-bag error</em> (OOB) while learning; set <code>oob_score=True</code>. (The out-of-bag (OOB) error is the average error for each data sample, calculated using predictions from the trees that do not contain that sample in their respective bootstrap samples.)\n",
        "    OOB is a generalisation/predictive error that, together with <code>warm_start=True</code>, can be used for efficient search for a good-enough number of trees, i.e. the <code>n_estimators</code> hyperparameter value (see: <a href=https://scikit-learn.org/stable/auto_examples/ensemble/plot_ensemble_oob.html>OOB Errors for Random Forests</a>).\n",
        "    </p>\n",
        "</div>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Boosting: AdaBoost\n",
        "\n",
        "<span style=\"font-size: 125%;\">What is it?</span>\n",
        "\n",
        "Boosting is another sub-type of ensemble learning. Same as in averaging, the idea is to generate many **weak classifiers to create a single strong classifier**, but in contrast to averaging, the classifiers are learnt **iteratively**.\n",
        "\n",
        "<span style=\"font-size: 125%;\">How does it work?</span>\n",
        "\n",
        "Each iteration focuses more on **previously misclassified samples**. To that end, **data samples are weighted**, and after each learning iteration the data weights are readjusted.\n",
        "\n",
        "<table>\n",
        "    <tr><td><img src=\"./images/AdaBoost.png\" width=800px></td></tr>\n",
        "    <tr><td><center><sub>Source: Marsh, B., (2016), <em>Multivariate Analysis of the Vector Boson Fusion Higgs Boson</em>.</sub></center></td></tr>\n",
        "</table>\n",
        "\n",
        "The final prediction is a weighted majority vote or weighted sum of predictions of the weighted weak classifiers.\n",
        "\n",
        "Boosting works very well out of the box. There is usually no need to fine tune method hyperparameters to get good performance.\n",
        "\n",
        "<span style=\"font-size: 125%;\">Where do i start?</span>\n",
        "\n",
        "**AdaBoost (\u201cAdaptive Boosting\u201d) is a baseline boosting algorithm** that originally used decisoin trees as weak classifiers, but, in principle, works with any classification method (`base_estimator` parameter).\n",
        "\n",
        "In each AdaBoost learning iteration, additionally to samples weights, the **weak classifiers are weighted**. Their weights are readjusted, such that **the more accurate a weak classifier is, the larger its weight is**.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Demonstration\n",
        "\n",
        "You will find AdaBoost algorithm implementation in the `sklearn.ensemble` module.\n",
        "\n",
        "We'll use `n_estimators` parameter to determine number of weak classifiers. These by default are single node decision trees (`base_estimator = DecisionTreeClassifier(max_depth=1)`). We can examine them via `.estimators_` property of a trained method.\n",
        "\n",
        "For presentation, in order to weight the classifiers, we will use the original discrete AdaBoost learning method (`algorithm=\"SAMME\"`). Because the classifiers learn iteratively on differently weighted samples, to understand the weights we have to look at internal train errors and not at the final scores on the training data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from math import ceil, floor\n",
        "\n",
        "import pandas as pd\n",
        "from sklearn.ensemble import AdaBoostClassifier\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.tree import plot_tree\n",
        "\n",
        "df = pd.read_csv(\"data/beers.csv\")\n",
        "print(df.head(2))\n",
        "\n",
        "features = df.iloc[:, :-1]\n",
        "labelv = df.iloc[:, -1]\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(features, labelv, random_state=10)\n",
        "\n",
        "# 9 single node decision trees\n",
        "# total: 9*1 decision nodes, 9*2 class nodes\n",
        "# (Note: with default real AdaBoost \"SAMME.R\" algorithm all weights are 1 at the end)\n",
        "n_trees = 9\n",
        "classifier = AdaBoostClassifier(n_estimators=n_trees, algorithm=\"SAMME\", random_state=0)\n",
        "pipeline = make_pipeline(StandardScaler(), classifier)\n",
        "pipeline.fit(X_train, y_train)\n",
        "\n",
        "print()\n",
        "print(\"AdaBoost\")\n",
        "print(f\"train score: {100 * pipeline.score(X_train, y_train):.2f}%\")\n",
        "print(f\"test score: {100 * pipeline.score(X_test, y_test):.2f}%\")\n",
        "\n",
        "# to evaluate ensemble estimators, we need to use transformed data\n",
        "X_train_trans = pipeline[:-1].transform(X_train)\n",
        "X_test_trans = pipeline[:-1].transform(X_test)\n",
        "\n",
        "fig, ax_arr = plt.subplots(ncols=n_trees, nrows=1, figsize=(5 * n_trees, 4))\n",
        "for i, internal_classifier in enumerate(classifier.estimators_):\n",
        "    ax = ax_arr[i]\n",
        "    plot_tree(internal_classifier, feature_names=features.columns.values, ax=ax)\n",
        "    ax.set_title(\n",
        "        (\n",
        "            f\"Tree #{i}, weight: {classifier.estimator_weights_[i]:.2f}\\n\"\n",
        "            f\"train error: {classifier.estimator_errors_[i]:.2f}\\n\"\n",
        "            f\"(train score: {100 * internal_classifier.score(X_train_trans, y_train):.2f}%)\\n\"\n",
        "            f\"test score: {100 * internal_classifier.score(X_test_trans, y_test):.2f}%\"\n",
        "        )\n",
        "    )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Other boosting methods\n",
        "\n",
        "In practice you will mostly want to use other than AdaBoost methods for boosting.\n",
        "\n",
        "####  Gradient Tree Boosting (GTB)\n",
        "\n",
        "It re-formulates boosting problem as an optimization problem which is solved with efficient Stochastic Gradient Descent optimization method (more on that in the neuronal networks script).\n",
        "\n",
        "In contrast to AdaBoost, GTB relies on using decision trees.\n",
        "\n",
        "In particular, try out [XGboost](https://xgboost.readthedocs.io/en/latest/); it's a package that won many competitions, cf. [XGboost@Kaggle](https://www.kaggle.com/dansbecker/xgboost). It is not part of scikit-learn, but it offers a `scikit-learn` API (see https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn ); a `scikit-learn` equivalent is [`GradientBoostingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html).\n",
        "\n",
        "#### Histogram-based Gradient Boosting Classification Tree.\n",
        "\n",
        "A new `scikit-learn` implementation of boosting based on decision trees is [`HistGradientBoostingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html). It is much faster then `GradientBoostingClassifier` for big datasets (`n_samples >= 10 000`).\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Ensemble Stacking: a honorary mention\n",
        "\n",
        "Stacking is used often in case of different types of base models, when it's not clear which type of model will perform best.\n",
        "\n",
        "The stacking architectore combines multiple classfiers which feed into a final meta-model (called also *combiner*, *blender*, or *generalizer*). Here is an example:\n",
        "\n",
        "\n",
        "<table>\n",
        "    <tr><td><img src=\"./images/ensemble-learning-stacking.png\" width=\"400px\"></td></tr>\n",
        "    <tr><td><center><sub><a href=\"https://data-science-blog.com/blog/2017/12/03/ensemble-learning/\">https://data-science-blog.com/blog/2017/12/03/ensemble-learning/</a></sub></center></td></tr>\n",
        "</table>\n",
        "\n",
        "The learning procedure is as follows:\n",
        "- split your training data (e.g. in an 80-20 split),\n",
        "- train the first layer on the 80% of the data,\n",
        "- feed the remaining 20% through the first layer,\n",
        "- train the meta-model on the output from these 20%.\n",
        "\n",
        "The idea is that the meta-model also is able to correct overfitting from the first layer.\n",
        "\n",
        "Stacking combines strengths of different models and usually slightly outperforms the best individual model. In practice often multiple stacking layers are used with groups of different but repeating types of classifiers.\n",
        "\n",
        "<table>\n",
        "    <tr><td><center><img src=\"./images/ensemble-learning-stacking-kdd_2015_winner.png\" width=\"800px\"></center></td></tr>\n",
        "    <tr><td><center><sub>KDD Cup 2015 winner</sub></center></td></tr>\n",
        "    <tr><td><center><sub>GBM: Gradient Boosting Machine; NN: Neural Network; FM: Factorization Machine; LR: Logistic Regression; KRR: Kernel Ridge Regression; ET: Extra Trees; RF: Random Forests; KNN: K-Nearest Neighbors</sub></center></td></tr>\n",
        "    <tr><td><center><sub><a href=\"https://www.slideshare.net/jeongyoonlee/winning-data-science-competitions-74391113\"> Jeong-Yoon Lee, <em>Winning Data Science Competitions</em>, Apr 2017</a></sub></center></td></tr>\n",
        "</table>\n",
        "\n",
        "In the `sklearn.ensemble` the stacking is implemented by `StackingClassifier` and `StackingRegressor`."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Why does ensemble learning work?\n",
        "\n",
        "* Probability of making an error by majority of the classifiers in the ensemble is much lower then error that each of the weak classifiers makes alone.\n",
        "\n",
        "* An ensemble classifier is more roboust (has lower variance) with respect to the training data.\n",
        "\n",
        "* The weak classifiers are small, fast to learn, and, in case of averaging, they can be learnt in parallel.\n",
        "\n",
        "In general, **usually ensemble classifier performs better than any of the weak classifiers in the ensemble**."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Coding session\n",
        "\n",
        "Apply all classifiers seen so far to the MNIST images dataset (`from sklearn.datasets import load_digits`). To that end:\n",
        "\n",
        "1. split your data into training and a test \"holdout\" data (`from sklearn.model_selection import train_test_split`) in 80:20 proporition; stratify the 10 target classes (`stratify=...`);\n",
        "2. compare cross validation mean f1 scores; use the stratified k-fold CV strategy (`from sklearn.model_selection import cross_val_score, StratifiedKFold`; attention: this is a non-binary multi-class problem, you will have to use an adjusted f1 score, e.g. unweighted per class mean via `scoring=\"f1_macro\"` keyword arg - see [the `scoring` parameter predefined values](https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules));\n",
        "3. train the final model on the whole dataset and, use the test \"holdout\" data to repot a final model f1 performance; report also accuracy, precision and recall scores (`from sklearn.metrics import classification_report`);\n",
        "4. next, try to manually tune hyperparameters to minimize the train test gap and to squeeze out at least 90% cross validation f1 score performance out of each classifier; try using PCA preprocessing (`sklearn.pipeline.make_pipeline`, `sklearn.decomposition.PCA`); what about data scaling? which models are most effective and easiest to tune manually?\n",
        "5. optionally, once you get a feel of good preprocessing and classifier hyperparameters, define parameters ranges and apply a CV-based search to find optimal values for the hyperparameters (`from sklearn.model_selection import GridSearchCV, RandomizedSearchCV`)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from sklearn.datasets import load_digits\n",
        "from sklearn.decomposition import PCA\n",
        "\n",
        "# classifiers\n",
        "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.svm import SVC, LinearSVC\n",
        "from sklearn.tree import DecisionTreeClassifier\n",
        "\n",
        "digits = load_digits()\n",
        "\n",
        "labels = digits.target\n",
        "n_samples = len(labels)\n",
        "\n",
        "# flatten images of shape N_SAMPLES x 8 x 8 to N_SAMPLES x 64:\n",
        "print(\"digits data set shape:\", digits.images.shape)\n",
        "features = digits.images.reshape((n_samples, -1))\n",
        "print(\"feature matrix shape:\", features.shape)\n",
        "\n",
        "# (\n",
        "#     features_train,\n",
        "#     features_test,\n",
        "#     labels_train,\n",
        "#     labels_test,\n",
        "# ) = train_test_split(..., test_size=..., stratify=..., random_state=42)\n",
        "#\n",
        "# classifiers = [\n",
        "#     LogisticRegression(...),\n",
        "#     LinearSVC(...),\n",
        "#     SVC(...),\n",
        "#     DecisionTreeClassifier(..., random_state=42),  # rather won't do very well when not used in an ensemble method\n",
        "#     RandomForestClassifier(..., random_state=42),\n",
        "#     AdaBoostClassifier(..., random_state=42),\n",
        "# ]\n",
        "#\n",
        "# pipeline = make_pipeline(...)\n",
        "#\n",
        "# cross_validator = StratifiedKFold(...)\n",
        "# scores = cross_val_score(..., scoring=..., cv=...)\n",
        "# ..."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "tags": [
          "solution"
        ]
      },
      "outputs": [],
      "source": [
        "# SOLUTION\n",
        "import numpy as np\n",
        "from sklearn.datasets import load_digits\n",
        "from sklearn.decomposition import PCA\n",
        "\n",
        "# classifiers\n",
        "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.svm import SVC, LinearSVC\n",
        "from sklearn.tree import DecisionTreeClassifier\n",
        "\n",
        "digits = load_digits()\n",
        "\n",
        "labels = digits.target\n",
        "n_samples = len(labels)\n",
        "\n",
        "# flatten images of shape N_SAMPLES x 8 x 8 to N_SAMPLES x 64:\n",
        "print(\"digits data set shape:\", digits.images.shape)\n",
        "features = digits.images.reshape((n_samples, -1))\n",
        "print(\"feature matrix shape:\", features.shape)\n",
        "\n",
        "(\n",
        "    features_train,\n",
        "    features_test,\n",
        "    labels_train,\n",
        "    labels_test,\n",
        ") = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)\n",
        "\n",
        "classifiers = [\n",
        "    LogisticRegression(C=10, penalty=\"l1\", solver=\"liblinear\", max_iter=10_000),\n",
        "    LinearSVC(C=1, penalty=\"l1\", dual=False, max_iter=50_000),\n",
        "    SVC(C=10, gamma=0.001),\n",
        "    DecisionTreeClassifier(\n",
        "        max_depth=9, random_state=42\n",
        "    ),  # rather won't do very well when not used in an ensemble method\n",
        "    RandomForestClassifier(\n",
        "        max_depth=6,\n",
        "        n_estimators=50,\n",
        "        max_features=\"log2\",\n",
        "        random_state=42,\n",
        "    ),\n",
        "    AdaBoostClassifier(\n",
        "        base_estimator=DecisionTreeClassifier(\n",
        "            max_depth=3\n",
        "        ),  # in CV search use: adaboostclassifier__base_estimator__max_depth\n",
        "        n_estimators=100,\n",
        "        algorithm=\"SAMME\",  # works better than default for a multi-class problem\n",
        "        random_state=42,\n",
        "    ),\n",
        "]\n",
        "\n",
        "# We do already account for class imbalances in train/test and CV splits via stratification\n",
        "# => take an unweighted f1 score per class mean (\"f1_macro\"), thus, keeping f1 score in between\n",
        "#    the precision and recall scores.\n",
        "scoring = \"f1_macro\"\n",
        "pca_n_components = (\n",
        "    features.shape[1] // 3\n",
        ")  # accuracy-wise does not add much but speeds up training\n",
        "cv_n_splits = 10\n",
        "\n",
        "for classifier in classifiers:\n",
        "    print()\n",
        "    print(f\"#### {classifier}\")\n",
        "    pipeline = make_pipeline(PCA(n_components=pca_n_components), classifier)\n",
        "    cross_validator = StratifiedKFold(\n",
        "        n_splits=cv_n_splits, shuffle=True, random_state=42\n",
        "    )\n",
        "    scores = cross_val_score(\n",
        "        pipeline, features_train, labels_train, scoring=scoring, cv=cross_validator\n",
        "    )\n",
        "    print()\n",
        "    print(f\"train {cv_n_splits}-fold CV mean {scoring}:\")\n",
        "    print(f\"\\t{scores.mean():.2f} +/- {scores.std():.2f}\")\n",
        "    print()\n",
        "    # the final model - train on the whole dataset\n",
        "    classifier.fit(features_train, labels_train)\n",
        "    # the final \"holdout\" test\n",
        "    print(f\"test score:\")\n",
        "    predicted_test = classifier.predict(features_test)\n",
        "    print(\n",
        "        classification_report(\n",
        "            labels_test,\n",
        "            predicted_test,\n",
        "        )\n",
        "    )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Summary"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Below you will find a table with some guidelines, as well as pros and cons of different classication methods available in scikit-learn.\n",
        "\n",
        "<div class=\"alert alert-block alert-warning\">\n",
        "    <p><i class=\"fa fa-warning\"></i>&nbsp;<strong>Summary table</strong></p>\n",
        "\n",
        "<p>\n",
        "<em>Disclaimer</em>: this table is neither a single source of truth nor complete - it's intended only to provide some first considerations when starting out. At the end of the day, you have to try and pick a method that works for your problem/data.\n",
        "</p>\n",
        "\n",
        "<table>\n",
        "<thead>\n",
        "<tr>\n",
        "<th style=\"text-align: center;\">Classifier type</th>\n",
        "<th style=\"text-align: center;\">When?</th>\n",
        "<th style=\"text-align: center;\">Advantages</th>\n",
        "<th style=\"text-align: center;\">Disadvantages</th>\n",
        "</tr>\n",
        "</thead>\n",
        "<tbody>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Nearest Neighbors<br><br><code>KNeighborsClassifier</code></td>\n",
        "<td style=\"text-align: left;\">- numeric data<br> - when (fast) linear classifiers do not work</td>\n",
        "<td style=\"text-align: left;\">- simple (not many parameters to tweak), hence, a good baseline classifier</td>\n",
        "<td style=\"text-align: left;\">- known not to work well for many dimensions (20 or even less features)</td>\n",
        "</tr>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Logistic Regression<br><br><code>LogisticRegression</code></td>\n",
        "<td style=\"text-align: left;\">- high-dimensional data<br> - a lot of data</td>\n",
        "<td style=\"text-align: left;\">- fast, also in high dimensions<br> - weights can be interpreted</td>\n",
        "<td style=\"text-align: left;\">- data has to be linearly separable (happens often in higher dimensions)<br> - not very efficient with large number of samples</td>\n",
        "</tr>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Linear SVM<br><br><code>LinearSVC</code></td>\n",
        "<td style=\"text-align: left;\">same as above but might be better for text analysis (many features)</td>\n",
        "<td style=\"text-align: left;\">same as above but might be better with very large number of features</td>\n",
        "<td style=\"text-align: left;\">same as above but possibly a bit better with large number of samples</td>\n",
        "</tr>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Kernel SVM<br><br><code>SVC</code></td>\n",
        "<td style=\"text-align: left;\">same as above but when linear SVM does not work<br>- not too many data points</td>\n",
        "<td style=\"text-align: left;\">same as above but learns non-linear boundaries</td>\n",
        "<td style=\"text-align: left;\">same as above but much slower and requires data scaling<br>- model is not easily interpretable</td>\n",
        "</tr>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Decision Tree<br><br><code>DecisionTreeClassifier</code></td>\n",
        "<td style=\"text-align: left;\">- for illustration/insight<br> - with multi-class problems <br> - with categorical or mixed categorical and numerical data</td>\n",
        "<td style=\"text-align: left;\">- simple to interpret<br> - good classification speed and performance</td>\n",
        "<td style=\"text-align: left;\">- prone to overfitting<br> - unstable: small change in the training data can give very different model</td>\n",
        "</tr>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Ensemble Averaging<br><br><code>RandomForestClassifier</code></td>\n",
        "<td style=\"text-align: left;\">- when decision tree would be used but for performance</td>\n",
        "<td style=\"text-align: left;\">- fixes decision tree issues: does not overfit easily and is stable with respect to training data<br> - takes into account features dependencies<br> - can compute predicitve error when learning<br> ...</td>\n",
        "<td style=\"text-align: left;\">- harder to interpret than a single decision tree</td>\n",
        "</tr>\n",
        "<tr>\n",
        "<td style=\"text-align: left;\">Boosting<br><br><code>AdaBoostClassifier</code> (<code>XGBClassifier</code>, <code>HistGradientBoostingClassifier</code>)</td>\n",
        "<td style=\"text-align: left;\">same as above</td>\n",
        "<td style=\"text-align: left;\">- works very well out-of-the-box<br>- better performance and more interpretable than random forest when using depth 1 trees</td>\n",