Skip to content
Snippets Groups Projects
00_numpy_pandas_matplotlib_intro.ipynb 250 KiB
Newer Older
  • Learn to ignore specific revisions
  • {
     "cells": [
      {
       "cell_type": "code",
    
    schmittu's avatar
    schmittu committed
       "execution_count": 1,
    
       "metadata": {},
       "outputs": [
        {
         "data": {
          "text/html": [
           "<style>\n",
           "    \n",
           "    @import url('http://fonts.googleapis.com/css?family=Source+Code+Pro');\n",
           "    \n",
           "    @import url('http://fonts.googleapis.com/css?family=Kameron');\n",
           "    @import url('http://fonts.googleapis.com/css?family=Crimson+Text');\n",
           "    \n",
           "    @import url('http://fonts.googleapis.com/css?family=Lato');\n",
           "    @import url('http://fonts.googleapis.com/css?family=Source+Sans+Pro');\n",
           "    \n",
           "    @import url('http://fonts.googleapis.com/css?family=Lora'); \n",
           "\n",
           "    \n",
           "    body {\n",
           "        font-family: 'Lora', Consolas, sans-serif;\n",
           "       \n",
           "        -webkit-print-color-adjust: exact important !;\n",
           "        \n",
           "      \n",
           "       \n",
           "    }\n",
           "    \n",
           "    .alert-block {\n",
           "        width: 95%;\n",
           "        margin: auto;\n",
           "    }\n",
           "    \n",
           "    .rendered_html code\n",
           "    {\n",
           "        color: black;\n",
           "        background: #eaf0ff;\n",
           "        background: #f5f5f5; \n",
           "        padding: 1pt;\n",
           "        font-family:  'Source Code Pro', Consolas, monocco, monospace;\n",
           "    }\n",
           "    \n",
           "    p {\n",
           "      line-height: 140%;\n",
           "    }\n",
           "    \n",
           "    strong code {\n",
           "        background: red;\n",
           "    }\n",
           "    \n",
           "    .rendered_html strong code\n",
           "    {\n",
           "        background: #f5f5f5;\n",
           "    }\n",
           "    \n",
           "    .CodeMirror pre {\n",
           "    font-family: 'Source Code Pro', monocco, Consolas, monocco, monospace;\n",
           "    }\n",
           "    \n",
           "    .cm-s-ipython span.cm-keyword {\n",
           "        font-weight: normal;\n",
           "     }\n",
           "     \n",
           "     strong {\n",
           "         background: #f5f5f5;\n",
           "         margin-top: 4pt;\n",
           "         margin-bottom: 4pt;\n",
           "         padding: 2pt;\n",
           "         border: 0.5px solid #a0a0a0;\n",
           "         font-weight: bold;\n",
           "         color: darkred;\n",
           "     }\n",
           "     \n",
           "    \n",
           "    div #notebook {\n",
           "        # font-size: 10pt; \n",
           "        line-height: 145%;\n",
           "        }\n",
           "        \n",
           "    li {\n",
           "        line-height: 145%;\n",
           "    }\n",
           "\n",
           "    div.output_area pre {\n",
           "        background: #fff9d8 !important;\n",
           "        padding: 5pt;\n",
           "       \n",
           "       -webkit-print-color-adjust: exact; \n",
           "        \n",
           "    }\n",
           " \n",
           "    \n",
           " \n",
           "    h1, h2, h3, h4 {\n",
           "        font-family: Kameron, arial;\n",
           "\n",
           "\n",
           "    }\n",
           "    \n",
           "    div#maintoolbar {display: none !important;}\n",
           "</style>\n",
           "    <script>\n",
           "IPython.OutputArea.prototype._should_scroll = function(lines) {\n",
           "        return false;\n",
           "}\n",
           "    </script>\n"
          ],
          "text/plain": [
           "<IPython.core.display.HTML object>"
          ]
         },
    
    schmittu's avatar
    schmittu committed
         "execution_count": 1,
    
         "metadata": {},
         "output_type": "execute_result"
        }
       ],
       "source": [
        "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
        "import matplotlib.pyplot as plt\n",
        "%matplotlib inline\n",
        "%config InlineBackend.figure_format = 'retina'\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore', category=FutureWarning)\n",
        "warnings.filterwarnings = lambda *a, **kw: None\n",
        "from IPython.core.display import HTML; HTML(open(\"custom.html\", \"r\").read())"
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "# Chapter 0: Introduction \n",
        "\n",
        "\n",
        "<div class=\"alert alert-block alert-warning\">\n",
        "    <i class=\"fa fa-warning\"></i>&nbsp;This script introduces <code>numpy</code>, <code>pandas</code> and <code>matplotlib</code> and <code>seaborn</code> as far as we use it in the following course. \n",
        "\n",
        "\n",
        "Thus it is not a comprehensive introduction to these libraries !\n",
        "    </div>"
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "## pandas\n",
        "\n",
        "`pandas` allows handling tabular data as so called `DataFrame`s. Tabular data means that columns have types. Within a colum values are of the same type, but types can differ between columns."
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "### Some basics"
       ]
      },
      {
       "cell_type": "code",
    
    schmittu's avatar
    schmittu committed
       "execution_count": 2,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "a,b,c\r\n",
          "0,0.0,one\r\n",
          "1,1.1,two\r\n",
    
    schmittu's avatar
    schmittu committed
          "4,4.4,thee\r\n",
          "9,9.9,four\r\n",
          "16,17.6,five\r\n",
    
          "25,27.5,one\r\n",
    
    schmittu's avatar
    schmittu committed
          "36,39.6,two\r\n"
    
    schmittu's avatar
    schmittu committed
        "# show content of csv file, only works in notebook:\n",
    
        "!cat data/example.csv"
    
       "execution_count": 5,
    
       "metadata": {
        "scrolled": true
       },
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b     c\n",
          "0   0   0.0   one\n",
          "1   1   1.1   two\n",
          "2   4   4.4  thee\n",
          "3   9   9.9  four\n",
          "4  16  17.6  five\n",
          "5  25  27.5   one\n",
          "6  36  39.6   two\n"
         ]
    
        }
       ],
       "source": [
        "# read file with pandas\n",
        "\n",
        "import pandas as pd\n",
        "\n",
    
        "df = pd.read_csv(\"data/example.csv\")\n",
    
        "print(df)"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "<div class=\"alert alert-block alert-info\">\n",
        "<i class=\"fa fa-warning\"></i>&nbsp;<code>pandas</code> also \n",
        "supports reading and writing of other file formats, like <code>.xlsx</code>, <code>.hdf5</code> or <code>sqlite3</code> files.\n",
        "</div>\n",
        "\n",
        "\n",
        "\n"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 62,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "<class 'pandas.core.frame.DataFrame'>\n",
    
    schmittu's avatar
    schmittu committed
          "RangeIndex: 7 entries, 0 to 6\n",
    
          "Data columns (total 4 columns):\n",
    
    schmittu's avatar
    schmittu committed
          "a    7 non-null int64\n",
          "b    7 non-null float64\n",
          "c    7 non-null object\n",
    
          "d    7 non-null int64\n",
          "dtypes: float64(1), int64(2), object(1)\n",
          "memory usage: 304.0+ bytes\n"
    
         ]
        }
       ],
       "source": [
        "df.info()"
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "You can see that the colums `a`, `b` and `c` have different types `int64`, `float64` and `object`. The latter can be read as \"anything but a number\"."
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 7,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "(7, 3)\n"
         ]
    
        "# number of rows and columns\n",
        "print(df.shape)"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "The `.shape` is numbers of rows times number of columns."
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "To show the first 5 rows of a data frame we can use `.head()`."
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 8,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b     c\n",
          "0   0   0.0   one\n",
          "1   1   1.1   two\n",
          "2   4   4.4  thee\n",
          "3   9   9.9  four\n",
          "4  16  17.6  five\n"
         ]
    
        "print(df.head())"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "And `.tail()` shows the last 5 rows:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 9,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b     c\n",
          "2   4   4.4  thee\n",
          "3   9   9.9  four\n",
          "4  16  17.6  five\n",
          "5  25  27.5   one\n",
          "6  36  39.6   two\n"
         ]
    
        "print(df.tail())"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "Both accept an integer to change the number of rows to show:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 10,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "   a    b     c\n",
          "0  0  0.0   one\n",
          "1  1  1.1   two\n",
          "2  4  4.4  thee\n"
         ]
    
        "print(df.head(3))"
    
    schmittu's avatar
    schmittu committed
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
    
        "Compute some statistics on the columns"
    
    schmittu's avatar
    schmittu committed
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 12,
    
    schmittu's avatar
    schmittu committed
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "               a          b\n",
          "count   7.000000   7.000000\n",
          "mean   13.000000  14.300000\n",
          "std    13.490738  14.839811\n",
          "min     0.000000   0.000000\n",
          "25%     2.500000   2.750000\n",
          "50%     9.000000   9.900000\n",
          "75%    20.500000  22.550000\n",
          "max    36.000000  39.600000\n"
         ]
    
    schmittu's avatar
    schmittu committed
        }
       ],
       "source": [
    
        "print(df.describe())"
    
    schmittu's avatar
    schmittu committed
       ]
      },
    
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "###  Accessing parts of a data frame"
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "We can access separate columns using a column name:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 13,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "0     0\n",
          "1     1\n",
          "2     4\n",
          "3     9\n",
          "4    16\n",
          "5    25\n",
          "6    36\n",
          "Name: a, dtype: int64\n"
         ]
    
        "print(df[\"a\"])"
    
    schmittu's avatar
    schmittu committed
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "Single columns are `Series` in `pandas`:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 14,
    
    schmittu's avatar
    schmittu committed
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "<class 'pandas.core.series.Series'>\n"
         ]
    
    schmittu's avatar
    schmittu committed
        }
       ],
       "source": [
    
        "print(type(df['a']))"
    
    schmittu's avatar
    schmittu committed
       ]
      },
    
       "execution_count": 15,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
    
    schmittu's avatar
    schmittu committed
          "0      0.0\n",
          "1      3.2\n",
          "2     12.8\n",
          "3     28.8\n",
          "4     51.2\n",
          "5     80.0\n",
          "6    115.2\n",
    
          "dtype: float64\n"
         ]
        }
       ],
       "source": [
        "scores = df[\"a\"] + 2 * df[\"b\"]\n",
        "print(scores)"
       ]
      },
    
    schmittu's avatar
    schmittu committed
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "<div class=\"alert alert-block alert-warning\">\n",
        "<i class=\"fa fa-warning\"></i>&nbsp;Don't forget that\n",
        "    <ul>\n",
        "        <li> Indexing in Python starts with <code>0</code>\n",
        "        </li>\n",
        "        <li> Upper limits are exclusive\n",
        "            </li>\n",
        "        <li> Negative indices start from the right end, <code>-1</code> is the last element, <code>-2</code> the one before, etc.</li>\n",
        "        <li> <code>:</code> refers to all elements.</li>\n",
        "    </ul>\n",
        "</div>\n",
        "\n",
        "\n"
       ]
      },
    
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "`df.iloc[row_slice, col_slice]` offers index based access:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 16,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "0     0\n",
          "1     1\n",
          "2     4\n",
          "3     9\n",
          "4    16\n",
          "5    25\n",
          "6    36\n",
          "Name: a, dtype: int64\n"
         ]
    
        "print(df.iloc[:, 0])"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "\n",
        "\n",
        "To extract rows `1` to `2` (included), and all columns up to the last one:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 17,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "   a    b\n",
          "1  1  1.1\n",
          "2  4  4.4\n"
         ]
    
        "print(df.iloc[1:3, :-1])"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "To extract the last column:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 18,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "1     two\n",
          "2    thee\n",
          "Name: c, dtype: object\n"
         ]
    
        "print(df.iloc[1:3, -1])"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "### Filtering a data frame"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 19,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "   a    b     c\n",
          "0  0  0.0   one\n",
          "1  1  1.1   two\n",
          "2  4  4.4  thee\n",
          "3  9  9.9  four\n"
         ]
    
        "# all rows where the value of a is smaller than 10:\n",
        "print(df[df[\"a\"] < 10])"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "This works as follows:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 20,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "0    False\n",
          "1    False\n",
          "2     True\n",
          "3     True\n",
          "4     True\n",
          "5     True\n",
          "6     True\n",
          "Name: a, dtype: bool\n"
         ]
    
    schmittu's avatar
    schmittu committed
        "flags = df[\"a\"] > 3\n",
    
        "\n",
        "# we see that flags is a vector with logical values depending on\n",
        "# the given condition \"a > 3\":\n",
        "print(flags)"
    
       "execution_count": 22,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b     c\n",
          "2   4   4.4  thee\n",
          "3   9   9.9  four\n",
          "4  16  17.6  five\n",
          "5  25  27.5   one\n",
          "6  36  39.6   two\n"
         ]
    
    schmittu's avatar
    schmittu committed
        }
       ],
       "source": [
    
        "# when we pass these logical values to \"df[...]\" only the \"True rows\"\n",
        "# remain:\n",
        "print(df[flags])"
    
    schmittu's avatar
    schmittu committed
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "Another example:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 23,
    
    schmittu's avatar
    schmittu committed
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b    c\n",
          "0   0   0.0  one\n",
          "5  25  27.5  one\n"
         ]
    
        "print(df[df[\"c\"] == \"one\"])"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "### Extending a dataframe\n",
        "\n",
        "Adding a new, computed column:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 24,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b     c    d\n",
          "0   0   0.0   one    0\n",
          "1   1   1.1   two    1\n",
          "2   4   4.4  thee   16\n",
          "3   9   9.9  four   81\n",
          "4  16  17.6  five  256\n"
         ]
    
        }
       ],
       "source": [
        "# values in new column d will be values from \"a\" squared:\n",
        "df[\"d\"] = df[\"a\"] ** 2\n",
    
    schmittu's avatar
    schmittu committed
        "\n",
    
        "print(df.head())"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
    
    schmittu's avatar
    schmittu committed
        "We can also overwrite a column, here we use `apply` to apply the same function on all values in the given column:"
    
       "execution_count": 27,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "    a     b     c    d\n",
          "0   0   0.0   one    2\n",
          "1   1   1.1   two    3\n",
          "2   4   4.4  thee   18\n",
          "3   9   9.9  four   83\n",
          "4  16  17.6  five  258\n"
         ]
    
    schmittu's avatar
    schmittu committed
        "def increment(v):\n",
        "    return v + 1\n",
        "\n",
        "df[\"d\"] = df[\"d\"].apply(increment)\n",
    
        "print(df.head())"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "## numpy\n",
        "\n",
        "`numpy` offers data structures from linear algebra, e.g. vectors and matrices. \n",
        "\n",
        "In contrast to `pd.DataFrame` matrices contain numbers of the same type."
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 33,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
    
          "[3. 5. 8.]\n"
    
         ]
        }
       ],
       "source": [
        "import numpy as np\n",
        "\n",
    
        "x = np.array([3.0, 5.0, 8.0])\n",
    
        "print(x)"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 34,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "(3,)\n"
         ]
        }
       ],
       "source": [
        "print(x.shape)"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 35,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "[[1. 2. 3.]\n",
          " [3. 4. 5.]\n",
          " [3. 5. 3.]]\n"
         ]
        }
       ],
       "source": [
        "A = np.array([[1.0, 2.0, 3.0],\n",
        "              [3.0, 4.0, 5.0],\n",
        "              [3.0, 5.0, 3.0],\n",
        "             ])\n",
        "print(A)"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 36,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "(3, 3)\n"
         ]
        }
       ],
       "source": [
        "print(A.shape)"
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "Indexed access works as usual:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 37,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
    
          "3.0\n",
          "8.0\n",
          "[5. 8.]\n"
    
        "print(x[0])\n",
        "print(x[-1])\n",
        "print(x[1:])"
    
       "execution_count": 39,
    
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
    
          "3.0\n",
          "[2. 4. 5.]\n"
    
        "print(A[1, 0])\n",
        "print(A[:, 1])"
    
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "Numpy offers element-wise function application:"
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": 40,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "[[ 1.  4.  9.]\n",
          " [ 9. 16. 25.]\n",
          " [ 9. 25.  9.]]\n"
         ]
    
        }
       ],
       "source": [
        "# caveat ! not matrix-matrix multiplication\n",
    
        "print(A * A)"
    
       "execution_count": 41,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "[[16. 25. 22.]\n",
          " [30. 47. 44.]\n",
          " [27. 41. 43.]]\n"
         ]
    
        }
       ],
       "source": [
        "# this is matrix-matrix multiplication:\n",
    
        "print(A @ A)"
    
       "execution_count": 50,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "[[-2. -1.  0.]\n",
          " [ 0.  1.  2.]\n",
          " [ 0.  2.  0.]]\n"
         ]
    
        "# substract 3 from all elements:\n",
        "print(A - 3)"
    
       "execution_count": 51,
    
       "metadata": {},
       "outputs": [
        {
    
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "[[2. 1. 0.]\n",
          " [0. 1. 2.]\n",
          " [0. 2. 0.]]\n"
         ]
    
        "# subtract 3 from all elements, then compute absolute\n",
        "# values for every element:\n",
        "print(np.abs(A - 3))"