00_numpy_pandas_matplotlib_intro.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       "    \n",
       "    @import url('http://fonts.googleapis.com/css?family=Source+Code+Pro');\n",
       "    \n",
       "    @import url('http://fonts.googleapis.com/css?family=Kameron');\n",
       "    @import url('http://fonts.googleapis.com/css?family=Crimson+Text');\n",
       "    \n",
       "    @import url('http://fonts.googleapis.com/css?family=Lato');\n",
       "    @import url('http://fonts.googleapis.com/css?family=Source+Sans+Pro');\n",
       "    \n",
       "    @import url('http://fonts.googleapis.com/css?family=Lora'); \n",
       "\n",
       "    \n",
       "    body {\n",
       "        font-family: 'Lora', Consolas, sans-serif;\n",
       "       \n",
       "        -webkit-print-color-adjust: exact important !;\n",
       "        \n",
       "      \n",
       "       \n",
       "    }\n",
       "    \n",
       "    .alert-block {\n",
       "        width: 95%;\n",
       "        margin: auto;\n",
       "    }\n",
       "    \n",
       "    .rendered_html code\n",
       "    {\n",
       "        color: black;\n",
       "        background: #eaf0ff;\n",
       "        background: #f5f5f5; \n",
       "        padding: 1pt;\n",
       "        font-family:  'Source Code Pro', Consolas, monocco, monospace;\n",
       "    }\n",
       "    \n",
       "    p {\n",
       "      line-height: 140%;\n",
       "    }\n",
       "    \n",
       "    strong code {\n",
       "        background: red;\n",
       "    }\n",
       "    \n",
       "    .rendered_html strong code\n",
       "    {\n",
       "        background: #f5f5f5;\n",
       "    }\n",
       "    \n",
       "    .CodeMirror pre {\n",
       "    font-family: 'Source Code Pro', monocco, Consolas, monocco, monospace;\n",
       "    }\n",
       "    \n",
       "    .cm-s-ipython span.cm-keyword {\n",
       "        font-weight: normal;\n",
       "     }\n",
       "     \n",
       "     strong {\n",
       "         background: #f5f5f5;\n",
       "         margin-top: 4pt;\n",
       "         margin-bottom: 4pt;\n",
       "         padding: 2pt;\n",
       "         border: 0.5px solid #a0a0a0;\n",
       "         font-weight: bold;\n",
       "         color: darkred;\n",
       "     }\n",
       "     \n",
       "    \n",
       "    div #notebook {\n",
       "        # font-size: 10pt; \n",
       "        line-height: 145%;\n",
       "        }\n",
       "        \n",
       "    li {\n",
       "        line-height: 145%;\n",
       "    }\n",
       "\n",
       "    div.output_area pre {\n",
       "        background: #fff9d8 !important;\n",
       "        padding: 5pt;\n",
       "       \n",
       "       -webkit-print-color-adjust: exact; \n",
       "        \n",
       "    }\n",
       " \n",
       "    \n",
       " \n",
       "    h1, h2, h3, h4 {\n",
       "        font-family: Kameron, arial;\n",
       "\n",
       "\n",
       "    }\n",
       "    \n",
       "    div#maintoolbar {display: none !important;}\n",
       "</style>\n",
       "    <script>\n",
       "IPython.OutputArea.prototype._should_scroll = function(lines) {\n",
       "        return false;\n",
       "}\n",
       "    </script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
    "warnings.filterwarnings = lambda *a, **kw: None\n",
    "from IPython.core.display import HTML; HTML(open(\"custom.html\", \"r\").read())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 0: Introduction \n",
    "\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\">\n",
    "    <i class=\"fa fa-warning\"></i>&nbsp;This script introduces <code>numpy</code>, <code>pandas</code> and <code>matplotlib</code> and <code>seaborn</code> as far as we use it in the following course. \n",
    "\n",
    "\n",
    "Thus it is not a comprehensive introduction to these libraries !\n",
    "    </div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pandas\n",
    "\n",
    "`pandas` allows handling tabular data as so called `DataFrame`s. Tabular data means that columns have types. Within a colum values are of the same type, but types can differ between columns."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Some basics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a,b,c\r\n",
      "0,0.0,one\r\n",
      "1,1.1,two\r\n",
      "4,4.4,thee\r\n",
      "9,9.9,four\r\n",
      "16,17.6,five\r\n",
      "25,27.5,one\r\n",
      "36,39.6,two\r\n"
     ]
    }
   ],
   "source": [
    "# show content of csv file, only works in notebook:\n",
    "!cat data/example.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b     c\n",
      "0   0   0.0   one\n",
      "1   1   1.1   two\n",
      "2   4   4.4  thee\n",
      "3   9   9.9  four\n",
      "4  16  17.6  five\n",
      "5  25  27.5   one\n",
      "6  36  39.6   two\n"
     ]
    }
   ],
   "source": [
    "# read file with pandas\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"data/example.csv\")\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-info\">\n",
    "<i class=\"fa fa-warning\"></i>&nbsp;<code>pandas</code> also \n",
    "supports reading and writing of other file formats, like <code>.xlsx</code>, <code>.hdf5</code> or <code>sqlite3</code> files.\n",
    "</div>\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 7 entries, 0 to 6\n",
      "Data columns (total 4 columns):\n",
      "a    7 non-null int64\n",
      "b    7 non-null float64\n",
      "c    7 non-null object\n",
      "d    7 non-null int64\n",
      "dtypes: float64(1), int64(2), object(1)\n",
      "memory usage: 304.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can see that the colums `a`, `b` and `c` have different types `int64`, `float64` and `object`. The latter can be read as \"anything but a number\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7, 3)\n"
     ]
    }
   ],
   "source": [
    "# number of rows and columns\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `.shape` is numbers of rows times number of columns."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To show the first 5 rows of a data frame we can use `.head()`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b     c\n",
      "0   0   0.0   one\n",
      "1   1   1.1   two\n",
      "2   4   4.4  thee\n",
      "3   9   9.9  four\n",
      "4  16  17.6  five\n"
     ]
    }
   ],
   "source": [
    "print(df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And `.tail()` shows the last 5 rows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b     c\n",
      "2   4   4.4  thee\n",
      "3   9   9.9  four\n",
      "4  16  17.6  five\n",
      "5  25  27.5   one\n",
      "6  36  39.6   two\n"
     ]
    }
   ],
   "source": [
    "print(df.tail())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Both accept an integer to change the number of rows to show:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   a    b     c\n",
      "0  0  0.0   one\n",
      "1  1  1.1   two\n",
      "2  4  4.4  thee\n"
     ]
    }
   ],
   "source": [
    "print(df.head(3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compute some statistics on the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               a          b\n",
      "count   7.000000   7.000000\n",
      "mean   13.000000  14.300000\n",
      "std    13.490738  14.839811\n",
      "min     0.000000   0.000000\n",
      "25%     2.500000   2.750000\n",
      "50%     9.000000   9.900000\n",
      "75%    20.500000  22.550000\n",
      "max    36.000000  39.600000\n"
     ]
    }
   ],
   "source": [
    "print(df.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Accessing parts of a data frame"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can access separate columns using a column name:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0     0\n",
      "1     1\n",
      "2     4\n",
      "3     9\n",
      "4    16\n",
      "5    25\n",
      "6    36\n",
      "Name: a, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df[\"a\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Single columns are `Series` in `pandas`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.series.Series'>\n"
     ]
    }
   ],
   "source": [
    "print(type(df['a']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0      0.0\n",
      "1      3.2\n",
      "2     12.8\n",
      "3     28.8\n",
      "4     51.2\n",
      "5     80.0\n",
      "6    115.2\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "scores = df[\"a\"] + 2 * df[\"b\"]\n",
    "print(scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-warning\">\n",
    "<i class=\"fa fa-warning\"></i>&nbsp;Don't forget that\n",
    "    <ul>\n",
    "        <li> Indexing in Python starts with <code>0</code>\n",
    "        </li>\n",
    "        <li> Upper limits are exclusive\n",
    "            </li>\n",
    "        <li> Negative indices start from the right end, <code>-1</code> is the last element, <code>-2</code> the one before, etc.</li>\n",
    "        <li> <code>:</code> refers to all elements.</li>\n",
    "    </ul>\n",
    "</div>\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`df.iloc[row_slice, col_slice]` offers index based access:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0     0\n",
      "1     1\n",
      "2     4\n",
      "3     9\n",
      "4    16\n",
      "5    25\n",
      "6    36\n",
      "Name: a, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df.iloc[:, 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "To extract rows `1` to `2` (included), and all columns up to the last one:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   a    b\n",
      "1  1  1.1\n",
      "2  4  4.4\n"
     ]
    }
   ],
   "source": [
    "print(df.iloc[1:3, :-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To extract the last column:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1     two\n",
      "2    thee\n",
      "Name: c, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(df.iloc[1:3, -1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filtering a data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   a    b     c\n",
      "0  0  0.0   one\n",
      "1  1  1.1   two\n",
      "2  4  4.4  thee\n",
      "3  9  9.9  four\n"
     ]
    }
   ],
   "source": [
    "# all rows where the value of a is smaller than 10:\n",
    "print(df[df[\"a\"] < 10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This works as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    False\n",
      "1    False\n",
      "2     True\n",
      "3     True\n",
      "4     True\n",
      "5     True\n",
      "6     True\n",
      "Name: a, dtype: bool\n"
     ]
    }
   ],
   "source": [
    "flags = df[\"a\"] > 3\n",
    "\n",
    "# we see that flags is a vector with logical values depending on\n",
    "# the given condition \"a > 3\":\n",
    "print(flags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b     c\n",
      "2   4   4.4  thee\n",
      "3   9   9.9  four\n",
      "4  16  17.6  five\n",
      "5  25  27.5   one\n",
      "6  36  39.6   two\n"
     ]
    }
   ],
   "source": [
    "# when we pass these logical values to \"df[...]\" only the \"True rows\"\n",
    "# remain:\n",
    "print(df[flags])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b    c\n",
      "0   0   0.0  one\n",
      "5  25  27.5  one\n"
     ]
    }
   ],
   "source": [
    "print(df[df[\"c\"] == \"one\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extending a dataframe\n",
    "\n",
    "Adding a new, computed column:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b     c    d\n",
      "0   0   0.0   one    0\n",
      "1   1   1.1   two    1\n",
      "2   4   4.4  thee   16\n",
      "3   9   9.9  four   81\n",
      "4  16  17.6  five  256\n"
     ]
    }
   ],
   "source": [
    "# values in new column d will be values from \"a\" squared:\n",
    "df[\"d\"] = df[\"a\"] ** 2\n",
    "\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can also overwrite a column, here we use `apply` to apply the same function on all values in the given column:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    a     b     c    d\n",
      "0   0   0.0   one    2\n",
      "1   1   1.1   two    3\n",
      "2   4   4.4  thee   18\n",
      "3   9   9.9  four   83\n",
      "4  16  17.6  five  258\n"
     ]
    }
   ],
   "source": [
    "def increment(v):\n",
    "    return v + 1\n",
    "\n",
    "df[\"d\"] = df[\"d\"].apply(increment)\n",
    "\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## numpy\n",
    "\n",
    "`numpy` offers data structures from linear algebra, e.g. vectors and matrices. \n",
    "\n",
    "In contrast to `pd.DataFrame` matrices contain numbers of the same type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3. 5. 8.]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "x = np.array([3.0, 5.0, 8.0])\n",
    "print(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3,)\n"
     ]
    }
   ],
   "source": [
    "print(x.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1. 2. 3.]\n",
      " [3. 4. 5.]\n",
      " [3. 5. 3.]]\n"
     ]
    }
   ],
   "source": [
    "A = np.array([[1.0, 2.0, 3.0],\n",
    "              [3.0, 4.0, 5.0],\n",
    "              [3.0, 5.0, 3.0],\n",
    "             ])\n",
    "print(A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3, 3)\n"
     ]
    }
   ],
   "source": [
    "print(A.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Indexed access works as usual:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.0\n",
      "8.0\n",
      "[5. 8.]\n"
     ]
    }
   ],
   "source": [
    "print(x[0])\n",
    "print(x[-1])\n",
    "print(x[1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.0\n",
      "[2. 4. 5.]\n"
     ]
    }
   ],
   "source": [
    "print(A[1, 0])\n",
    "print(A[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Numpy offers element-wise function application:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 1.  4.  9.]\n",
      " [ 9. 16. 25.]\n",
      " [ 9. 25.  9.]]\n"
     ]
    }
   ],
   "source": [
    "# caveat ! not matrix-matrix multiplication\n",
    "print(A * A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[16. 25. 22.]\n",
      " [30. 47. 44.]\n",
      " [27. 41. 43.]]\n"
     ]
    }
   ],
   "source": [
    "# this is matrix-matrix multiplication:\n",
    "print(A @ A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-2. -1.  0.]\n",
      " [ 0.  1.  2.]\n",
      " [ 0.  2.  0.]]\n"
     ]
    }
   ],
   "source": [
    "# substract 3 from all elements:\n",
    "print(A - 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[2. 1. 0.]\n",
      " [0. 1. 2.]\n",
      " [0. 2. 0.]]\n"
     ]
    }
   ],
   "source": [
    "# subtract 3 from all elements, then compute absolute\n",
    "# values for every element:\n",
    "print(np.abs(A - 3))"
   ]