diff --git a/08_a-neural_networks.ipynb b/08_a-neural_networks.ipynb
index ef048fda1130d889522f43ff1a4c87b82d09d1e4..cbd2137100b068365e0e6a0b542fe747cd039c4f 100644
--- a/08_a-neural_networks.ipynb
+++ b/08_a-neural_networks.ipynb
@@ -2,51 +2,47 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'tensorflow'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-2-8815e0f9bf1a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorflow'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
     "from numpy.random import seed\n",
     "\n",
+<<<<<<< HEAD
     "import os, sys\n",
     "\n",
     "if sys.platform == \"win32\":\n",
     "    os.add_dll_directory(os.path.dirname(sys.executable))\n",
     "\n",
+=======
+>>>>>>> a331103 (slight improvements and splitting the notebooks)
     "seed(42)\n",
     "import tensorflow as tf\n",
+    "\n",
     "tf.random.set_seed(42)\n",
-    "import matplotlib.pyplot as plt\n",
     "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
     "import seaborn as sns\n",
+    "\n",
     "sns.set(style=\"darkgrid\")\n",
-    "mpl.rcParams['lines.linewidth'] = 3\n",
+    "mpl.rcParams[\"lines.linewidth\"] = 3\n",
     "%matplotlib inline\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "%config IPCompleter.greedy=True\n",
     "import warnings\n",
-    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
-    "from IPython.core.display import HTML; HTML(open(\"custom.html\", \"r\").read())"
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "from IPython.core.display import HTML\n",
+    "\n",
+    "HTML(open(\"custom.html\", \"r\").read())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Chapter 8: Introduction to Neural Networks\n",
+    "# Chapter 8a: Introduction to Neural Networks\n",
     "\n",
     "\n",
     "\n",
@@ -71,7 +67,8 @@
     "| 1997 | Long-short term memory (LSTM) model |\n",
     "| 1998 | LeNet-5 |\n",
     "| 2014 | Gated Recurrent Units (GRU), Generative Adversarial Networks (GAN) |\n",
-    "| 2015 | ResNet |"
+    "| 2015 | ResNet |\n",
+    "| 2017 | Transformer model is proposed |"
    ]
   },
   {
@@ -82,7 +79,7 @@
     "* Data\n",
     "* Data\n",
     "* Data\n",
-    "* Availability of Graphic Processing Units (GPUs)\n",
+    "* Availability of Graphics Processing Units (GPUs)\n",
     "* Algorithmic developments which allow for efficient training of networks networks and making them deeper\n",
     "* Development of high-level libraries/APIs have made the field much more accessible than it was a decade ago"
    ]
@@ -95,7 +92,7 @@
     "<center>\n",
     "<figure>\n",
     "<img src=\"./images/neuralnets/neural_net_ex.svg\" width=\"700\"/>\n",
-    "<figcaption>A three layer densely connected Neural Network (By convention the input layer is not counted).</figcaption>\n",
+    "<figcaption>A <b>three layer</b> densely connected Neural Network (By convention the input layer is not counted).</figcaption>\n",
     "</figure>\n",
     "</center>"
    ]
@@ -116,7 +113,7 @@
     "<center>\n",
     "<figure>\n",
     "<img src=\"./images/neuralnets/perceptron_ex.svg\" width=\"400\"/>\n",
-    "<figcaption>A simple perceptron with three inputs and one output.</figcaption>\n",
+    "<figcaption>A simple perceptron with <b>three inputs</b> and <b>one output</b>.</figcaption>\n",
     "</figure>\n",
     "</center>\n",
     "\n",
@@ -150,8 +147,8 @@
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import seaborn as sns"
    ]
   },
   {
@@ -161,13 +158,13 @@
    "outputs": [],
    "source": [
     "# Plotting the step function\n",
-    "x = np.arange(-2,2.1,0.01)\n",
+    "x = np.arange(-2, 2.1, 0.01)\n",
     "y = np.zeros(len(x))\n",
-    "threshold = 0.\n",
-    "y[x>threshold] = 1.\n",
-    "step_plot = sns.lineplot(x, y).set_title('Step function') ;\n",
-    "plt.xlabel('weighted_sum') ;\n",
-    "plt.ylabel('f(weighted_sum)') ;"
+    "threshold = 0.0\n",
+    "y[x > threshold] = 1.0\n",
+    "step_plot = sns.lineplot(x, y).set_title(\"Step function\")\n",
+    "plt.xlabel(\"weighted_sum\")\n",
+    "plt.ylabel(\"f(weighted_sum)\");"
    ]
   },
   {
@@ -211,12 +208,12 @@
     "w = [1, 1]\n",
     "# (x1, x2) pairs\n",
     "x1 = [0, 1, 0, 1]\n",
+    "\n",
     "x2 = [0, 0, 1, 1]\n",
     "# Calling the perceptron function\n",
     "output = perceptron([x1, x2], w, threshold)\n",
     "for i in range(len(output)):\n",
-    "    print(\"Perceptron output for x1, x2 = \", x1[i], \",\", x2[i],\n",
-    "          \" is \", output[i])"
+    "    print(\"Perceptron output for x1, x2 = \", x1[i], \",\", x2[i], \" is \", output[i])"
    ]
   },
   {
@@ -235,12 +232,12 @@
     "def perceptron_DB(x1, x2, w, threshold):\n",
     "    # Plotting the decision boundary of the perceptron\n",
     "    plt.scatter(x1, x2, color=\"black\")\n",
-    "    plt.xlim(-1,2)\n",
-    "    plt.ylim(-1,2)\n",
+    "    plt.xlim(-1, 2)\n",
+    "    plt.ylim(-1, 2)\n",
     "    # The decision boundary is a line given by\n",
     "    # w_1*x_1+w_2*x_2-threshold=0\n",
     "    x1 = np.arange(-3, 4)\n",
-    "    x2 = (threshold - x1*w[0])/w[1]\n",
+    "    x2 = (threshold - x1 * w[0]) / w[1]\n",
     "    sns.lineplot(x1, x2, **{\"color\": \"black\"})\n",
     "    plt.xlabel(\"x$_1$\", fontsize=16)\n",
     "    plt.ylabel(\"x$_2$\", fontsize=16)\n",
@@ -248,18 +245,22 @@
     "    pts_tmp = np.arange(-2, 2.1, 0.02)\n",
     "    points = np.array(np.meshgrid(pts_tmp, pts_tmp)).T.reshape(-1, 2)\n",
     "    outputs = perceptron(points.T, w, threshold)\n",
-    "    plt.plot(points[:, 0][outputs == 0], points[:, 1][outputs == 0],\n",
-    "             \"o\",\n",
-    "             color=\"steelblue\",\n",
-    "             markersize=1,\n",
-    "             alpha=0.04,\n",
-    "             )\n",
-    "    plt.plot(points[:, 0][outputs == 1], points[:, 1][outputs == 1],\n",
-    "             \"o\",\n",
-    "             color=\"chocolate\",\n",
-    "             markersize=1,\n",
-    "             alpha=0.04,\n",
-    "             )\n",
+    "    plt.plot(\n",
+    "        points[:, 0][outputs == 0],\n",
+    "        points[:, 1][outputs == 0],\n",
+    "        \"o\",\n",
+    "        color=\"steelblue\",\n",
+    "        markersize=1,\n",
+    "        alpha=0.04,\n",
+    "    )\n",
+    "    plt.plot(\n",
+    "        points[:, 0][outputs == 1],\n",
+    "        points[:, 1][outputs == 1],\n",
+    "        \"o\",\n",
+    "        color=\"chocolate\",\n",
+    "        markersize=1,\n",
+    "        alpha=0.04,\n",
+    "    )\n",
     "    plt.title(\"Blue color = 0 and Chocolate = 1\")"
    ]
   },
@@ -277,8 +278,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Exercise section\n",
-    "* Compute a Boolean \"OR\" using a perceptron\n",
+    "## Exercise section\n",
+    "1. Compute a Boolean \"OR\" using a perceptron\n",
     "\n",
     "Hint: copy the code from the \"AND\" example and edit the weights and/or threshold"
    ]
@@ -320,16 +321,15 @@
    "source": [
     "# Solution\n",
     "# Calculating Boolean OR using a perceptron\n",
-    "threshold=0.6\n",
+    "threshold = 0.6\n",
     "# (w1, w2)\n",
-    "w=[1,1]\n",
+    "w = [1, 1]\n",
     "# (x1, x2) pairs\n",
     "x1 = [0, 1, 0, 1]\n",
     "x2 = [0, 0, 1, 1]\n",
     "output = perceptron([x1, x2], w, threshold)\n",
     "for i in range(len(output)):\n",
-    "    print(\"Perceptron output for x1, x2 = \", x1[i], \",\", x2[i],\n",
-    "          \" is \", output[i])\n",
+    "    print(\"Perceptron output for x1, x2 = \", x1[i], \",\", x2[i], \" is \", output[i])\n",
     "perceptron_DB(x1, x2, w, threshold)"
    ]
   },
@@ -337,8 +337,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Exercise section\n",
-    "* Create a NAND gate using a perceptron\n",
+    "2. Create a NAND gate using a perceptron\n",
     "\n",
     "Boolean NAND\n",
     "\n",
@@ -373,16 +372,16 @@
     "# Solution\n",
     "# Calculating Boolean NAND using a perceptron\n",
     "import matplotlib.pyplot as plt\n",
-    "threshold=-1.5\n",
+    "\n",
+    "threshold = -1.5\n",
     "# (w1, w2)\n",
-    "w=[-1,-1]\n",
+    "w = [-1, -1]\n",
     "# (x1, x2) pairs\n",
     "x1 = [0, 1, 0, 1]\n",
     "x2 = [0, 0, 1, 1]\n",
     "output = perceptron([x1, x2], w, threshold)\n",
     "for i in range(len(output)):\n",
-    "    print(\"Perceptron output for x1, x2 = \", x1[i], \",\", x2[i],\n",
-    "          \" is \", output[i])\n",
+    "    print(\"Perceptron output for x1, x2 = \", x1[i], \",\", x2[i], \" is \", output[i])\n",
     "perceptron_DB(x1, x2, w, threshold)"
    ]
   },
@@ -397,17 +396,17 @@
     "**WHAT CAN WE DO?**\n",
     "\n",
     "\n",
-    "Hint: Think about what is the significance of the NAND gate we have created above?"
+    "**Hint:** Think about what is the significance of the NAND gate we have created above?"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Multi-layer perceptrons\n",
+    "## Multi-layer perceptrons\n",
     "\n",
     "\n",
-    "Answer: We said a single perceptron can't compute a \"XOR\" function. We didn't say that about **multiple Perceptrons** put together.\n",
+    "**Answer:** We said a single perceptron can't compute a \"XOR\" function. We didn't say that about **multiple Perceptrons** put together.\n",
     "\n",
     "The normal densely connected neural network is sometimes also called \"Multi-layer\" perceptron.\n",
     "\n",
@@ -423,12 +422,32 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "## Learning\n",
     "\n",
-    "We know that we can compute complicated functions by combining a number of perceptrons.\n",
-    "\n",
+    "We know that we can compute complicated functions by combining a number of perceptrons."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<i class=\"fa fa-info-circle\"></i>\n",
+    "<a href=https://en.wikipedia.org/wiki/Universal_approximation_theorem>Universal Approximation Theorem:</a>\n",
+    "    Universal approximation theorems imply that neural networks can represent a wide variety of interesting functions when given appropriate weights.\n",
+    "    </div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
     "In the perceptron examples we had set the model parameters (weights and threshold) by hand.\n",
     "\n",
     "This is something we definitely **DO NOT** want to do or even can do for big networks.\n",
@@ -438,7 +457,7 @@
     "<div class=\"alert alert-block alert-warning\">\n",
     "    <i class=\"fa fa-info-circle\"></i>&nbsp; <strong>Threshold -> bias</strong>  \n",
     "    \n",
-    "Before we go further we need to introduce one change. The threshold which we saw in the step activation function above is moved to the left side of the equation and is called **bias**.\n",
+    "Before we go further we need to introduce one change. In the neural networks literature, the threshold which we saw in the step activation function above is moved to the left side of the equation and is called **bias**.\n",
     "\n",
     "$$\n",
     "f = \\left\\{\n",
@@ -455,10 +474,11 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
-    "In order to algorithmically set/learn the weights and bias we need to choose an appropriate loss function for the problem at hand and solve an optimization problem.\n",
-    "We will explain below what this means.\n",
+    "In order to algorithmically set/learn the weights and biases we need to choose an appropriate loss function for the problem at hand and solve an optimization problem.\n",
     "\n",
     "\n",
     "### Loss function\n",
@@ -514,6 +534,13 @@
     "</div>"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In practice modified versions of gradient descent such as <a href=https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>RMSprop</a> and <a href=https://arxiv.org/abs/1412.6980> Adam optimizer</a> are used."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -521,12 +548,12 @@
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
     "import numpy as np\n",
+    "import seaborn as sns\n",
     "\n",
-    "plt.figure(figsize=(10, 4)) ;\n",
+    "plt.figure(figsize=(10, 4))\n",
     "\n",
-    "pts=np.arange(-20,20, 0.1) ;"
+    "pts = np.arange(-20, 20, 0.1);"
    ]
   },
   {
@@ -552,7 +579,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.lineplot(pts, 1/(1+np.exp(-pts))) ;"
+    "sns.lineplot(pts, 1 / (1 + np.exp(-pts)));"
    ]
   },
   {
@@ -572,7 +599,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.lineplot(pts, np.tanh(pts*np.pi)) ;"
+    "sns.lineplot(pts, np.tanh(pts * np.pi));"
    ]
   },
   {
@@ -592,8 +619,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pts_relu=[max(0,i) for i in pts];\n",
-    "plt.plot(pts, pts_relu) ;"
+    "pts_relu = [max(0, i) for i in pts]\n",
+    "plt.plot(pts, pts_relu);"
    ]
   },
   {
@@ -624,11 +651,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "alpha=0.1 # Large alpha chosen for plotting purposes\n",
-    "pts_leakyrelu=[max(alpha*i,i) for i in pts];\n",
-    "plt.plot(pts, pts_leakyrelu) ;\n",
-    "plt.xlim(-5,5);\n",
-    "plt.ylim(-1,5);"
+    "alpha = 0.1  # Large alpha chosen for plotting purposes\n",
+    "pts_leakyrelu = [max(alpha * i, i) for i in pts]\n",
+    "plt.plot(pts, pts_leakyrelu)\n",
+    "plt.xlim(-5, 5)\n",
+    "plt.ylim(-1, 5);"
    ]
   },
   {
@@ -654,11 +681,12 @@
    "outputs": [],
    "source": [
     "import math\n",
-    "alpha=1\n",
-    "pts_elu=[alpha*(math.exp(i)-1) if i<0 else i for i in pts]\n",
-    "plt.plot(pts, pts_elu) ;\n",
-    "plt.xlim(-5,5);\n",
-    "plt.ylim(-2,5);"
+    "\n",
+    "alpha = 1\n",
+    "pts_elu = [alpha * (math.exp(i) - 1) if i < 0 else i for i in pts]\n",
+    "plt.plot(pts, pts_elu)\n",
+    "plt.xlim(-5, 5)\n",
+    "plt.ylim(-2, 5);"
    ]
   },
   {
@@ -672,23 +700,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<div class=\"alert alert-block alert-info\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "Why don't we just use a simple linear activation function?\n",
-    "    \n",
-    "Linear activations are **NOT** used because it can be mathematically shown that if they are used then the output is just a linear function of the input. So we cannot learn interesting and complex functions by adding any number of hidden layers.\n",
-    "\n",
-    "The only exception when we do want to use a linear activation is for the output layer of a network when solving a regression problem.\n",
-    "\n",
-    "</p>\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Exercise section - Google Playground\n",
+    "## Exercise section - Google Playground\n",
     "\n",
     "A great tool from Google to develop a feeling for the workings of neural networks.\n",
     "\n",
@@ -696,1356 +708,34 @@
     "\n",
     "<img src=\"./images/neuralnets/google_playground.png\"/>\n",
     "\n",
-    "**Walkthrough by instructor**\n",
+    "**Walkthrough of the interface by the instructor**\n",
     "\n",
-    "Some concepts to look at:\n",
-    "\n",
-    "* Simple vs Complex models (Effect of network size)\n",
-    "* Optimization results\n",
-    "* Effect of activation functions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Introduction to TensorFlow (keras api)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### A bit about Keras?\n",
+    "Try to set up a simple neural network to solve the circle example:\n",
     "\n",
-    "* It is a high level API to create and work with neural networks\n",
-    "* Used to support multiple backends such as **TensorFlow** from Google, **Theano** (Although Theano is dead now) and **CNTK** (Microsoft Cognitive Toolkit), up till release 2.3.0 \n",
-    "* Very good for creating neural nets quickly and hides away a lot of tedious work\n",
-    "* Has been incorporated into official TensorFlow (which obviously only works with tensorflow) and as of TensorFlow 2.0 this is the main api to use it\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<center>\n",
-    "<figure>\n",
-    "<img src=\"./images/neuralnets/neural_net_keras_1.svg\" width=\"700\"/>\n",
-    "<figcaption>Building this model in Keras</figcaption>\n",
-    "</figure>\n",
-    "</center>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Say hello to Tensorflow\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense, Activation\n",
-    "\n",
-    "# Creating a model\n",
-    "model = Sequential()\n",
-    "\n",
-    "# Adding layers to this model\n",
-    "# 1st Hidden layer\n",
-    "# A Dense/fully-connected layer which takes as input a \n",
-    "# feature array of shape (samples, num_features)\n",
-    "# Here input_shape = (2,) means that the layer expects an input with num_features = 2\n",
-    "# and the sample size could be anything\n",
-    "# The activation function for this layer is set to \"relu\"\n",
-    "model.add(Dense(units=4, input_shape=(2,), activation=\"relu\"))\n",
-    "\n",
-    "# 2nd Hidden layer\n",
-    "# This is also a fully-connected layer and we do not need to specify the\n",
-    "# shape of the input anymore (We need to do that only for the first layer)\n",
-    "# NOTE: Now we didn't add the activation seperately. Instead we just added it\n",
-    "# while calling Dense(). This and the way used for the first layer are Equivalent!\n",
-    "model.add(Dense(units=4, activation=\"relu\"))\n",
-    "\n",
-    "          \n",
-    "# The output layer\n",
-    "model.add(Dense(units=1))\n",
-    "model.add(Activation(\"sigmoid\"))\n",
-    "\n",
-    "model.summary()"
+    "1. Start with a single hidden layer. What are the minimum number of neurons needed to get a reasonable decision boundary?\n",
+    "2. Add more neurons and hidden layers while choosing a linear activation function. What do you observe? Why?\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### XOR using neural networks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Creating a network to solve the XOR problem\n",
-    "\n",
-    "# Loading and plotting the data\n",
-    "xor = pd.read_csv(\"data/xor.csv\")\n",
-    "\n",
-    "# Using x and y coordinates as featues\n",
-    "features = xor.iloc[:, :-1]\n",
-    "# Convert boolean to integer values (True->1 and False->0)\n",
-    "labels = (1-xor.iloc[:, -1].astype(int))\n",
-    "\n",
-    "colors = [[\"steelblue\", \"chocolate\"][i] for i in labels]\n",
-    "plt.figure(figsize=(5, 5))\n",
-    "plt.xlim([-2, 2])\n",
-    "plt.ylim([-2, 2])\n",
-    "plt.title(\"Blue points are False\")\n",
-    "plt.scatter(features[\"x\"], features[\"y\"], color=colors, marker=\"o\") ;"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Building a simple Tensorflow model\n",
-    "\n",
-    "def a_simple_NN():\n",
-    "    \n",
-    "    model = Sequential()\n",
-    "\n",
-    "    model.add(Dense(4, input_shape = (2,), activation = \"relu\"))\n",
-    "\n",
-    "    model.add(Dense(4, activation = \"relu\"))\n",
-    "\n",
-    "    model.add(Dense(1, activation = \"sigmoid\"))\n",
-    "\n",
-    "    model.compile(loss=\"binary_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"])\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "Why don't we just use a simple linear activation function?\n",
     "    \n",
-    "    return model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Instantiating the model\n",
-    "model = a_simple_NN()\n",
-    "\n",
-    "# Splitting the dataset into training (70%) and validation sets (30%)\n",
-    "X_train, X_test, y_train, y_test = train_test_split(\n",
-    "    features, labels, test_size=0.3)\n",
-    "\n",
-    "# Setting the number of passes through the entire training set\n",
-    "num_epochs = 300\n",
-    "\n",
-    "# model.fit() is used to train the model\n",
-    "# We can pass validation data while training\n",
-    "model_run = model.fit(X_train, y_train, epochs=num_epochs,\n",
-    "                      validation_data=(X_test, y_test))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-info\"><p><i class=\"fa fa-info-circle\"></i>&nbsp;\n",
-    "    NOTE: We can pass \"verbose=0\" to model.fit() to suppress the printing of model output on the terminal/notebook.\n",
-    "</p></div>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Plotting the loss and accuracy on the training and validation sets during the training\n",
-    "# This can be done by using Keras callback \"history\" which is applied by default\n",
-    "history_model = model_run.history\n",
+    "Linear activations are **NOT** used because it can be mathematically shown that if they are used then the output is just a linear function of the input. So we cannot learn interesting and complex functions by adding any number of hidden layers.\n",
     "\n",
-    "print(\"The history has the following data: \", history_model.keys())\n",
+    "The only exception when we do want to use a linear activation is for the output layer of a network when solving a regression problem.\n",
     "\n",
-    "# Plotting the training and validation accuracy during the training\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"accuracy\"], color = \"blue\", label=\"Training set\") ;\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"val_accuracy\"], color = \"red\", label=\"Valdation set\") ;\n",
-    "plt.xlabel(\"epochs\") ;\n",
-    "plt.ylabel(\"accuracy\") ;"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "The plots such as above are essential for analyzing the behaviour and performance of the network and to tune it in the right direction. However, for the example above we don't expect to derive a lot of insight from this plot as the function we are trying to fit is quite simple and there is not too much noise. We will see the significance of these curves in a later example.\n",
     "</p>\n",
     "</div>"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Before we move on forward we see how to save and load a keras model\n",
-    "model.save(\"./data/my_first_NN.h5\")\n",
-    "\n",
-    "# Optional: See what is in the hdf5 file we just created above\n",
-    "\n",
-    "from tensorflow.keras.models import load_model\n",
-    "model = load_model(\"./data/my_first_NN.h5\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For the training and validation in the example above we split our dataset into a 70-30 train-validation set. We know from previous chapters that to more robustly estimate the accuracy of our model we can use **K-fold cross-validation**.\n",
-    "This is even more important when we have small datasets and cannot afford to reserve a validation set!\n",
-    "\n",
-    "One way to do the cross-validation here would be to write our own function to do this. However, we also know that **scikit-learn** provides several handy functions to evaluate and tune the models. So the question is:\n",
-    "\n",
-    "\n",
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "    Can we somehow use the scikit-learn functions or the ones we wrote ourselves for scikit-learn models to evaluate and tune our Keras models?\n",
-    "\n",
-    "\n",
-    "The Answer is **YES !**\n",
-    "</p>\n",
-    "</div>\n",
-    "\n",
-    "\n",
-    "\n",
-    "We show how to do this in the following section."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Using scikit-learn functions on keras models\n",
-    "\n",
-    "\n",
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "Keras offers 2 wrappers which allow its Sequential models to be used with scikit-learn. \n",
-    "\n",
-    "There are: **KerasClassifier** and **KerasRegressor**.\n",
-    "\n",
-    "For more information:\n",
-    "https://keras.io/scikit-learn-api/\n",
-    "</p>\n",
-    "</div>\n",
-    "\n",
-    "\n",
-    "\n",
-    "**Now lets see how this works!**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We wrap the Keras model we created above with KerasClassifier\n",
-    "from tensorflow.keras.wrappers.scikit_learn import KerasClassifier\n",
-    "from sklearn.model_selection import cross_val_score\n",
-    "# Wrapping Keras model\n",
-    "# NOTE: We pass verbose=0 to suppress the model output\n",
-    "num_epochs = 400\n",
-    "model_scikit = KerasClassifier(\n",
-    "    build_fn=a_simple_NN, epochs=num_epochs, verbose=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Let's reuse the function to visualize the decision boundary which we saw in chapter 2 with minimal change\n",
-    "\n",
-    "def list_flatten(list_of_list):\n",
-    "    flattened_list = [i for j in list_of_list for i in j]\n",
-    "    return flattened_list\n",
-    "\n",
-    "def plot_points(plt=plt, marker='o'):\n",
-    "    colors = [[\"steelblue\", \"chocolate\"][i] for i in labels]\n",
-    "    plt.scatter(features.iloc[:, 0], features.iloc[:, 1], color=colors, marker=marker);\n",
-    "\n",
-    "def train_and_plot_decision_surface(\n",
-    "    name, classifier, features_2d, labels, preproc=None, plt=plt, marker='o', N=400\n",
-    "):\n",
-    "\n",
-    "    features_2d = np.array(features_2d)\n",
-    "    xmin, ymin = features_2d.min(axis=0)\n",
-    "    xmax, ymax = features_2d.max(axis=0)\n",
-    "\n",
-    "    x = np.linspace(xmin, xmax, N)\n",
-    "    y = np.linspace(ymin, ymax, N)\n",
-    "    points = np.array(np.meshgrid(x, y)).T.reshape(-1, 2)\n",
-    "\n",
-    "    if preproc is not None:\n",
-    "        points_for_classifier = preproc.fit_transform(points)\n",
-    "        features_2d = preproc.fit_transform(features_2d)\n",
-    "    else:\n",
-    "        points_for_classifier = points\n",
-    "\n",
-    "    classifier.fit(features_2d, labels, verbose=0)\n",
-    "    \n",
-    "    if name == \"Neural Net\":\n",
-    "        #predicted = classifier.predict(features_2d)\n",
-    "        #predicted = list_flatten(predicted)\n",
-    "        predicted = list_flatten((classifier.predict(features_2d) > 0.5).astype(\"int32\"))\n",
-    "    #else:\n",
-    "        #predicted = classifier.predict(features_2d)\n",
-    "    \n",
-    "    \n",
-    "    if preproc is not None:\n",
-    "        name += \" (w/ preprocessing)\"\n",
-    "    print(name + \":\\t\", sum(predicted == labels), \"/\", len(labels), \"correct\")\n",
-    "    \n",
-    "    if name == \"Neural Net\":\n",
-    "        #classes = np.array(list_flatten(classifier.predict(points_for_classifier)), dtype=bool)\n",
-    "        classes = np.array(list_flatten((classifier.predict(points_for_classifier) > 0.5).astype(\"int32\")), dtype=bool)\n",
-    "    #else:\n",
-    "        #classes = np.array(classifier.predict(points_for_classifier), dtype=bool)\n",
-    "    plt.plot(\n",
-    "        points[~classes][:, 0],\n",
-    "        points[~classes][:, 1],\n",
-    "        \"o\",\n",
-    "        color=\"steelblue\",\n",
-    "        markersize=1,\n",
-    "        alpha=0.01,\n",
-    "    )\n",
-    "    plt.plot(\n",
-    "        points[classes][:, 0],\n",
-    "        points[classes][:, 1],\n",
-    "        \"o\",\n",
-    "        color=\"chocolate\",\n",
-    "        markersize=1,\n",
-    "        alpha=0.04,\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "_, ax = plt.subplots(figsize=(6, 6))\n",
-    "\n",
-    "train_and_plot_decision_surface(\"Neural Net\", model_scikit, features, labels, plt=ax)\n",
-    "plot_points(plt=ax)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Applying K-fold cross-validation\n",
-    "# Here we pass the whole dataset, i.e. features and labels, instead of splitting it.\n",
-    "num_folds = 5\n",
-    "cross_validation = cross_val_score(\n",
-    "    model_scikit, features, labels, cv=num_folds, verbose=0)\n",
-    "\n",
-    "print(\"The acuracy on the \", num_folds, \" validation folds:\", cross_validation)\n",
-    "print(\"The Average acuracy on the \", num_folds, \" validation folds:\", np.mean(cross_validation))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "The code above took quite long to finish even though we used only 5  CV folds and the neural network and data size are very small! This gives an indication of the enormous compute requirements of training production-grade deep neural networks.\n",
-    "</p>\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Hyperparameter optimization"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We know from chapter 6 that there are 2 types of parameters which need to be tuned for a machine learning model.\n",
-    "* Internal model parameters (weights) which can be learned for e.g. by gradient-descent\n",
-    "* Hyperparameters\n",
-    "\n",
-    "In the model created above we made some arbitrary choices such as the choice of the optimizer we used, optimizer's learning rate, number of hidden units and so on ...\n",
-    "\n",
-    "Now that we have the keras model wrapped as a scikit-learn model we can use the grid search functions we have seen in chapter 6."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.model_selection import GridSearchCV\n",
-    "# Just to remember\n",
-    "model_scikit = KerasClassifier(\n",
-    "    build_fn=a_simple_NN, **{\"epochs\": num_epochs, \"verbose\": 0})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "HP_grid = {'epochs' : [30, 50, 100]}\n",
-    "search = GridSearchCV(estimator=model_scikit, param_grid=HP_grid)\n",
-    "search.fit(features, labels)\n",
-    "print(search.best_score_, search.best_params_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "HP_grid = {'epochs' : [10, 15, 30], \n",
-    "           'batch_size' : [10, 20, 30] }\n",
-    "search = GridSearchCV(estimator=model_scikit, param_grid=HP_grid)\n",
-    "search.fit(features, labels)\n",
-    "print(search.best_score_, search.best_params_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# A more general model for further Hyperparameter optimization\n",
-    "from tensorflow.keras import optimizers\n",
-    "\n",
-    "def a_simple_NN(activation='relu', num_hidden_neurons=[4, 4], learning_rate=0.01):\n",
-    "\n",
-    "    model = Sequential()\n",
-    "\n",
-    "    model.add(Dense(num_hidden_neurons[0],\n",
-    "                    input_shape=(2,), activation=activation))\n",
-    "\n",
-    "    model.add(Dense(num_hidden_neurons[1], activation=activation))\n",
-    "\n",
-    "    model.add(Dense(1, activation=\"sigmoid\"))\n",
-    "\n",
-    "    model.compile(loss=\"binary_crossentropy\", optimizer=optimizers.rmsprop(\n",
-    "        lr=learning_rate), metrics=[\"accuracy\"])\n",
-    "\n",
-    "    return model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Exercise section: \n",
-    "* Look at the model above and choose a couple of hyperparameters to optimize. \n",
-    "* **OPTIONAL:** What function from scikit-learn other than GridSearchCV can we use for hyperparameter optimization? Use it."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Code here"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "Another library which you should definitely look at for doing hyperparameter optimization with keras models is the <a href=\"https://github.com/maxpumperla/hyperas\">Hyperas library</a> which is a wrapper around the <a href=\"https://github.com/hyperopt/hyperopt\">Hyperopt library</a>. \n",
-    "\n",
-    "</p>\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Exercise section:  \n",
-    "* Create a neural network to classify the 2d points example from chapter 2 learned (Optional: As you create the model read a bit on the different keras commands we have used)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "import numpy as np\n",
-    "from sklearn.model_selection import train_test_split, cross_val_score\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense\n",
-    "from tensorflow.keras import optimizers\n",
-    "from tensorflow.keras.wrappers.scikit_learn import KerasClassifier"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "circle = pd.read_csv(\"data/circle.csv\")\n",
-    "# Using x and y coordinates as featues\n",
-    "features = circle.iloc[:, :-1]\n",
-    "# Convert boolean to integer values (True->1 and False->0)\n",
-    "labels = circle.iloc[:, -1].astype(int)\n",
-    "\n",
-    "colors = [[\"steelblue\", \"chocolate\"][i] for i in circle[\"label\"]]\n",
-    "plt.figure(figsize=(5, 5))\n",
-    "plt.xlim([-2, 2])\n",
-    "plt.ylim([-2, 2])\n",
-    "\n",
-    "plt.scatter(features[\"x\"], features[\"y\"], color=colors, marker=\"o\");\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Insert Code here"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The examples we saw above are really nice to show various features of the Keras library and to understand how we build and train a model. However, they are not the ideal problems one should solve using neural networks. They are too simple and can be solved easily by classical machine learning algorithms. \n",
-    "\n",
-    "Now we show examples where Neural Networks really shine over classical machine learning algorithms."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Handwritten Digits Classification (multi-class classification)\n",
-    "**MNIST Dataset**\n",
-    "\n",
-    "MNIST datasets is a very common dataset used in machine learning. It is widely used to train and validate models.\n",
-    "\n",
-    "\n",
-    ">The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a >test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size->normalized and centered in a fixed-size image.\n",
-    ">It is a good database for people who want to try learning techniques and pattern recognition methods on real-world >data while spending minimal efforts on preprocessing and formatting.\n",
-    ">source: http://yann.lecun.com/exdb/mnist/\n",
-    "\n",
-    "This dataset consists of images of handwritten digits between 0-9 and their corresponsing labels. We want to train a neural network which is able to predict the correct digit on the image. \n",
-    "This is a multi-class classification problem. Unlike binary classification which we have seen till now we will classify data into 10 different classes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import seaborn as sns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Loading the dataset in keras\n",
-    "# Later you can explore and play with other datasets with come with Keras\n",
-    "from tensorflow.keras.datasets import mnist\n",
-    "\n",
-    "# Loading the train and test data\n",
-    "\n",
-    "(X_train, y_train), (X_test, y_test) = mnist.load_data()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Looking at the dataset\n",
-    "print(X_train.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We can see that the training set consists of 60,000 images of size 28x28 pixels\n",
-    "i=np.random.randint(0,X_train.shape[0])\n",
-    "sns.set_style(\"white\")\n",
-    "plt.imshow(X_train[i], cmap=\"gray_r\") ;\n",
-    "sns.set(style=\"darkgrid\")\n",
-    "print(\"This digit is: \" , y_train[i])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Look at the data values for a couple of images\n",
-    "print(X_train[0].min(), X_train[1].max())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The data consists of values between 0-255 representing the **grayscale level**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# The labels are the digit on the image\n",
-    "print(y_train.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Scaling the data\n",
-    "# It is important to normalize the input data to (0-1) before providing it to a neural net\n",
-    "# We could use the previously introduced function from scikit-learn. However, here it is sufficient to\n",
-    "# just divide the input data by 255\n",
-    "X_train_norm = X_train/255.\n",
-    "X_test_norm = X_test/255.\n",
-    "\n",
-    "# Also we need to reshape the input data such that each sample is a vector and not a 2D matrix\n",
-    "X_train_prep = X_train_norm.reshape(X_train_norm.shape[0],28*28)\n",
-    "X_test_prep = X_test_norm.reshape(X_test_norm.shape[0],28*28)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "One-Hot encoding\n",
-    "\n",
-    "In multi-class classification problems the labels are provided to the neural network as something called **One-hot encodings**. The categorical labels (0-9 here) are converted to vectors.\n",
-    "\n",
-    "For the MNIST problem where the data has **10 categories** we will convert every label to a vector of length 10. \n",
-    "All the entries of this vector will be zero **except** for the index which is equal to the (integer) value of the label.\n",
-    "\n",
-    "For example:\n",
-    "if label is 4. The one-hot vector will look like **[0 0 0 0 1 0 0 0 0 0]**\n",
-    "\n",
-    "Fortunately, Keras has a built-in function to achieve this and we do not have to write a code for this ourselves.\n",
-    "</p>\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tensorflow.keras import utils\n",
-    "\n",
-    "y_train_onehot = utils.to_categorical(y_train, num_classes=10)\n",
-    "y_test_onehot = utils.to_categorical(y_test, num_classes=10)\n",
-    "\n",
-    "print(y_train_onehot.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Building the tensorflow model\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense\n",
-    "\n",
-    "def mnist_model():\n",
-    "    model = Sequential()\n",
-    "\n",
-    "    model.add(Dense(64, input_shape=(28*28,), activation=\"relu\"))\n",
-    "\n",
-    "    model.add(Dense(64, activation=\"relu\"))\n",
-    "\n",
-    "    model.add(Dense(10, activation=\"softmax\"))\n",
-    "\n",
-    "    model.compile(loss=\"categorical_crossentropy\",\n",
-    "                  optimizer=\"rmsprop\", metrics=[\"accuracy\"])\n",
-    "    return model\n",
-    "\n",
-    "model = mnist_model()\n",
-    "\n",
-    "model_run = model.fit(X_train_prep, y_train_onehot, epochs=20,\n",
-    "                      batch_size=512)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"The [loss, accuracy] on test dataset are: \" , model.evaluate(X_test_prep, y_test_onehot))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Exercise section\n",
-    "* Reinitialize and run the model again with validation dataset, plot the accuracy as a function of epochs, play with number of epochs and observe what is happening."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Code here"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "solution"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "# Solution:\n",
-    "num_epochs = 20\n",
-    "model = mnist_model()\n",
-    "model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,\n",
-    "                      batch_size=512, validation_data=(X_test_prep, y_test_onehot))\n",
-    "# Evaluating the model on test dataset\n",
-    "#print(\"The [loss, accuracy] on test dataset are: \" , model.evaluate(X_test_prep, y_test_onehot))\n",
-    "history_model = model_run.history\n",
-    "print(\"The history has the following data: \", history_model.keys())\n",
-    "\n",
-    "# Plotting the training and validation accuracy during the training\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"accuracy\"], color = \"blue\", label=\"Training set\") ;\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"val_accuracy\"], color = \"red\", label=\"Valdation set\") ;\n",
-    "plt.xlabel(\"epochs\") ;\n",
-    "plt.ylabel(\"accuracy\") ;"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "What we see here is **overfitting**. After the first few epochs the training and validation datasets show a similar accuracy but thereafter the network starts to over fit to the training set."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "Keep in mind that neural networks are quite prone to overfitting so always check for it.\n",
-    "</p>\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Adding regularization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Adding l2 regularization\n",
-    "# Building the keras model\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense\n",
-    "from tensorflow.keras.regularizers import l2\n",
-    "\n",
-    "def mnist_model():\n",
-    "    \n",
-    "    model = Sequential()\n",
-    "\n",
-    "    model.add(Dense(64, input_shape=(28*28,), activation=\"relu\", \n",
-    "                   kernel_regularizer=l2(0.01)))\n",
-    "\n",
-    "    model.add(Dense(64, activation=\"relu\", \n",
-    "                   kernel_regularizer=l2(0.01)))\n",
-    "\n",
-    "    model.add(Dense(10, activation=\"softmax\"))\n",
-    "\n",
-    "    model.compile(loss=\"categorical_crossentropy\",\n",
-    "                  optimizer=\"rmsprop\", metrics=[\"accuracy\"])\n",
-    "    return model\n",
-    "\n",
-    "model = mnist_model()\n",
-    "\n",
-    "num_epochs = 20\n",
-    "model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,\n",
-    "                      batch_size=512, validation_data=(X_test_prep, y_test_onehot))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Evaluating the model on test dataset\n",
-    "history_model = model_run.history\n",
-    "print(\"The history has the following data: \", history_model.keys())\n",
-    "\n",
-    "# Plotting the training and validation accuracy during the training\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"accuracy\"], color = \"blue\", label=\"Training set\") ;\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"val_accuracy\"], color = \"red\", label=\"Valdation set\") ;\n",
-    "plt.xlabel(\"epochs\") ;\n",
-    "plt.ylabel(\"accuracy\") ;"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
-    "Another way to add regularization and to make the network more robust is by applying Dropout. When we add dropout to a layer a specified percentage of units in that layer are switched off. \n",
-    "    \n",
-    "Both L2 regularization and Dropout make the model simpler and thus reducing overfitting.\n",
-    "</p>\n",
-    "</div>\n",
-    "\n",
-    "### Exercise section\n",
-    "* Add dropout instead of L2 regularization in the network above"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Adding dropout is easy in keras\n",
-    "# We import a layer called Dropout and add as follows\n",
-    "# model.add(Dropout(0.2)) to randomly drop 20% of the hidden units\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "solution"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "# Solution\n",
-    "# Adding Dropout\n",
-    "# Building the tensorflow model\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense, Dropout\n",
-    "\n",
-    "def mnist_model():\n",
-    "    \n",
-    "    model = Sequential()\n",
-    "\n",
-    "    model.add(Dense(64, input_shape=(28*28,), activation=\"relu\"))\n",
-    "              \n",
-    "    model.add(Dropout(0.15))\n",
-    "\n",
-    "    model.add(Dense(64, activation=\"relu\"))\n",
-    "    \n",
-    "    model.add(Dense(10, activation=\"softmax\"))\n",
-    "\n",
-    "    model.compile(loss=\"categorical_crossentropy\",\n",
-    "                  optimizer=\"rmsprop\", metrics=[\"accuracy\"])\n",
-    "              \n",
-    "    return model\n",
-    "\n",
-    "model = mnist_model()\n",
-    "\n",
-    "num_epochs = 20\n",
-    "model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,\n",
-    "                      batch_size=512, validation_data=(X_test_prep, y_test_onehot))\n",
-    "\n",
-    "# Evaluating the model on test dataset\n",
-    "history_model = model_run.history\n",
-    "print(\"The history has the following data: \", history_model.keys())\n",
-    "\n",
-    "# Plotting the training and validation accuracy during the training\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"accuracy\"], color = \"blue\", label=\"Training set\") ;\n",
-    "sns.lineplot(np.arange(1, num_epochs+1), history_model[\"val_accuracy\"], color = \"red\", label=\"Valdation set\") ;\n",
-    "plt.xlabel(\"epochs\") ;\n",
-    "plt.ylabel(\"accuracy\") ;"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Network Architectures\n",
-    "\n",
-    "The neural networks which we have seen till now are the simplest kind of neural networks.\n",
-    "There exist more sophisticated network architectures especially designed for specific applications.\n",
-    "Some of them are as follows:\n",
-    "\n",
-    "###  Convolution Neural Networks (CNNs)\n",
-    "\n",
-    "These networks are used mostly for computer vision like tasks such as image classification and object detection. \n",
-    "One of the old CNN networks is shown below.\n",
-    "\n",
-    "<center>\n",
-    "<figure>\n",
-    "<img src=\"./images/neuralnets/CNN_lecun.png\" width=\"800\"/>\n",
-    "<figcaption>source: LeCun et al., Gradient-based learning applied to document recognition (1998).</figcaption>\n",
-    "</figure>\n",
-    "</center>\n",
-    "\n",
-    "CNNs consist of new type of layers such as convolution and pooling layers.\n",
-    "\n",
-    "###  Recurrent Neural Networks (RNNs)\n",
-    "\n",
-    "RNNs are used for problems such as time-series data, speech recognition and translation.\n",
-    "\n",
-    "### Generative adversarial networks (GANs)\n",
-    "\n",
-    "GANs consist of 2 parts, a generative network and a discriminative network. The generative network produces data which is then fed to the discriminative network which judges if the new data belongs to a specified dataset. Then via feedback loops the generative network becomes better and better at creating images similar to the dataset the discriminative network is judging against. At the same time the discriminative network get better and better at identifyig **fake** instances which are not from the reference dataset. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## CNN in a bit more detail\n",
-    "\n",
-    "The standard CNN architecture can be seen as 2 parts:\n",
-    "\n",
-    "* Feature extraction\n",
-    "* Classification\n",
-    "\n",
-    "For the **classification** part we use the densly connected network as shown in the keras examples above.\n",
-    "\n",
-    "However, for the **feature extraction** part we use new types of layers called **convolution** layers\n",
-    "\n",
-    "### What is a Convolution?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "sns.set_style(\"white\")\n",
-    "# Loading the train and test data\n",
-    "digit = np.genfromtxt(\"data/digit_4_14x14.csv\", delimiter=\",\").astype(np.int16) ;\n",
-    "plt.imshow(digit, \"gray_r\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This image in matrix form"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_astable(matrix, hw=0.15):\n",
-    "    matrix = plt.table(cellText=matrix, loc=(0,0), cellLoc='center') ;\n",
-    "    matrix.set_fontsize(14)\n",
-    "    cells=matrix.get_celld() ;\n",
-    "    for i in cells:\n",
-    "        cells[i].set_height(hw) ;\n",
-    "        cells[i].set_width(hw) ;\n",
-    "    plt.axis(\"off\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_astable(digit)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Vertical edge detection\n",
-    "vertical_edge_kernel = np.array([[-1, 2, -1], [-1, 2, -1], [-1, 2, -1]])\n",
-    "plot_astable(vertical_edge_kernel, 0.2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "def convolution(matrix, kernel):\n",
-    "    # This function computes a convolution between a matrix and a kernel/filter without any padding\n",
-    "    width_kernel = kernel.shape[0]\n",
-    "    height_kernel = kernel.shape[1]\n",
-    "    convolution = np.zeros((matrix.shape[0] - width_kernel + 1,\n",
-    "                            matrix.shape[1] - height_kernel + 1))\n",
-    "    for i in range(matrix.shape[0] - width_kernel + 1):\n",
-    "        for j in range(matrix.shape[1] - height_kernel + 1):\n",
-    "            convolution[i, j] = np.sum(np.multiply(\n",
-    "                matrix[i:i+width_kernel, j:j+height_kernel], kernel))\n",
-    "    return convolution\n",
-    "\n",
-    "\n",
-    "vertical_detect = convolution(digit, vertical_edge_kernel)\n",
-    "plt.imshow(vertical_detect, cmap=\"gray_r\") ;"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Horizontal edge detection\n",
-    "horizontal_edge_kernel = np.array([[-1, -1, -1], [2, 2, 2], [-1, -1, -1]])\n",
-    "plot_astable(horizontal_edge_kernel, 0.2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "horizontal_detect = convolution(digit, horizontal_edge_kernel)\n",
-    "plt.imshow(horizontal_detect, cmap=\"gray_r\") ;"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Maxpooling\n",
-    "Taking maximum in n x n sized sliding windows"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "def maxpool_2x2(matrix):\n",
-    "    out_dim = np.array([matrix.shape[0]/2, matrix.shape[1]/2]).astype(int)\n",
-    "    subsample = np.zeros((out_dim))\n",
-    "    for i in range(out_dim[0]):\n",
-    "        for j in range(out_dim[1]):\n",
-    "            subsample[i,j] = np.max(matrix[i*2:i*2+2, j*2:j*2+2])\n",
-    "    return subsample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "subsampled_image = maxpool_2x2(vertical_detect)\n",
-    "plt.imshow(subsampled_image, cmap=\"gray_r\")\n",
-    "plt.title(\"Max Pooled vertical edge detection filter\") ;"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "subsampled_image = maxpool_2x2(horizontal_detect)\n",
-    "plt.imshow(subsampled_image, cmap=\"gray_r\") ;\n",
-    "plt.title(\"Max Pooled horizontal edge detection filter\") ;"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Let's explore some more of such filters/kernels!!\n",
-    "\n",
-    "http://setosa.io/ev/image-kernels"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## CNN Examples"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For this example we will work with a dataset called fashion-MNIST which is quite similar to the MNIST data above.\n",
-    "> Fashion-MNIST is a dataset of Zalando's article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits.\n",
-    "source: https://github.com/zalandoresearch/fashion-mnist\n",
-    "\n",
-    "The 10 classes of this dataset are:\n",
-    "\n",
-    "| Label| Item |\n",
-    "| --- | --- |\n",
-    "| 0 |\tT-shirt/top |\n",
-    "| 1\t| Trouser |\n",
-    "|2|\tPullover|\n",
-    "|3|\tDress|\n",
-    "|4|\tCoat|\n",
-    "|5|\tSandal|\n",
-    "|6|\tShirt|\n",
-    "|7|\tSneaker|\n",
-    "|8|\tBag|\n",
-    "|9|\tAnkle boot|"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Loading the dataset in tensorflow\n",
-    "# Later you can explore and play with other datasets with come with tensorflow\n",
-    "from tensorflow.keras.datasets import fashion_mnist\n",
-    "\n",
-    "# Loading the train and test data\n",
-    "\n",
-    "(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()\n",
-    "\n",
-    "items =['T-shirt/top', 'Trouser', \n",
-    "        'Pullover', 'Dress', \n",
-    "        'Coat', 'Sandal', \n",
-    "        'Shirt', 'Sneaker',\n",
-    "        'Bag', 'Ankle boot']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We can see that the training set consists of 60,000 images of size 28x28 pixels\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "i=np.random.randint(0,X_train.shape[0])\n",
-    "plt.imshow(X_train[i], cmap=\"gray_r\") ; \n",
-    "print(\"This item is a: \" , items[y_train[i]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Also we need to reshape the input data such that each sample is a 4D matrix of dimension\n",
-    "# (num_samples, width, height, channels). Even though these images are grayscale we need to add\n",
-    "# channel dimension as this is expected by the Conv function\n",
-    "X_train_prep = X_train.reshape(X_train.shape[0],28,28,1)/255.\n",
-    "X_test_prep = X_test.reshape(X_test.shape[0],28,28,1)/255.\n",
-    "\n",
-    "from tensorflow.keras.utils import to_categorical\n",
-    "\n",
-    "y_train_onehot = to_categorical(y_train, num_classes=10)\n",
-    "y_test_onehot = to_categorical(y_test, num_classes=10)\n",
-    "\n",
-    "print(y_train_onehot.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Creating a CNN similar to the one shown in the figure from LeCun paper\n",
-    "# In the original implementation Average pooling was used. However, we will use maxpooling as this \n",
-    "# is what us used in the more recent architectures and is found to be a better choice\n",
-    "# Convolution -> Pooling -> Convolution -> Pooling -> Flatten -> Dense -> Dense -> Output layer\n",
-    "from tensorflow.keras.models import Sequential\n",
-    "from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization\n",
-    "\n",
-    "def simple_CNN():\n",
-    "    \n",
-    "    model = Sequential()\n",
-    "    \n",
-    "    model.add(Conv2D(6, (3,3), input_shape=(28,28,1), activation='relu'))\n",
-    "    \n",
-    "    model.add(MaxPool2D((2,2)))\n",
-    "    \n",
-    "    model.add(Conv2D(16, (3,3), activation='relu'))\n",
-    "    \n",
-    "    model.add(MaxPool2D((2,2)))\n",
-    "    \n",
-    "    model.add(Flatten())\n",
-    "    \n",
-    "    model.add(Dense(120, activation='relu'))\n",
-    "    \n",
-    "    model.add(Dense(84, activation='relu'))\n",
-    "    \n",
-    "    model.add(Dense(10, activation='softmax'))\n",
-    "    \n",
-    "    model.compile(loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"])\n",
-    "    \n",
-    "    return model\n",
-    "\n",
-    "model = simple_CNN()\n",
-    "model.summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "num_epochs = 5\n",
-    "model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs, \n",
-    "                      batch_size=64, validation_data=(X_test_prep, y_test_onehot))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Exercise section\n",
-    "* Use the above model or improve it (change number of filters, add more layers etc. on the MNIST example and see if you can get a better accuracy than what we achieved with a vanilla neural network)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Exercise section\n",
-    "* Explore the CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html) dataset included with Keras and build+train a simple CNN to classify it"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tensorflow.keras.datasets import cifar10\n",
-    "(X_train, y_train), (X_test, y_test) = cifar10.load_data()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Copyright (C) 2019-2021 ETH Zurich, SIS ID"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/08_b-neural_networks.ipynb b/08_b-neural_networks.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..477aaacd12e5b87d48b4d9d631fcd1547cefa0a1
--- /dev/null
+++ b/08_b-neural_networks.ipynb
@@ -0,0 +1,1406 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
+    "from numpy.random import seed\n",
+    "\n",
+    "seed(42)\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.set(style=\"darkgrid\")\n",
+    "mpl.rcParams[\"lines.linewidth\"] = 3\n",
+    "%matplotlib inline\n",
+    "%config InlineBackend.figure_format = 'retina'\n",
+    "%config IPCompleter.greedy=True\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "from IPython.core.display import HTML\n",
+    "\n",
+    "HTML(open(\"custom.html\", \"r\").read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 8b: Introduction to Neural Networks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction to TensorFlow (keras API)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A bit about Keras?\n",
+    "\n",
+    "* It is a high level API to create and work with neural networks\n",
+    "* Used to support multiple backends such as **TensorFlow** from Google, **Theano** (Theano is dead now) and **CNTK** (Microsoft Cognitive Toolkit), up till release 2.3.0 \n",
+    "* Very good for creating neural nets quickly and hides away a lot of tedious work\n",
+    "* Has been incorporated into official TensorFlow (which obviously only works with tensorflow) and is its main API as of version 2.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<center>\n",
+    "<figure>\n",
+    "<img src=\"./images/neuralnets/neural_net_keras_1.svg\" width=\"700\"/>\n",
+    "<figcaption>Building this model in TensorFlow (Keras)</figcaption>\n",
+    "</figure>\n",
+    "</center>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Say hello to Tensorflow\n",
+    "from tensorflow.keras.layers import Activation, Dense\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "\n",
+    "# Creating a model\n",
+    "model = Sequential()\n",
+    "\n",
+    "# Adding layers to this model\n",
+    "# 1st Hidden layer\n",
+    "# A Dense/fully-connected layer which takes as input a\n",
+    "# feature array of shape (samples, num_features)\n",
+    "# Here input_shape = (2,) means that the layer expects an input with num_features = 2\n",
+    "# and the sample size could be anything\n",
+    "# The activation function for this layer is set to \"relu\"\n",
+    "model.add(Dense(units=4, input_shape=(2,), activation=\"relu\"))\n",
+    "\n",
+    "# 2nd Hidden layer\n",
+    "# This is also a fully-connected layer and we do not need to specify the\n",
+    "# shape of the input anymore (We need to do that only for the first layer)\n",
+    "# NOTE: Now we didn't add the activation seperately. Instead we just added it\n",
+    "# while calling Dense(). This and the way used for the first layer are Equivalent!\n",
+    "model.add(Dense(units=4, activation=\"relu\"))\n",
+    "\n",
+    "\n",
+    "# The output layer\n",
+    "model.add(Dense(units=1))\n",
+    "model.add(Activation(\"sigmoid\"))\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### XOR using neural networks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "from tensorflow.keras.models import Sequential"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Creating a network to solve the XOR problem\n",
+    "\n",
+    "# Loading and plotting the data\n",
+    "xor = pd.read_csv(\"data/xor.csv\")\n",
+    "\n",
+    "# Using x and y coordinates as featues\n",
+    "features = xor.iloc[:, :-1]\n",
+    "# Convert boolean to integer values (True->1 and False->0)\n",
+    "labels = 1 - xor.iloc[:, -1].astype(int)\n",
+    "\n",
+    "colors = [[\"steelblue\", \"chocolate\"][i] for i in labels]\n",
+    "plt.figure(figsize=(5, 5))\n",
+    "plt.xlim([-2, 2])\n",
+    "plt.ylim([-2, 2])\n",
+    "plt.title(\"Blue points are False\")\n",
+    "plt.scatter(features[\"x\"], features[\"y\"], color=colors, marker=\"o\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Building a simple Tensorflow model\n",
+    "\n",
+    "\n",
+    "def a_simple_NN():\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(4, input_shape=(2,), activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(4, activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(1, activation=\"sigmoid\"))\n",
+    "\n",
+    "    model.compile(loss=\"binary_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"])\n",
+    "\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instantiating the model\n",
+    "model = a_simple_NN()\n",
+    "\n",
+    "# Splitting the dataset into training (70%) and validation sets (30%)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)\n",
+    "\n",
+    "# Setting the number of passes through the entire training set\n",
+    "num_epochs = 300\n",
+    "\n",
+    "# model.fit() is used to train the model\n",
+    "# We can pass validation data while training\n",
+    "model_run = model.fit(\n",
+    "    X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-info\"><p><i class=\"fa fa-info-circle\"></i>&nbsp;\n",
+    "    NOTE: We can pass \"verbose=0\" to model.fit() to suppress the printing of model output on the terminal/notebook.\n",
+    "</p></div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plotting the loss and accuracy on the training and validation sets during the training\n",
+    "# This can be done by using TensorFlow (Keras) callback \"history\" which is applied by default\n",
+    "history_model = model_run.history\n",
+    "\n",
+    "print(\"The history has the following data: \", history_model.keys())\n",
+    "\n",
+    "# Plotting the training and validation accuracy during the training\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"accuracy\"], color=\"blue\", label=\"Training set\"\n",
+    ")\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"val_accuracy\"], color=\"red\", label=\"Valdation set\"\n",
+    ")\n",
+    "plt.xlabel(\"epochs\")\n",
+    "plt.ylabel(\"accuracy\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "The plots such as above are essential for analyzing the behaviour and performance of the network and to tune it in the right direction. However, for the example above we don't expect to derive a lot of insight from this plot as the function we are trying to fit is quite simple and there is not too much noise. We will see the significance of these curves in a later example.\n",
+    "</p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Before we move on forward we see how to save and load a TensorFlow (keras) model\n",
+    "model.save(\"./data/my_first_NN.h5\")\n",
+    "model.save(\"./data/my_first_NN\")\n",
+    "\n",
+    "\n",
+    "# Optional: See what is in the hdf5 file we just created above\n",
+    "\n",
+    "from tensorflow.keras.models import load_model\n",
+    "\n",
+    "model = load_model(\"./data/my_first_NN.h5\")\n",
+    "model_pb = load_model(\"./data/my_first_NN\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For the training and validation in the example above we split our dataset into a 70-30 train-validation set. We know from previous chapters that to more robustly estimate the accuracy of our model we can use **K-fold cross-validation**.\n",
+    "This is even more important when we have small datasets and cannot afford to reserve a validation set!\n",
+    "\n",
+    "One way to do the cross-validation here would be to write our own function to do this. However, we also know that **scikit-learn** provides several handy functions to evaluate and tune the models. So the question is:\n",
+    "\n",
+    "\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "    Can we somehow use the scikit-learn functions or the ones we wrote ourselves for scikit-learn models to evaluate and tune our TensorFlow (Keras) models?\n",
+    "\n",
+    "\n",
+    "The Answer is **YES !**\n",
+    "</p>\n",
+    "</div>\n",
+    "\n",
+    "\n",
+    "\n",
+    "We show how to do this in the following section."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using scikit-learn functions on TensorFlow (Keras) models\n",
+    "\n",
+    "\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "TensorFlow (Keras) offers 2 wrappers which allow its Sequential models to be used with scikit-learn. \n",
+    "\n",
+    "There are: **KerasClassifier** and **KerasRegressor**.\n",
+    "\n",
+    "For more information:\n",
+    "https://keras.io/scikit-learn-api/\n",
+    "</p>\n",
+    "</div>\n",
+    "\n",
+    "\n",
+    "\n",
+    "**Now lets see how this works!**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We wrap the TensorFlow (Keras) model we created above with KerasClassifier\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "from tensorflow.keras.wrappers.scikit_learn import KerasClassifier\n",
+    "\n",
+    "# Wrapping TensorFlow (Keras) model\n",
+    "# NOTE: We pass verbose=0 to suppress the model output\n",
+    "num_epochs = 400\n",
+    "model_scikit = KerasClassifier(build_fn=a_simple_NN, epochs=num_epochs, verbose=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's reuse the function to visualize the decision boundary which we saw in chapter 2 with minimal change\n",
+    "\n",
+    "\n",
+    "def list_flatten(list_of_list):\n",
+    "    flattened_list = [i for j in list_of_list for i in j]\n",
+    "    return flattened_list\n",
+    "\n",
+    "\n",
+    "def plot_points(plt=plt, marker=\"o\"):\n",
+    "    colors = [[\"steelblue\", \"chocolate\"][i] for i in labels]\n",
+    "    plt.scatter(features.iloc[:, 0], features.iloc[:, 1], color=colors, marker=marker)\n",
+    "\n",
+    "\n",
+    "def train_and_plot_decision_surface(\n",
+    "    name, classifier, features_2d, labels, preproc=None, plt=plt, marker=\"o\", N=400\n",
+    "):\n",
+    "\n",
+    "    features_2d = np.array(features_2d)\n",
+    "    xmin, ymin = features_2d.min(axis=0)\n",
+    "    xmax, ymax = features_2d.max(axis=0)\n",
+    "\n",
+    "    x = np.linspace(xmin, xmax, N)\n",
+    "    y = np.linspace(ymin, ymax, N)\n",
+    "    points = np.array(np.meshgrid(x, y)).T.reshape(-1, 2)\n",
+    "\n",
+    "    if preproc is not None:\n",
+    "        points_for_classifier = preproc.fit_transform(points)\n",
+    "        features_2d = preproc.fit_transform(features_2d)\n",
+    "    else:\n",
+    "        points_for_classifier = points\n",
+    "\n",
+    "    classifier.fit(features_2d, labels, verbose=0)\n",
+    "\n",
+    "    if name == \"Neural Net\":\n",
+    "        # predicted = classifier.predict(features_2d)\n",
+    "        # predicted = list_flatten(predicted)\n",
+    "        predicted = list_flatten(\n",
+    "            (classifier.predict(features_2d) > 0.5).astype(\"int32\")\n",
+    "        )\n",
+    "    # else:\n",
+    "    # predicted = classifier.predict(features_2d)\n",
+    "\n",
+    "    if preproc is not None:\n",
+    "        name += \" (w/ preprocessing)\"\n",
+    "    print(name + \":\\t\", sum(predicted == labels), \"/\", len(labels), \"correct\")\n",
+    "\n",
+    "    if name == \"Neural Net\":\n",
+    "        # classes = np.array(list_flatten(classifier.predict(points_for_classifier)), dtype=bool)\n",
+    "        classes = np.array(\n",
+    "            list_flatten(\n",
+    "                (classifier.predict(points_for_classifier) > 0.5).astype(\"int32\")\n",
+    "            ),\n",
+    "            dtype=bool,\n",
+    "        )\n",
+    "    # else:\n",
+    "    # classes = np.array(classifier.predict(points_for_classifier), dtype=bool)\n",
+    "    plt.plot(\n",
+    "        points[~classes][:, 0],\n",
+    "        points[~classes][:, 1],\n",
+    "        \"o\",\n",
+    "        color=\"steelblue\",\n",
+    "        markersize=1,\n",
+    "        alpha=0.01,\n",
+    "    )\n",
+    "    plt.plot(\n",
+    "        points[classes][:, 0],\n",
+    "        points[classes][:, 1],\n",
+    "        \"o\",\n",
+    "        color=\"chocolate\",\n",
+    "        markersize=1,\n",
+    "        alpha=0.04,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_, ax = plt.subplots(figsize=(6, 6))\n",
+    "\n",
+    "train_and_plot_decision_surface(\"Neural Net\", model_scikit, features, labels, plt=ax)\n",
+    "plot_points(plt=ax)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Applying K-fold cross-validation\n",
+    "# Here we pass the whole dataset, i.e. features and labels, instead of splitting it.\n",
+    "num_folds = 5\n",
+    "cross_validation = cross_val_score(\n",
+    "    model_scikit, features, labels, cv=num_folds, verbose=0\n",
+    ")\n",
+    "\n",
+    "print(\"The acuracy on the \", num_folds, \" validation folds:\", cross_validation)\n",
+    "print(\n",
+    "    \"The Average acuracy on the \",\n",
+    "    num_folds,\n",
+    "    \" validation folds:\",\n",
+    "    np.mean(cross_validation),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "The code above took quite long to finish even though we used only 5  CV folds and the neural network and data size are very small! This gives an indication of the enormous compute requirements of training production-grade deep neural networks.\n",
+    "</p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Hyperparameter optimization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We know from chapter 6 that there are 2 types of parameters which need to be tuned for a machine learning model.\n",
+    "* Internal model parameters (weights) which can be learned for e.g. by gradient-descent\n",
+    "* Hyperparameters\n",
+    "\n",
+    "In the model created above we made some arbitrary choices such as the choice of the optimizer we used, optimizer's learning rate, number of hidden units and so on ...\n",
+    "\n",
+    "Now that we have the TensorFlow (keras) model wrapped as a scikit-learn model we can use the grid search functions we have seen in chapter 6."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "\n",
+    "# Just to remember\n",
+    "model_scikit = KerasClassifier(\n",
+    "    build_fn=a_simple_NN, **{\"epochs\": num_epochs, \"verbose\": 0}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HP_grid = {\"epochs\": [30, 50, 100]}\n",
+    "search = GridSearchCV(estimator=model_scikit, param_grid=HP_grid)\n",
+    "search.fit(features, labels)\n",
+    "print(search.best_score_, search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HP_grid = {\"epochs\": [10, 15, 30], \"batch_size\": [10, 20, 30]}\n",
+    "search = GridSearchCV(estimator=model_scikit, param_grid=HP_grid)\n",
+    "search.fit(features, labels)\n",
+    "print(search.best_score_, search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A more general model for further Hyperparameter optimization\n",
+    "from tensorflow.keras import optimizers\n",
+    "\n",
+    "\n",
+    "def a_simple_NN(activation=\"relu\", num_hidden_neurons=[4, 4], learning_rate=0.01):\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(num_hidden_neurons[0], input_shape=(2,), activation=activation))\n",
+    "\n",
+    "    model.add(Dense(num_hidden_neurons[1], activation=activation))\n",
+    "\n",
+    "    model.add(Dense(1, activation=\"sigmoid\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"binary_crossentropy\",\n",
+    "        optimizer=optimizers.RMSprop(learning_rate=learning_rate),\n",
+    "        metrics=[\"accuracy\"],\n",
+    "    )\n",
+    "\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### keras-tuner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -q -U keras-tuner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def a_simple_NN(activation=\"relu\", num_hidden_neurons=[4, 4], learning_rate=0.01):\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(num_hidden_neurons[0], input_shape=(2,), activation=activation))\n",
+    "\n",
+    "    model.add(Dense(num_hidden_neurons[1], activation=activation))\n",
+    "\n",
+    "    model.add(Dense(1, activation=\"sigmoid\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"binary_crossentropy\",\n",
+    "        optimizer=optimizers.RMSprop(learning_rate=learning_rate),\n",
+    "        metrics=[\"accuracy\"],\n",
+    "    )\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import keras_tuner as kt\n",
+    "\n",
+    "\n",
+    "def model_builder(hp):\n",
+    "\n",
+    "    # Tune the number of units in the first Dense layer\n",
+    "    hp_units = hp.Int(\"units\", min_value=4, max_value=8, step=2)\n",
+    "    hp_units_2 = hp.Int(\"units2\", min_value=4, max_value=16, step=2)\n",
+    "\n",
+    "    # Tune the learning rate for the optimizer\n",
+    "    hp_learning_rate = hp.Choice(\"learning_rate\", values=[1e-2, 1e-3, 1e-4])\n",
+    "\n",
+    "    # Tune the choice of the activation function\n",
+    "    activation = hp.Choice(name=\"activation\", values=[\"relu\", \"sigmoid\"])\n",
+    "\n",
+    "    model = a_simple_NN(activation, [hp_units, hp_units_2], hp_learning_rate)\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "# The argument ‘hp’ is an instance of the class HyperParameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tuner = kt.BayesianOptimization(\n",
+    "    model_builder,\n",
+    "    objective=\"val_accuracy\",\n",
+    "    max_trials=10,\n",
+    "    project_name=\"intro_to_kt\",\n",
+    "    overwrite=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tuner.search(X_train, y_train, epochs=100, validation_data=(X_test, y_test))\n",
+    "best_model = tuner.get_best_models()[0]\n",
+    "print(tuner.get_best_hyperparameters()[0].values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise section:  \n",
+    "1. Create a neural network to classify the 2d points example from chapter 2 (Optional: As you create the model read a bit on the different TensorFlow (keras) commands we have used)\n",
+    "2. Plot the decision boundary\n",
+    "3. Choose and optimize a couple of hyperparameters\n",
+    "4. **OPTIONAL:** What function from scikit-learn other than GridSearchCV can we use for hyperparameter optimization? Use it (or use the equivalent method from keras-tuner)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split\n",
+    "from tensorflow.keras import optimizers\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.wrappers.scikit_learn import KerasClassifier\n",
+    "import keras_tuner as kt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "circle = pd.read_csv(\"data/circle.csv\")\n",
+    "# Using x and y coordinates as featues\n",
+    "features = circle.iloc[:, :-1]\n",
+    "# Convert boolean to integer values (True->1 and False->0)\n",
+    "labels = circle.iloc[:, -1].astype(int)\n",
+    "\n",
+    "colors = [[\"steelblue\", \"chocolate\"][i] for i in circle[\"label\"]]\n",
+    "plt.figure(figsize=(5, 5))\n",
+    "plt.xlim([-2, 2])\n",
+    "plt.ylim([-2, 2])\n",
+    "\n",
+    "plt.scatter(features[\"x\"], features[\"y\"], color=colors, marker=\"o\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Insert Code here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Solution\n",
+    "def circle_NN(activation=\"relu\", learning_rate=0.01):\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(4, input_shape=(2,), activation=activation))\n",
+    "\n",
+    "    model.add(Dense(4, activation=activation))\n",
+    "\n",
+    "    model.add(Dense(1, activation=\"sigmoid\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"binary_crossentropy\",\n",
+    "        optimizer=optimizers.RMSprop(learning_rate=learning_rate),\n",
+    "        metrics=[\"accuracy\"],\n",
+    "    )\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "# Instantiating the model\n",
+    "model = circle_NN()\n",
+    "\n",
+    "# Splitting the dataset into training (70%) and validation sets (30%)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)\n",
+    "\n",
+    "# Setting the number of passes through the entire training set\n",
+    "num_epochs = 400\n",
+    "\n",
+    "# model.fit() is used to train the model\n",
+    "# We can pass validation data while training\n",
+    "model_run = model.fit(\n",
+    "    X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), verbose=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# solution\n",
+    "_, ax = plt.subplots(figsize=(6, 6))\n",
+    "\n",
+    "num_epochs = 400\n",
+    "circle_scikit = KerasClassifier(build_fn=circle_NN, epochs=num_epochs, verbose=0)\n",
+    "\n",
+    "train_and_plot_decision_surface(\"Neural Net\", circle_scikit, features, labels, plt=ax)\n",
+    "plot_points(plt=ax)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# solution (older method)\n",
+    "\"\"\"\n",
+    "HP_grid = {\n",
+    "    \"activation\": [\"relu\", \"sigmoid\"],\n",
+    "    \"learning_rate\": [0.01, 0.005, 0.001],\n",
+    "}\n",
+    "search = GridSearchCV(estimator=circle_scikit, param_grid=HP_grid)\n",
+    "search.fit(features, labels)\n",
+    "print(search.best_score_, search.best_params_)\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "def model_builder(hp):\n",
+    "    # Tune the learning rate for the optimizer\n",
+    "    hp_learning_rate = hp.Choice(\"learning_rate\", values=[1e-2, 1e-3, 1e-4])\n",
+    "    # Tune the choice of the activation function\n",
+    "    activation = hp.Choice(name=\"activation\", values=[\"relu\", \"sigmoid\"])\n",
+    "\n",
+    "    model = circle_NN(activation, hp_learning_rate)\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "tuner = kt.BayesianOptimization(\n",
+    "    model_builder,\n",
+    "    objective=\"val_accuracy\",\n",
+    "    max_trials=10,\n",
+    "    project_name=\"circle_exercise\",\n",
+    "    overwrite=True,\n",
+    ")\n",
+    "tuner.search(X_train, y_train, epochs=400, validation_data=(X_test, y_test))\n",
+    "best_model = tuner.get_best_models()[0]\n",
+    "print(tuner.get_best_hyperparameters()[0].values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "Another library which you should definitely look at for doing hyperparameter optimization with keras models is the <a href=\"https://github.com/maxpumperla/hyperas\">Hyperas library</a> which is a wrapper around the <a href=\"https://github.com/hyperopt/hyperopt\">Hyperopt library</a>. \n",
+    "\n",
+    "</p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The examples we saw above are really nice to show various features of the TensorFlow (Keras) library and to understand how we build and train a model. However, they are not the ideal problems one should solve using neural networks. They are too simple and can be solved easily by classical machine learning algorithms. \n",
+    "\n",
+    "Now we show examples where Neural Networks really shine over classical machine learning algorithms."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Handwritten Digits Classification (multi-class classification)\n",
+    "**MNIST Dataset**\n",
+    "\n",
+    "MNIST datasets is a very common dataset used in machine learning. It is widely used to train and validate models.\n",
+    "\n",
+    "\n",
+    "> The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a > test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image.\n",
+    "> It is a good database for people who want to try learning techniques and pattern recognition methods on real-world \n",
+    "> data while spending minimal efforts on preprocessing and formatting.\n",
+    "> source: http://yann.lecun.com/exdb/mnist/\n",
+    "\n",
+    "This dataset consists of images of handwritten digits between 0-9 and their corresponsing labels. We want to train a neural network which is able to predict the correct digit on the image. \n",
+    "This is a multi-class classification problem. Unlike binary classification which we have seen till now we will classify data into 10 different classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loading the dataset in TensorFlow (keras)\n",
+    "# Later you can explore and play with other datasets with come with TensorFlow (Keras)\n",
+    "from tensorflow.keras.datasets import mnist\n",
+    "\n",
+    "# Loading the train and test data\n",
+    "\n",
+    "(X_train, y_train), (X_test, y_test) = mnist.load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Looking at the dataset\n",
+    "print(X_train.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We can see that the training set consists of 60,000 images of size 28x28 pixels\n",
+    "i = np.random.randint(0, X_train.shape[0])\n",
+    "sns.set_style(\"white\")\n",
+    "plt.imshow(X_train[i], cmap=\"gray_r\")\n",
+    "sns.set(style=\"darkgrid\")\n",
+    "print(\"This digit is: \", y_train[i])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Look at the data values for a couple of images\n",
+    "print(X_train[0].min(), X_train[1].max())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The data consists of values between 0-255 representing the **grayscale level**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The labels are the digit on the image\n",
+    "print(y_train.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scaling the data\n",
+    "# It is important to normalize the input data to (0-1) before providing it to a neural net\n",
+    "# We could use the previously introduced function from scikit-learn. However, here it is sufficient to\n",
+    "# just divide the input data by 255\n",
+    "X_train_norm = X_train / 255.0\n",
+    "X_test_norm = X_test / 255.0\n",
+    "\n",
+    "# Also we need to reshape the input data such that each sample is a vector and not a 2D matrix\n",
+    "X_train_prep = X_train_norm.reshape(X_train_norm.shape[0], 28 * 28)\n",
+    "X_test_prep = X_test_norm.reshape(X_test_norm.shape[0], 28 * 28)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "One-Hot encoding\n",
+    "\n",
+    "In multi-class classification problems the labels are provided to the neural network as something called **One-hot encodings**. The categorical labels (0-9 here) are converted to vectors.\n",
+    "\n",
+    "For the MNIST problem where the data has **10 categories** we will convert every label to a vector of length 10. \n",
+    "All the entries of this vector will be zero **except** for the index which is equal to the (integer) value of the label.\n",
+    "\n",
+    "For example:\n",
+    "if label is 4. The one-hot vector will look like **[0 0 0 0 1 0 0 0 0 0]**\n",
+    "\n",
+    "Fortunately, TensorFlow (Keras) has a built-in function to achieve this and we do not have to write a code for this ourselves.\n",
+    "</p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras import utils\n",
+    "\n",
+    "y_train_onehot = utils.to_categorical(y_train, num_classes=10)\n",
+    "y_test_onehot = utils.to_categorical(y_test, num_classes=10)\n",
+    "\n",
+    "print(y_train_onehot.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Building the tensorflow model\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "\n",
+    "\n",
+    "def mnist_model():\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(64, input_shape=(28 * 28,), activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(64, activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = mnist_model()\n",
+    "\n",
+    "model_run = model.fit(X_train_prep, y_train_onehot, epochs=20, batch_size=512)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\n",
+    "    \"The [loss, accuracy] on test dataset are: \",\n",
+    "    model.evaluate(X_test_prep, y_test_onehot),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise section\n",
+    "* Reinitialize and run the model again with validation dataset, plot the accuracy as a function of epochs, play with number of epochs and observe what is happening."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Code here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Solution:\n",
+    "num_epochs = 20\n",
+    "model = mnist_model()\n",
+    "model_run = model.fit(\n",
+    "    X_train_prep,\n",
+    "    y_train_onehot,\n",
+    "    epochs=num_epochs,\n",
+    "    batch_size=512,\n",
+    "    validation_data=(X_test_prep, y_test_onehot),\n",
+    ")\n",
+    "# Evaluating the model on test dataset\n",
+    "# print(\"The [loss, accuracy] on test dataset are: \" , model.evaluate(X_test_prep, y_test_onehot))\n",
+    "history_model = model_run.history\n",
+    "print(\"The history has the following data: \", history_model.keys())\n",
+    "\n",
+    "# Plotting the training and validation accuracy during the training\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"accuracy\"], color=\"blue\", label=\"Training set\"\n",
+    ")\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"val_accuracy\"], color=\"red\", label=\"Valdation set\"\n",
+    ")\n",
+    "plt.xlabel(\"epochs\")\n",
+    "plt.ylabel(\"accuracy\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "What we see here is **overfitting**. Within the first few epochs the training and validation datasets show similar accuracies but thereafter the network starts to overfit to the training set."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "Keep in mind that neural networks are quite prone to overfitting so always check for it.\n",
+    "</p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Adding regularization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Adding l2 regularization\n",
+    "# Building the TensorFlow (keras) model\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.regularizers import l2\n",
+    "\n",
+    "\n",
+    "def mnist_model():\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(\n",
+    "        Dense(\n",
+    "            64, input_shape=(28 * 28,), activation=\"relu\", kernel_regularizer=l2(0.01)\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    model.add(Dense(64, activation=\"relu\", kernel_regularizer=l2(0.01)))\n",
+    "\n",
+    "    model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = mnist_model()\n",
+    "\n",
+    "num_epochs = 20\n",
+    "model_run = model.fit(\n",
+    "    X_train_prep,\n",
+    "    y_train_onehot,\n",
+    "    epochs=num_epochs,\n",
+    "    batch_size=512,\n",
+    "    validation_data=(X_test_prep, y_test_onehot),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluating the model on test dataset\n",
+    "history_model = model_run.history\n",
+    "print(\"The history has the following data: \", history_model.keys())\n",
+    "\n",
+    "# Plotting the training and validation accuracy during the training\n",
+    "sns.lineplot(model_run.epoch, history_model[\"accuracy\"], color=\"blue\", label=\"Training set\")\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"val_accuracy\"], color=\"red\", label=\"Valdation set\"\n",
+    ")\n",
+    "plt.xlabel(\"epochs\")\n",
+    "plt.ylabel(\"accuracy\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<p><i class=\"fa fa-warning\"></i>&nbsp;\n",
+    "Another way to add regularization and to make the network more robust is by applying Dropout. When we add dropout to a layer a specified percentage of units in that layer are switched off. \n",
+    "    \n",
+    "Both L2 regularization and Dropout make the model simpler and thus reducing overfitting.\n",
+    "</p>\n",
+    "</div>\n",
+    "\n",
+    "### Exercise section\n",
+    "* Add dropout instead of L2 regularization in the network above"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Adding dropout is easy in TensorFlow (keras)\n",
+    "# We import a layer called Dropout and add as follows\n",
+    "# model.add(Dropout(0.2)) to randomly drop 20% of the hidden units"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "solution"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Solution\n",
+    "# Adding Dropout\n",
+    "# Building the tensorflow model\n",
+    "from tensorflow.keras.layers import Dense, Dropout\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "\n",
+    "\n",
+    "def mnist_model():\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(64, input_shape=(28 * 28,), activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dropout(0.15))\n",
+    "\n",
+    "    model.add(Dense(64, activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = mnist_model()\n",
+    "\n",
+    "num_epochs = 20\n",
+    "model_run = model.fit(\n",
+    "    X_train_prep,\n",
+    "    y_train_onehot,\n",
+    "    epochs=num_epochs,\n",
+    "    batch_size=512,\n",
+    "    validation_data=(X_test_prep, y_test_onehot),\n",
+    ")\n",
+    "\n",
+    "# Evaluating the model on test dataset\n",
+    "history_model = model_run.history\n",
+    "print(\"The history has the following data: \", history_model.keys())\n",
+    "\n",
+    "# Plotting the training and validation accuracy during the training\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"accuracy\"], color=\"blue\", label=\"Training set\"\n",
+    ")\n",
+    "sns.lineplot(\n",
+    "    model_run.epoch, history_model[\"val_accuracy\"], color=\"red\", label=\"Valdation set\"\n",
+    ")\n",
+    "plt.xlabel(\"epochs\")\n",
+    "plt.ylabel(\"accuracy\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Callbacks\n",
+    "\n",
+    "Tensorflow offers many useful callbacks which are often used.\n",
+    "Examples:\n",
+    "* ModelCheckpoint - Used to save checkpoints of the model at specified frequency\n",
+    "* TensorBoard - Super useful for monitoring and visualizations\n",
+    "* ReduceLROnPlateau - Reduces the learning rate when the monitored metric has stopped improving\n",
+    "\n",
+    "For a complete list have a look at: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext tensorboard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example showing the use of tensorboard\n",
+    "from tensorflow.keras.callbacks import TensorBoard\n",
+    "from tensorflow.keras.layers import Dense, Dropout\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "\n",
+    "\n",
+    "def mnist_model():\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Dense(64, input_shape=(28 * 28,), activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dropout(0.15))\n",
+    "\n",
+    "    model.add(Dense(64, activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = mnist_model()\n",
+    "\n",
+    "num_epochs = 20\n",
+    "tensorboard_callback = TensorBoard(log_dir=\"./logs\")\n",
+    "model_run = model.fit(\n",
+    "    X_train_prep,\n",
+    "    y_train_onehot,\n",
+    "    epochs=num_epochs,\n",
+    "    batch_size=512,\n",
+    "    validation_data=(X_test_prep, y_test_onehot),\n",
+    "    callbacks=[tensorboard_callback],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%tensorboard --logdir logs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-info\"><p><i class=\"fa fa-info-circle\"></i>&nbsp;\n",
+    "    <a href=https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams>HParams Dashboard</a>: The HParams dashboard in TensorBoard provides several tools to help with this process of identifying the best experiment or most promising sets of hyperparameters.\n",
+    "</p></div>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  },
+  "latex_envs": {
+   "LaTeX_envs_menu_present": true,
+   "autoclose": false,
+   "autocomplete": true,
+   "bibliofile": "biblio.bib",
+   "cite_by": "apalike",
+   "current_citInitial": 1,
+   "eqLabelWithNumbers": true,
+   "eqNumInitial": 1,
+   "hotkeys": {
+    "equation": "Ctrl-E",
+    "itemize": "Ctrl-I"
+   },
+   "labels_anchors": false,
+   "latex_user_defs": false,
+   "report_style_numbering": false,
+   "user_envs_cfg": false
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": true,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/08_c-neural_networks.ipynb b/08_c-neural_networks.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d0d97e3e59524d5b7a4366df49a03b70b2ac63de
--- /dev/null
+++ b/08_c-neural_networks.ipynb
@@ -0,0 +1,655 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
+    "from numpy.random import seed\n",
+    "\n",
+    "seed(42)\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.set(style=\"darkgrid\")\n",
+    "mpl.rcParams[\"lines.linewidth\"] = 3\n",
+    "%matplotlib inline\n",
+    "%config InlineBackend.figure_format = 'retina'\n",
+    "%config IPCompleter.greedy=True\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "from IPython.core.display import HTML\n",
+    "\n",
+    "HTML(open(\"custom.html\", \"r\").read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 8c: Introduction to Neural Networks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "## Network Architectures\n",
+    "\n",
+    "The neural networks which we have seen till now are the simplest kind of neural networks.\n",
+    "There exist more sophisticated network architectures especially designed for specific applications.\n",
+    "Some of them are as follows:\n",
+    "\n",
+    "###  Convolution Neural Networks (CNNs)\n",
+    "\n",
+    "These networks are used mostly for computer vision like tasks such as image classification and object detection. \n",
+    "One of the old CNN networks is shown below.\n",
+    "\n",
+    "<center>\n",
+    "<figure>\n",
+    "<img src=\"./images/neuralnets/CNN_lecun.png\" width=\"800\"/>\n",
+    "<figcaption>source: LeCun et al., Gradient-based learning applied to document recognition (1998).</figcaption>\n",
+    "</figure>\n",
+    "</center>\n",
+    "\n",
+    "CNNs consist of new type of layers such as convolution and pooling layers.\n",
+    "\n",
+    "###  Recurrent Neural Networks (RNNs)\n",
+    "\n",
+    "RNNs are used for problems such as time-series data, speech recognition and translation.\n",
+    "\n",
+    "### Generative adversarial networks (GANs)\n",
+    "\n",
+    "GANs consist of 2 parts, a generative network and a discriminative network. The generative network produces data which is then fed to the discriminative network which judges if the new data belongs to a specified dataset. Then via feedback loops the generative network becomes better and better at creating images similar to the dataset the discriminative network is judging against. At the same time the discriminative network get better and better at identifyig **fake** instances which are not from the reference dataset. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CNN in a bit more detail\n",
+    "\n",
+    "The standard CNN architecture can be seen as 2 parts:\n",
+    "\n",
+    "* Feature extraction\n",
+    "* Classification\n",
+    "\n",
+    "For the **classification** part we use the densly connected network as shown in the TensorFlow (keras) examples above.\n",
+    "\n",
+    "However, for the **feature extraction** part we use new types of layers called **convolution** layers\n",
+    "\n",
+    "### What is a Convolution?\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.set_style(\"white\")\n",
+    "# Loading the train and test data\n",
+    "digit = np.genfromtxt(\"data/digit_4_14x14.csv\", delimiter=\",\").astype(np.int16)\n",
+    "plt.imshow(digit, \"gray_r\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This image in matrix form"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_astable(matrix, hw=0.15):\n",
+    "    matrix = plt.table(cellText=matrix, loc=(0, 0), cellLoc=\"center\")\n",
+    "    matrix.set_fontsize(14)\n",
+    "    cells = matrix.get_celld()\n",
+    "    for i in cells:\n",
+    "        cells[i].set_height(hw)\n",
+    "        cells[i].set_width(hw)\n",
+    "    plt.axis(\"off\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_astable(digit)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Vertical edge detection\n",
+    "vertical_edge_kernel = np.array([[-1, 2, -1], [-1, 2, -1], [-1, 2, -1]])\n",
+    "plot_astable(vertical_edge_kernel, 0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def convolution(matrix, kernel):\n",
+    "    # This function computes a convolution between a matrix and a kernel/filter without any padding\n",
+    "    width_kernel = kernel.shape[0]\n",
+    "    height_kernel = kernel.shape[1]\n",
+    "    convolution = np.zeros(\n",
+    "        (matrix.shape[0] - width_kernel + 1, matrix.shape[1] - height_kernel + 1)\n",
+    "    )\n",
+    "    for i in range(matrix.shape[0] - width_kernel + 1):\n",
+    "        for j in range(matrix.shape[1] - height_kernel + 1):\n",
+    "            convolution[i, j] = np.sum(\n",
+    "                np.multiply(matrix[i : i + width_kernel, j : j + height_kernel], kernel)\n",
+    "            )\n",
+    "    return convolution\n",
+    "\n",
+    "\n",
+    "vertical_detect = convolution(digit, vertical_edge_kernel)\n",
+    "plt.imshow(vertical_detect, cmap=\"gray_r\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Horizontal edge detection\n",
+    "horizontal_edge_kernel = np.array([[-1, -1, -1], [2, 2, 2], [-1, -1, -1]])\n",
+    "plot_astable(horizontal_edge_kernel, 0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "horizontal_detect = convolution(digit, horizontal_edge_kernel)\n",
+    "plt.imshow(horizontal_detect, cmap=\"gray_r\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Maxpooling\n",
+    "Taking maximum in n x n sized sliding windows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def maxpool_2x2(matrix):\n",
+    "    out_dim = np.array([matrix.shape[0] / 2, matrix.shape[1] / 2]).astype(int)\n",
+    "    subsample = np.zeros((out_dim))\n",
+    "    for i in range(out_dim[0]):\n",
+    "        for j in range(out_dim[1]):\n",
+    "            subsample[i, j] = np.max(matrix[i * 2 : i * 2 + 2, j * 2 : j * 2 + 2])\n",
+    "    return subsample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "subsampled_image = maxpool_2x2(vertical_detect)\n",
+    "plt.imshow(subsampled_image, cmap=\"gray_r\")\n",
+    "plt.title(\"Max Pooled vertical edge detection filter\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subsampled_image = maxpool_2x2(horizontal_detect)\n",
+    "plt.imshow(subsampled_image, cmap=\"gray_r\")\n",
+    "plt.title(\"Max Pooled horizontal edge detection filter\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Let's explore some more of such filters/kernels!!\n",
+    "\n",
+    "http://setosa.io/ev/image-kernels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CNN Examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this example we will work with a dataset called fashion-MNIST which is quite similar to the MNIST data above.\n",
+    "> Fashion-MNIST is a dataset of Zalando's article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits.\n",
+    "source: https://github.com/zalandoresearch/fashion-mnist\n",
+    "\n",
+    "The 10 classes of this dataset are:\n",
+    "\n",
+    "| Label| Item |\n",
+    "| --- | --- |\n",
+    "| 0 |\tT-shirt/top |\n",
+    "| 1\t| Trouser |\n",
+    "|2|\tPullover|\n",
+    "|3|\tDress|\n",
+    "|4|\tCoat|\n",
+    "|5|\tSandal|\n",
+    "|6|\tShirt|\n",
+    "|7|\tSneaker|\n",
+    "|8|\tBag|\n",
+    "|9|\tAnkle boot|"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loading the dataset in tensorflow\n",
+    "# Later you can explore and play with other datasets with come with tensorflow\n",
+    "from tensorflow.keras.datasets import fashion_mnist\n",
+    "\n",
+    "# Loading the train and test data\n",
+    "\n",
+    "(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()\n",
+    "\n",
+    "items = [\n",
+    "    \"T-shirt/top\",\n",
+    "    \"Trouser\",\n",
+    "    \"Pullover\",\n",
+    "    \"Dress\",\n",
+    "    \"Coat\",\n",
+    "    \"Sandal\",\n",
+    "    \"Shirt\",\n",
+    "    \"Sneaker\",\n",
+    "    \"Bag\",\n",
+    "    \"Ankle boot\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We can see that the training set consists of 60,000 images of size 28x28 pixels\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "i = np.random.randint(0, X_train.shape[0])\n",
+    "plt.imshow(X_train[i], cmap=\"gray_r\")\n",
+    "print(\"This item is a: \", items[y_train[i]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Also we need to reshape the input data such that each sample is a 4D matrix of dimension\n",
+    "# (num_samples, width, height, channels). Even though these images are grayscale we need to add\n",
+    "# channel dimension as this is expected by the Conv function\n",
+    "X_train_prep = X_train.reshape(X_train.shape[0], 28, 28, 1) / 255.0\n",
+    "X_test_prep = X_test.reshape(X_test.shape[0], 28, 28, 1) / 255.0\n",
+    "\n",
+    "from tensorflow.keras.utils import to_categorical\n",
+    "\n",
+    "y_train_onehot = to_categorical(y_train, num_classes=10)\n",
+    "y_test_onehot = to_categorical(y_test, num_classes=10)\n",
+    "\n",
+    "print(y_train_onehot.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Creating a CNN similar to the one shown in the figure from LeCun paper\n",
+    "# In the original implementation Average pooling was used. However, we will use maxpooling as this\n",
+    "# is what us used in the more recent architectures and is found to be a better choice\n",
+    "# Convolution -> Pooling -> Convolution -> Pooling -> Flatten -> Dense -> Dense -> Output layer\n",
+    "from tensorflow.keras.layers import (\n",
+    "    BatchNormalization,\n",
+    "    Conv2D,\n",
+    "    Dense,\n",
+    "    Dropout,\n",
+    "    Flatten,\n",
+    "    MaxPool2D,\n",
+    ")\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "\n",
+    "\n",
+    "def simple_CNN():\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    model.add(Conv2D(6, (3, 3), input_shape=(28, 28, 1), activation=\"relu\"))\n",
+    "\n",
+    "    model.add(MaxPool2D((2, 2)))\n",
+    "\n",
+    "    model.add(Conv2D(16, (3, 3), activation=\"relu\"))\n",
+    "\n",
+    "    model.add(MaxPool2D((2, 2)))\n",
+    "\n",
+    "    model.add(Flatten())\n",
+    "\n",
+    "    model.add(Dense(120, activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(84, activation=\"relu\"))\n",
+    "\n",
+    "    model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = simple_CNN()\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_epochs = 5\n",
+    "model_run = model.fit(\n",
+    "    X_train_prep,\n",
+    "    y_train_onehot,\n",
+    "    epochs=num_epochs,\n",
+    "    batch_size=64,\n",
+    "    validation_data=(X_test_prep, y_test_onehot),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (optional) Exercise section\n",
+    "* Use the above model or improve it (change number of filters, add more layers etc. on the MNIST example and see if you can get a better accuracy than what we achieved with a vanilla neural network)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise section\n",
+    "* Explore the CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html) dataset included with TensorFlow (Keras) and build+train a simple CNN to classify it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.datasets import cifar10\n",
+    "\n",
+    "(X_train, y_train), (X_test, y_test) = cifar10.load_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright (C) 2019-2021 ETH Zurich, SIS ID"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functional API\n",
+    "\n",
+    "The Sequential API of TensorFlow (Keras) is good enough for simple models with a linear topology.\n",
+    "However, the functional api is more flexible and allows for more complicated use cases such as:\n",
+    "* models with non-linear topology\n",
+    "* shared layers\n",
+    "* multiple inputs or outputs\n",
+    "\n",
+    "Examples of such models:\n",
+    "\n",
+    "* U-Net for image segmentation (https://lmb.informatik.uni-freiburg.de/people/ronneber/u-net/)\n",
+    "* ResNet https://arxiv.org/pdf/1512.03385.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Simple example showing the Fashion MNIST example using the functional api\n",
+    "from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input, MaxPool2D\n",
+    "from tensorflow.keras.models import Model\n",
+    "\n",
+    "\n",
+    "def simple_CNN_functional():\n",
+    "\n",
+    "    img_inputs = Input(shape=(28, 28, 1))\n",
+    "\n",
+    "    x = Conv2D(6, (3, 3), activation=\"relu\")(img_inputs)\n",
+    "\n",
+    "    x = MaxPool2D((2, 2))(x)\n",
+    "\n",
+    "    x = Conv2D(16, (3, 3), activation=\"relu\")(x)\n",
+    "\n",
+    "    x = MaxPool2D((2, 2))(x)\n",
+    "\n",
+    "    x = Flatten()(x)\n",
+    "\n",
+    "    x = Dense(120, activation=\"relu\")(x)\n",
+    "\n",
+    "    x = Dense(84, activation=\"relu\")(x)\n",
+    "\n",
+    "    output = Dense(10, activation=\"softmax\")(x)\n",
+    "\n",
+    "    model = Model(inputs=img_inputs, outputs=output, name=\"fashion_mnist_model\")\n",
+    "\n",
+    "    model.compile(\n",
+    "        loss=\"categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = simple_CNN_functional()\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_epochs = 5\n",
+    "model_run = model.fit(\n",
+    "    X_train_prep,\n",
+    "    y_train_onehot,\n",
+    "    epochs=num_epochs,\n",
+    "    batch_size=64,\n",
+    "    validation_data=(X_test_prep, y_test_onehot),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Toy ResNet\n",
+    "(source: https://keras.io/guides/functional_api/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.layers import (\n",
+    "    Conv2D,\n",
+    "    Dense,\n",
+    "    Flatten,\n",
+    "    GlobalAveragePooling2D,\n",
+    "    Input,\n",
+    "    MaxPool2D,\n",
+    "    add,\n",
+    ")\n",
+    "from tensorflow.keras.models import Model\n",
+    "\n",
+    "\n",
+    "def toy_ResNet():\n",
+    "\n",
+    "    inputs = Input(shape=(32, 32, 3), name=\"img\")\n",
+    "    x = Conv2D(32, 3, activation=\"relu\")(inputs)\n",
+    "    x = Conv2D(64, 3, activation=\"relu\")(x)\n",
+    "    block_1_output = MaxPool2D(3)(x)\n",
+    "\n",
+    "    x = Conv2D(64, 3, activation=\"relu\", padding=\"same\")(block_1_output)\n",
+    "    x = Conv2D(64, 3, activation=\"relu\", padding=\"same\")(x)\n",
+    "    block_2_output = add([x, block_1_output])\n",
+    "\n",
+    "    x = Conv2D(64, 3, activation=\"relu\", padding=\"same\")(block_2_output)\n",
+    "    x = Conv2D(64, 3, activation=\"relu\", padding=\"same\")(x)\n",
+    "    block_3_output = add([x, block_2_output])\n",
+    "\n",
+    "    x = Conv2D(64, 3, activation=\"relu\")(block_3_output)\n",
+    "    x = GlobalAveragePooling2D()(x)\n",
+    "    x = Dense(256, activation=\"relu\")(x)\n",
+    "    x = Dropout(0.5)(x)\n",
+    "    outputs = Dense(10, \"softmax\")(x)\n",
+    "\n",
+    "    model = Model(inputs, outputs, name=\"toy_resnet\")\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = toy_ResNet()\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.utils import plot_model\n",
+    "\n",
+    "plot_model(model, \"mini_resnet.png\", show_shapes=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  },
+  "latex_envs": {
+   "LaTeX_envs_menu_present": true,
+   "autoclose": false,
+   "autocomplete": true,
+   "bibliofile": "biblio.bib",
+   "cite_by": "apalike",
+   "current_citInitial": 1,
+   "eqLabelWithNumbers": true,
+   "eqNumInitial": 1,
+   "hotkeys": {
+    "equation": "Ctrl-E",
+    "itemize": "Ctrl-I"
+   },
+   "labels_anchors": false,
+   "latex_user_defs": false,
+   "report_style_numbering": false,
+   "user_envs_cfg": false
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": true,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/08_d-neural_networks.ipynb b/08_d-neural_networks.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..306e64557100303b0d90be45b257da5d48ced56c
--- /dev/null
+++ b/08_d-neural_networks.ipynb
@@ -0,0 +1,495 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
+    "from numpy.random import seed\n",
+    "\n",
+    "seed(42)\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.set(style=\"darkgrid\")\n",
+    "mpl.rcParams[\"lines.linewidth\"] = 3\n",
+    "%matplotlib inline\n",
+    "%config InlineBackend.figure_format = 'retina'\n",
+    "%config IPCompleter.greedy=True\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "from IPython.core.display import HTML\n",
+    "\n",
+    "HTML(open(\"custom.html\", \"r\").read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 8d: Introduction to Neural Networks\n",
+    "## Using pre-defined models in TensorFlow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras import applications\n",
+    "\n",
+    "help(applications)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ImageNet \n",
+    "[ImageNet](http://image-net.org/) is a very large (> 14 million!! images) and easily accessible image database. More than 14 million annotated images indicating the object in the image and more than 1 million images with bounding box information.\n",
+    "\n",
+    "Summary and statistics: http://image-net.org/about-stats\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.applications import VGG16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "?VGG16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = VGG16(weights=\"imagenet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Image as Img\n",
+    "\n",
+    "Img(filename=\"./images/cutepanda.jpg\", width=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input\n",
+    "from tensorflow.keras.preprocessing.image import img_to_array, load_img\n",
+    "\n",
+    "image = load_img(\"./images/cutepanda.jpg\", target_size=(224, 224))\n",
+    "# convert the image pixels to a numpy array\n",
+    "image = img_to_array(image)\n",
+    "# Prepare data for the model\n",
+    "image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))\n",
+    "image = preprocess_input(image)\n",
+    "# prediction of probability of belonging to the output classes\n",
+    "prediction = model.predict(image)\n",
+    "# converting the probabilities to class labels\n",
+    "label = decode_predictions(prediction)\n",
+    "# Top 5 classes\n",
+    "label = label[0]\n",
+    "for pred in label:\n",
+    "    # print the classification\n",
+    "    print(\"It is: {} with probability {:.4f}%\".format(pred[1], pred[2] * 100))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Transfering knowledge"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recap: Convolutional Neural Networks can be seen as being comprised of 2 parts:\n",
+    "**A feature extractor (convolution , Maxpooling layers) and a classifier part (Dense layers)**\n",
+    "\n",
+    "Different possibilities to work with pre-trained/pre-existing models trained on a very large datasets such as Imagenet:\n",
+    "\n",
+    "* Freezing the convolution part and throwing away the classifer part. Adding your own dense layers and training them.\n",
+    "* Freezing only some layers in the convolution part and throwing away the classifer part. Adding your own dense layers and training the unfreezed and the dense layers.\n",
+    "* Only using the architecture and training the whole network again."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Realistic example\n",
+    "\n",
+    "### Histopathological Cancer Detection\n",
+    "\n",
+    "https://www.kaggle.com/c/histopathologic-cancer-detection/overview\n",
+    "\n",
+    "**Download data**: https://polybox.ethz.ch/index.php/s/ADUFBnaxbNXAxX4\n",
+    "\n",
+    "Identification of metastatic cancer in small image patches taken from larger digital pathology scans."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "# Plotting a few images from this dataset\n",
+    "import os\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from numpy import random\n",
+    "from PIL import Image\n",
+    "\n",
+    "random.seed(42)\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "\n",
+    "def plot_data(samples, top_dir):\n",
+    "    sub_directories = [\"benign\", \"malign\"]\n",
+    "    fig, ax = plt.subplots(\n",
+    "        len(sub_directories),\n",
+    "        samples,\n",
+    "        sharex=True,\n",
+    "        sharey=True,\n",
+    "        figsize=(3 * samples, 3 * len(sub_directories)),\n",
+    "    )\n",
+    "    labels = [\"0\", \"1\"]\n",
+    "    assert len(sub_directories) == 2\n",
+    "    for i in range(samples):\n",
+    "        for j, k in enumerate(sub_directories):\n",
+    "            tmp = os.path.join(top_dir, k)\n",
+    "            tmp_img = Image.open(os.path.join(tmp, random.choice(os.listdir(tmp))))\n",
+    "            ax[j, i].imshow(np.asarray(tmp_img))\n",
+    "            ax[j, i].set_title(\"{}: label={}\".format(k, j))\n",
+    "            ax[j, i].grid(False)\n",
+    "\n",
+    "\n",
+    "data_dir = \"PATH_TO_histopathologic_cancer_detection_FOLDER\"\n",
+    "plot_data(4, os.path.join(data_dir, \"train\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data preprocessing\n",
+    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
+    "\n",
+    "train_data = ImageDataGenerator(rescale=1 / 255.0)\n",
+    "\n",
+    "train_directory = os.path.join(data_dir, \"train\")\n",
+    "train_data_generator = train_data.flow_from_directory(\n",
+    "    train_directory, target_size=(96, 96), batch_size=256, class_mode=\"binary\"\n",
+    ")\n",
+    "\n",
+    "validation_data = ImageDataGenerator(rescale=1 / 255.0)\n",
+    "validation_directory = os.path.join(data_dir, \"validation\")\n",
+    "validation_data_generator = validation_data.flow_from_directory(\n",
+    "    validation_directory, target_size=(96, 96), batch_size=256, class_mode=\"binary\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from tensorflow.keras import layers, models, optimizers\n",
+    "from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.applications import VGG16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_extractor = VGG16(weights=None, include_top=False, input_shape=(96, 96, 3))\n",
+    "# feature_extractor = MobileNetV2(weights=None, include_top=False, input_shape=(96,96,3))\n",
+    "feature_extractor.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = models.Sequential()\n",
+    "model.add(feature_extractor)\n",
+    "model.add(layers.Flatten())\n",
+    "model.add(layers.Dropout(0.2))\n",
+    "model.add(layers.Dense(512, activation=\"relu\"))\n",
+    "model.add(layers.Dense(1, activation=\"sigmoid\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(\n",
+    "    optimizer=optimizers.RMSprop(lr=0.0001),\n",
+    "    loss=\"binary_crossentropy\",\n",
+    "    metrics=[\"accuracy\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_epochs = 10\n",
+    "reduce_lr = ReduceLROnPlateau(\n",
+    "    monitor=\"val_loss\", factor=0.2, patience=2, min_lr=0.000001\n",
+    ")\n",
+    "mcp_save = ModelCheckpoint(\"./test/\", save_freq=\"epoch\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CPU times: user 1h 21min 11s, sys: 17min 41s, total: 1h 38min 53s\n",
+    "# Wall time: 1h 58min 20s wo dropout\n",
+    "model_run = model.fit(\n",
+    "    train_data_generator,\n",
+    "    steps_per_epoch=len(train_data_generator),\n",
+    "    epochs=num_epochs,\n",
+    "    validation_data=validation_data_generator,\n",
+    "    validation_steps=len(validation_data_generator),\n",
+    "    callbacks=[reduce_lr, mcp_save],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "\n",
+    "# with open(\"./data/histopathology_run_history\", \"wb\") as filehandler:\n",
+    "#    pickle.dump(model_run.history, filehandler)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_file = open(\"./data/histopathology_run_history\", \"rb\")\n",
+    "history = pickle.load(history_file)\n",
+    "num_epochs = 10\n",
+    "plt.plot(\n",
+    "    np.arange(0, num_epochs),\n",
+    "    history[\"val_accuracy\"],\n",
+    "    label=\"Validation accuracy\",\n",
+    ")\n",
+    "plt.plot(np.arange(0, num_epochs), history[\"accuracy\"], label=\"Train accuracy\")\n",
+    "plt.xlabel(\"epoch\")\n",
+    "plt.ylabel(\"Accuracy\")\n",
+    "plt.legend()\n",
+    "plt.ylim([0.6, 1])\n",
+    "plt.grid()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data Augmentation\n",
+    "train_data = ImageDataGenerator(\n",
+    "    rescale=1 / 255.0,\n",
+    "    rotation_range=90,\n",
+    "    width_shift_range=0.0,\n",
+    "    height_shift_range=0.0,\n",
+    "    shear_range=0.1,\n",
+    "    horizontal_flip=True,\n",
+    "    fill_mode=\"nearest\",\n",
+    ")\n",
+    "# Visualizing what our data generator is doing\n",
+    "# Choosing an image randomly\n",
+    "from numpy import random\n",
+    "\n",
+    "pic_malignant = np.asarray(\n",
+    "    Image.open(\n",
+    "        train_directory\n",
+    "        + \"/malign/\"\n",
+    "        + random.choice(os.listdir(train_directory + \"/malign/\"))\n",
+    "    )\n",
+    ")\n",
+    "fig, ax = plt.subplots(1, 8, sharex=True, sharey=True, figsize=(3 * 8, 3))\n",
+    "ax = ax.flatten()\n",
+    "ax[0].imshow(pic_malignant)\n",
+    "ax[0].grid(False)\n",
+    "pic_malignant = pic_malignant[np.newaxis, :]\n",
+    "for i, img in enumerate(train_data.flow(pic_malignant)):\n",
+    "    ax[i + 1].imshow(img[0])\n",
+    "    ax[i + 1].grid(False)\n",
+    "    if i == 6:\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TensorFlow Hub\n",
+    "\n",
+    "A great repository of trained machine learning models!\n",
+    "\n",
+    "The models can be downloaded and used with just a few lines of code.\n",
+    "\n",
+    "Find models here: https://tfhub.dev/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade tensorflow_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_hub as hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "layer = hub.KerasLayer(\n",
+    "    \"https://tfhub.dev/google/imagenet/resnet_v2_50/classification/4\", trainable=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.models import Sequential\n",
+    "\n",
+    "model = Sequential([layer])\n",
+    "model.build([None, 224, 224, 3])\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/data/histopathology_run_history b/data/histopathology_run_history
new file mode 100644
index 0000000000000000000000000000000000000000..0fb5d259a1127d291ae3caf2eb9fe9a694f98373
Binary files /dev/null and b/data/histopathology_run_history differ
diff --git a/images/cutepanda.jpg b/images/cutepanda.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..46aec7f7277439e0d377a65884cc522123b98352
Binary files /dev/null and b/images/cutepanda.jpg differ