Merge branch '50-update-installation-instructions-2' into 'master'

Resolve "Update installation instructions" Closes #50 See merge request sis/courses/machinelearning-introduction-workshop!47

Merge branch '50-update-installation-instructions-2' into 'master'
3f8667c3 · schmittu · 824f8a93 · b4e40cbc · 3f8667c3 · 3f8667c3
Commit 3f8667c3 authored 3 years ago by schmittu
--- a/08_a-neural_networks.ipynb
+++ b/08_a-neural_networks.ipynb
@@ -2,12 +2,30 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'tensorflow'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-2-8815e0f9bf1a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorflow'"
+     ]
+    }
+   ],
   "source": [
    "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
    "from numpy.random import seed\n",
+    "\n",
+    "import os, sys\n",
+    "\n",
+    "if sys.platform == \"win32\":\n",
+    "    os.add_dll_directory(os.path.dirname(sys.executable))\n",
+    "\n",
    "seed(42)\n",
    "import tensorflow as tf\n",
    "tf.random.set_seed(42)\n",
@@ -2041,7 +2059,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.0"
+   "version": "3.7.7"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,

 %% Cell type:code id: tags:

 ``` python
 # IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !
 from numpy.random import seed
+
+import os, sys
+
+if sys.platform == "win32":
+    os.add_dll_directory(os.path.dirname(sys.executable))
+
 seed(42)
 import tensorflow as tf
 tf.random.set_seed(42)
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 import seaborn as sns
 sns.set(style="darkgrid")
 mpl.rcParams['lines.linewidth'] = 3
 %matplotlib inline
 %config InlineBackend.figure_format = 'retina'
 %config IPCompleter.greedy=True
 import warnings
 warnings.filterwarnings('ignore', category=FutureWarning)
 from IPython.core.display import HTML; HTML(open("custom.html", "r").read())
 ```

+%% Output
+
+    ---------------------------------------------------------------------------
+    ModuleNotFoundError                       Traceback (most recent call last)
+    <ipython-input-2-8815e0f9bf1a> in <module>
+          8
+          9 seed(42)
+    ---> 10 import tensorflow as tf
+         11 tf.random.set_seed(42)
+         12 import matplotlib.pyplot as plt
+    ModuleNotFoundError: No module named 'tensorflow'
+
 %% Cell type:markdown id: tags:

 # Chapter 8: Introduction to Neural Networks



 <img src="./images/3042en.jpg" title="made at imgflip.com" width=35%/>

 %% Cell type:markdown id: tags:

 ## Brief history of neural networks


 |  |  |
 | ----------- | ----------- |
 | 1943 | Threshold Logic |
 | 1940s | Hebbian Learning |
 | 1958 | Perceptron |
 | 1980s | Neocognitron |
 | 1982 | Hopfield Network |
 | 1989 | Convolutional neural network (CNN) kernels trained via backpropagation |
 | 1997 | Long-short term memory (LSTM) model |
 | 1998 | LeNet-5 |
 | 2014 | Gated Recurrent Units (GRU), Generative Adversarial Networks (GAN) |
 | 2015 | ResNet |

 %% Cell type:markdown id: tags:

 ## Why the boom now?
 * Data
 * Data
 * Data
 * Availability of Graphic Processing Units (GPUs)
 * Algorithmic developments which allow for efficient training of networks networks and making them deeper
 * Development of high-level libraries/APIs have made the field much more accessible than it was a decade ago

 %% Cell type:markdown id: tags:

 ## Feed-Forward neural network
 <center>
 <figure>
 <img src="./images/neuralnets/neural_net_ex.svg" width="700"/>
 <figcaption>A three layer densely connected Neural Network (By convention the input layer is not counted).</figcaption>
 </figure>
 </center>

 %% Cell type:markdown id: tags:

 ## Building blocks
 ### Perceptron

 The smallest unit of a neural network is a **perceptron** like node.

 **What is a Perceptron?**

 It is a simple function which can have multiple inputs and has a single output.

 <center>
 <figure>
 <img src="./images/neuralnets/perceptron_ex.svg" width="400"/>
 <figcaption>A simple perceptron with three inputs and one output.</figcaption>
 </figure>
 </center>


 It works as follows:

 Step 1: A **weighted sum** of the inputs is calculated

 \begin{equation*}
 weighted\_sum = w_{1} x_{1} + w_{2} x_{2} + w_{3} x_{3} + ...
 \end{equation*}

 Step 2: A **step** activation function is applied

 $$
 f = \left\{
        \begin{array}{ll}
            0 & \quad weighted\_sum < threshold \\
            1 & \quad weighted\_sum \geq threshold
        \end{array}
    \right.
 $$

 You can see that this is also a linear classifier as the ones we introduced in script 02.

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
 ```

 %% Cell type:code id: tags:

 ``` python
 # Plotting the step function
 x = np.arange(-2,2.1,0.01)
 y = np.zeros(len(x))
 threshold = 0.
 y[x>threshold] = 1.
 step_plot = sns.lineplot(x, y).set_title('Step function') ;
 plt.xlabel('weighted_sum') ;
 plt.ylabel('f(weighted_sum)') ;
 ```

 %% Cell type:code id: tags:

 ``` python
 def perceptron(X, w, threshold=1):
    # This function computes sum(w_i*x_i) and
    # applies a perceptron activation
    linear_sum = np.dot(np.asarray(X).T, w)
    output = np.zeros(len(linear_sum), dtype=np.int8)
    output[linear_sum >= threshold] = 1
    return output
 ```

 %% Cell type:markdown id: tags:

 Boolean AND

 | $x_1$ | $x_2$ | output |
 | --- | --- | --- |
 | 0 | 0 | 0 |
 | 1 | 0 | 0 |
 | 0 | 1 | 0 |
 | 1 | 1 | 1 |

 %% Cell type:code id: tags:

 ``` python
 # Calculating Boolean AND using a perceptron
 threshold = 1.5
 # (w1, w2)
 w = [1, 1]
 # (x1, x2) pairs
 x1 = [0, 1, 0, 1]
 x2 = [0, 0, 1, 1]
 # Calling the perceptron function
 output = perceptron([x1, x2], w, threshold)
 for i in range(len(output)):
    print("Perceptron output for x1, x2 = ", x1[i], ",", x2[i],
          " is ", output[i])
 ```

 %% Cell type:markdown id: tags:

 In this simple case we can rewrite our equation to $x_2 = ...... $ which describes a line in 2D:

 %% Cell type:code id: tags:

 ``` python
 def perceptron_DB(x1, x2, w, threshold):
    # Plotting the decision boundary of the perceptron
    plt.scatter(x1, x2, color="black")
    plt.xlim(-1,2)
    plt.ylim(-1,2)
    # The decision boundary is a line given by
    # w_1*x_1+w_2*x_2-threshold=0
    x1 = np.arange(-3, 4)
    x2 = (threshold - x1*w[0])/w[1]
    sns.lineplot(x1, x2, **{"color": "black"})
    plt.xlabel("x$_1$", fontsize=16)
    plt.ylabel("x$_2$", fontsize=16)
    # Coloring the regions
    pts_tmp = np.arange(-2, 2.1, 0.02)
    points = np.array(np.meshgrid(pts_tmp, pts_tmp)).T.reshape(-1, 2)
    outputs = perceptron(points.T, w, threshold)
    plt.plot(points[:, 0][outputs == 0], points[:, 1][outputs == 0],
             "o",
             color="steelblue",
             markersize=1,
             alpha=0.04,
             )
    plt.plot(points[:, 0][outputs == 1], points[:, 1][outputs == 1],
             "o",
             color="chocolate",
             markersize=1,
             alpha=0.04,
             )
    plt.title("Blue color = 0 and Chocolate = 1")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Plotting the perceptron decision boundary
 perceptron_DB(x1, x2, w, threshold)
 ```

 %% Cell type:markdown id: tags:

 Exercise section
 * Compute a Boolean "OR" using a perceptron

 Hint: copy the code from the "AND" example and edit the weights and/or threshold

 %% Cell type:markdown id: tags:

 Boolean OR

 | $x_1$ | $x_2$ | output |
 | --- | --- | --- |
 | 0 | 0 | 0 |
 | 1 | 0 | 1 |
 | 0 | 1 | 1 |
 | 1 | 1 | 1 |

 %% Cell type:code id: tags:

 ``` python
 # Calculating Boolean OR using a perceptron
 # Enter code here
 ```

 %% Cell type:code id: tags:solution

 ``` python
 # Solution
 # Calculating Boolean OR using a perceptron
 threshold=0.6
 # (w1, w2)
 w=[1,1]
 # (x1, x2) pairs
 x1 = [0, 1, 0, 1]
 x2 = [0, 0, 1, 1]
 output = perceptron([x1, x2], w, threshold)
 for i in range(len(output)):
    print("Perceptron output for x1, x2 = ", x1[i], ",", x2[i],
          " is ", output[i])
 perceptron_DB(x1, x2, w, threshold)
 ```

 %% Cell type:markdown id: tags:

 Exercise section
 * Create a NAND gate using a perceptron

 Boolean NAND

 | $x_1$ | $x_2$ | output |
 | --- | --- | --- |
 | 0 | 0 | 1 |
 | 1 | 0 | 1 |
 | 0 | 1 | 1 |
 | 1 | 1 | 0 |

 %% Cell type:code id: tags:

 ``` python
 # Calculating Boolean NAND using a perceptron
 # Enter code here
 ```

 %% Cell type:code id: tags:solution

 ``` python
 # Solution
 # Calculating Boolean NAND using a perceptron
 import matplotlib.pyplot as plt
 threshold=-1.5
 # (w1, w2)
 w=[-1,-1]
 # (x1, x2) pairs
 x1 = [0, 1, 0, 1]
 x2 = [0, 0, 1, 1]
 output = perceptron([x1, x2], w, threshold)
 for i in range(len(output)):
    print("Perceptron output for x1, x2 = ", x1[i], ",", x2[i],
          " is ", output[i])
 perceptron_DB(x1, x2, w, threshold)
 ```

 %% Cell type:markdown id: tags:

 In fact, a single perceptron can compute "AND", "OR" and "NOT" boolean functions.

 However, it cannot compute some other boolean functions such as "XOR".

 **WHAT CAN WE DO?**


 Hint: Think about what is the significance of the NAND gate we have created above?

 %% Cell type:markdown id: tags:

 ### Multi-layer perceptrons


 Answer: We said a single perceptron can't compute a "XOR" function. We didn't say that about **multiple Perceptrons** put together.

 The normal densely connected neural network is sometimes also called "Multi-layer" perceptron.

 **XOR function using multiple perceptrons**

 <center>
 <figure>
 <img src="./images/neuralnets/perceptron_XOR.svg" width="400"/>
 <figcaption>Multiple perceptrons connected together to output a XOR function.</figcaption>
 </figure>
 </center>

 %% Cell type:markdown id: tags:

 ## Learning

 We know that we can compute complicated functions by combining a number of perceptrons.

 In the perceptron examples we had set the model parameters (weights and threshold) by hand.

 This is something we definitely **DO NOT** want to do or even can do for big networks.

 We want some algorithm to set/learn the model parameters for us!

 <div class="alert alert-block alert-warning">
    <i class="fa fa-info-circle"></i>&nbsp; <strong>Threshold -> bias</strong>

 Before we go further we need to introduce one change. The threshold which we saw in the step activation function above is moved to the left side of the equation and is called **bias**.

 $$
 f = \left\{
        \begin{array}{ll}
            0 & \quad weighted\_sum + bias < 0 \\
            1 & \quad weighted\_sum + bias \geq 0
        \end{array}
       \quad \quad  \mathrm{where}, bias = -threshold
    \right.
 $$

 </div>

 %% Cell type:markdown id: tags:

 In order to algorithmically set/learn the weights and bias we need to choose an appropriate loss function for the problem at hand and solve an optimization problem.
 We will explain below what this means.


 ### Loss function

 To learn using an algorithm we need to define a quantity/function which allows us to measure how close or far are the predictions of our network/setup from reality or the supplied labels. This is done by choosing a so-called "Loss function" (as in the case for other machine learning algorithms).

 Once we have this function, we need an algorithm to update the weights of the network such that this loss function decreases.
 As one can already imagine the choice of an appropriate loss function is critical to the success of the model.

 Fortunately, for classification and regression (which cover a large variety of problems) these loss functions are well known.

 **Crossentropy** and **mean squared error** loss functions are often used for standard classification and regression problems, respectively.

 <div class="alert alert-block alert-warning">
    <i class="fa fa-info-circle"></i>&nbsp; As we have seen before, <strong>mean squared error</strong> is defined as


 $$
 \frac{1}{n} \left((y_1 - \hat{y}_1)^2 + (y_2 - \hat{y}_2)^2 + ... + (y_n - \hat{y}_n)^2 \right)
 $$


 </div>

 ### Gradient based learning

 As mentioned above, once we have chosen a loss function, we want to solve an **optimization problem** which minimizes this loss by updating the parameters (weights and biases) of the network. This is how the learning takes in a NN, and the "knowledge" is stored as the weights and biases.

 The most popular optimization methods used in Neural Network training are **Gradient-descent (GD)** type methods, such as gradient-descent itself, RMSprop and Adam.

 **Gradient-descent** uses partial derivatives of the loss function with respect to the network weights and a learning rate to updates the weights such that the loss function decreases and after some iterations reaches its (Global) minimum value.

 First, the loss function and its derivative are computed at the output node, and this signal is propagated backwards, using the chain rule, in the network to compute the partial derivatives. Hence, this method is called **Backpropagation**.

 One way to perform a single GD pass is to compute the partial derivatives using **all the samples** in our data, computing average derivatives and using them to update the weights. This is called **Batch gradient descent**. However, in deep learning we mostly work with massive datasets and using batch gradient descent can make the training very slow!

 The other extreme is to randomly shuffle the dataset and advance a pass of GD with the gradients computed using only **one sample** at a time. This is called **Stochastic gradient descent**.

 <center>
 <figure>
 <img src="./images/stochastic-vs-batch-gradient-descent.png" width="600"/>
 <figcaption>Source: <a href="https://wikidocs.net/3413">https://wikidocs.net/3413</a></figcaption>
 </figure>
 </center>


 In practice, an approach in-between these two is used. The entire dataset is divided into **m batches** and these are used one by one to compute the derivatives and apply GD. This technique is called **Mini-batch gradient descent**.

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 One pass through the entire training dataset is called 1 epoch of training.
 </p>
 </div>

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np

 plt.figure(figsize=(10, 4)) ;

 pts=np.arange(-20,20, 0.1) ;
 ```

 %% Cell type:markdown id: tags:

 ### Activation Functions

 In order to train the network we need to move away from Perceptron's **step** activation function because it can not be used for training using the gradient-descent and back-propagation algorithms among other drawbacks.

 Non-Linear functions such as:

 * Sigmoid

 \begin{equation*}
 f(z) = \frac{1}{1+e^{-z}} \quad \quad \mathrm{where}, z = weighted\_sum + bias
 \end{equation*}

 %% Cell type:code id: tags:

 ``` python
 sns.lineplot(pts, 1/(1+np.exp(-pts))) ;
 ```

 %% Cell type:markdown id: tags:

 * tanh

 \begin{equation*}
 f(z) = \frac{e^{z} - e^{-z}}{e^{z} + e^{-z}}\quad \quad \mathrm{where}, z = weighted\_sum + bias
 \end{equation*}

 %% Cell type:code id: tags:

 ``` python
 sns.lineplot(pts, np.tanh(pts*np.pi)) ;
 ```

 %% Cell type:markdown id: tags:

 * **ReLU (Rectified linear unit)**

 \begin{equation*}
 f(z) = \mathrm{max}(0,z)   \quad \quad \mathrm{where}, z = weighted\_sum + bias
 \end{equation*}

 %% Cell type:code id: tags:

 ``` python
 pts_relu=[max(0,i) for i in pts];
 plt.plot(pts, pts_relu) ;
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 ReLU is very popular and is widely used nowadays. There exist several other variations of ReLU, e.g. "leaky ReLU", "ELU".
 </p>
 </div>

 %% Cell type:markdown id: tags:

 * **Leaky ReLU**

 \begin{equation*}
 f(z) = \mathrm{max}(\alpha z,z)   \quad \quad \mathrm{where}, z = weighted\_sum + bias \text{ and } \alpha \text{ (generally) } = 0.01
 \end{equation*}

 %% Cell type:code id: tags:

 ``` python
 alpha=0.1 # Large alpha chosen for plotting purposes
 pts_leakyrelu=[max(alpha*i,i) for i in pts];
 plt.plot(pts, pts_leakyrelu) ;
 plt.xlim(-5,5);
 plt.ylim(-1,5);
 ```

 %% Cell type:markdown id: tags:

 * **ELU (Exponential linear unit)**

 \begin{equation*}
  f_{z} =
    \begin{cases}
      \alpha(\exp(z)-1) & z<0\\
      z & z \geq 0
    \end{cases},
       \quad \quad \mathrm{where}, z = weighted\_sum + bias \text{ and } \alpha \text{ (generally) } = 1
 \end{equation*}

 %% Cell type:code id: tags:

 ``` python
 import math
 alpha=1
 pts_elu=[alpha*(math.exp(i)-1) if i<0 else i for i in pts]
 plt.plot(pts, pts_elu) ;
 plt.xlim(-5,5);
 plt.ylim(-2,5);
 ```

 %% Cell type:markdown id: tags:

 are some of the commonly used as activation functions. Such non-linear activation functions allow the network to learn complex representations of data.

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-info">
 <p><i class="fa fa-warning"></i>&nbsp;
 Why don't we just use a simple linear activation function?

 Linear activations are **NOT** used because it can be mathematically shown that if they are used then the output is just a linear function of the input. So we cannot learn interesting and complex functions by adding any number of hidden layers.

 The only exception when we do want to use a linear activation is for the output layer of a network when solving a regression problem.

 </p>
 </div>

 %% Cell type:markdown id: tags:

 ### Exercise section - Google Playground

 A great tool from Google to develop a feeling for the workings of neural networks.

 https://playground.tensorflow.org/

 <img src="./images/neuralnets/google_playground.png"/>

 **Walkthrough by instructor**

 Some concepts to look at:

 * Simple vs Complex models (Effect of network size)
 * Optimization results
 * Effect of activation functions

 %% Cell type:markdown id: tags:

 ## Introduction to TensorFlow (keras api)

 %% Cell type:markdown id: tags:

 ### A bit about Keras?

 * It is a high level API to create and work with neural networks
 * Used to support multiple backends such as **TensorFlow** from Google, **Theano** (Although Theano is dead now) and **CNTK** (Microsoft Cognitive Toolkit), up till release 2.3.0
 * Very good for creating neural nets quickly and hides away a lot of tedious work
 * Has been incorporated into official TensorFlow (which obviously only works with tensorflow) and as of TensorFlow 2.0 this is the main api to use it

 %% Cell type:markdown id: tags:

 <center>
 <figure>
 <img src="./images/neuralnets/neural_net_keras_1.svg" width="700"/>
 <figcaption>Building this model in Keras</figcaption>
 </figure>
 </center>

 %% Cell type:code id: tags:

 ``` python
 # Say hello to Tensorflow
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Activation

 # Creating a model
 model = Sequential()

 # Adding layers to this model
 # 1st Hidden layer
 # A Dense/fully-connected layer which takes as input a
 # feature array of shape (samples, num_features)
 # Here input_shape = (2,) means that the layer expects an input with num_features = 2
 # and the sample size could be anything
 # The activation function for this layer is set to "relu"
 model.add(Dense(units=4, input_shape=(2,), activation="relu"))

 # 2nd Hidden layer
 # This is also a fully-connected layer and we do not need to specify the
 # shape of the input anymore (We need to do that only for the first layer)
 # NOTE: Now we didn't add the activation seperately. Instead we just added it
 # while calling Dense(). This and the way used for the first layer are Equivalent!
 model.add(Dense(units=4, activation="relu"))


 # The output layer
 model.add(Dense(units=1))
 model.add(Activation("sigmoid"))

 model.summary()
 ```

 %% Cell type:markdown id: tags:

 ### XOR using neural networks

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.model_selection import train_test_split
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense
 import numpy as np
 ```

 %% Cell type:code id: tags:

 ``` python
 # Creating a network to solve the XOR problem

 # Loading and plotting the data
 xor = pd.read_csv("data/xor.csv")

 # Using x and y coordinates as featues
 features = xor.iloc[:, :-1]
 # Convert boolean to integer values (True->1 and False->0)
 labels = (1-xor.iloc[:, -1].astype(int))

 colors = [["steelblue", "chocolate"][i] for i in labels]
 plt.figure(figsize=(5, 5))
 plt.xlim([-2, 2])
 plt.ylim([-2, 2])
 plt.title("Blue points are False")
 plt.scatter(features["x"], features["y"], color=colors, marker="o") ;
 ```

 %% Cell type:code id: tags:

 ``` python
 # Building a simple Tensorflow model

 def a_simple_NN():

    model = Sequential()

    model.add(Dense(4, input_shape = (2,), activation = "relu"))

    model.add(Dense(4, activation = "relu"))

    model.add(Dense(1, activation = "sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

    return model
 ```

 %% Cell type:code id: tags:

 ``` python
 # Instantiating the model
 model = a_simple_NN()

 # Splitting the dataset into training (70%) and validation sets (30%)
 X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3)

 # Setting the number of passes through the entire training set
 num_epochs = 300

 # model.fit() is used to train the model
 # We can pass validation data while training
 model_run = model.fit(X_train, y_train, epochs=num_epochs,
                      validation_data=(X_test, y_test))
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-info"><p><i class="fa fa-info-circle"></i>&nbsp;
    NOTE: We can pass "verbose=0" to model.fit() to suppress the printing of model output on the terminal/notebook.
 </p></div>

 %% Cell type:code id: tags:

 ``` python
 # Plotting the loss and accuracy on the training and validation sets during the training
 # This can be done by using Keras callback "history" which is applied by default
 history_model = model_run.history

 print("The history has the following data: ", history_model.keys())

 # Plotting the training and validation accuracy during the training
 sns.lineplot(np.arange(1, num_epochs+1), history_model["accuracy"], color = "blue", label="Training set") ;
 sns.lineplot(np.arange(1, num_epochs+1), history_model["val_accuracy"], color = "red", label="Valdation set") ;
 plt.xlabel("epochs") ;
 plt.ylabel("accuracy") ;
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 The plots such as above are essential for analyzing the behaviour and performance of the network and to tune it in the right direction. However, for the example above we don't expect to derive a lot of insight from this plot as the function we are trying to fit is quite simple and there is not too much noise. We will see the significance of these curves in a later example.
 </p>
 </div>

 %% Cell type:code id: tags:

 ``` python
 # Before we move on forward we see how to save and load a keras model
 model.save("./data/my_first_NN.h5")

 # Optional: See what is in the hdf5 file we just created above

 from tensorflow.keras.models import load_model
 model = load_model("./data/my_first_NN.h5")
 ```

 %% Cell type:markdown id: tags:

 For the training and validation in the example above we split our dataset into a 70-30 train-validation set. We know from previous chapters that to more robustly estimate the accuracy of our model we can use **K-fold cross-validation**.
 This is even more important when we have small datasets and cannot afford to reserve a validation set!

 One way to do the cross-validation here would be to write our own function to do this. However, we also know that **scikit-learn** provides several handy functions to evaluate and tune the models. So the question is:


 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
    Can we somehow use the scikit-learn functions or the ones we wrote ourselves for scikit-learn models to evaluate and tune our Keras models?


 The Answer is **YES !**
 </p>
 </div>



 We show how to do this in the following section.

 %% Cell type:markdown id: tags:

 ## Using scikit-learn functions on keras models


 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 Keras offers 2 wrappers which allow its Sequential models to be used with scikit-learn.

 There are: **KerasClassifier** and **KerasRegressor**.

 For more information:
 https://keras.io/scikit-learn-api/
 </p>
 </div>



 **Now lets see how this works!**

 %% Cell type:code id: tags:

 ``` python
 # We wrap the Keras model we created above with KerasClassifier
 from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
 from sklearn.model_selection import cross_val_score
 # Wrapping Keras model
 # NOTE: We pass verbose=0 to suppress the model output
 num_epochs = 400
 model_scikit = KerasClassifier(
    build_fn=a_simple_NN, epochs=num_epochs, verbose=0)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Let's reuse the function to visualize the decision boundary which we saw in chapter 2 with minimal change

 def list_flatten(list_of_list):
    flattened_list = [i for j in list_of_list for i in j]
    return flattened_list

 def plot_points(plt=plt, marker='o'):
    colors = [["steelblue", "chocolate"][i] for i in labels]
    plt.scatter(features.iloc[:, 0], features.iloc[:, 1], color=colors, marker=marker);

 def train_and_plot_decision_surface(
    name, classifier, features_2d, labels, preproc=None, plt=plt, marker='o', N=400
 ):

    features_2d = np.array(features_2d)
    xmin, ymin = features_2d.min(axis=0)
    xmax, ymax = features_2d.max(axis=0)

    x = np.linspace(xmin, xmax, N)
    y = np.linspace(ymin, ymax, N)
    points = np.array(np.meshgrid(x, y)).T.reshape(-1, 2)

    if preproc is not None:
        points_for_classifier = preproc.fit_transform(points)
        features_2d = preproc.fit_transform(features_2d)
    else:
        points_for_classifier = points

    classifier.fit(features_2d, labels, verbose=0)

    if name == "Neural Net":
        #predicted = classifier.predict(features_2d)
        #predicted = list_flatten(predicted)
        predicted = list_flatten((classifier.predict(features_2d) > 0.5).astype("int32"))
    #else:
        #predicted = classifier.predict(features_2d)


    if preproc is not None:
        name += " (w/ preprocessing)"
    print(name + ":\t", sum(predicted == labels), "/", len(labels), "correct")

    if name == "Neural Net":
        #classes = np.array(list_flatten(classifier.predict(points_for_classifier)), dtype=bool)
        classes = np.array(list_flatten((classifier.predict(points_for_classifier) > 0.5).astype("int32")), dtype=bool)
    #else:
        #classes = np.array(classifier.predict(points_for_classifier), dtype=bool)
    plt.plot(
        points[~classes][:, 0],
        points[~classes][:, 1],
        "o",
        color="steelblue",
        markersize=1,
        alpha=0.01,
    )
    plt.plot(
        points[classes][:, 0],
        points[classes][:, 1],
        "o",
        color="chocolate",
        markersize=1,
        alpha=0.04,
    )
 ```

 %% Cell type:code id: tags:

 ``` python
 _, ax = plt.subplots(figsize=(6, 6))

 train_and_plot_decision_surface("Neural Net", model_scikit, features, labels, plt=ax)
 plot_points(plt=ax)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Applying K-fold cross-validation
 # Here we pass the whole dataset, i.e. features and labels, instead of splitting it.
 num_folds = 5
 cross_validation = cross_val_score(
    model_scikit, features, labels, cv=num_folds, verbose=0)

 print("The acuracy on the ", num_folds, " validation folds:", cross_validation)
 print("The Average acuracy on the ", num_folds, " validation folds:", np.mean(cross_validation))
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 The code above took quite long to finish even though we used only 5  CV folds and the neural network and data size are very small! This gives an indication of the enormous compute requirements of training production-grade deep neural networks.
 </p>
 </div>

 %% Cell type:markdown id: tags:

 ## Hyperparameter optimization

 %% Cell type:markdown id: tags:

 We know from chapter 6 that there are 2 types of parameters which need to be tuned for a machine learning model.
 * Internal model parameters (weights) which can be learned for e.g. by gradient-descent
 * Hyperparameters

 In the model created above we made some arbitrary choices such as the choice of the optimizer we used, optimizer's learning rate, number of hidden units and so on ...

 Now that we have the keras model wrapped as a scikit-learn model we can use the grid search functions we have seen in chapter 6.

 %% Cell type:code id: tags:

 ``` python
 from sklearn.model_selection import GridSearchCV
 # Just to remember
 model_scikit = KerasClassifier(
    build_fn=a_simple_NN, **{"epochs": num_epochs, "verbose": 0})
 ```

 %% Cell type:code id: tags:

 ``` python
 HP_grid = {'epochs' : [30, 50, 100]}
 search = GridSearchCV(estimator=model_scikit, param_grid=HP_grid)
 search.fit(features, labels)
 print(search.best_score_, search.best_params_)
 ```

 %% Cell type:code id: tags:

 ``` python
 HP_grid = {'epochs' : [10, 15, 30],
           'batch_size' : [10, 20, 30] }
 search = GridSearchCV(estimator=model_scikit, param_grid=HP_grid)
 search.fit(features, labels)
 print(search.best_score_, search.best_params_)
 ```

 %% Cell type:code id: tags:

 ``` python
 # A more general model for further Hyperparameter optimization
 from tensorflow.keras import optimizers

 def a_simple_NN(activation='relu', num_hidden_neurons=[4, 4], learning_rate=0.01):

    model = Sequential()

    model.add(Dense(num_hidden_neurons[0],
                    input_shape=(2,), activation=activation))

    model.add(Dense(num_hidden_neurons[1], activation=activation))

    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer=optimizers.rmsprop(
        lr=learning_rate), metrics=["accuracy"])

    return model
 ```

 %% Cell type:markdown id: tags:

 ### Exercise section:
 * Look at the model above and choose a couple of hyperparameters to optimize.
 * **OPTIONAL:** What function from scikit-learn other than GridSearchCV can we use for hyperparameter optimization? Use it.

 %% Cell type:code id: tags:

 ``` python
 # Code here
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 Another library which you should definitely look at for doing hyperparameter optimization with keras models is the <a href="https://github.com/maxpumperla/hyperas">Hyperas library</a> which is a wrapper around the <a href="https://github.com/hyperopt/hyperopt">Hyperopt library</a>.

 </p>
 </div>

 %% Cell type:markdown id: tags:

 ### Exercise section:
 * Create a neural network to classify the 2d points example from chapter 2 learned (Optional: As you create the model read a bit on the different keras commands we have used).

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
 from sklearn.model_selection import train_test_split, cross_val_score
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense
 from tensorflow.keras import optimizers
 from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
 ```

 %% Cell type:code id: tags:

 ``` python
 circle = pd.read_csv("data/circle.csv")
 # Using x and y coordinates as featues
 features = circle.iloc[:, :-1]
 # Convert boolean to integer values (True->1 and False->0)
 labels = circle.iloc[:, -1].astype(int)

 colors = [["steelblue", "chocolate"][i] for i in circle["label"]]
 plt.figure(figsize=(5, 5))
 plt.xlim([-2, 2])
 plt.ylim([-2, 2])

 plt.scatter(features["x"], features["y"], color=colors, marker="o");
 ```

 %% Cell type:code id: tags:

 ``` python
 # Insert Code here
 ```

 %% Cell type:markdown id: tags:

 The examples we saw above are really nice to show various features of the Keras library and to understand how we build and train a model. However, they are not the ideal problems one should solve using neural networks. They are too simple and can be solved easily by classical machine learning algorithms.

 Now we show examples where Neural Networks really shine over classical machine learning algorithms.

 %% Cell type:markdown id: tags:

 ## Handwritten Digits Classification (multi-class classification)
 **MNIST Dataset**

 MNIST datasets is a very common dataset used in machine learning. It is widely used to train and validate models.


 >The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a >test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size->normalized and centered in a fixed-size image.
 >It is a good database for people who want to try learning techniques and pattern recognition methods on real-world >data while spending minimal efforts on preprocessing and formatting.
 >source: http://yann.lecun.com/exdb/mnist/

 This dataset consists of images of handwritten digits between 0-9 and their corresponsing labels. We want to train a neural network which is able to predict the correct digit on the image.
 This is a multi-class classification problem. Unlike binary classification which we have seen till now we will classify data into 10 different classes.

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 import numpy as np
 import seaborn as sns
 ```

 %% Cell type:code id: tags:

 ``` python
 # Loading the dataset in keras
 # Later you can explore and play with other datasets with come with Keras
 from tensorflow.keras.datasets import mnist

 # Loading the train and test data

 (X_train, y_train), (X_test, y_test) = mnist.load_data()
 ```

 %% Cell type:code id: tags:

 ``` python
 # Looking at the dataset
 print(X_train.shape)
 ```

 %% Cell type:code id: tags:

 ``` python
 # We can see that the training set consists of 60,000 images of size 28x28 pixels
 i=np.random.randint(0,X_train.shape[0])
 sns.set_style("white")
 plt.imshow(X_train[i], cmap="gray_r") ;
 sns.set(style="darkgrid")
 print("This digit is: " , y_train[i])
 ```

 %% Cell type:code id: tags:

 ``` python
 # Look at the data values for a couple of images
 print(X_train[0].min(), X_train[1].max())
 ```

 %% Cell type:markdown id: tags:

 The data consists of values between 0-255 representing the **grayscale level**

 %% Cell type:code id: tags:

 ``` python
 # The labels are the digit on the image
 print(y_train.shape)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Scaling the data
 # It is important to normalize the input data to (0-1) before providing it to a neural net
 # We could use the previously introduced function from scikit-learn. However, here it is sufficient to
 # just divide the input data by 255
 X_train_norm = X_train/255.
 X_test_norm = X_test/255.

 # Also we need to reshape the input data such that each sample is a vector and not a 2D matrix
 X_train_prep = X_train_norm.reshape(X_train_norm.shape[0],28*28)
 X_test_prep = X_test_norm.reshape(X_test_norm.shape[0],28*28)
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 One-Hot encoding

 In multi-class classification problems the labels are provided to the neural network as something called **One-hot encodings**. The categorical labels (0-9 here) are converted to vectors.

 For the MNIST problem where the data has **10 categories** we will convert every label to a vector of length 10.
 All the entries of this vector will be zero **except** for the index which is equal to the (integer) value of the label.

 For example:
 if label is 4. The one-hot vector will look like **[0 0 0 0 1 0 0 0 0 0]**

 Fortunately, Keras has a built-in function to achieve this and we do not have to write a code for this ourselves.
 </p>
 </div>

 %% Cell type:code id: tags:

 ``` python
 from tensorflow.keras import utils

 y_train_onehot = utils.to_categorical(y_train, num_classes=10)
 y_test_onehot = utils.to_categorical(y_test, num_classes=10)

 print(y_train_onehot.shape)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Building the tensorflow model
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense

 def mnist_model():
    model = Sequential()

    model.add(Dense(64, input_shape=(28*28,), activation="relu"))

    model.add(Dense(64, activation="relu"))

    model.add(Dense(10, activation="softmax"))

    model.compile(loss="categorical_crossentropy",
                  optimizer="rmsprop", metrics=["accuracy"])
    return model

 model = mnist_model()

 model_run = model.fit(X_train_prep, y_train_onehot, epochs=20,
                      batch_size=512)
 ```

 %% Cell type:code id: tags:

 ``` python
 print("The [loss, accuracy] on test dataset are: " , model.evaluate(X_test_prep, y_test_onehot))
 ```

 %% Cell type:markdown id: tags:

 ### Exercise section
 * Reinitialize and run the model again with validation dataset, plot the accuracy as a function of epochs, play with number of epochs and observe what is happening.

 %% Cell type:code id: tags:

 ``` python
 # Code here
 ```

 %% Cell type:code id: tags:solution

 ``` python
 # Solution:
 num_epochs = 20
 model = mnist_model()
 model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,
                      batch_size=512, validation_data=(X_test_prep, y_test_onehot))
 # Evaluating the model on test dataset
 #print("The [loss, accuracy] on test dataset are: " , model.evaluate(X_test_prep, y_test_onehot))
 history_model = model_run.history
 print("The history has the following data: ", history_model.keys())

 # Plotting the training and validation accuracy during the training
 sns.lineplot(np.arange(1, num_epochs+1), history_model["accuracy"], color = "blue", label="Training set") ;
 sns.lineplot(np.arange(1, num_epochs+1), history_model["val_accuracy"], color = "red", label="Valdation set") ;
 plt.xlabel("epochs") ;
 plt.ylabel("accuracy") ;
 ```

 %% Cell type:markdown id: tags:

 What we see here is **overfitting**. After the first few epochs the training and validation datasets show a similar accuracy but thereafter the network starts to over fit to the training set.

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 Keep in mind that neural networks are quite prone to overfitting so always check for it.
 </p>
 </div>

 %% Cell type:markdown id: tags:

 ### Adding regularization

 %% Cell type:code id: tags:

 ``` python
 # Adding l2 regularization
 # Building the keras model
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense
 from tensorflow.keras.regularizers import l2

 def mnist_model():

    model = Sequential()

    model.add(Dense(64, input_shape=(28*28,), activation="relu",
                   kernel_regularizer=l2(0.01)))

    model.add(Dense(64, activation="relu",
                   kernel_regularizer=l2(0.01)))

    model.add(Dense(10, activation="softmax"))

    model.compile(loss="categorical_crossentropy",
                  optimizer="rmsprop", metrics=["accuracy"])
    return model

 model = mnist_model()

 num_epochs = 20
 model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,
                      batch_size=512, validation_data=(X_test_prep, y_test_onehot))
 ```

 %% Cell type:code id: tags:

 ``` python
 # Evaluating the model on test dataset
 history_model = model_run.history
 print("The history has the following data: ", history_model.keys())

 # Plotting the training and validation accuracy during the training
 sns.lineplot(np.arange(1, num_epochs+1), history_model["accuracy"], color = "blue", label="Training set") ;
 sns.lineplot(np.arange(1, num_epochs+1), history_model["val_accuracy"], color = "red", label="Valdation set") ;
 plt.xlabel("epochs") ;
 plt.ylabel("accuracy") ;
 ```

 %% Cell type:markdown id: tags:

 <div class="alert alert-block alert-warning">
 <p><i class="fa fa-warning"></i>&nbsp;
 Another way to add regularization and to make the network more robust is by applying Dropout. When we add dropout to a layer a specified percentage of units in that layer are switched off.

 Both L2 regularization and Dropout make the model simpler and thus reducing overfitting.
 </p>
 </div>

 ### Exercise section
 * Add dropout instead of L2 regularization in the network above

 %% Cell type:code id: tags:

 ``` python
 # Adding dropout is easy in keras
 # We import a layer called Dropout and add as follows
 # model.add(Dropout(0.2)) to randomly drop 20% of the hidden units


 ```

 %% Cell type:code id: tags:solution

 ``` python
 # Solution
 # Adding Dropout
 # Building the tensorflow model
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Dropout

 def mnist_model():

    model = Sequential()

    model.add(Dense(64, input_shape=(28*28,), activation="relu"))

    model.add(Dropout(0.15))

    model.add(Dense(64, activation="relu"))

    model.add(Dense(10, activation="softmax"))

    model.compile(loss="categorical_crossentropy",
                  optimizer="rmsprop", metrics=["accuracy"])

    return model

 model = mnist_model()

 num_epochs = 20
 model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,
                      batch_size=512, validation_data=(X_test_prep, y_test_onehot))

 # Evaluating the model on test dataset
 history_model = model_run.history
 print("The history has the following data: ", history_model.keys())

 # Plotting the training and validation accuracy during the training
 sns.lineplot(np.arange(1, num_epochs+1), history_model["accuracy"], color = "blue", label="Training set") ;
 sns.lineplot(np.arange(1, num_epochs+1), history_model["val_accuracy"], color = "red", label="Valdation set") ;
 plt.xlabel("epochs") ;
 plt.ylabel("accuracy") ;
 ```

 %% Cell type:markdown id: tags:

 ## Network Architectures

 The neural networks which we have seen till now are the simplest kind of neural networks.
 There exist more sophisticated network architectures especially designed for specific applications.
 Some of them are as follows:

 ###  Convolution Neural Networks (CNNs)

 These networks are used mostly for computer vision like tasks such as image classification and object detection.
 One of the old CNN networks is shown below.

 <center>
 <figure>
 <img src="./images/neuralnets/CNN_lecun.png" width="800"/>
 <figcaption>source: LeCun et al., Gradient-based learning applied to document recognition (1998).</figcaption>
 </figure>
 </center>

 CNNs consist of new type of layers such as convolution and pooling layers.

 ###  Recurrent Neural Networks (RNNs)

 RNNs are used for problems such as time-series data, speech recognition and translation.

 ### Generative adversarial networks (GANs)

 GANs consist of 2 parts, a generative network and a discriminative network. The generative network produces data which is then fed to the discriminative network which judges if the new data belongs to a specified dataset. Then via feedback loops the generative network becomes better and better at creating images similar to the dataset the discriminative network is judging against. At the same time the discriminative network get better and better at identifyig **fake** instances which are not from the reference dataset.

 %% Cell type:markdown id: tags:

 ## CNN in a bit more detail

 The standard CNN architecture can be seen as 2 parts:

 * Feature extraction
 * Classification

 For the **classification** part we use the densly connected network as shown in the keras examples above.

 However, for the **feature extraction** part we use new types of layers called **convolution** layers

 ### What is a Convolution?

 %% Cell type:code id: tags:

 ``` python
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 sns.set_style("white")
 # Loading the train and test data
 digit = np.genfromtxt("data/digit_4_14x14.csv", delimiter=",").astype(np.int16) ;
 plt.imshow(digit, "gray_r")
 ```

 %% Cell type:markdown id: tags:

 This image in matrix form

 %% Cell type:code id: tags:

 ``` python
 def plot_astable(matrix, hw=0.15):
    matrix = plt.table(cellText=matrix, loc=(0,0), cellLoc='center') ;
    matrix.set_fontsize(14)
    cells=matrix.get_celld() ;
    for i in cells:
        cells[i].set_height(hw) ;
        cells[i].set_width(hw) ;
    plt.axis("off")
 ```

 %% Cell type:code id: tags:

 ``` python
 plot_astable(digit)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Vertical edge detection
 vertical_edge_kernel = np.array([[-1, 2, -1], [-1, 2, -1], [-1, 2, -1]])
 plot_astable(vertical_edge_kernel, 0.2)
 ```

 %% Cell type:code id: tags:

 ``` python
 import numpy as np

 def convolution(matrix, kernel):
    # This function computes a convolution between a matrix and a kernel/filter without any padding
    width_kernel = kernel.shape[0]
    height_kernel = kernel.shape[1]
    convolution = np.zeros((matrix.shape[0] - width_kernel + 1,
                            matrix.shape[1] - height_kernel + 1))
    for i in range(matrix.shape[0] - width_kernel + 1):
        for j in range(matrix.shape[1] - height_kernel + 1):
            convolution[i, j] = np.sum(np.multiply(
                matrix[i:i+width_kernel, j:j+height_kernel], kernel))
    return convolution


 vertical_detect = convolution(digit, vertical_edge_kernel)
 plt.imshow(vertical_detect, cmap="gray_r") ;
 ```

 %% Cell type:code id: tags:

 ``` python
 # Horizontal edge detection
 horizontal_edge_kernel = np.array([[-1, -1, -1], [2, 2, 2], [-1, -1, -1]])
 plot_astable(horizontal_edge_kernel, 0.2)
 ```

 %% Cell type:code id: tags:

 ``` python
 horizontal_detect = convolution(digit, horizontal_edge_kernel)
 plt.imshow(horizontal_detect, cmap="gray_r") ;
 ```

 %% Cell type:markdown id: tags:

 ### Maxpooling
 Taking maximum in n x n sized sliding windows

 %% Cell type:code id: tags:

 ``` python
 import numpy as np
 def maxpool_2x2(matrix):
    out_dim = np.array([matrix.shape[0]/2, matrix.shape[1]/2]).astype(int)
    subsample = np.zeros((out_dim))
    for i in range(out_dim[0]):
        for j in range(out_dim[1]):
            subsample[i,j] = np.max(matrix[i*2:i*2+2, j*2:j*2+2])
    return subsample
 ```

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 subsampled_image = maxpool_2x2(vertical_detect)
 plt.imshow(subsampled_image, cmap="gray_r")
 plt.title("Max Pooled vertical edge detection filter") ;
 ```

 %% Cell type:code id: tags:

 ``` python
 subsampled_image = maxpool_2x2(horizontal_detect)
 plt.imshow(subsampled_image, cmap="gray_r") ;
 plt.title("Max Pooled horizontal edge detection filter") ;
 ```

 %% Cell type:markdown id: tags:

 ### Let's explore some more of such filters/kernels!!

 http://setosa.io/ev/image-kernels

 %% Cell type:markdown id: tags:

 ## CNN Examples

 %% Cell type:markdown id: tags:

 For this example we will work with a dataset called fashion-MNIST which is quite similar to the MNIST data above.
 > Fashion-MNIST is a dataset of Zalando's article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits.
 source: https://github.com/zalandoresearch/fashion-mnist

 The 10 classes of this dataset are:

 | Label| Item |
 | --- | --- |
 | 0 |	T-shirt/top |
 | 1	| Trouser |
 |2|	Pullover|
 |3|	Dress|
 |4|	Coat|
 |5|	Sandal|
 |6|	Shirt|
 |7|	Sneaker|
 |8|	Bag|
 |9|	Ankle boot|

 %% Cell type:code id: tags:

 ``` python
 # Loading the dataset in tensorflow
 # Later you can explore and play with other datasets with come with tensorflow
 from tensorflow.keras.datasets import fashion_mnist

 # Loading the train and test data

 (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

 items =['T-shirt/top', 'Trouser',
        'Pullover', 'Dress',
        'Coat', 'Sandal',
        'Shirt', 'Sneaker',
        'Bag', 'Ankle boot']
 ```

 %% Cell type:code id: tags:

 ``` python
 # We can see that the training set consists of 60,000 images of size 28x28 pixels
 import matplotlib.pyplot as plt
 import numpy as np
 i=np.random.randint(0,X_train.shape[0])
 plt.imshow(X_train[i], cmap="gray_r") ;
 print("This item is a: " , items[y_train[i]])
 ```

 %% Cell type:code id: tags:

 ``` python
 # Also we need to reshape the input data such that each sample is a 4D matrix of dimension
 # (num_samples, width, height, channels). Even though these images are grayscale we need to add
 # channel dimension as this is expected by the Conv function
 X_train_prep = X_train.reshape(X_train.shape[0],28,28,1)/255.
 X_test_prep = X_test.reshape(X_test.shape[0],28,28,1)/255.

 from tensorflow.keras.utils import to_categorical

 y_train_onehot = to_categorical(y_train, num_classes=10)
 y_test_onehot = to_categorical(y_test, num_classes=10)

 print(y_train_onehot.shape)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Creating a CNN similar to the one shown in the figure from LeCun paper
 # In the original implementation Average pooling was used. However, we will use maxpooling as this
 # is what us used in the more recent architectures and is found to be a better choice
 # Convolution -> Pooling -> Convolution -> Pooling -> Flatten -> Dense -> Dense -> Output layer
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization

 def simple_CNN():

    model = Sequential()

    model.add(Conv2D(6, (3,3), input_shape=(28,28,1), activation='relu'))

    model.add(MaxPool2D((2,2)))

    model.add(Conv2D(16, (3,3), activation='relu'))

    model.add(MaxPool2D((2,2)))

    model.add(Flatten())

    model.add(Dense(120, activation='relu'))

    model.add(Dense(84, activation='relu'))

    model.add(Dense(10, activation='softmax'))

    model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

    return model

 model = simple_CNN()
 model.summary()
 ```

 %% Cell type:code id: tags:

 ``` python
 num_epochs = 5
 model_run = model.fit(X_train_prep, y_train_onehot, epochs=num_epochs,
                      batch_size=64, validation_data=(X_test_prep, y_test_onehot))
 ```

 %% Cell type:markdown id: tags:

 ### Exercise section
 * Use the above model or improve it (change number of filters, add more layers etc. on the MNIST example and see if you can get a better accuracy than what we achieved with a vanilla neural network)

 %% Cell type:markdown id: tags:

 ### Exercise section
 * Explore the CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html) dataset included with Keras and build+train a simple CNN to classify it

 %% Cell type:code id: tags:

 ``` python
 from tensorflow.keras.datasets import cifar10
 (X_train, y_train), (X_test, y_test) = cifar10.load_data()
 ```

 %% Cell type:markdown id: tags:

 Copyright (C) 2019-2021 ETH Zurich, SIS ID

 %% Cell type:code id: tags:

 ``` python
 ```

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,3 +19,9 @@ Launch JupyterLab:
    $ jupyter lab

 Edit (improve) notebooks.
+
+## Upload course material gitlab.ethz.ch
+
+Use `update_course_repo.sh` to prepare official repository to be used during
+the workshop.  This script updates the official repository from the currently
+active branch.
--- a/Install.md
+++ b/Install.md
-# Setting up the Conda environment
-
-## Install conda
-
-https://conda.io/docs/user-guide/install/
-
-## Install the environment
-
-### On Linux:
-* Open a terminal and navigate to the workshop git repository
-* Run the following command:
-
-~~~~bash
-conda env create -f environment.yml
-~~~~
-
-### On Mac:
-* Open a terminal and navigate to the workshop git repository
-* Run the following command:
-
-~~~~bash
-conda env create -f environment_mac.yml
-~~~~
-
-### On Windows
-
- * Open the Anaconda Navigator
- * Go to `Environments`
- * Select Import at the bottom and specify the following
-   * Any name that you would like
-   * Specification File: Select the `environment.yml` file
--- a/README.md
+++ b/README.md
-# README
+# Introduction to Machine Learning using Python

-Material related to the SIS machine learning workshop.
+Materials for the SIS machine learning workshop.

-## Setting up the conda environment on personal computers

-* **Install conda**
-https://conda.io/docs/user-guide/install/
+## Setup Instructions

-* **Install the environment**
-  * On Linux or Mac:
-  Open a terminal and navigate to the workshop git repository
-  Run the command: `conda env create -f environment.yml`
-  * On Windows
-    * Open the Anaconda Navigator
-    * Go to Environments
-    * Select Import at the bottom and specify the following
-      * Any name that you would like
-      * Specification File: Select the environment.yml file
+The setup is based on the Conda distribution for Python called Anaconda
+(https://www.anaconda.com/products/individual).

-* **Check installation**
+### I. Install Anaconda

-  * On **Linux**/**Mac** - open a terminal
+1. If you don’t have Anaconda yet, download and install Anaconda from
+   https://www.anaconda.com/products/individual, or, in case you work on a ETH-managed
+   computer, install Anaconda via the
+   [ETH AppV Software Kiosk](https://app.ethz.ch/kiosk).

-  * On **Windows** - open Anaconda prompt
+### II. Install Conda environment

-and run the following commands:
-~~~~
-conda activate machine_learning_workshop
-jupyter notebook
-~~~~
+#### on MacOS:

-(For some versions of Anaconda on **Windows** you might have to run just **activate machine_learning_workshop** intead of **conda activate ...**)
+1. Open Terminal.app, change directory to the directory with workshop materials
+   (`$ cd path/to/dir`) and run:

+       $ conda env create -f environment.yml

-in case your installation worked your browser should open jupyter.
+#### on Linux
+
+1. Open Terminal, change directory to the directory with workshop materials
+   (`$ cd path/to/dir`) and run:
+
+       $ run conda env create -f environment_linux.yml
+
+#### on Windows
+
+1. Start the Anaconda Navigator and Select "Environments"
+2. Press "Import Environment" Button.
+3. Click at the folder icon; using the File Selection Dialog navigate to the to the directory with workshop materials and select `environment.yml` file.
+4. Press "Import" Button.
+5. Click on the black triangle and from the dropdown list select "Open Terminal".
+
+### III. Check installation
+
+1. In the Terminal run:
+
+      $ conda run -n machine_learning_workshop_2021 python -c "import tensorflow, matplotlib, numpy, pandas; print('OK')"
+
+   Your setup is OK if this command returns without any error message.
+
+2. Check if you can start JupyterLab in your Web browser by running in the Terminal:
+
+      $ conda run -n machine_learning_workshop_2021 jupyter lab
+
+
+## Using JupyterLab
+
+The course content is provided as Jupyter Notebooks. Please make sure to familiarize
+yourself with
+[JupyterLab Interface](https://jupyterlab.readthedocs.io/en/latest/user/interface.html).
+
+The Setup Instructions include information on how to start JupyterLab in a Web browser.
+
+## Preparation Script
+
+The workshop materials contain a `00_numpy_pandas_matplotlib_intro.ipynb`
+Jupyter Notebook with an introduction to NumPy, pandas and Matplotlib libraries, which we
+are used without further introduction during the workshop. Please prepare yourself and
+go through the notebook.
--- a/environment.yml
+++ b/environment.yml
@@ -9,12 +9,12 @@ channels:
    - conda-forge

 dependencies:
-    - python==3.8
+    - python==3.8.8
    - pandas
    - matplotlib
    - scikit-learn==0.24.1
    - seaborn
-    - jupyterlab
+    - jupyterlab==3.1.10
    - pydot
    - pillow
    - pip

--- a/environment_linux.yml
+++ b/environment_linux.yml
+# Open a terminal and execute the following command to create the conda environment
+# for the workshop
+# 'conda env create -f environment_linux.yml'
+
+name: machine_learning_workshop_2021
+
+channels:
+  - anaconda
+  - defaults
+
+dependencies:
+  - python==3.8.8
+  - pandas
+  - matplotlib
+  - scikit-learn==0.24.1
+  - seaborn
+  - jupyterlab==3.1.7
+  - pydot
+  - pillow
+  - pip
+  - tensorflow==2.4.1
--- a/update_course_repo.sh
+++ b/update_course_repo.sh
@@ -10,9 +10,11 @@ set -e

 NOTEBOOKS=??_*.ipynb
 NEEDED="images data"
-EXTRA="Install.md LICENSE intro_presentation environment.yml"
+EXTRA="README.md LICENSE intro_presentation environment*.yml"

-REPOURL=https://gitlab.ethz.ch/sis/machine-learning-workshop
+NAME=machinelearning-introduction-workshop
+
+REPOURL=https://gitlab.ethz.ch/schmittu/${NAME}

 echo
 echo This script will upload the following files and folders to ${REPOURL}:
@@ -37,16 +39,15 @@ done
 FLDR=$(mktemp -d)
 git -C ${FLDR} clone ${REPOURL}.git

-ROOT=${FLDR}/machine-learning-workshop

-test -d ${ROOT}/solutions || mkdir -p ${ROOT}/solutions
+ROOT=${FLDR}/${NAME}
+echo
+echo ${ROOT}
+echo

-NOTEBOOKS=??_*.ipynb
-NEEDED="images data"
-EXTRA="Install.md LICENSE intro_presentation environment.yml"
+test -d ${ROOT}/solutions || mkdir -p ${ROOT}/solutions

 cp -R ${NEEDED} ${NOTEBOOKS} ${ROOT}/solutions
-
 cp -R ${NEEDED} ${EXTRA} ${ROOT}

 for N in ${NOTEBOOKS}; do