08_e-neural_networks.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-03-02 09:53:35.448921: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-03-02 09:53:35.685930: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2023-03-02 09:53:45.887532: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cluster/apps/gcc-8.2.0/npm-6.14.9-774crfohwvu6a33ijcow7x5cvonu44oi/lib:/cluster/apps/gcc-8.2.0/r-4.2.2-ydfaklhfrhw5dy6qcfzxlxfviwovcord/rlib/R/lib:/cluster/apps/gcc-8.2.0/nccl-2.11.4-1-pwkiz23vbeac3vt5ykybdwzaykprizb2/lib:/cluster/apps/gcc-8.2.0/cudnn-8.2.1.32-yqvbgr3teq3v6xu5eyc75xhbl2ya343j/lib64:/cluster/apps/gcc-8.2.0/cuda-11.3.1-o54iuxgz6jm4csvkstuj5hjg4tvd44h3/lib64:/cluster/apps/gcc-8.2.0/openblas-0.3.15-huwxbhezdzoo74awrgoz6sd2qndpmdva/lib:/cluster/apps/nss/gcc-8.2.0/python/3.10.4/x86_64/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-8.2.0-6xqov2fhvbmehix42slain67vprec3fs/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-8.2.0-6xqov2fhvbmehix42slain67vprec3fs/lib:/cluster/apps/lsf/10.1/linux2.6-glibc2.3-x86_64/lib::\n",
      "2023-03-02 09:53:45.890685: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cluster/apps/gcc-8.2.0/npm-6.14.9-774crfohwvu6a33ijcow7x5cvonu44oi/lib:/cluster/apps/gcc-8.2.0/r-4.2.2-ydfaklhfrhw5dy6qcfzxlxfviwovcord/rlib/R/lib:/cluster/apps/gcc-8.2.0/nccl-2.11.4-1-pwkiz23vbeac3vt5ykybdwzaykprizb2/lib:/cluster/apps/gcc-8.2.0/cudnn-8.2.1.32-yqvbgr3teq3v6xu5eyc75xhbl2ya343j/lib64:/cluster/apps/gcc-8.2.0/cuda-11.3.1-o54iuxgz6jm4csvkstuj5hjg4tvd44h3/lib64:/cluster/apps/gcc-8.2.0/openblas-0.3.15-huwxbhezdzoo74awrgoz6sd2qndpmdva/lib:/cluster/apps/nss/gcc-8.2.0/python/3.10.4/x86_64/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-8.2.0-6xqov2fhvbmehix42slain67vprec3fs/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-8.2.0-6xqov2fhvbmehix42slain67vprec3fs/lib:/cluster/apps/lsf/10.1/linux2.6-glibc2.3-x86_64/lib::\n",
      "2023-03-02 09:53:45.890701: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style>\n",
       "\n",
       "    @import url('http://fonts.googleapis.com/css?family=Source+Code+Pro');\n",
       "\n",
       "    @import url('http://fonts.googleapis.com/css?family=Kameron');\n",
       "    @import url('http://fonts.googleapis.com/css?family=Crimson+Text');\n",
       "\n",
       "    @import url('http://fonts.googleapis.com/css?family=Lato');\n",
       "    @import url('http://fonts.googleapis.com/css?family=Source+Sans+Pro');\n",
       "\n",
       "    @import url('http://fonts.googleapis.com/css?family=Lora');\n",
       "\n",
       "\n",
       "    body {\n",
       "        font-family: 'Lora', Consolas, sans-serif;\n",
       "\n",
       "        -webkit-print-color-adjust: exact important !;\n",
       "\n",
       "\n",
       "\n",
       "    }\n",
       "\n",
       "    .alert-block {\n",
       "        width: 95%;\n",
       "        margin: auto;\n",
       "    }\n",
       "\n",
       "    .rendered_html code\n",
       "    {\n",
       "        color: black;\n",
       "        background: #eaf0ff;\n",
       "        background: #f5f5f5;\n",
       "        padding: 1pt;\n",
       "        font-family:  'Source Code Pro', Consolas, monocco, monospace;\n",
       "    }\n",
       "\n",
       "    p {\n",
       "      line-height: 140%;\n",
       "    }\n",
       "\n",
       "    strong code {\n",
       "        background: red;\n",
       "    }\n",
       "\n",
       "    .rendered_html strong code\n",
       "    {\n",
       "        background: #f5f5f5;\n",
       "    }\n",
       "\n",
       "    .CodeMirror pre {\n",
       "    font-family: 'Source Code Pro', monocco, Consolas, monocco, monospace;\n",
       "    }\n",
       "\n",
       "    .cm-s-ipython span.cm-keyword {\n",
       "        font-weight: normal;\n",
       "     }\n",
       "\n",
       "     strong {\n",
       "         background: #f5f5f5;\n",
       "         margin-top: 4pt;\n",
       "         margin-bottom: 4pt;\n",
       "         padding: 2pt;\n",
       "         border: 0.5px solid #a0a0a0;\n",
       "         font-weight: bold;\n",
       "         color: darkred;\n",
       "     }\n",
       "\n",
       "\n",
       "    div #notebook {\n",
       "        # font-size: 10pt;\n",
       "        line-height: 145%;\n",
       "        }\n",
       "\n",
       "    li {\n",
       "        line-height: 145%;\n",
       "    }\n",
       "\n",
       "    div.output_area pre {\n",
       "        background: #fff9d8 !important;\n",
       "        padding: 5pt;\n",
       "\n",
       "       -webkit-print-color-adjust: exact;\n",
       "\n",
       "    }\n",
       "\n",
       "\n",
       "\n",
       "    h1, h2, h3, h4 {\n",
       "        font-family: Kameron, arial;\n",
       "\n",
       "    }\n",
       "\n",
       "    div#maintoolbar {display: none !important;}\n",
       "    /*\n",
       "\n",
       "    div#site {\n",
       "        border-top: 20px solid #1F407A;\n",
       "        border-right: 20px solid #1F407A;\n",
       "        margin-bottom: 0;\n",
       "        padding-bottom: 0;\n",
       "    }\n",
       "    div#toc-wrapper {\n",
       "        border-left: 20px solid #1F407A;\n",
       "        border-top: 20px solid #1F407A;\n",
       "\n",
       "    }\n",
       "\n",
       "    body {\n",
       "        margin-botton:10px;\n",
       "    }\n",
       "    */\n",
       "\n",
       "</style>\n",
       "    <script>\n",
       "IPython.OutputArea.prototype._should_scroll = function(lines) {\n",
       "        return false;\n",
       "}\n",
       "    </script>\n",
       "\n",
       "\n",
       "<footer id=\"attribution\" style=\"float:left; color:#1F407A; background:#fff; font-family: helvetica;\">\n",
       "    This script is licensed under CC BY-NC 4.0<br/>\n",
       "    Copyright (C) 2019-2023 Scientific IT Services of ETH Zurich,\n",
       "    <p>\n",
       "    Contributing Authors:\n",
       "    Dr. Tarun Chadha,\n",
       "    Dr. Franziska Oschmann,\n",
       "    Dr. Mikolaj Rybinski,\n",
       "    Dr. Manuel Weberndorfer,\n",
       "    Dr. Uwe Schmitt.\n",
       "    </p<\n",
       "</footer>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !\n",
    "from numpy.random import seed\n",
    "\n",
    "seed(42)\n",
    "import tensorflow as tf\n",
    "\n",
    "tf.random.set_seed(42)\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "sns.set(style=\"darkgrid\")\n",
    "mpl.rcParams[\"lines.linewidth\"] = 3\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "%config IPCompleter.greedy=True\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
    "from IPython.core.display import HTML\n",
    "\n",
    "HTML(open(\"custom.html\", \"r\").read())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 8e: Sequence modeling: Natural language processing\n",
    "## What is Natural language processing?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As the name suggests, it refers to processing of data such as text and speech. This involves tasks such as:\n",
    "\n",
    "- Automatic document processing\n",
    "- Topic modeling\n",
    "- Language translation\n",
    "- sentiment analysis\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we all know, computers cannot process data in text format. They need numbers. So we need some mechanism to convert our text to numbers.\n",
    "\n",
    "**Important to know libraries:**\n",
    "- [Natural language toolkit](https://www.nltk.org/)\n",
    "- [Gensim](https://radimrehurek.com/gensim/)\n",
    "- [Tomotopy](https://bab2min.github.io/tomotopy/v0.12.3/en/)\n",
    "- [fastext](https://fasttext.cc/)\n",
    "\n",
    "## Text prepocessing\n",
    "\n",
    "### Tokenization\n",
    "\n",
    "Text -> tokens\n",
    "\n",
    "The process of reducing a piece of text to tokens is called tokenization. It is genrally done at a word level but can also be done at other levels such as a sentence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/cluster/project/workshops/machine_learning/latest/venv/lib64/python3.10/site-packages/tensorflow/__init__.py'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf.__file__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading collection 'all'\n",
      "[nltk_data]    | \n",
      "[nltk_data]    | Downloading package abc to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package abc is already up-to-date!\n",
      "[nltk_data]    | Downloading package alpino to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package alpino is already up-to-date!\n",
      "[nltk_data]    | Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package averaged_perceptron_tagger is already up-\n",
      "[nltk_data]    |       to-date!\n",
      "[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package averaged_perceptron_tagger_ru is already\n",
      "[nltk_data]    |       up-to-date!\n",
      "[nltk_data]    | Downloading package basque_grammars to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package basque_grammars is already up-to-date!\n",
      "[nltk_data]    | Downloading package bcp47 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package bcp47 is already up-to-date!\n",
      "[nltk_data]    | Downloading package biocreative_ppi to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package biocreative_ppi is already up-to-date!\n",
      "[nltk_data]    | Downloading package bllip_wsj_no_aux to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package bllip_wsj_no_aux is already up-to-date!\n",
      "[nltk_data]    | Downloading package book_grammars to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package book_grammars is already up-to-date!\n",
      "[nltk_data]    | Downloading package brown to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package brown is already up-to-date!\n",
      "[nltk_data]    | Downloading package brown_tei to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package brown_tei is already up-to-date!\n",
      "[nltk_data]    | Downloading package cess_cat to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package cess_cat is already up-to-date!\n",
      "[nltk_data]    | Downloading package cess_esp to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package cess_esp is already up-to-date!\n",
      "[nltk_data]    | Downloading package chat80 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package chat80 is already up-to-date!\n",
      "[nltk_data]    | Downloading package city_database to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package city_database is already up-to-date!\n",
      "[nltk_data]    | Downloading package cmudict to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package cmudict is already up-to-date!\n",
      "[nltk_data]    | Downloading package comparative_sentences to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package comparative_sentences is already up-to-\n",
      "[nltk_data]    |       date!\n",
      "[nltk_data]    | Downloading package comtrans to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package comtrans is already up-to-date!\n",
      "[nltk_data]    | Downloading package conll2000 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package conll2000 is already up-to-date!\n",
      "[nltk_data]    | Downloading package conll2002 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package conll2002 is already up-to-date!\n",
      "[nltk_data]    | Downloading package conll2007 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package conll2007 is already up-to-date!\n",
      "[nltk_data]    | Downloading package crubadan to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package crubadan is already up-to-date!\n",
      "[nltk_data]    | Downloading package dependency_treebank to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package dependency_treebank is already up-to-date!\n",
      "[nltk_data]    | Downloading package dolch to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package dolch is already up-to-date!\n",
      "[nltk_data]    | Downloading package europarl_raw to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package europarl_raw is already up-to-date!\n",
      "[nltk_data]    | Downloading package extended_omw to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package extended_omw is already up-to-date!\n",
      "[nltk_data]    | Downloading package floresta to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package floresta is already up-to-date!\n",
      "[nltk_data]    | Downloading package framenet_v15 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package framenet_v15 is already up-to-date!\n",
      "[nltk_data]    | Downloading package framenet_v17 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package framenet_v17 is already up-to-date!\n",
      "[nltk_data]    | Downloading package gazetteers to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package gazetteers is already up-to-date!\n",
      "[nltk_data]    | Downloading package genesis to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package genesis is already up-to-date!\n",
      "[nltk_data]    | Downloading package gutenberg to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package gutenberg is already up-to-date!\n",
      "[nltk_data]    | Downloading package ieer to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package ieer is already up-to-date!\n",
      "[nltk_data]    | Downloading package inaugural to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package inaugural is already up-to-date!\n",
      "[nltk_data]    | Downloading package indian to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package indian is already up-to-date!\n",
      "[nltk_data]    | Downloading package jeita to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package jeita is already up-to-date!\n",
      "[nltk_data]    | Downloading package kimmo to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package kimmo is already up-to-date!\n",
      "[nltk_data]    | Downloading package knbc to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package knbc is already up-to-date!\n",
      "[nltk_data]    | Downloading package large_grammars to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package large_grammars is already up-to-date!\n",
      "[nltk_data]    | Downloading package lin_thesaurus to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package lin_thesaurus is already up-to-date!\n",
      "[nltk_data]    | Downloading package mac_morpho to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package mac_morpho is already up-to-date!\n",
      "[nltk_data]    | Downloading package machado to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package machado is already up-to-date!\n",
      "[nltk_data]    | Downloading package masc_tagged to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package masc_tagged is already up-to-date!\n",
      "[nltk_data]    | Downloading package maxent_ne_chunker to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package maxent_ne_chunker is already up-to-date!\n",
      "[nltk_data]    | Downloading package maxent_treebank_pos_tagger to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package maxent_treebank_pos_tagger is already up-\n",
      "[nltk_data]    |       to-date!\n",
      "[nltk_data]    | Downloading package moses_sample to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package moses_sample is already up-to-date!\n",
      "[nltk_data]    | Downloading package movie_reviews to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package movie_reviews is already up-to-date!\n",
      "[nltk_data]    | Downloading package mte_teip5 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package mte_teip5 is already up-to-date!\n",
      "[nltk_data]    | Downloading package mwa_ppdb to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package mwa_ppdb is already up-to-date!\n",
      "[nltk_data]    | Downloading package names to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package names is already up-to-date!\n",
      "[nltk_data]    | Downloading package nombank.1.0 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package nombank.1.0 is already up-to-date!\n",
      "[nltk_data]    | Downloading package nonbreaking_prefixes to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package nonbreaking_prefixes is already up-to-date!\n",
      "[nltk_data]    | Downloading package nps_chat to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package nps_chat is already up-to-date!\n",
      "[nltk_data]    | Downloading package omw to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package omw is already up-to-date!\n",
      "[nltk_data]    | Downloading package omw-1.4 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package omw-1.4 is already up-to-date!\n",
      "[nltk_data]    | Downloading package opinion_lexicon to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package opinion_lexicon is already up-to-date!\n",
      "[nltk_data]    | Downloading package panlex_swadesh to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package panlex_swadesh is already up-to-date!\n",
      "[nltk_data]    | Downloading package paradigms to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package paradigms is already up-to-date!\n",
      "[nltk_data]    | Downloading package pe08 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package pe08 is already up-to-date!\n",
      "[nltk_data]    | Downloading package perluniprops to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package perluniprops is already up-to-date!\n",
      "[nltk_data]    | Downloading package pil to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package pil is already up-to-date!\n",
      "[nltk_data]    | Downloading package pl196x to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package pl196x is already up-to-date!\n",
      "[nltk_data]    | Downloading package porter_test to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package porter_test is already up-to-date!\n",
      "[nltk_data]    | Downloading package ppattach to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package ppattach is already up-to-date!\n",
      "[nltk_data]    | Downloading package problem_reports to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package problem_reports is already up-to-date!\n",
      "[nltk_data]    | Downloading package product_reviews_1 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package product_reviews_1 is already up-to-date!\n",
      "[nltk_data]    | Downloading package product_reviews_2 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package product_reviews_2 is already up-to-date!\n",
      "[nltk_data]    | Downloading package propbank to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package propbank is already up-to-date!\n",
      "[nltk_data]    | Downloading package pros_cons to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package pros_cons is already up-to-date!\n",
      "[nltk_data]    | Downloading package ptb to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package ptb is already up-to-date!\n",
      "[nltk_data]    | Downloading package punkt to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package punkt is already up-to-date!\n",
      "[nltk_data]    | Downloading package qc to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package qc is already up-to-date!\n",
      "[nltk_data]    | Downloading package reuters to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package reuters is already up-to-date!\n",
      "[nltk_data]    | Downloading package rslp to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package rslp is already up-to-date!\n",
      "[nltk_data]    | Downloading package rte to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package rte is already up-to-date!\n",
      "[nltk_data]    | Downloading package sample_grammars to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package sample_grammars is already up-to-date!\n",
      "[nltk_data]    | Downloading package semcor to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package semcor is already up-to-date!\n",
      "[nltk_data]    | Downloading package senseval to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package senseval is already up-to-date!\n",
      "[nltk_data]    | Downloading package sentence_polarity to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package sentence_polarity is already up-to-date!\n",
      "[nltk_data]    | Downloading package sentiwordnet to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package sentiwordnet is already up-to-date!\n",
      "[nltk_data]    | Downloading package shakespeare to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package shakespeare is already up-to-date!\n",
      "[nltk_data]    | Downloading package sinica_treebank to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package sinica_treebank is already up-to-date!\n",
      "[nltk_data]    | Downloading package smultron to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package smultron is already up-to-date!\n",
      "[nltk_data]    | Downloading package snowball_data to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package snowball_data is already up-to-date!\n",
      "[nltk_data]    | Downloading package spanish_grammars to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package spanish_grammars is already up-to-date!\n",
      "[nltk_data]    | Downloading package state_union to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package state_union is already up-to-date!\n",
      "[nltk_data]    | Downloading package stopwords to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package stopwords is already up-to-date!\n",
      "[nltk_data]    | Downloading package subjectivity to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package subjectivity is already up-to-date!\n",
      "[nltk_data]    | Downloading package swadesh to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package swadesh is already up-to-date!\n",
      "[nltk_data]    | Downloading package switchboard to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package switchboard is already up-to-date!\n",
      "[nltk_data]    | Downloading package tagsets to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package tagsets is already up-to-date!\n",
      "[nltk_data]    | Downloading package timit to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package timit is already up-to-date!\n",
      "[nltk_data]    | Downloading package toolbox to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package toolbox is already up-to-date!\n",
      "[nltk_data]    | Downloading package treebank to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package treebank is already up-to-date!\n",
      "[nltk_data]    | Downloading package twitter_samples to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package twitter_samples is already up-to-date!\n",
      "[nltk_data]    | Downloading package udhr to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package udhr is already up-to-date!\n",
      "[nltk_data]    | Downloading package udhr2 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package udhr2 is already up-to-date!\n",
      "[nltk_data]    | Downloading package unicode_samples to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package unicode_samples is already up-to-date!\n",
      "[nltk_data]    | Downloading package universal_tagset to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package universal_tagset is already up-to-date!\n",
      "[nltk_data]    | Downloading package universal_treebanks_v20 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package universal_treebanks_v20 is already up-to-\n",
      "[nltk_data]    |       date!\n",
      "[nltk_data]    | Downloading package vader_lexicon to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package vader_lexicon is already up-to-date!\n",
      "[nltk_data]    | Downloading package verbnet to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package verbnet is already up-to-date!\n",
      "[nltk_data]    | Downloading package verbnet3 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package verbnet3 is already up-to-date!\n",
      "[nltk_data]    | Downloading package webtext to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package webtext is already up-to-date!\n",
      "[nltk_data]    | Downloading package wmt15_eval to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package wmt15_eval is already up-to-date!\n",
      "[nltk_data]    | Downloading package word2vec_sample to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package word2vec_sample is already up-to-date!\n",
      "[nltk_data]    | Downloading package wordnet to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package wordnet is already up-to-date!\n",
      "[nltk_data]    | Downloading package wordnet2021 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package wordnet2021 is already up-to-date!\n",
      "[nltk_data]    | Downloading package wordnet2022 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package wordnet2022 is already up-to-date!\n",
      "[nltk_data]    | Downloading package wordnet31 to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package wordnet31 is already up-to-date!\n",
      "[nltk_data]    | Downloading package wordnet_ic to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package wordnet_ic is already up-to-date!\n",
      "[nltk_data]    | Downloading package words to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package words is already up-to-date!\n",
      "[nltk_data]    | Downloading package ycoe to\n",
      "[nltk_data]    |     /cluster/home/oschmanf/nltk_data...\n",
      "[nltk_data]    |   Package ycoe is already up-to-date!\n",
      "[nltk_data]    | \n",
      "[nltk_data]  Done downloading collection all\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "nltk.download(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"Is Monty a python or a group of pythons in a flying circus? What about swimming circuses?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Is', 'Monty', 'a', 'python', 'or', 'a', 'group', 'of', 'pythons', 'in', 'a', 'flying', 'circus', '?', 'What', 'about', 'swimming', 'circuses', '?']\n"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "print(word_tokenize(text))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lemmatization and Stemming\n",
    "\n",
    "Most of the time we want to also reduce the inflectional forms of the same word. For example, consider a text that has (organization, organizational, organizations)\n",
    "\n",
    "`Stemming`: This is a process of reducing a word to a stem form based on some pre-defined rules. The resulting stem might be a non-sensical word.\n",
    "\n",
    "`Lemmatization`: This is a process of reducing a word to a lemma or the dictionary form of the word. This follows lexicon rules and is much more comprehensive than `stemming`. However, it is also more computationally expensive."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokens \n",
      "\n",
      "['Is', 'Monty', 'a', 'python', 'or', 'a', 'group', 'of', 'pythons', 'in', 'a', 'flying', 'circus', '?', 'What', 'about', 'swimming', 'circuses', '?']\n",
      "+----------+--------+----------+\n",
      "|   Word   |  Stem  |  Lemma   |\n",
      "+----------+--------+----------+\n",
      "|    Is    |   is   |    Is    |\n",
      "|  Monty   | monti  |  Monty   |\n",
      "|    a     |   a    |    a     |\n",
      "|  python  | python |  python  |\n",
      "|    or    |   or   |    or    |\n",
      "|    a     |   a    |    a     |\n",
      "|  group   | group  |  group   |\n",
      "|    of    |   of   |    of    |\n",
      "| pythons  | python |  python  |\n",
      "|    in    |   in   |    in    |\n",
      "|    a     |   a    |    a     |\n",
      "|  flying  |  fli   |  flying  |\n",
      "|  circus  | circu  |  circus  |\n",
      "|    ?     |   ?    |    ?     |\n",
      "|   What   |  what  |   What   |\n",
      "|  about   | about  |  about   |\n",
      "| swimming |  swim  | swimming |\n",
      "| circuses | circus |  circus  |\n",
      "|    ?     |   ?    |    ?     |\n",
      "+----------+--------+----------+\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
    "from nltk.tokenize import word_tokenize\n",
    "from prettytable import PrettyTable\n",
    "\n",
    "words = word_tokenize(text)\n",
    "print(\"Tokens \\n\")\n",
    "print(words)\n",
    "\n",
    "stemmer = PorterStemmer()\n",
    "\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "table = PrettyTable([\"Word\", \"Stem\", \"Lemma\"])\n",
    "\n",
    "for w in words:\n",
    "    table.add_row([w, stemmer.stem(w), lemmatizer.lemmatize(w)])\n",
    "\n",
    "print(table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'swimming'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lemmatizer.lemmatize(\"swimming\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mSignature:\u001b[0m \u001b[0mlemmatizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemmatize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'n'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m\n",
       "Lemmatize `word` using WordNet's built-in morphy function.\n",
       "Returns the input word unchanged if it cannot be found in WordNet.\n",
       "\n",
       ":param word: The input word to lemmatize.\n",
       ":type word: str\n",
       ":param pos: The Part Of Speech tag. Valid options are `\"n\"` for nouns,\n",
       "    `\"v\"` for verbs, `\"a\"` for adjectives, `\"r\"` for adverbs and `\"s\"`\n",
       "    for satellite adjectives.\n",
       ":param pos: str\n",
       ":return: The lemma of `word`, for the given `pos`.\n",
       "\u001b[0;31mFile:\u001b[0m      /cluster/project/workshops/machine_learning/latest/venv/lib64/python3.10/site-packages/nltk/stem/wordnet.py\n",
       "\u001b[0;31mType:\u001b[0m      method"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "lemmatizer.lemmatize?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'swim'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lemmatizer.lemmatize(\"swimming\", \"v\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+--------+--------+\n",
      "|   Word   |  Stem  | Lemma  |\n",
      "+----------+--------+--------+\n",
      "|    Is    |   is   |   Is   |\n",
      "|  Monty   | monti  | Monty  |\n",
      "|    a     |   a    |   a    |\n",
      "|  python  | python | python |\n",
      "|    or    |   or   |   or   |\n",
      "|    a     |   a    |   a    |\n",
      "|  group   | group  | group  |\n",
      "|    of    |   of   |   of   |\n",
      "| pythons  | python | python |\n",
      "|    in    |   in   |   in   |\n",
      "|    a     |   a    |   a    |\n",
      "|  flying  |  fli   |  fly   |\n",
      "|  circus  | circu  | circus |\n",
      "|    ?     |   ?    |   ?    |\n",
      "|   What   |  what  |  What  |\n",
      "|  about   | about  | about  |\n",
      "| swimming |  swim  |  swim  |\n",
      "| circuses | circus | circus |\n",
      "|    ?     |   ?    |   ?    |\n",
      "+----------+--------+--------+\n"
     ]
    }
   ],
   "source": [
    "# Automatically find POS tag\n",
    "from nltk.corpus import wordnet\n",
    "\n",
    "\n",
    "def get_wordnet_pos(word):\n",
    "    \"\"\"Map POS tag to first character lemmatize() accepts\"\"\"\n",
    "    tag = nltk.pos_tag([word])[0][1][0].upper()\n",
    "    tag_dict = {\n",
    "        \"J\": wordnet.ADJ,\n",
    "        \"N\": wordnet.NOUN,\n",
    "        \"V\": wordnet.VERB,\n",
    "        \"R\": wordnet.ADV,\n",
    "    }\n",
    "\n",
    "    return tag_dict.get(tag, wordnet.NOUN)\n",
    "\n",
    "\n",
    "words = word_tokenize(text)\n",
    "\n",
    "table = PrettyTable([\"Word\", \"Stem\", \"Lemma\"])\n",
    "\n",
    "for w in words:\n",
    "    table.add_row([w, stemmer.stem(w), lemmatizer.lemmatize(w, get_wordnet_pos(w))])\n",
    "\n",
    "print(table)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Other:\n",
    "\n",
    "- Text to lower case\n",
    "- Remove punctuations\n",
    "- Remove stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "is monty a python or a group of pythons in a flying circus? what about swimming circuses?\n",
      "is monty a python or a group of pythons in a flying circus what about swimming circuses\n"
     ]
    }
   ],
   "source": [
    "# Text to lower case\n",
    "text = text.lower()\n",
    "print(text)\n",
    "\n",
    "# Remove punctuations\n",
    "import string\n",
    "\n",
    "text = text.translate(str.maketrans(\"\", \"\", string.punctuation))\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
     ]
    }
   ],
   "source": [
    "# Remove stopwords\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "print(stopwords.words(\"english\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['monty', 'python', 'group', 'pythons', 'flying', 'circus', 'swimming', 'circuses']\n"
     ]
    }
   ],
   "source": [
    "words = word_tokenize(text)\n",
    "\n",
    "filtered_text = [w for w in words if not w in set(stopwords.words(\"english\"))]\n",
    "\n",
    "print(filtered_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokens to Vectors\n",
    "\n",
    "Once we have cleaned up our text we have different ways in which we can tokenize them:\n",
    "\n",
    "### Bag-of-Words (BoW)\n",
    "\n",
    "Imagine that all the unique words in our text corpus are put together in one big bag. \n",
    "\n",
    "All or a subset of this bag is then considered as our `vocabulary`.\n",
    "\n",
    "Each unit (document/line/...) in our corpus can now be represented as a vector of length equal to our vocabulary size with each index of the vector representing a word from our `vocabulary`.\n",
    "\n",
    "We count the number of occurences of each word in a unit of text and put this number at the corresponding location in this vector. If the word does not exist in the unit we enter 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['monty', 'python', 'group', 'python', 'flying', 'circus']\n",
      "['swimming', 'circus']\n",
      "{'monty': 1, 'python': 2, 'group': 1, 'flying': 1, 'circus': 2, 'swimming': 1}\n",
      "[[1. 0.]\n",
      " [2. 0.]\n",
      " [1. 0.]\n",
      " [1. 0.]\n",
      " [1. 1.]\n",
      " [0. 1.]]\n"
     ]
    }
   ],
   "source": [
    "# Let's consider each sentence of our example text as a document/unit we want to process\n",
    "import numpy as np\n",
    "\n",
    "text = [\n",
    "    \"Is Monty a python or a group of pythons in a flying circus?\",\n",
    "    \"What about swimming circuses?\",\n",
    "]\n",
    "\n",
    "for index, value in enumerate(text):\n",
    "    text[index] = value.lower().translate(str.maketrans(\"\", \"\", string.punctuation))\n",
    "\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "unique_words = {}\n",
    "\n",
    "bow_text = []\n",
    "\n",