In [None]:
# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !
from numpy.random import seed

seed(42)
import tensorflow as tf

tf.random.set_seed(42)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="darkgrid")
mpl.rcParams["lines.linewidth"] = 3
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%config IPCompleter.greedy=True
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=FutureWarning)
from IPython.core.display import HTML

HTML(open("custom.html", "r").read())

# Chapter 8e: Sequence modeling: Natural language processing
## What is Natural language processing?

As the name suggests, it refers to processing of data such as text and speech. This involves tasks such as:

- Automatic document processing
- Topic modeling
- Language translation
- sentiment analysis



As we all know, computers cannot process data in text format. They need numbers. So we need some mechanism to convert our text to numbers.

**Important to know libraries:**
- [Natural language toolkit](https://www.nltk.org/)
- [Gensim](https://radimrehurek.com/gensim/)
- [Tomotopy](https://bab2min.github.io/tomotopy/v0.12.3/en/)
- [fastext](https://fasttext.cc/)

## Text prepocessing

### Tokenization

Text -> tokens

The process of reducing a piece of text to tokens is called tokenization. It is genrally done at a word level but can also be done at other levels such as a sentence.

In [None]:
import nltk

nltk.download("all")

In [None]:
text = "Is Monty a python or a group of pythons in a flying circus? What about swimming circuses?"

In [None]:
from nltk.tokenize import word_tokenize

print(word_tokenize(text))

### Lemmatization and Stemming

Most of the time we want to also reduce the inflectional forms of the same word. For example, consider a text that has (organization, organizational, organizations)

`Stemming`: This is a process of reducing a word to a stem form based on some pre-defined rules. The resulting stem might be a non-sensical word.

`Lemmatization`: This is a process of reducing a word to a lemma or the dictionary form of the word. This follows lexicon rules and is much more comprehensive than `stemming`. However, it is also more computationally expensive.

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from prettytable import PrettyTable

words = word_tokenize(text)
print("Tokens \n")
print(words)

stemmer = PorterStemmer()

lemmatizer = WordNetLemmatizer()

table = PrettyTable(["Word", "Stem", "Lemma"])

for w in words:
 table.add_row([w, stemmer.stem(w), lemmatizer.lemmatize(w)])

print(table)

In [None]:
lemmatizer.lemmatize("swimming")

In [None]:
lemmatizer.lemmatize?

In [None]:
lemmatizer.lemmatize("swimming", "v")

In [None]:
# Automatically find POS tag
from nltk.corpus import wordnet


def get_wordnet_pos(word):
 """Map POS tag to first character lemmatize() accepts"""
 tag = nltk.pos_tag([word])[0][1][0].upper()
 tag_dict = {
 "J": wordnet.ADJ,
 "N": wordnet.NOUN,
 "V": wordnet.VERB,
 "R": wordnet.ADV,
 }

 return tag_dict.get(tag, wordnet.NOUN)


words = word_tokenize(text)

table = PrettyTable(["Word", "Stem", "Lemma"])

for w in words:
 table.add_row([w, stemmer.stem(w), lemmatizer.lemmatize(w, get_wordnet_pos(w))])

print(table)

### Other:

- Text to lower case
- Remove punctuations
- Remove stopwords

In [None]:
# Text to lower case
text = text.lower()
print(text)

# Remove punctuations
import string

text = text.translate(str.maketrans("", "", string.punctuation))
print(text)

In [None]:
# Remove stopwords
from nltk.corpus import stopwords

print(stopwords.words("english"))

In [None]:
words = word_tokenize(text)

filtered_text = [w for w in words if not w in set(stopwords.words("english"))]

print(filtered_text)

## Tokens to Vectors

Once we have cleaned up our text we have different ways in which we can tokenize them:

### Bag-of-Words (BoW)

Imagine that all the unique words in our text corpus are put together in one big bag. 

All or a subset of this bag is then considered as our `vocabulary`.

Each unit (document/line/...) in our corpus can now be represented as a vector of length equal to our vocabulary size with each index of the vector representing a word from our `vocabulary`.

We count the number of occurences of each word in a unit of text and put this number at the corresponding location in this vector. If the word does not exist in the unit we enter 0.

In [None]:
# Let's consider each sentence of our example text as a document/unit we want to process
import numpy as np

text = [
 "Is Monty a python or a group of pythons in a flying circus?",
 "What about swimming circuses?",
]

for index, value in enumerate(text):
 text[index] = value.lower().translate(str.maketrans("", "", string.punctuation))

lemmatizer = WordNetLemmatizer()

unique_words = {}

bow_text = []

for index, value in enumerate(text):
 words = word_tokenize(value)
 words = [w for w in words if not w in set(stopwords.words("english"))]
 words = [lemmatizer.lemmatize(w) for w in words]
 print(words)
 for token in words:
 if token not in unique_words.keys():
 unique_words[token] = 1
 else:
 unique_words[token] += 1
 bow_text.append(words)

print(unique_words)

unique_words = list(unique_words.keys())

bow_vectors = np.zeros((len(unique_words), len(text)))

for column, value in enumerate(bow_text):
 for _, word in enumerate(value):
 if word in unique_words:
 bow_vectors[unique_words.index(word), column] += 1
print(bow_vectors)

Much better way of doing this is:

In [None]:
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer automatically makes the text lowercase

text = [
 "Is Monty a python or a group of python in a flying circus?",
 "What about swimming circuses?",
]


class LemmaTokenizer:
 def __init__(self):
 self.wnl = WordNetLemmatizer()

 def __call__(self, doc):
 return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


vectorizer = CountVectorizer(
 stop_words=list(set(stopwords.words("english")).union(set(punctuation))),
 tokenizer=LemmaTokenizer(),
)

bow_vectors = vectorizer.fit_transform(text)

print(f"The vocabulary of our corpus is: \n {vectorizer.vocabulary_}\n")

print(f"Vectorizer from Scikit learn creates sparse matrices: {type(bow_vectors)} \n")

print(f"The created vectors are: {bow_vectors.todense()}")

In [None]:
# Other tokenizers
from string import punctuation

from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

tokenizer = TweetTokenizer()

text = [
 "Is Monty a python or a group of python's in a flying circus?",
 "What about swimming circuses?",
]


class LemmaTokenizer:
 def __init__(self):
 self.wnl = WordNetLemmatizer()

 def __call__(self, doc):
 return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]


vectorizer = CountVectorizer(
 stop_words=list(set(stopwords.words("english")).union(set(punctuation))),
 tokenizer=LemmaTokenizer(),
)

bow_vectors = vectorizer.fit_transform(text)

print(f"The vocabulary of our corpus is: \n {vectorizer.vocabulary_}\n")

print(f"Vectorizer from Scikit learn creates sparse matrices: {type(bow_vectors)} \n")

print(f"The created vectors are: {bow_vectors.todense()}")

### Term frequency inverse document frequency (Tf-idf)

A numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

A survey conducted in 2015 showed that 83% of text-based recommender systems in digital libraries use tf–idf(*)

*[Research-paper recommender systems : a literature survey](https://link.springer.com/article/10.1007/s00799-015-0156-0)

$TF-IDF = TF * IDF$

**TF = Term frequency**

**IDF = Inverse document/text frequency**

$T_{t',d}$ = Number of occurences of a particular term ($t'$) in a document ($d$).

$\sum_{t' \in d} T_{t',d}$ : Total number of terms in the document

$N_T$ = Total number of documents/text samples.

$N_{t'}$ = Number of documents/text samples that contain the term $t'$-

$TF-IDF = \dfrac{T_{t',d}}{\sum_{t' \in d} T_{t',d}} * \dfrac{N_T}{N_{t'}}$


### IMDB dataset

Let's have a look at a sample dataset.

IMDB dataset comprises of 50,000 movie reviews. Each of them has a label `0` or `1` representing a bad or a good review, respectively.

`Note`: This dataset is also contained in tensorflow.keras.datasets, however that data is already preprocessed. Therefore, we import it from tensorflow_datasets instead.

### Exercise: Explore the IMDB dataset and vectorize the tokens

In [None]:
import tensorflow_datasets as tfds

train_data, test_data = tfds.load(
 name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True
)

X_train, y_train = tfds.as_numpy(train_data)
X_test, y_test = tfds.as_numpy(test_data)

In [None]:
print(f"Number of: training samples - {len(y_train)}, test_samples - {len(y_test)}")

In [None]:
print(X_train[:5])
print(y_train[:5])

### Exercise: Apply tokenization and vectorization (e.g. CountVectorizer) to the imdb dataset

In [None]:
# Create a vectorizer e.g. CountVectorizer
# Pass maximum features=10000 to the vectorizer to avoid running out of memory


# train it on the training set (HINT: one can pass an array of texts)


# Look at the resulting vocabulary
# vectorizer.vocabulary_


# Transform the test data

In [None]:
# Build a 3 layer simple vanilla neural network
# Dont forget to add dropout




model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# We need to convert the sparse vector to dense
results = model.fit(
 train.todense(),
 y_train,
 epochs=10,
 batch_size=512,
 validation_data=(test.todense(), y_test),
)

In [None]:
# Solution
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
 stop_words=list(set(stopwords.words("english")).union(set(punctuation))),
 tokenizer=LemmaTokenizer(),
 max_features=20000,
)

train = vectorizer.fit_transform(X_train)

#print(vectorizer.vocabulary_)

test = vectorizer.transform(X_test)

In [None]:
# Build a 3 layer simple vanilla neural network
# Dont forget to add dropout

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential


model = Sequential()
model.add(Dense(50, activation="relu", input_shape=(test.shape[1],)))
# Hidden - Layers
model.add(Dropout(0.5))
model.add(Dense(30, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(20, activation="relu"))
# Output- Layer
model.add(Dense(1, activation="sigmoid"))
model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# We need to convert the sparse vector to dense
results = model.fit(
 train.todense(),
 y_train,
 epochs=10,
 batch_size=512,
 validation_data=(test.todense(), y_test),
)

### Word embeddings: Featurized representation of words

<center>
<figure>
<img src="./images/neuralnets/word_embedding.png" width="700"/>
<figcaption>Embedding words in a higher dimensional feature space</figcaption>
</figure>
</center>

| <div style="width:150px"></div> | <div style="width:150px"></div> Apple | <div style="width:150px"></div> Orange | <div style="width:150px"></div> Pants | <div style="width:150px"></div> Tiger |
| :-----------: | :-----------: | :-----------: | :-----------: | :-----------: |
| Animal |0.01 |0.015 |0.006 | 0.96 |
| Fruit | 0.99 | 0.97 | -0.001 | -0.01 |
| Clothing | 0.02 | 0.07 | 0.97 | 0.002 |
| FeatureX | - | - | - | - |
| FeatureY | - | - | - | - |


Some models to compute word embeddings:
- Word2Vec
- GloVe
- fastText
- BERT

### Pretrained embeddings

Example:
https://fasttext.cc/docs/en/crawl-vectors.html

In [None]:
import fasttext

ft = fasttext.load_model("./data/cc.en.100.bin")

In [None]:
words = ["cat", "dog", "cream", "pizza", "car", "tractor"]

word_vectors = {}
for word in words:
 word_vectors[word] = ft.get_word_vector(word)

In [None]:
import pandas as pd
from scipy import spatial


def compute_similarity(a, b):
 """This function computes cosine similarity between two vectors"""
 return 1 - spatial.distance.cosine(a, b)

In [None]:
# similarities = np.zeros([len(words)]*2)
similarities = pd.DataFrame(columns=words, index=words)
for word_a, vec_a in word_vectors.items():
 for word_b, vec_b in word_vectors.items():
 similarities.at[word_a, word_b] = compute_similarity(vec_a, vec_b)

In [None]:
similarities

### Recurrent Neural Networks (RNNs)

RNNs are used for problems such as time-series data, speech recognition and translation.

<center>
<figure>
<img src="./images/neuralnets/RNNs.png" width="700"/>
<figcaption>Recurrent neural network</figcaption>
</figure>
</center>

There are newer variants that overcome some issues with a vanilla RNN:
- Gated Recurrent Unit (GRU)
- Long Short Term Memory (LSTM)

**Example walkthrough** : https://keras.io/examples/vision/video_classification/

### Transformer models

Transformers are models based on an encoder-decoder architecture and mainly using the attention.

<center>
<table>
 <tr><td>
 <figure>
 <img src="./images/neuralnets/transformer.png" width="400"/>
 <figcaption>Transformer architecture</figcaption>
 </figure>
 </td></tr>
 <tr><td><center><sub>Source: <a href="https://arxiv.org/abs/1706.03762/">"Attention is all you need": https://arxiv.org/abs/1706.03762/</a></sub></center></td></tr>
</table>
</center>ce

### Putting it all together

https://paperswithcode.com/sota/sentiment-analysis-on-imdb