moderation_classifier/train_MNB.py: Clean code

4c110768 · Franziska Oschmann · e8e6d084 · 4c110768
Commit 4c110768 authored 1 year ago by Franziska Oschmann
--- a/moderation_classifier/train_MNB.py
+++ b/moderation_classifier/train_MNB.py
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import f1_score, precision_recall_fscore_support
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline

 import click
+import datetime
+from joblib import dump
 from nltk.corpus import stopwords
 import pandas as pd
 import spacy
@@ -15,37 +16,14 @@ import os
 from src.preprocessing_text import TextLoader, TextProcessor


-# ToDo
-# clean code
-# add contional storage of results (check if rows (train-test pair) exists and store results in this row)
-# train on diff corpi and generateb table with predictions
-# notebook with example of preprocessing?
-# Tests
-
-
-def preprocess(text: str, nlp) -> str:
-    """
-    Applies preprocessing to text
-    :param text: Input text
-    :param nlp: Loaded nlp model
-    """
-
-    tp = TextProcessor(nlp)
-
-    text_proc = tp.remove_spaces(text)
-    text_proc = tp.remove_punctuation(text_proc)
-    text_proc = tp.remove_mentions(text_proc)
-    # text_proc = tp.lemmatize_text(text_proc)
-    text_proc = tp.fold_case(text_proc)
-
-    return text_proc
-
-
 def create_pipeline():
    """
    Creates classification pipeline
    """

+    # define preprocessor
+    tp = TextProcessor()
+
    # define vectorizer
    stop_words_ge = stopwords.words("german")
    vectorizer = TfidfVectorizer(
@@ -56,43 +34,22 @@ def create_pipeline():
    mnb = MultinomialNB(alpha=0.01)

    # set pipeline
-    pipe = Pipeline([("vectorizer", vectorizer), ("mnb", mnb)])
+    pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)])

    return pipe


-def save_results(
-    train_spec: str, test_spec: str, prec: float, recall: float, f1: float
-) -> None:
+def save_model(pipe: Pipeline):
    """
-    Saves results to csv file
+    Saves trained model
+    :param pipe: Trained pipeline
    """
-    path_results = "MNB_results.csv"
-    if os.path.exists(path_results):
-        df_res = pd.read_csv(path_results, index_col="Unnamed: 0")
-    else:
-        df_res = pd.DataFrame(
-            {
-                "train: newspaper": pd.Series(dtype="str"),
-                "test: newspaper": pd.Series(dtype="str"),
-                "precision": pd.Series(dtype="float"),
-                "recall": pd.Series(dtype="float"),
-                "f1": pd.Series(dtype="float"),
-            }
-        )
-
-    df_res_tmp = pd.DataFrame(
-        {
-            "train: newspaper": [train_spec],
-            "test: newspaper": [test_spec],
-            "precision": [prec],
-            "recall": [recall],
-            "f1": [recall],
-        }
-    )

-    df_res = pd.concat([df_res, df_res_tmp])
-    df_res.to_csv(path_results)
+    if not os.path.exists("saved_models/MNB/"):
+        os.makedirs("saved_models/MNB/")
+
+    timestemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    dump(pipe, "saved_models/MNB/" + timestemp + ".joblib")


 @click.argument("input_data")
@@ -102,70 +59,28 @@ def main(input_data: Union[str, os.PathLike]):
    :param input_data: Path to input dataframe.
    """

-    # load data and extract only german text from tagesanzeiger
-    print("Load text")
+    # Load data and extract only text from tagesanzeiger
+    print("Load and preprocess text")
    tl = TextLoader(input_data)
+    df_de = tl.load_text_csv(
+        newspaper="tagesanzeiger",
+        load_subset=False,
+        remove_duplicates=False,
+        min_num_words=3,
+    )
+    #df_de = df_de.sample(50000)

-    df_de = tl.load_text_csv(lang="de")
-
-    df_de = df_de[
-        (
-            (df_de.originTenantId == "tagesanzeiger")
-            | (df_de.originTenantId == "bazonline")
-            | (df_de.originTenantId == "derbund")
-            | (df_de.originTenantId == "bernerzeitung")
-        )
-    ]
-
-    # apply preprocessing
-    print("Preprocess text")
-    df_de_sub = df_de  # .sample(50000)
-    nlp = spacy.load("de_core_news_sm")
-
-    text_proc = df_de_sub.text.apply(preprocess, nlp=nlp)
-    df_de_sub.text = text_proc
-    # # add test: other lang/other newspaper in df
-    # add test: lowercase/spec char in text stc
-
-    newspapers = df_de_sub.originTenantId.unique()
-
-    for n_train in newspapers:
-        text_n1 = df_de_sub[df_de_sub.originTenantId == n_train].text
-        label_n1 = df_de_sub[df_de_sub.originTenantId == n_train].label
-
-        X_train, X_test, y_train, y_test = train_test_split(
-            text_n1, label_n1, stratify=label_n1
-        )
-
-        print("Train model")
-        pipe = create_pipeline()
-        pipe.fit(X_train, y_train)
-
-        for n_test in newspapers:
-            text_n2 = df_de_sub[df_de_sub.originTenantId == n_test].text
-            label_n2 = df_de_sub[df_de_sub.originTenantId == n_test].label
-
-            # train pipeline
-
-            if n_train != n_test:
-                X_test = text_n2
-                y_test = label_n2
-
-            y_pred = pipe.predict(X_test)
-
-            precision, recall, *_ = precision_recall_fscore_support(
-                y_test, y_pred, average="weighted"
-            )
-            f1 = f1_score(y_test, y_pred)
+    # Prepare data for modeling
+    text = df_de.text
+    label = df_de.label

-            save_results((n_train), (n_test), precision, recall, f1)
+    X_train, X_val, y_train, y_val = train_test_split(text, label, stratify=label)

-    print("Accuracy is:")
-    print(pipe.score(X_test, y_test))
-    print("     ")
-    print("Precision, Recall:")
-    print(precision, recall)
-    print("     ")
+    # Training
+    print("Train model")
+    pipe = create_pipeline()
+    pipe.fit(X_train, y_train)
+    save_model(pipe)


 if __name__ == "__main__":