Skip to content
Snippets Groups Projects
Commit 4c110768 authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

moderation_classifier/train_MNB.py: Clean code

parent e8e6d084
No related branches found
No related tags found
1 merge request!2Dev train models
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import click
import datetime
from joblib import dump
from nltk.corpus import stopwords
import pandas as pd
import spacy
......@@ -15,37 +16,14 @@ import os
from src.preprocessing_text import TextLoader, TextProcessor
# ToDo
# clean code
# add contional storage of results (check if rows (train-test pair) exists and store results in this row)
# train on diff corpi and generateb table with predictions
# notebook with example of preprocessing?
# Tests
def preprocess(text: str, nlp) -> str:
"""
Applies preprocessing to text
:param text: Input text
:param nlp: Loaded nlp model
"""
tp = TextProcessor(nlp)
text_proc = tp.remove_spaces(text)
text_proc = tp.remove_punctuation(text_proc)
text_proc = tp.remove_mentions(text_proc)
# text_proc = tp.lemmatize_text(text_proc)
text_proc = tp.fold_case(text_proc)
return text_proc
def create_pipeline():
"""
Creates classification pipeline
"""
# define preprocessor
tp = TextProcessor()
# define vectorizer
stop_words_ge = stopwords.words("german")
vectorizer = TfidfVectorizer(
......@@ -56,43 +34,22 @@ def create_pipeline():
mnb = MultinomialNB(alpha=0.01)
# set pipeline
pipe = Pipeline([("vectorizer", vectorizer), ("mnb", mnb)])
pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)])
return pipe
def save_results(
train_spec: str, test_spec: str, prec: float, recall: float, f1: float
) -> None:
def save_model(pipe: Pipeline):
"""
Saves results to csv file
Saves trained model
:param pipe: Trained pipeline
"""
path_results = "MNB_results.csv"
if os.path.exists(path_results):
df_res = pd.read_csv(path_results, index_col="Unnamed: 0")
else:
df_res = pd.DataFrame(
{
"train: newspaper": pd.Series(dtype="str"),
"test: newspaper": pd.Series(dtype="str"),
"precision": pd.Series(dtype="float"),
"recall": pd.Series(dtype="float"),
"f1": pd.Series(dtype="float"),
}
)
df_res_tmp = pd.DataFrame(
{
"train: newspaper": [train_spec],
"test: newspaper": [test_spec],
"precision": [prec],
"recall": [recall],
"f1": [recall],
}
)
df_res = pd.concat([df_res, df_res_tmp])
df_res.to_csv(path_results)
if not os.path.exists("saved_models/MNB/"):
os.makedirs("saved_models/MNB/")
timestemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
dump(pipe, "saved_models/MNB/" + timestemp + ".joblib")
@click.argument("input_data")
......@@ -102,70 +59,28 @@ def main(input_data: Union[str, os.PathLike]):
:param input_data: Path to input dataframe.
"""
# load data and extract only german text from tagesanzeiger
print("Load text")
# Load data and extract only text from tagesanzeiger
print("Load and preprocess text")
tl = TextLoader(input_data)
df_de = tl.load_text_csv(
newspaper="tagesanzeiger",
load_subset=False,
remove_duplicates=False,
min_num_words=3,
)
#df_de = df_de.sample(50000)
df_de = tl.load_text_csv(lang="de")
df_de = df_de[
(
(df_de.originTenantId == "tagesanzeiger")
| (df_de.originTenantId == "bazonline")
| (df_de.originTenantId == "derbund")
| (df_de.originTenantId == "bernerzeitung")
)
]
# apply preprocessing
print("Preprocess text")
df_de_sub = df_de # .sample(50000)
nlp = spacy.load("de_core_news_sm")
text_proc = df_de_sub.text.apply(preprocess, nlp=nlp)
df_de_sub.text = text_proc
# # add test: other lang/other newspaper in df
# add test: lowercase/spec char in text stc
newspapers = df_de_sub.originTenantId.unique()
for n_train in newspapers:
text_n1 = df_de_sub[df_de_sub.originTenantId == n_train].text
label_n1 = df_de_sub[df_de_sub.originTenantId == n_train].label
X_train, X_test, y_train, y_test = train_test_split(
text_n1, label_n1, stratify=label_n1
)
print("Train model")
pipe = create_pipeline()
pipe.fit(X_train, y_train)
for n_test in newspapers:
text_n2 = df_de_sub[df_de_sub.originTenantId == n_test].text
label_n2 = df_de_sub[df_de_sub.originTenantId == n_test].label
# train pipeline
if n_train != n_test:
X_test = text_n2
y_test = label_n2
y_pred = pipe.predict(X_test)
precision, recall, *_ = precision_recall_fscore_support(
y_test, y_pred, average="weighted"
)
f1 = f1_score(y_test, y_pred)
# Prepare data for modeling
text = df_de.text
label = df_de.label
save_results((n_train), (n_test), precision, recall, f1)
X_train, X_val, y_train, y_val = train_test_split(text, label, stratify=label)
print("Accuracy is:")
print(pipe.score(X_test, y_test))
print(" ")
print("Precision, Recall:")
print(precision, recall)
print(" ")
# Training
print("Train model")
pipe = create_pipeline()
pipe.fit(X_train, y_train)
save_model(pipe)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment