Skip to content
Snippets Groups Projects
Commit 828cec5f authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

MNB: Add saving of logs to training and load them in eval-script

parent 4c110768
No related branches found
No related tags found
1 merge request!2Dev train models
import click
from collections import Counter
from joblib import load
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import f1_score, precision_recall_fscore_support
from typing import Union
import os
from src.preprocessing_text import TextLoader
def load_model(path):
"""
Loads trained model
"""
pipe = load(path)
return pipe
@click.argument("train_logs")
def main(train_logs: Union[str, os.PathLike]):
"""
Prepares data and evaluates trained MNB model
:param train_logs: path to csv-file containing train logs
"""
# Load logs
df = pd.read_csv(train_logs, index_col="Unnamed: 0")
path_model = df.loc["path"].values[0]
input_data = df.loc["input_data"].values[0].replace("train", "test")
# Load model
pipe = load_model(path_model)
# Load test data
tl = TextLoader(input_data)
df_test = tl.load_text_csv(
newspaper="tagesanzeiger",
load_subset=False,
remove_duplicates=False,
min_num_words=3,
)
X_test = df_test.text
y_test = df_test.label
# Make prediction
y_pred = pipe.predict(X_test)
y_pred_t = pipe.predict(X_test)
precision, recall, *_ = precision_recall_fscore_support(
y_test, y_pred, average="weighted"
)
f1 = f1_score(y_test, y_pred)
score = pipe.score(X_test, y_test)
results_all = dict()
results_all["precision"] = precision
results_all["recall"] = recall
results_all["f1"] = f1
results_all["score"] = score
# Get results per topic
count_topics = Counter(df_test["topic"]).most_common(10)
topics = [t[0] for t in count_topics]
results_t = dict()
for t in topics:
X_test_t = df_test[df_test.topic == t].text
y_test_t = df_test[df_test.topic == t].label
y_pred_t = pipe.predict(X_test_t)
precision, recall, *_ = precision_recall_fscore_support(
y_test_t, y_pred_t, average="weighted"
)
f1 = f1_score(y_test_t, y_pred_t)
score = pipe.score(X_test_t, y_test_t)
results_t[t] = dict()
results_t[t]["precision"] = precision
results_t[t]["recall"] = recall
results_t[t]["f1"] = f1
results_t[t]["score"] = score
# Compute rejection rate
reject_rate_all = np.round(df_test.label.mean(), 4) * 100
reject_rate_topic = [
np.round(df_test[df_test.topic == k].label.mean(), 4) * 100 for k in topics
]
# Compute number comments
num_comm_all = df_test.shape[0]
num_comm_topic = [df_test[df_test.topic == k].shape[0] for k in topics]
# Save results
df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"])
df_res_all.loc["rejection rate"] = reject_rate_all
df_res_all.loc["number comments"] = num_comm_all
df_res_topic = pd.DataFrame.from_dict(results_t)
df_res_topic.loc["rejection rate"] = reject_rate_topic
df_res_topic.loc["number comments"] = num_comm_topic
df_res = df_res_all.join(df_res_topic)
df_res.loc["data"] = [input_data] * df_res.shape[1]
df_res.to_csv("results/results_eval_MNB/" + Path(path_model).stem + ".csv")
if __name__ == "__main__":
main()
......@@ -7,6 +7,7 @@ import click
import datetime
from joblib import dump
from nltk.corpus import stopwords
from pathlib import Path
import pandas as pd
import spacy
......@@ -39,17 +40,53 @@ def create_pipeline():
return pipe
def save_model(pipe: Pipeline):
def create_path() -> Union[str, os.PathLike]:
"""
Saves trained model
:param pipe: Trained pipeline
Creates path to store trained model
"""
if not os.path.exists("saved_models/MNB/"):
os.makedirs("saved_models/MNB/")
timestemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
dump(pipe, "saved_models/MNB/" + timestemp + ".joblib")
return Path("saved_models/MNB/" + timestemp + ".joblib")
def save_model(pipe: Pipeline, path):
"""
Saves trained model
:param pipe: Trained pipeline
"""
dump(pipe, path)
def save_logs(
path_repo: Union[str, os.PathLike],
path: Union[str, os.PathLike],
input_data: Union[str, os.PathLike],
text_preprocessing: bool,
val_score: float,
):
"""
Saves training logs whch can be used during evaluation
:param path_repo: Path to repository
:param path: Path to trained model
:param input_data: Path to used train data
:param text_preprocessing: Boolean flag whether preprocessing was used or not
"""
logs = dict()
logs["path_model"] = path
logs["input_data"] = input_data
logs["text_preprocessing"] = text_preprocessing
logs["val_score"] = val_score
path_logs = (path_repo).joinpath("saved_models/MNB_logs/")
if not os.path.exists(path_logs):
os.makedirs(path_logs)
df_logs = pd.DataFrame.from_dict(logs, orient="index", columns=["logs"])
df_logs.to_csv(path_logs.joinpath(path.stem).with_suffix(".csv"))
@click.argument("input_data")
......@@ -59,16 +96,22 @@ def main(input_data: Union[str, os.PathLike]):
:param input_data: Path to input dataframe.
"""
# Extract path
p = Path(input_data)
p_repo = p.parent.parent
# Load data and extract only text from tagesanzeiger
print("Load and preprocess text")
remove_duplicates = False
min_num_words = 3
tl = TextLoader(input_data)
df_de = tl.load_text_csv(
newspaper="tagesanzeiger",
load_subset=False,
remove_duplicates=False,
min_num_words=3,
remove_duplicates=remove_duplicates,
min_num_words=min_num_words,
)
#df_de = df_de.sample(50000)
df_de = df_de.sample(50)
# Prepare data for modeling
text = df_de.text
......@@ -80,7 +123,18 @@ def main(input_data: Union[str, os.PathLike]):
print("Train model")
pipe = create_pipeline()
pipe.fit(X_train, y_train)
save_model(pipe)
val_score = pipe.score(X_val, y_val)
# Save model and training logs
path = create_path()
save_model(pipe, path)
save_logs(
path_repo=p_repo,
path=path,
input_data=input_data,
text_preprocessing=True,
val_score=val_score,
)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment