Newer
Older
import click
from collections import Counter
import numpy as np
import pandas as pd
from pathlib import Path
Franziska Oschmann
committed
from sklearn.metrics import precision_recall_fscore_support
from typing import Union
import os
Franziska Oschmann
committed
from src.MNB_utils import load_model
from src.preprocessing_text import TextLoader
Franziska Oschmann
committed
from src.train_logs import load_logs
from src.eval_utils import gen_scores_dict
@click.argument("train_logs")
def main(train_logs: Union[str, os.PathLike]):
"""
Prepares data and evaluates trained MNB model
:param train_logs: path to csv-file containing train logs
"""
# Load logs
Franziska Oschmann
committed
(
path_repo,
path_model,
input_data,
_,
newspaper,
lang,
topic,
remove_duplicates,
min_num_words,
) = load_logs(train_logs)
# Load model
pipe = load_model(path_model)
# Load test data
tl = TextLoader(input_data)
df_test = tl.load_text_csv(
Franziska Oschmann
committed
newspaper=newspaper,
lang=lang,
topic=topic,
load_subset=False,
Franziska Oschmann
committed
remove_duplicates=remove_duplicates,
min_num_words=min_num_words,
Franziska Oschmann
committed
X_test = df_test.text
y_test = df_test.label
# Make prediction
y_pred = pipe.predict(X_test)
Franziska Oschmann
committed
# Compute scores and add to dict
precision, recall, f1, _ = precision_recall_fscore_support(
Franziska Oschmann
committed
y_test, y_pred, average="weighted"
Franziska Oschmann
committed
results_all = gen_scores_dict(precision, recall, f1, accuracy)
# Get results per topic
count_topics = Counter(df_test["topic"]).most_common(10)
topics = [t[0] for t in count_topics]
results_t = dict()
for t in topics:
X_test_t = df_test[df_test.topic == t].text
y_test_t = df_test[df_test.topic == t].label
y_pred_t = pipe.predict(X_test_t)
precision, recall, f1, _ = precision_recall_fscore_support(
Franziska Oschmann
committed
y_test_t, y_pred_t, average="weighted"
accuracy = pipe.score(X_test_t, y_test_t)
Franziska Oschmann
committed
results_t[t] = gen_scores_dict(precision, recall, f1, accuracy)
# Compute rejection rate
reject_rate_all = np.round(df_test.label.mean(), 4) * 100
reject_rate_topic = [
np.round(df_test[df_test.topic == k].label.mean(), 4) * 100 for k in topics
]
# Compute number comments
num_comm_all = df_test.shape[0]
num_comm_topic = [df_test[df_test.topic == k].shape[0] for k in topics]
# Save results
df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"])
df_res_all.loc["rejection rate"] = reject_rate_all
df_res_all.loc["number comments"] = num_comm_all
df_res_topic = pd.DataFrame.from_dict(results_t)
df_res_topic.loc["rejection rate"] = reject_rate_topic
df_res_topic.loc["number comments"] = num_comm_topic
df_res = df_res_all.join(df_res_topic)
df_res.loc["data"] = [input_data] * df_res.shape[1]
df_res.to_csv(
path_repo + "/results/results_eval_MNB/" + Path(path_model).stem + ".csv"
)
if __name__ == "__main__":
main()