Newer
Older
import click
from collections import Counter
from joblib import load
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import f1_score, precision_recall_fscore_support
from typing import Union
import os
from src.preprocessing_text import TextLoader
def load_model(path):
"""
Loads trained model
"""
pipe = load(path)
return pipe
@click.argument("train_logs")
def main(train_logs: Union[str, os.PathLike]):
"""
Prepares data and evaluates trained MNB model
:param train_logs: path to csv-file containing train logs
"""
# Load logs
df = pd.read_csv(train_logs, index_col="Unnamed: 0")
path_model = df.loc["path_model"].values[0]
input_data = df.loc["input_data"].values[0].replace("train", "test")
# Load model
pipe = load_model(path_model)
# Load test data
tl = TextLoader(input_data)
df_test = tl.load_text_csv(
newspaper="tagesanzeiger",
load_subset=False,
remove_duplicates=False,
min_num_words=3,
)
X_test = df_test.text
y_test = df_test.label
# Make prediction
y_pred = pipe.predict(X_test)
y_pred_t = pipe.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(
y_test, y_pred, average='weighted'
results_all = dict()
results_all["precision"] = precision
results_all["recall"] = recall
results_all["f1"] = f1
#import pdb; pdb.set_trace()
# Get results per topic
count_topics = Counter(df_test["topic"]).most_common(10)
topics = [t[0] for t in count_topics]
results_t = dict()
for t in topics:
X_test_t = df_test[df_test.topic == t].text
y_test_t = df_test[df_test.topic == t].label
y_pred_t = pipe.predict(X_test_t)
precision, recall, f1, _ = precision_recall_fscore_support(
y_test_t, y_pred_t, average='weighted'
accuracy = pipe.score(X_test_t, y_test_t)
results_t[t] = dict()
results_t[t]["accuracy"] = accuracy
results_t[t]["f1"] = f1
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
results_t[t]["precision"] = precision
results_t[t]["recall"] = recall
# Compute rejection rate
reject_rate_all = np.round(df_test.label.mean(), 4) * 100
reject_rate_topic = [
np.round(df_test[df_test.topic == k].label.mean(), 4) * 100 for k in topics
]
# Compute number comments
num_comm_all = df_test.shape[0]
num_comm_topic = [df_test[df_test.topic == k].shape[0] for k in topics]
# Save results
df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"])
df_res_all.loc["rejection rate"] = reject_rate_all
df_res_all.loc["number comments"] = num_comm_all
df_res_topic = pd.DataFrame.from_dict(results_t)
df_res_topic.loc["rejection rate"] = reject_rate_topic
df_res_topic.loc["number comments"] = num_comm_topic
df_res = df_res_all.join(df_res_topic)
df_res.loc["data"] = [input_data] * df_res.shape[1]
df_res.to_csv("results/results_eval_MNB/" + Path(path_model).stem + ".csv")
if __name__ == "__main__":
main()