import click from collections import Counter import numpy as np import pandas as pd from pathlib import Path from sklearn.metrics import precision_recall_fscore_support from typing import Union import os from src.MNB_utils import load_model from src.preprocessing_text import TextLoader from src.train_logs import load_logs from src.eval_utils import gen_scores_dict @click.argument("train_logs") def main(train_logs: Union[str, os.PathLike]): """ Prepares data and evaluates trained MNB model :param train_logs: path to csv-file containing train logs """ # Load logs ( path_repo, path_model, input_data, _, newspaper, lang, topic, remove_duplicates, min_num_words, ) = load_logs(train_logs) # Load model pipe = load_model(path_model) # Load test data tl = TextLoader(input_data) df_test = tl.load_text_csv( newspaper=newspaper, lang=lang, topic=topic, load_subset=False, remove_duplicates=remove_duplicates, min_num_words=min_num_words, ) X_test = df_test.text y_test = df_test.label # Make prediction y_pred = pipe.predict(X_test) # Compute scores and add to dict precision, recall, f1, _ = precision_recall_fscore_support( y_test, y_pred, average="weighted" ) accuracy = pipe.score(X_test, y_test) results_all = gen_scores_dict(precision, recall, f1, accuracy) # Get results per topic count_topics = Counter(df_test["topic"]).most_common(10) topics = [t[0] for t in count_topics] results_t = dict() for t in topics: X_test_t = df_test[df_test.topic == t].text y_test_t = df_test[df_test.topic == t].label y_pred_t = pipe.predict(X_test_t) precision, recall, f1, _ = precision_recall_fscore_support( y_test_t, y_pred_t, average="weighted" ) accuracy = pipe.score(X_test_t, y_test_t) results_t[t] = gen_scores_dict(precision, recall, f1, accuracy) # Compute rejection rate reject_rate_all = np.round(df_test.label.mean(), 4) * 100 reject_rate_topic = [ np.round(df_test[df_test.topic == k].label.mean(), 4) * 100 for k in topics ] # Compute number comments num_comm_all = df_test.shape[0] num_comm_topic = [df_test[df_test.topic == k].shape[0] for k in topics] # Save results df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"]) df_res_all.loc["rejection rate"] = reject_rate_all df_res_all.loc["number comments"] = num_comm_all df_res_topic = pd.DataFrame.from_dict(results_t) df_res_topic.loc["rejection rate"] = reject_rate_topic df_res_topic.loc["number comments"] = num_comm_topic df_res = df_res_all.join(df_res_topic) df_res.loc["data"] = [input_data] * df_res.shape[1] df_res.to_csv( path_repo + "/results/results_eval_MNB/" + Path(path_model).stem + ".csv" ) if __name__ == "__main__": main()