diff --git a/moderation_classifier/eval_BERT.py b/moderation_classifier/eval_BERT.py index 0528ba300587cb94f498aef7c526d0028f7e549c..e1c99a14640ad6008ca857685de085b9fec844b9 100644 --- a/moderation_classifier/eval_BERT.py +++ b/moderation_classifier/eval_BERT.py @@ -1,6 +1,7 @@ from datasets import load_dataset from evaluate import evaluator -from transformers import pipeline +from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification +import tensorflow as tf import click import evaluate @@ -8,8 +9,12 @@ import numpy as np import os import pandas as pd from pathlib import Path +import timeit +from tqdm import tqdm from typing import Union +from sklearn.metrics import precision_recall_fscore_support, accuracy_score + from src.preprocessing_text import TextLoader, TextProcessor from src.prepare_bert_tf import df2dict @@ -23,6 +28,7 @@ def main(train_logs: Union[str, os.PathLike]): # Load logs df = pd.read_csv(train_logs, index_col="Unnamed: 0") + path_repo = df.loc["path_repo"].values[0] path_model = df.loc["path_model"].values[0] input_data = df.loc["input_data"].values[0].replace("train", "test") text_preprocessing = df.loc["text_preprocessing"].values[0] @@ -32,6 +38,7 @@ def main(train_logs: Union[str, os.PathLike]): tl = TextLoader(input_data) df_de = tl.load_text_csv( newspaper="tagesanzeiger", + lang='de', load_subset=False, remove_duplicates=True, min_num_words=3, @@ -41,62 +48,90 @@ def main(train_logs: Union[str, os.PathLike]): tp = TextProcessor() text_proc = tp.fit_transform(df_de.text) df_de.text = text_proc - comm_per_topic = tl.get_comments_per_topic(df_de) + comon_topics = tl.get_comments_per_topic(df_de) + - # Prepare data for evaluation - data = df2dict(df_de, split_data=False) - data_topic = { - k: df2dict(comm_per_topic[k], split_data=False) for k in comm_per_topic.keys() - } + df_de = df_de[:10000] - task_evaluator = evaluator("text-classification") - pipe = pipeline("text-classification", model=path_model) + + # Load tokenizer and model + start = timeit.timeit() + tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") + model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model) - # Evaluate results for all data and per topic - metrics = ['accuracy', 'f1', 'precision', 'recall'] - eval_results_all = task_evaluator.compute( - model_or_pipeline=pipe, - data=data, - metric=evaluate.combine(metrics), - label_mapping={"NEGATIVE": 0, "POSITIVE": 1} + # Split text into batches + text_list = list(df_de.text.values) + n = 500 + results=[text_list[idx:idx+n] for idx in range(0, len(text_list), n)] + + # eval all + y_pred_all = [] + for batch in tqdm(results): + inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True) + logits = model(**inputs).logits + y_pred_batch = tf.argmax(logits,axis=1) + y_pred_all.append(y_pred_batch) + + y_pred_all = np.concatenate(y_pred_all) + + precision, recall, f1, _ = precision_recall_fscore_support( + df_de.label, y_pred_all, average="weighted" ) + accuracy = accuracy_score(df_de.label, y_pred_all) - eval_results_topic = { - k: task_evaluator.compute( - model_or_pipeline=pipe, - data=data_topic[k], - metric=evaluate.combine(metrics), - label_mapping={"NEGATIVE": 0, "POSITIVE": 1}, + results_all = dict() + results_all["precision"] = precision + results_all["recall"] = recall + results_all["f1"] = f1 + results_all["accuracy"] = accuracy + + # eval per topic + topics = [t[0] for t in comon_topics] + results_t = dict() + + for t in topics: + y_test_t = df_de[df_de.topic == t].label + y_pred_t = y_pred_all[df_de.topic == t] + + precision, recall, f1, _ = precision_recall_fscore_support( + y_test_t, y_pred_t, average="weighted" ) - for k in data_topic.keys() - } + accuracy = accuracy_score(y_test_t, y_pred_t) + results_t[t] = dict() + results_t[t]["accuracy"] = accuracy + results_t[t]["f1"] = f1 + results_t[t]["precision"] = precision + results_t[t]["recall"] = recall + + end = timeit.timeit() + print(end - start) # Compute rejection rate reject_rate_all = np.round(df_de.label.mean(), 4) * 100 reject_rate_topic = [ np.round(df_de[df_de.topic == k].label.mean(), 4) * 100 - for k in data_topic.keys() + for k in topics ] # Compute number comments num_comm_all = df_de.shape[0] - num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in data_topic.keys()] + num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in topics] # Save results df_res_all = pd.DataFrame().from_dict( - eval_results_all, orient="index", columns=["all"] + results_all, orient="index", columns=["all"] ) df_res_all.loc["rejection rate"] = reject_rate_all df_res_all.loc["number comments"] = num_comm_all - df_res_topic = pd.DataFrame.from_dict(eval_results_topic) + df_res_topic = pd.DataFrame.from_dict(results_t) df_res_topic.loc["rejection rate"] = reject_rate_topic df_res_topic.loc["number comments"] = num_comm_topic df_res = df_res_all.join(df_res_topic) df_res.loc["data"] = [input_data] * df_res.shape[1] - df_res.to_csv("results/results_eval_BERT/" + Path(path_model).stem + ".csv") + df_res.to_csv(path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + ".csv") if __name__ == "__main__":