Newer
Older
from datasets import load_dataset
from evaluate import evaluator
Franziska Oschmann
committed
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np
import os
import pandas as pd
from pathlib import Path
Franziska Oschmann
committed
import timeit
from tqdm import tqdm
from typing import Union
Franziska Oschmann
committed
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from src.preprocessing_text import TextLoader, TextProcessor
from src.prepare_bert_tf import df2dict
@click.argument("train_logs")
def main(train_logs: Union[str, os.PathLike]):
"""
Prepares data and evaluates trained BERT model with TF
:param train_logs: path to csv-file containing train logs
# Load logs
df = pd.read_csv(train_logs, index_col="Unnamed: 0")
Franziska Oschmann
committed
path_repo = df.loc["path_repo"].values[0]
path_model = df.loc["path_model"].values[0]
input_data = df.loc["input_data"].values[0].replace("train", "test")
text_preprocessing = df.loc["text_preprocessing"].values[0]
# Load data and extract only text from tagesanzeiger
print("Load and preprocess text")
tl = TextLoader(input_data)
df_de = tl.load_text_csv(
newspaper="tagesanzeiger",
Franziska Oschmann
committed
lang='de',
load_subset=False,
min_num_words=3,
)
if text_preprocessing:
tp = TextProcessor()
text_proc = tp.fit_transform(df_de.text)
df_de.text = text_proc
Franziska Oschmann
committed
comon_topics = tl.get_comments_per_topic(df_de)
Franziska Oschmann
committed
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model)
Franziska Oschmann
committed
# Split text into batches
text_list = list(df_de.text.values)
Franziska Oschmann
committed
results=[text_list[idx:idx+n] for idx in range(0, len(text_list), n)]
import pdb; pdb.set_trace()
Franziska Oschmann
committed
# eval all
y_pred_all = []
for batch in tqdm(results):
inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True)
logits = model(**inputs).logits
y_pred_batch = tf.argmax(logits,axis=1)
y_pred_all.append(y_pred_batch)
y_pred_all = np.concatenate(y_pred_all)
precision, recall, f1, _ = precision_recall_fscore_support(
df_de.label, y_pred_all, average='weighted'
Franziska Oschmann
committed
accuracy = accuracy_score(df_de.label, y_pred_all)
Franziska Oschmann
committed
results_all = dict()
results_all["precision"] = precision
results_all["recall"] = recall
results_all["f1"] = f1
results_all["accuracy"] = accuracy
# eval per topic
topics = [t[0] for t in comon_topics]
results_t = dict()
for t in topics:
y_test_t = df_de[df_de.topic == t].label
y_pred_t = y_pred_all[df_de.topic == t]
precision, recall, f1, _ = precision_recall_fscore_support(
y_test_t, y_pred_t, average='weighted'
Franziska Oschmann
committed
accuracy = accuracy_score(y_test_t, y_pred_t)
results_t[t] = dict()
results_t[t]["accuracy"] = accuracy
results_t[t]["f1"] = f1
results_t[t]["precision"] = precision
results_t[t]["recall"] = recall
# Compute rejection rate
reject_rate_all = np.round(df_de.label.mean(), 4) * 100
reject_rate_topic = [
np.round(df_de[df_de.topic == k].label.mean(), 4) * 100
Franziska Oschmann
committed
for k in topics
]
# Compute number comments
num_comm_all = df_de.shape[0]
Franziska Oschmann
committed
num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in topics]
# Save results
df_res_all = pd.DataFrame().from_dict(
Franziska Oschmann
committed
results_all, orient="index", columns=["all"]
)
df_res_all.loc["rejection rate"] = reject_rate_all
df_res_all.loc["number comments"] = num_comm_all
Franziska Oschmann
committed
df_res_topic = pd.DataFrame.from_dict(results_t)
df_res_topic.loc["rejection rate"] = reject_rate_topic
df_res_topic.loc["number comments"] = num_comm_topic
df_res = df_res_all.join(df_res_topic)
df_res.loc["data"] = [input_data] * df_res.shape[1]
Franziska Oschmann
committed
df_res.to_csv(path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + ".csv")
if __name__ == "__main__":
main()