diff --git a/moderation_classifier/eval_BERT.py b/moderation_classifier/eval_BERT.py
index 0528ba300587cb94f498aef7c526d0028f7e549c..e1c99a14640ad6008ca857685de085b9fec844b9 100644
--- a/moderation_classifier/eval_BERT.py
+++ b/moderation_classifier/eval_BERT.py
@@ -1,6 +1,7 @@
 from datasets import load_dataset
 from evaluate import evaluator
-from transformers import pipeline
+from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
+import tensorflow as tf
 
 import click
 import evaluate
@@ -8,8 +9,12 @@ import numpy as np
 import os
 import pandas as pd
 from pathlib import Path
+import timeit
+from tqdm import tqdm
 from typing import Union
 
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+
 from src.preprocessing_text import TextLoader, TextProcessor
 from src.prepare_bert_tf import df2dict
 
@@ -23,6 +28,7 @@ def main(train_logs: Union[str, os.PathLike]):
 
     # Load logs
     df = pd.read_csv(train_logs, index_col="Unnamed: 0")
+    path_repo = df.loc["path_repo"].values[0]
     path_model = df.loc["path_model"].values[0]
     input_data = df.loc["input_data"].values[0].replace("train", "test")
     text_preprocessing = df.loc["text_preprocessing"].values[0]
@@ -32,6 +38,7 @@ def main(train_logs: Union[str, os.PathLike]):
     tl = TextLoader(input_data)
     df_de = tl.load_text_csv(
         newspaper="tagesanzeiger",
+        lang='de',
         load_subset=False,
         remove_duplicates=True,
         min_num_words=3,
@@ -41,62 +48,90 @@ def main(train_logs: Union[str, os.PathLike]):
         tp = TextProcessor()
         text_proc = tp.fit_transform(df_de.text)
         df_de.text = text_proc
-    comm_per_topic = tl.get_comments_per_topic(df_de)
+    comon_topics = tl.get_comments_per_topic(df_de)
+
 
-    # Prepare data for evaluation
-    data = df2dict(df_de, split_data=False)
-    data_topic = {
-        k: df2dict(comm_per_topic[k], split_data=False) for k in comm_per_topic.keys()
-    }
+    df_de = df_de[:10000]
 
-    task_evaluator = evaluator("text-classification")
-    pipe = pipeline("text-classification", model=path_model)
+
+    # Load tokenizer and model
+    start = timeit.timeit()
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
+    model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model)
     
-    # Evaluate results for all data and per topic
-    metrics = ['accuracy', 'f1', 'precision', 'recall']
-    eval_results_all = task_evaluator.compute(
-            model_or_pipeline=pipe, 
-            data=data, 
-            metric=evaluate.combine(metrics), 
-            label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
+    # Split text into batches
+    text_list = list(df_de.text.values)
+    n = 500
+    results=[text_list[idx:idx+n] for idx in range(0, len(text_list), n)]
+
+    # eval all
+    y_pred_all = []
+    for batch in tqdm(results):
+        inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True)
+        logits = model(**inputs).logits
+        y_pred_batch = tf.argmax(logits,axis=1)
+        y_pred_all.append(y_pred_batch)
+
+    y_pred_all = np.concatenate(y_pred_all)
+
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        df_de.label, y_pred_all, average="weighted"
     )
+    accuracy = accuracy_score(df_de.label, y_pred_all)
 
-    eval_results_topic = {
-        k: task_evaluator.compute(
-            model_or_pipeline=pipe,
-            data=data_topic[k],
-            metric=evaluate.combine(metrics),
-            label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
+    results_all = dict()
+    results_all["precision"] = precision
+    results_all["recall"] = recall
+    results_all["f1"] = f1
+    results_all["accuracy"] = accuracy
+    
+    # eval per topic
+    topics = [t[0] for t in comon_topics]
+    results_t = dict()
+
+    for t in topics:
+        y_test_t = df_de[df_de.topic == t].label
+        y_pred_t = y_pred_all[df_de.topic == t]
+
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            y_test_t, y_pred_t, average="weighted"
         )
-        for k in data_topic.keys()
-    }
+        accuracy = accuracy_score(y_test_t, y_pred_t)
+        results_t[t] = dict()
+        results_t[t]["accuracy"] = accuracy
+        results_t[t]["f1"] = f1
+        results_t[t]["precision"] = precision
+        results_t[t]["recall"] = recall
+    
+    end = timeit.timeit()
+    print(end - start)
 
     # Compute rejection rate
     reject_rate_all = np.round(df_de.label.mean(), 4) * 100
     reject_rate_topic = [
         np.round(df_de[df_de.topic == k].label.mean(), 4) * 100
-        for k in data_topic.keys()
+        for k in topics
     ]
 
     # Compute number comments
     num_comm_all = df_de.shape[0]
-    num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in data_topic.keys()]
+    num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in topics]
 
     # Save results
     df_res_all = pd.DataFrame().from_dict(
-        eval_results_all, orient="index", columns=["all"]
+        results_all, orient="index", columns=["all"]
     )
     df_res_all.loc["rejection rate"] = reject_rate_all
     df_res_all.loc["number comments"] = num_comm_all
 
-    df_res_topic = pd.DataFrame.from_dict(eval_results_topic)
+    df_res_topic = pd.DataFrame.from_dict(results_t)
     df_res_topic.loc["rejection rate"] = reject_rate_topic
     df_res_topic.loc["number comments"] = num_comm_topic
 
     df_res = df_res_all.join(df_res_topic)
     df_res.loc["data"] = [input_data] * df_res.shape[1]
 
-    df_res.to_csv("results/results_eval_BERT/" + Path(path_model).stem + ".csv")
+    df_res.to_csv(path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + ".csv")
 
 
 if __name__ == "__main__":