Only return most common topics

9fcfed88 · Franziska Oschmann · c70cbc71 · 9fcfed88 · 9fcfed88
Commit 9fcfed88 authored 1 year ago by Franziska Oschmann
--- a/moderation_classifier/eval_BERT.py
+++ b/moderation_classifier/eval_BERT.py
@@ -50,12 +50,7 @@ def main(train_logs: Union[str, os.PathLike]):
        df_de.text = text_proc
    comon_topics = tl.get_comments_per_topic(df_de)

-
-    df_de = df_de[:10000]
-
-
    # Load tokenizer and model
-    start = timeit.timeit()
    tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
    model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model)
    
@@ -102,9 +97,6 @@ def main(train_logs: Union[str, os.PathLike]):
        results_t[t]["f1"] = f1
        results_t[t]["precision"] = precision
        results_t[t]["recall"] = recall
-    
-    end = timeit.timeit()
-    print(end - start)

    # Compute rejection rate
    reject_rate_all = np.round(df_de.label.mean(), 4) * 100

--- a/src/preprocessing_text.py
+++ b/src/preprocessing_text.py
@@ -133,12 +133,8 @@ class TextLoader(object):
        #df = df.rename(columns={"rejected": "label"})

        topics = Counter(df["topic"]).most_common(num_topic)
-        comm_per_topic = dict()
-        for t in topics:
-            df_topic = df[df.topic == t[0]]
-            comm_per_topic[t[0]] = df_topic

-        return comm_per_topic
+        return topics

    def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray:
        """ "