diff --git a/moderation_classifier/eval_BERT.py b/moderation_classifier/eval_BERT.py index e1c99a14640ad6008ca857685de085b9fec844b9..b72a9d5ca7328771bcdd5ae2429cd25e9385bab7 100644 --- a/moderation_classifier/eval_BERT.py +++ b/moderation_classifier/eval_BERT.py @@ -50,12 +50,7 @@ def main(train_logs: Union[str, os.PathLike]): df_de.text = text_proc comon_topics = tl.get_comments_per_topic(df_de) - - df_de = df_de[:10000] - - # Load tokenizer and model - start = timeit.timeit() tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model) @@ -102,9 +97,6 @@ def main(train_logs: Union[str, os.PathLike]): results_t[t]["f1"] = f1 results_t[t]["precision"] = precision results_t[t]["recall"] = recall - - end = timeit.timeit() - print(end - start) # Compute rejection rate reject_rate_all = np.round(df_de.label.mean(), 4) * 100 diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py index 43f38bdbc1d8e852d396c5fbd6be6ddd1a07decf..0829b04a28bd3697e6dafda6d9438c2eb0de52d9 100644 --- a/src/preprocessing_text.py +++ b/src/preprocessing_text.py @@ -133,12 +133,8 @@ class TextLoader(object): #df = df.rename(columns={"rejected": "label"}) topics = Counter(df["topic"]).most_common(num_topic) - comm_per_topic = dict() - for t in topics: - df_topic = df[df.topic == t[0]] - comm_per_topic[t[0]] = df_topic - return comm_per_topic + return topics def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray: """ "