Skip to content
Snippets Groups Projects
Commit 9fcfed88 authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

Only return most common topics

parent c70cbc71
No related branches found
No related tags found
1 merge request!2Dev train models
......@@ -50,12 +50,7 @@ def main(train_logs: Union[str, os.PathLike]):
df_de.text = text_proc
comon_topics = tl.get_comments_per_topic(df_de)
df_de = df_de[:10000]
# Load tokenizer and model
start = timeit.timeit()
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model)
......@@ -102,9 +97,6 @@ def main(train_logs: Union[str, os.PathLike]):
results_t[t]["f1"] = f1
results_t[t]["precision"] = precision
results_t[t]["recall"] = recall
end = timeit.timeit()
print(end - start)
# Compute rejection rate
reject_rate_all = np.round(df_de.label.mean(), 4) * 100
......
......@@ -133,12 +133,8 @@ class TextLoader(object):
#df = df.rename(columns={"rejected": "label"})
topics = Counter(df["topic"]).most_common(num_topic)
comm_per_topic = dict()
for t in topics:
df_topic = df[df.topic == t[0]]
comm_per_topic[t[0]] = df_topic
return comm_per_topic
return topics
def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray:
""" "
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment