From 2cd7c834770f84163b1ed00d8eb7956104c0597c Mon Sep 17 00:00:00 2001 From: Franziska Oschmann <franziskaoschmann@staff-net-oct-dock-1-a-dhcp-100.intern.ethz.ch> Date: Thu, 13 Jul 2023 11:24:28 +0200 Subject: [PATCH] Smaller changes in train scripts --- moderation_classifier/train_BERT.py | 1 + moderation_classifier/train_MNB.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/moderation_classifier/train_BERT.py b/moderation_classifier/train_BERT.py index eca1f85..1fd2ee1 100644 --- a/moderation_classifier/train_BERT.py +++ b/moderation_classifier/train_BERT.py @@ -32,6 +32,7 @@ def save_logs( :param text_preprocessing: Boolean flag whether preprocessing was used or not """ logs = dict() + logs["path_repo"] = path_repo logs["path_model"] = path logs["input_data"] = input_data logs["text_preprocessing"] = text_preprocessing diff --git a/moderation_classifier/train_MNB.py b/moderation_classifier/train_MNB.py index 7d81813..5b1a5e3 100644 --- a/moderation_classifier/train_MNB.py +++ b/moderation_classifier/train_MNB.py @@ -32,7 +32,7 @@ def create_pipeline(): ) # define model - mnb = MultinomialNB(alpha=0.01) + mnb = MultinomialNB(alpha=0.1) # set pipeline pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)]) @@ -102,16 +102,16 @@ def main(input_data: Union[str, os.PathLike]): # Load data and extract only text from tagesanzeiger print("Load and preprocess text") - remove_duplicates = False + remove_duplicates = True min_num_words = 3 tl = TextLoader(input_data) df_de = tl.load_text_csv( newspaper="tagesanzeiger", + lang='de', load_subset=False, remove_duplicates=remove_duplicates, min_num_words=min_num_words, ) - df_de = df_de.sample(50) # Prepare data for modeling text = df_de.text -- GitLab