diff --git a/moderation_classifier/train_BERT.py b/moderation_classifier/train_BERT.py index eca1f859b5d11d1182657e0b014cf9f97889bad4..1fd2ee1596b6d9cb70d0d2815e4a47a65339c5f2 100644 --- a/moderation_classifier/train_BERT.py +++ b/moderation_classifier/train_BERT.py @@ -32,6 +32,7 @@ def save_logs( :param text_preprocessing: Boolean flag whether preprocessing was used or not """ logs = dict() + logs["path_repo"] = path_repo logs["path_model"] = path logs["input_data"] = input_data logs["text_preprocessing"] = text_preprocessing diff --git a/moderation_classifier/train_MNB.py b/moderation_classifier/train_MNB.py index 7d818130a767d77dca87e122ed7365e0adbc4763..5b1a5e31545d5d02147ace854474193a9dee6fb8 100644 --- a/moderation_classifier/train_MNB.py +++ b/moderation_classifier/train_MNB.py @@ -32,7 +32,7 @@ def create_pipeline(): ) # define model - mnb = MultinomialNB(alpha=0.01) + mnb = MultinomialNB(alpha=0.1) # set pipeline pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)]) @@ -102,16 +102,16 @@ def main(input_data: Union[str, os.PathLike]): # Load data and extract only text from tagesanzeiger print("Load and preprocess text") - remove_duplicates = False + remove_duplicates = True min_num_words = 3 tl = TextLoader(input_data) df_de = tl.load_text_csv( newspaper="tagesanzeiger", + lang='de', load_subset=False, remove_duplicates=remove_duplicates, min_num_words=min_num_words, ) - df_de = df_de.sample(50) # Prepare data for modeling text = df_de.text