diff --git a/moderation_classifier/eval_BERT.py b/moderation_classifier/eval_BERT.py index 2ff388f35cdd3b9d8043f9d3efe79a42aa28331e..db645557710da7b5beccb304f461c761c7ef0bbb 100644 --- a/moderation_classifier/eval_BERT.py +++ b/moderation_classifier/eval_BERT.py @@ -1,22 +1,19 @@ -from datasets import load_dataset -from evaluate import evaluator -from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification +from transformers import AutoTokenizer, TFAutoModelForSequenceClassification import tensorflow as tf import click -import evaluate import numpy as np import os import pandas as pd from pathlib import Path -import timeit -from tqdm import tqdm -from typing import Union +from typing import List, Union from sklearn.metrics import precision_recall_fscore_support, accuracy_score from src.preprocessing_text import TextLoader, TextProcessor -from src.prepare_bert_tf import df2dict +from src.train_logs import load_logs +from src.BERT_utils import predict_batches +from src.eval_utils import gen_scores_dict @click.argument("train_logs") @@ -27,21 +24,28 @@ def main(train_logs: Union[str, os.PathLike]): """ # Load logs - df = pd.read_csv(train_logs, index_col="Unnamed: 0") - path_repo = df.loc["path_repo"].values[0] - path_model = df.loc["path_model"].values[0] - input_data = df.loc["input_data"].values[0].replace("train", "test") - text_preprocessing = df.loc["text_preprocessing"].values[0] + ( + path_repo, + path_model, + input_data, + text_preprocessing, + newspaper, + lang, + topic, + remove_duplicates, + min_num_words, + ) = load_logs(train_logs) # Load data and extract only text from tagesanzeiger print("Load and preprocess text") tl = TextLoader(input_data) df_de = tl.load_text_csv( - newspaper="tagesanzeiger", - lang='de', + newspaper=newspaper, + lang=lang, + topic=topic, load_subset=False, - remove_duplicates=True, - min_num_words=3, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, ) if text_preprocessing: @@ -49,37 +53,24 @@ def main(train_logs: Union[str, os.PathLike]): text_proc = tp.fit_transform(df_de.text) df_de.text = text_proc comon_topics = tl.get_comments_per_topic(df_de) - + # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") - model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_model) - - # Split text into batches - text_list = list(df_de.text.values) - n = 100 - results=[text_list[idx:idx+n] for idx in range(0, len(text_list), n)] - import pdb; pdb.set_trace() - # eval all - y_pred_all = [] - for batch in tqdm(results): - inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True) - logits = model(**inputs).logits - y_pred_batch = tf.argmax(logits,axis=1) - y_pred_all.append(y_pred_batch) + model = TFAutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=path_model + ) - y_pred_all = np.concatenate(y_pred_all) + # Split text into batches + y_pred_all = predict_batches(df_de.text.values, model, tokenizer) + # eval all precision, recall, f1, _ = precision_recall_fscore_support( - df_de.label, y_pred_all, average='weighted' + df_de.label, y_pred_all, average="weighted" ) accuracy = accuracy_score(df_de.label, y_pred_all) - results_all = dict() - results_all["precision"] = precision - results_all["recall"] = recall - results_all["f1"] = f1 - results_all["accuracy"] = accuracy - + results_all = gen_scores_dict(precision, recall, f1, accuracy) + # eval per topic topics = [t[0] for t in comon_topics] results_t = dict() @@ -89,20 +80,16 @@ def main(train_logs: Union[str, os.PathLike]): y_pred_t = y_pred_all[df_de.topic == t] precision, recall, f1, _ = precision_recall_fscore_support( - y_test_t, y_pred_t, average='weighted' + y_test_t, y_pred_t, average="weighted" ) accuracy = accuracy_score(y_test_t, y_pred_t) - results_t[t] = dict() - results_t[t]["accuracy"] = accuracy - results_t[t]["f1"] = f1 - results_t[t]["precision"] = precision - results_t[t]["recall"] = recall + + results_t[t] = gen_scores_dict(precision, recall, f1, accuracy) # Compute rejection rate reject_rate_all = np.round(df_de.label.mean(), 4) * 100 reject_rate_topic = [ - np.round(df_de[df_de.topic == k].label.mean(), 4) * 100 - for k in topics + np.round(df_de[df_de.topic == k].label.mean(), 4) * 100 for k in topics ] # Compute number comments @@ -110,9 +97,7 @@ def main(train_logs: Union[str, os.PathLike]): num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in topics] # Save results - df_res_all = pd.DataFrame().from_dict( - results_all, orient="index", columns=["all"] - ) + df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"]) df_res_all.loc["rejection rate"] = reject_rate_all df_res_all.loc["number comments"] = num_comm_all @@ -123,7 +108,9 @@ def main(train_logs: Union[str, os.PathLike]): df_res = df_res_all.join(df_res_topic) df_res.loc["data"] = [input_data] * df_res.shape[1] - df_res.to_csv(path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + ".csv") + df_res.to_csv( + path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + ".csv" + ) if __name__ == "__main__": diff --git a/moderation_classifier/eval_MNB.py b/moderation_classifier/eval_MNB.py index 622e12d2eedceb6974efe68748d4d39d6b1fd4c1..3b1697fa4a6aaf749c5574de07bb6d369350a9bc 100644 --- a/moderation_classifier/eval_MNB.py +++ b/moderation_classifier/eval_MNB.py @@ -1,24 +1,17 @@ import click from collections import Counter -from joblib import load import numpy as np import pandas as pd from pathlib import Path -from sklearn.metrics import f1_score, precision_recall_fscore_support +from sklearn.metrics import precision_recall_fscore_support from typing import Union import os +from src.MNB_utils import load_model from src.preprocessing_text import TextLoader - - -def load_model(path): - """ - Loads trained model - """ - pipe = load(path) - - return pipe +from src.train_logs import load_logs +from src.eval_utils import gen_scores_dict @click.argument("train_logs") @@ -29,9 +22,17 @@ def main(train_logs: Union[str, os.PathLike]): """ # Load logs - df = pd.read_csv(train_logs, index_col="Unnamed: 0") - path_model = df.loc["path_model"].values[0] - input_data = df.loc["input_data"].values[0].replace("train", "test") + ( + path_repo, + path_model, + input_data, + _, + newspaper, + lang, + topic, + remove_duplicates, + min_num_words, + ) = load_logs(train_logs) # Load model pipe = load_model(path_model) @@ -39,32 +40,27 @@ def main(train_logs: Union[str, os.PathLike]): # Load test data tl = TextLoader(input_data) df_test = tl.load_text_csv( - newspaper="tagesanzeiger", - lang='de', + newspaper=newspaper, + lang=lang, + topic=topic, load_subset=False, - remove_duplicates=False, - min_num_words=3, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, ) - + X_test = df_test.text y_test = df_test.label # Make prediction y_pred = pipe.predict(X_test) - y_pred_t = pipe.predict(X_test) + # Compute scores and add to dict precision, recall, f1, _ = precision_recall_fscore_support( - y_test, y_pred, average='weighted' + y_test, y_pred, average="weighted" ) accuracy = pipe.score(X_test, y_test) - results_all = dict() - results_all["precision"] = precision - results_all["recall"] = recall - results_all["f1"] = f1 - results_all["accuracy"] = accuracy - - #import pdb; pdb.set_trace() + results_all = gen_scores_dict(precision, recall, f1, accuracy) # Get results per topic count_topics = Counter(df_test["topic"]).most_common(10) @@ -77,15 +73,11 @@ def main(train_logs: Union[str, os.PathLike]): y_pred_t = pipe.predict(X_test_t) precision, recall, f1, _ = precision_recall_fscore_support( - y_test_t, y_pred_t, average='weighted' + y_test_t, y_pred_t, average="weighted" ) - #f1 = f1_score(y_test_t, y_pred_t) accuracy = pipe.score(X_test_t, y_test_t) - results_t[t] = dict() - results_t[t]["accuracy"] = accuracy - results_t[t]["f1"] = f1 - results_t[t]["precision"] = precision - results_t[t]["recall"] = recall + + results_t[t] = gen_scores_dict(precision, recall, f1, accuracy) # Compute rejection rate reject_rate_all = np.round(df_test.label.mean(), 4) * 100 @@ -109,7 +101,7 @@ def main(train_logs: Union[str, os.PathLike]): df_res = df_res_all.join(df_res_topic) df_res.loc["data"] = [input_data] * df_res.shape[1] - df_res.to_csv("results/results_eval_MNB/" + Path(path_model).stem + ".csv") + df_res.to_csv(path_repo + "results/results_eval_MNB/" + Path(path_model).stem + ".csv") if __name__ == "__main__": diff --git a/moderation_classifier/train_BERT.py b/moderation_classifier/train_BERT.py index 1fd2ee1596b6d9cb70d0d2815e4a47a65339c5f2..49a25ee27079058af05707317ce8dd091f976d1f 100644 --- a/moderation_classifier/train_BERT.py +++ b/moderation_classifier/train_BERT.py @@ -16,34 +16,7 @@ from typing import Union from src.preprocessing_text import TextLoader, TextProcessor from src.prepare_bert_tf import df2dict, compute_metrics, prepare_training - - -def save_logs( - path_repo: Union[str, os.PathLike], - path: Union[str, os.PathLike], - input_data: Union[str, os.PathLike], - text_preprocessing: bool, -): - """ - Saves training logs whch can be used during evaluation - :param path_repo: Path to repository - :param path: Path to trained model - :param input_data: Path to used train data - :param text_preprocessing: Boolean flag whether preprocessing was used or not - """ - logs = dict() - logs["path_repo"] = path_repo - logs["path_model"] = path - logs["input_data"] = input_data - logs["text_preprocessing"] = text_preprocessing - - path_logs = (path_repo).joinpath("saved_models/BERT_logs/") - if not os.path.exists(path_logs): - os.makedirs(path_logs) - - df_logs = pd.DataFrame.from_dict(logs, orient="index", columns=["logs"]) - - df_logs.to_csv(path_logs.joinpath(path.stem).with_suffix(".csv")) +from src.train_logs import save_logs @click.argument("input_data") @@ -66,12 +39,19 @@ def main(input_data: Union[str, os.PathLike], text_preprocessing: bool): # Load data and extract only text from tagesanzeiger print("Load and preprocess text") + newspaper = "tagesanzeiger" + lang = "de" + topic = "Wissen" + remove_duplicates = True + min_num_words = 3 tl = TextLoader(input_data) df_de = tl.load_text_csv( - newspaper="tagesanzeiger", + newspaper=newspaper, + lang=lang, + topic=topic, load_subset=False, - remove_duplicates=True, - min_num_words=3, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, ) if text_preprocessing: @@ -133,13 +113,17 @@ def main(input_data: Union[str, os.PathLike], text_preprocessing: bool): callbacks = [metric_callback, checkpoint_callback, tensorboard_callback] # Fit model - print('Train model') + print("Train model") model.fit( - x=tf_train_set, validation_data=tf_validation_set, epochs=5, verbose=2, callbacks=callbacks + x=tf_train_set, + validation_data=tf_validation_set, + epochs=5, + verbose=2, + callbacks=callbacks, ) # Save model - print('Save model') + print("Save model") path_model = (p_repo).joinpath("saved_models/" + time_stemp) model.save_pretrained(path_model) tokenizer.save_pretrained(path_model) @@ -147,12 +131,18 @@ def main(input_data: Union[str, os.PathLike], text_preprocessing: bool): # Save model logs save_logs( path_repo=p_repo, - path=path_model, + path_model=path_model, input_data=input_data, - text_preprocessing=text_preprocessing, + text_preprocessing=True, + newspaper=newspaper, + lang=lang, + topic=topic, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + model_name="BERT", ) - print('Done') + print("Done") if __name__ == "__main__": diff --git a/moderation_classifier/train_MNB.py b/moderation_classifier/train_MNB.py index 5b1a5e31545d5d02147ace854474193a9dee6fb8..7f6c350e1349acff53bc12ff44857e280f08a0b5 100644 --- a/moderation_classifier/train_MNB.py +++ b/moderation_classifier/train_MNB.py @@ -1,92 +1,13 @@ from sklearn.model_selection import train_test_split -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import Pipeline import click -import datetime -from joblib import dump -from nltk.corpus import stopwords from pathlib import Path -import pandas as pd -import spacy - from typing import Union import os -from src.preprocessing_text import TextLoader, TextProcessor - - -def create_pipeline(): - """ - Creates classification pipeline - """ - - # define preprocessor - tp = TextProcessor() - - # define vectorizer - stop_words_ge = stopwords.words("german") - vectorizer = TfidfVectorizer( - stop_words=stop_words_ge, ngram_range=(1, 4), max_features=3000 - ) - - # define model - mnb = MultinomialNB(alpha=0.1) - - # set pipeline - pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)]) - - return pipe - - -def create_path() -> Union[str, os.PathLike]: - """ - Creates path to store trained model - """ - if not os.path.exists("saved_models/MNB/"): - os.makedirs("saved_models/MNB/") - - timestemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - return Path("saved_models/MNB/" + timestemp + ".joblib") - - -def save_model(pipe: Pipeline, path): - """ - Saves trained model - :param pipe: Trained pipeline - """ - dump(pipe, path) - - -def save_logs( - path_repo: Union[str, os.PathLike], - path: Union[str, os.PathLike], - input_data: Union[str, os.PathLike], - text_preprocessing: bool, - val_score: float, -): - """ - Saves training logs whch can be used during evaluation - :param path_repo: Path to repository - :param path: Path to trained model - :param input_data: Path to used train data - :param text_preprocessing: Boolean flag whether preprocessing was used or not - """ - logs = dict() - logs["path_model"] = path - logs["input_data"] = input_data - logs["text_preprocessing"] = text_preprocessing - logs["val_score"] = val_score - - path_logs = (path_repo).joinpath("saved_models/MNB_logs/") - if not os.path.exists(path_logs): - os.makedirs(path_logs) - - df_logs = pd.DataFrame.from_dict(logs, orient="index", columns=["logs"]) - - df_logs.to_csv(path_logs.joinpath(path.stem).with_suffix(".csv")) +from src.MNB_utils import create_pipeline, create_path, save_model +from src.preprocessing_text import TextLoader +from src.train_logs import save_logs @click.argument("input_data") @@ -102,12 +23,16 @@ def main(input_data: Union[str, os.PathLike]): # Load data and extract only text from tagesanzeiger print("Load and preprocess text") + newspaper = "tagesanzeiger" + lang = "de" + topic = "Wissen" remove_duplicates = True min_num_words = 3 tl = TextLoader(input_data) df_de = tl.load_text_csv( - newspaper="tagesanzeiger", - lang='de', + newspaper=newspaper, + lang=lang, + topic=topic, load_subset=False, remove_duplicates=remove_duplicates, min_num_words=min_num_words, @@ -130,9 +55,15 @@ def main(input_data: Union[str, os.PathLike]): save_model(pipe, path) save_logs( path_repo=p_repo, - path=path, + path_model=path, input_data=input_data, text_preprocessing=True, + newspaper=newspaper, + lang=lang, + topic=topic, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + model_name="MNB", val_score=val_score, ) diff --git a/src/BERT_utils.py b/src/BERT_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..728c40b6b92558705b75adaf75a0c8b31f31690a --- /dev/null +++ b/src/BERT_utils.py @@ -0,0 +1,33 @@ +import numpy as np +import tensorflow as tf +from tqdm import tqdm + +from typing import List + +def split_batches(text: np.ndarray, batch_size: int=100) -> List: + """ + Splits list with comments into batches + :param text: Array containing comments + :param batch_size: Number of comments per batch + """ + text_list = list(text) + text_batches=[text_list[idx:idx+batch_size] for idx in range(0, len(text_list), batch_size)] + return text_batches + +def predict_batches(text: np.ndarray, model, tokenizer) -> np.ndarray: + """ + Makes prediction for all batches and combines all predictions + :param text: Array containing comments + :param model: + :param tokenizer: + """ + text_batches = split_batches(text) + y_pred_all = [] + for batch in tqdm(text_batches): + inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True) + logits = model(**inputs).logits + y_pred_batch = tf.argmax(logits,axis=1) + y_pred_all.append(y_pred_batch) + y_pred_all = np.concatenate(y_pred_all) + + return y_pred_all \ No newline at end of file diff --git a/src/MNB_utils.py b/src/MNB_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c6db6889735594c494a1f3680a5e72d60eff09a9 --- /dev/null +++ b/src/MNB_utils.py @@ -0,0 +1,67 @@ +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +from sklearn.feature_extraction.text import TfidfVectorizer + +from nltk.corpus import stopwords + +import datetime +from joblib import dump, load +import os +from pathlib import Path + +from typing import Union + +from src.preprocessing_text import TextProcessor + + +def create_pipeline() -> Pipeline: + """ + Creates classification pipeline + """ + + # define preprocessor + tp = TextProcessor() + + # define vectorizer + stop_words_ge = stopwords.words("german") + vectorizer = TfidfVectorizer( + stop_words=stop_words_ge, ngram_range=(1, 4), max_features=3000 + ) + + # define model + mnb = MultinomialNB(alpha=0.1) + + # set pipeline + pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)]) + + return pipe + + +def create_path() -> Union[str, os.PathLike]: + """ + Creates path to store trained model + """ + if not os.path.exists("saved_models/MNB/"): + os.makedirs("saved_models/MNB/") + + timestemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + return Path("saved_models/MNB/" + timestemp + ".joblib") + + +def save_model(pipe: Pipeline, path): + """ + Saves trained model + :param pipe: Trained pipeline + """ + dump(pipe, path) + + +def load_model(path: Union[str, os.PathLike]) -> Pipeline: + """ + Loads trained model + :param path: Path to pipeline + """ + pipe = load(path) + + return pipe diff --git a/src/eval_utils.py b/src/eval_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac84a1eecb3ab9dbee79581239dce9c8b112569 --- /dev/null +++ b/src/eval_utils.py @@ -0,0 +1,15 @@ +def gen_scores_dict(precision: float, recall: float, f1: float, accuracy: float): + """ + Generates dictionary containing most important scores + :param precision: Precision score + :param recall: Recall score + :param f1: F1 score + :param accuracy: Accuracy score + """ + results = dict() + results["precision"] = precision + results["recall"] = recall + results["f1"] = f1 + results["accuracy"] = accuracy + + return results diff --git a/src/prepare_bert_tf.py b/src/prepare_bert_tf.py index 0b96b5ab950b37af2f9a7e36f6cd0e13ef151fe2..e5e315f4de03b22afb16694fb4f3d797f4a5e0de 100644 --- a/src/prepare_bert_tf.py +++ b/src/prepare_bert_tf.py @@ -13,7 +13,7 @@ def df2dict(df: pd.DataFrame, test_size: float = 0.2, split_data: bool = True): :param test_size: size of test set :param split_data: whether data should be split or not """ - #df = df.sample(200) + #df.sample(10000, replace=True) if split_data: train, test = train_test_split(df, test_size=test_size) diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py index 0829b04a28bd3697e6dafda6d9438c2eb0de52d9..aaa2491faad70d0a71c5f0241e6deaee04bfc50e 100644 --- a/src/preprocessing_text.py +++ b/src/preprocessing_text.py @@ -35,6 +35,7 @@ class TextLoader(object): self, newspaper: str = None, lang: str = None, + topic: str = None, load_subset: bool = False, remove_duplicates: bool = False, min_num_words: int = None, @@ -63,7 +64,7 @@ class TextLoader(object): df = df.rename(columns={"rejected": "label"}) df_filter = self.filter_df( - df, min_num_words, remove_duplicates, newspaper, lang + df, min_num_words, remove_duplicates, newspaper, lang, topic, ) return df_filter @@ -75,6 +76,7 @@ class TextLoader(object): remove_duplicates: bool, newspaper: str, lang: str, + topic: str, ) -> pd.DataFrame: """ Filters data depending on given arguments. @@ -94,6 +96,9 @@ class TextLoader(object): if lang: df = self.filter_language(df, lang=lang) + if topic: + df = self.filter_topic(df, topic=topic) + if remove_duplicates: df = self.remove_duplicate_comments(df) @@ -116,6 +121,14 @@ class TextLoader(object): :param lang: Language """ return df.loc[(df.language == lang)] + + def filter_topic(self, df: pd.DataFrame, topic: str): + """ + Filters out comments with specific topic + :param df: Input dataframe + :param lang: Language + """ + return df.loc[(df.topic == topic)] def filter_min_words(self, df: pd.DataFrame, min_words: int = 3): """Filters out comments with less than min words @@ -146,12 +159,17 @@ class TextLoader(object): np.where(np.array(list(c_comm.values())) > 1) ] + # indices_repetitions = np.concatenate( + # [ + # np.where(df.text == d)[0][ + # np.argsort(df.createdAt.iloc[np.where(df.text == d)[0]].values)[:-1] + # ] + # for d in tqdm(duplicate_comments) + # ] + # ) indices_repetitions = np.concatenate( [ - np.where(df.text == d)[0][ - np.argsort(df.createdAt.iloc[np.where(df.text == d)[0]].values)[:-1] - ] - for d in tqdm(duplicate_comments) + np.where(df.text == d)[0] for d in tqdm(duplicate_comments) ] ) diff --git a/src/train_logs.py b/src/train_logs.py new file mode 100644 index 0000000000000000000000000000000000000000..e560b3a33b1aa4089e74ff8b2da5a6e1792ca97f --- /dev/null +++ b/src/train_logs.py @@ -0,0 +1,92 @@ +from typing import Tuple, Union, Optional +import os + +import pandas as pd + + +def save_logs( + path_repo: Union[str, os.PathLike], + path_model: Union[str, os.PathLike], + input_data: Union[str, os.PathLike], + text_preprocessing: bool, + newspaper: str, + lang: str, + topic: str, + remove_duplicates: bool, + min_num_words: int, + model_name: str, + val_score: Optional[Union[str, os.PathLike]] = None, +): + """ + Saves training logs which can be used during evaluation + :param path_repo: Path to repository + :param path_model: Path to trained model + :param input_data: Path to used train data + :param text_preprocessing: Boolean flag whether preprocessing was used or not + :param newspaper: Name of newspaper + :param lang: Selected language + :param topic: Selected topic + :param remove_duplicates: Boolean flag whether duplicates should be removed + :param min_num_words: Minimum number of words per comment + :param model_name: Name of model + """ + logs = dict() + logs["path_repo"] = path_repo + logs["path_model"] = path_model + logs["input_data"] = input_data + logs["text_preprocessing"] = text_preprocessing + logs["newspaper"] = newspaper + logs["lang"] = lang + logs["topic"] = topic + logs["remove_duplicates"] = remove_duplicates + logs["min_num_words"] = min_num_words + logs["va_score"] = val_score + + path_logs = (path_repo).joinpath("saved_models/" + model_name + "_logs/") + if not os.path.exists(path_logs): + os.makedirs(path_logs) + + df_logs = pd.DataFrame.from_dict(logs, orient="index", columns=["logs"]) + + df_logs.to_csv(path_logs.joinpath(path_model.stem).with_suffix(".csv")) + + +def load_logs( + train_logs: Union[str, os.PathLike] +) -> Tuple[ + Union[str, os.PathLike], + Union[str, os.PathLike], + str, + bool, + str, + str, + str, + bool, + int, +]: + """ + Loads training logs + :param train_logs: Path to csv-file containing logs + """ + df = pd.read_csv(train_logs, index_col="Unnamed: 0") + path_repo = df.loc["path_repo"].values[0] + path_model = df.loc["path_model"].values[0] + input_data = df.loc["input_data"].values[0].replace("train", "test") + text_preprocessing = df.loc["text_preprocessing"].values[0] + newspaper = df.loc["newspaper"].values[0] + lang = df.loc["lang"].values[0] + topic = df.loc["topic"].values[0] + remove_duplicates = df.loc["remove_duplicates"].values[0] + min_num_words = df.loc["min_num_words"].values[0] + + return ( + path_repo, + path_model, + input_data, + text_preprocessing, + newspaper, + lang, + topic, + remove_duplicates, + min_num_words, + )