From 567cbb18fc4dec46f804fd940770b2719212a7c8 Mon Sep 17 00:00:00 2001 From: Franziska Oschmann <franziskaoschmann@Franziskas-MBP.fritz.box> Date: Thu, 29 Jun 2023 13:01:48 +0200 Subject: [PATCH] Add optional text preprocessing --- moderation_classifier/main.py | 12 +- moderation_classifier/train_BERT.py | 123 ++++++++------------ moderation_classifier/train_BERT_tmp.py | 148 ------------------------ src/prepare_bert_tf.py | 44 +++++++ 4 files changed, 103 insertions(+), 224 deletions(-) delete mode 100644 moderation_classifier/train_BERT_tmp.py create mode 100644 src/prepare_bert_tf.py diff --git a/moderation_classifier/main.py b/moderation_classifier/main.py index 281b89a..0bb8d16 100644 --- a/moderation_classifier/main.py +++ b/moderation_classifier/main.py @@ -13,15 +13,23 @@ import os @click.command() @click.option('-p', '--prepare_data', is_flag=True) +@click.option('-tp', '--text_preprocessing', is_flag=True) @click.option('-tm', '--train_mnb', is_flag=True) @click.option('-tb', '--train_bert', is_flag=True) @click.option('-tbto', '--train_bert_torch', is_flag=True) @click.argument('input_data') -def main(prepare_data: bool, train_mnb: bool, train_bert: bool, train_bert_torch: bool, input_data: Union[str, os.PathLike]): +def main(prepare_data: bool, + text_preprocessing: bool, + train_mnb: bool, + train_bert: bool, + train_bert_torch: bool, + input_data: Union[str, os.PathLike]): """ Run moderation classifier. :param prepare_data: Binary flag to specify if data should be prepared. + :param text_preprocessing: Binary flag to set text preprocessing. :param train_mnb: Binary flag to specify whether MNB should be trained. + :param train_mnb: Binary flag to specify whether BERT should be trained. :param input_data: Path to input dataframe. """ @@ -35,7 +43,7 @@ def main(prepare_data: bool, train_mnb: bool, train_bert: bool, train_bert_torch train_MNB.main(input_data) if train_bert: - train_BERT.main(input_data) + train_BERT.main(input_data, text_preprocessing) if train_bert_torch: train_BERT_torch.main(input_data) diff --git a/moderation_classifier/train_BERT.py b/moderation_classifier/train_BERT.py index dcd9c52..089782f 100644 --- a/moderation_classifier/train_BERT.py +++ b/moderation_classifier/train_BERT.py @@ -1,111 +1,81 @@ -from datasets import Dataset, DatasetDict - -import evaluate from transformers import AutoTokenizer from transformers import DataCollatorWithPadding - -from transformers import create_optimizer from transformers import TFAutoModelForSequenceClassification - -import numpy as np - from transformers.keras_callbacks import KerasMetricCallback -from transformers.keras_callbacks import PushToHubCallback import tensorflow as tf from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.callbacks import TensorBoard -import pandas as pd +import click import datetime -from typing import Union import os +from pathlib import Path +import spacy +from typing import Union -import click - -from sklearn.model_selection import train_test_split -from sklearn.metrics import f1_score, precision_recall_fscore_support - -from src.preprocessing_text import TextLoader - -def load_text(path: Union[str, os.PathLike], newspaper: str = 'tagesanzeiger', lang: str = 'de') -> pd.DataFrame: - """ - Loads daraframe and extracts text depending on newspaper and langugae - """ - df = pd.read_csv(path) - df = df.loc[(df.originTenantId==newspaper) & (df.language==lang)] - df = df[['text','rejected']] - df = df.rename(columns={"rejected": "label"}) - - return df +from src.preprocessing_text import TextLoader, TextProcessor +from src.prepare_bert_tf import df2dict, compute_metrics, prepare_training -def df2dict(df: pd.DataFrame): +def preprocess(text: str, nlp) -> str: """ - Converts Dataframe into Huggingface Dataset + Applies preprocessing to text + :param text: Input text + :param nlp: Loaded nlp model """ - df = df.sample(100) - train, test = train_test_split(df, test_size=0.2) - - - ds_train = Dataset.from_pandas(train) - ds_test = Dataset.from_pandas(test) - - ds = DatasetDict() - ds['train'] = ds_train - ds['test'] = ds_test + tp = TextProcessor(nlp) - return ds + text_proc = tp.remove_spaces(text) + text_proc = tp.remove_punctuation(text_proc) + text_proc = tp.remove_mentions(text_proc) + text_proc = tp.fold_case(text_proc) -def compute_metrics(eval_pred): - accuracy = evaluate.load("accuracy") - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) + return text_proc -def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5): +@click.argument('input_data') +@click.argument('text_preprocessing', required=False) +def main(input_data: Union[str, os.PathLike], text_preprocessing: bool): """ - Prepares training and sets params + Prepares data and trains BERT model with TF + :param input_data: path to input data """ - batches_per_epoch = len(dataset["train"]) // batch_size - total_train_steps = int(batches_per_epoch * num_epochs) - optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) - - return optimizer, schedule - + def preprocess_function(examples): + """ + Prepares tokenizer for mapping + """ + return tokenizer(examples["text"], truncation=True) + + # Extract path + p = Path(input_data) + p_repo = p.parent.parent -@click.argument('input_data') -def main(input_data: Union[str, os.PathLike]): - # load data and extract only german text from tagesanzeiger - print('Load text') + # Load data and extract only german text from tagesanzeiger + print('Load and preprocess text') tl = TextLoader(input_data) df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', load_subset=True) + + if text_preprocessing: + nlp = spacy.load('de_core_news_sm') + text_proc = df_de.text.apply(preprocess, nlp=nlp) + df_de.text = text_proc - ## Dataframe to dict/Train-test split - ds = df2dict(df_de) - ## Preprocessing/Tokenization + ## Prepare data for modeling + ds = df2dict(df_de) tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") - - def preprocess_function(examples): - return tokenizer(examples["text"], truncation=True) - - #truncate sequences to be no longer than the models maximum input length tokenized_text = ds.map(preprocess_function) - - #dynamically padding of sentences to the longest length in a batch data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") - ## Evaluation - accuracy = evaluate.load("accuracy") # Training + print('Train model') id2label = {0: "NEGATIVE", 1: "POSITIVE"} label2id = {"NEGATIVE": 0, "POSITIVE": 1} - optimizer, schedule = prepare_training(tokenized_text) + optimizer, _ = prepare_training(tokenized_text) model = TFAutoModelForSequenceClassification.from_pretrained( "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id ) @@ -126,7 +96,8 @@ def main(input_data: Union[str, os.PathLike]): model.compile(optimizer=optimizer) - checkpoint_filepath = '/Users/franziskaoschmann/Documents/public_policy/tmp/checkpoint/' + path_checkpoint = (p_repo).joinpath('tmp/checkpoint/') + checkpoint_filepath = path_checkpoint metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) checkpoint_callback = ModelCheckpoint(checkpoint_filepath, monitor = 'val_loss', save_best_only = True, @@ -140,8 +111,12 @@ def main(input_data: Union[str, os.PathLike]): callbacks = [metric_callback, checkpoint_callback, tensorboard_callback] model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks) - model.save_pretrained('/Users/franziskaoschmann/Documents/public_policy/saved_models/') - tokenizer.save_pretrained('/Users/franziskaoschmann/Documents/public_policy/saved_models/') + + + # Save model + path_model = (p_repo).joinpath('saved_models/') + model.save_pretrained(path_model) + tokenizer.save_pretrained(path_model) diff --git a/moderation_classifier/train_BERT_tmp.py b/moderation_classifier/train_BERT_tmp.py deleted file mode 100644 index 662bc78..0000000 --- a/moderation_classifier/train_BERT_tmp.py +++ /dev/null @@ -1,148 +0,0 @@ -from datasets import Dataset, DatasetDict - -import evaluate -from transformers import AutoTokenizer -from transformers import DataCollatorWithPadding - -from transformers import create_optimizer -from transformers import TFAutoModelForSequenceClassification - -import numpy as np - -from transformers.keras_callbacks import KerasMetricCallback -from transformers.keras_callbacks import PushToHubCallback - -import tensorflow as tf -from tf.keras.callbacks import ModelCheckpoint - -import pandas as pd - -from typing import Union -import os - -import click - -from sklearn.model_selection import train_test_split -from sklearn.metrics import f1_score, precision_recall_fscore_support - -from src.preprocessing_text import TextLoader - -def load_text(path: Union[str, os.PathLike], newspaper: str = 'tagesanzeiger', lang: str = 'de') -> pd.DataFrame: - """ - Loads daraframe and extracts text depending on newspaper and langugae - """ - df = pd.read_csv(path) - df = df.loc[(df.originTenantId==newspaper) & (df.language==lang)] - df = df[['text','rejected']] - df = df.rename(columns={"rejected": "label"}) - - return df - -def df2dict(df: pd.DataFrame): - """ - Converts Dataframe into Huggingface Dataset - """ - - df = df.sample(100) - train, test = train_test_split(df, test_size=0.2) - - - ds_train = Dataset.from_pandas(train) - ds_test = Dataset.from_pandas(test) - - ds = DatasetDict() - ds['train'] = ds_train - ds['test'] = ds_test - - return ds - -def compute_metrics(eval_pred): - accuracy = evaluate.load("accuracy") - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return accuracy.compute(predictions=predictions, references=labels) - -def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5): - """ - Prepares training and sets params - """ - - batches_per_epoch = len(dataset["train"]) // batch_size - total_train_steps = int(batches_per_epoch * num_epochs) - optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) - - return optimizer, schedule - - -@click.argument('input_data') -def main(input_data: Union[str, os.PathLike]): - - # load data and extract only german text from tagesanzeiger - print('Load text') - tl = TextLoader(input_data) - df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', load_subset=True) - - ## Dataframe to dict/Train-test split - ds = df2dict(df_de) - - ## Preprocessing/Tokenization - tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") - - def preprocess_function(examples): - return tokenizer(examples["text"], truncation=True) - - #truncate sequences to be no longer than the models maximum input length - tokenized_text = ds.map(preprocess_function) - - #dynamically padding of sentences to the longest length in a batch - data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") - - ## Evaluation - accuracy = evaluate.load("accuracy") - - # Training - id2label = {0: "NEGATIVE", 1: "POSITIVE"} - label2id = {"NEGATIVE": 0, "POSITIVE": 1} - - optimizer, schedule = prepare_training(tokenized_text) - model = TFAutoModelForSequenceClassification.from_pretrained( - "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id - ) - - tf_train_set = model.prepare_tf_dataset( - tokenized_text["train"], - shuffle=True, - batch_size=16, - collate_fn=data_collator, - ) - - tf_validation_set = model.prepare_tf_dataset( - tokenized_text["test"], - shuffle=False, - batch_size=16, - collate_fn=data_collator, - ) - - model.compile(optimizer=optimizer) - - checkpoint_filepath = '/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/checkpoint/' - metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) - checkpoint_callback = ModelCheckpoint(checkpoint_filepath, - monitor: str = 'val_loss', - save_best_only: bool = True, - save_weights_only: bool = False, - mode: str = 'min', - save_freq='epoch', - initial_value_threshold=None) - - callbacks = [metric_callback, checkpoint_callback] - - model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks) - model.save_pretrained('/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/saved_models/') - tokenizer.save_pretrained('/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/saved_models/') - - - - -if __name__ == "__main__": - main() diff --git a/src/prepare_bert_tf.py b/src/prepare_bert_tf.py new file mode 100644 index 0000000..7d902b8 --- /dev/null +++ b/src/prepare_bert_tf.py @@ -0,0 +1,44 @@ +import pandas as pd +from datasets import Dataset, DatasetDict +from sklearn.model_selection import train_test_split +import evaluate +import numpy as np +from transformers import create_optimizer + +def df2dict(df: pd.DataFrame, test_size: float = 0.2): + """ + Converts Dataframe into Huggingface Dataset + """ + df = df.sample(100) + train, test = train_test_split(df, test_size=test_size) + + ds_train = Dataset.from_pandas(train) + ds_test = Dataset.from_pandas(test) + + ds = DatasetDict() + ds['train'] = ds_train + ds['test'] = ds_test + + return ds + + +def compute_metrics(eval_pred): + """ + Computes metrics during training + """ + accuracy = evaluate.load("accuracy") + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + + return accuracy.compute(predictions=predictions, references=labels) + + +def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5): + """ + Prepares training and sets params + """ + batches_per_epoch = len(dataset["train"]) // batch_size + total_train_steps = int(batches_per_epoch * num_epochs) + optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) + + return optimizer, schedule \ No newline at end of file -- GitLab