Add optional text preprocessing

567cbb18 · Franziska Oschmann · 64a06473 · 567cbb18 · 567cbb18 · 64a06473
Commit 567cbb18 authored 1 year ago by Franziska Oschmann
--- a/moderation_classifier/main.py
+++ b/moderation_classifier/main.py
@@ -13,15 +13,23 @@ import os

 @click.command()
 @click.option('-p', '--prepare_data', is_flag=True)
+@click.option('-tp', '--text_preprocessing', is_flag=True)
 @click.option('-tm', '--train_mnb', is_flag=True)
 @click.option('-tb', '--train_bert', is_flag=True)
 @click.option('-tbto', '--train_bert_torch', is_flag=True)
 @click.argument('input_data')
-def main(prepare_data: bool, train_mnb: bool, train_bert: bool, train_bert_torch: bool, input_data: Union[str, os.PathLike]):
+def main(prepare_data: bool, 
+         text_preprocessing: bool, 
+         train_mnb: bool, 
+         train_bert: bool, 
+         train_bert_torch: bool, 
+         input_data: Union[str, os.PathLike]):
    """
    Run moderation classifier.
    :param prepare_data: Binary flag to specify if data should be prepared.
+    :param text_preprocessing: Binary flag to set text preprocessing.
    :param train_mnb: Binary flag to specify whether MNB should be trained.
+    :param train_mnb: Binary flag to specify whether BERT should be trained.
    :param input_data: Path to input dataframe.
    """
    
@@ -35,7 +43,7 @@ def main(prepare_data: bool, train_mnb: bool, train_bert: bool, train_bert_torch
        train_MNB.main(input_data)

    if train_bert:
-        train_BERT.main(input_data)
+        train_BERT.main(input_data, text_preprocessing)

    if train_bert_torch:
        train_BERT_torch.main(input_data)

--- a/moderation_classifier/train_BERT.py
+++ b/moderation_classifier/train_BERT.py
-from datasets import Dataset, DatasetDict
-
-import evaluate
 from transformers import AutoTokenizer
 from transformers import DataCollatorWithPadding
-
-from transformers import create_optimizer
 from transformers import TFAutoModelForSequenceClassification
-
-import numpy as np
-
 from transformers.keras_callbacks import KerasMetricCallback
-from transformers.keras_callbacks import PushToHubCallback

 import tensorflow as tf
 from tensorflow.keras.callbacks import ModelCheckpoint
 from tensorflow.keras.callbacks import TensorBoard

-import pandas as pd
+import click
 import datetime
-from typing import Union
 import os
+from pathlib import Path
+import spacy
+from typing import Union

-import click
-
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import f1_score, precision_recall_fscore_support
-
-from src.preprocessing_text import TextLoader
-
-def load_text(path: Union[str, os.PathLike], newspaper: str = 'tagesanzeiger', lang: str = 'de') -> pd.DataFrame: 
-    """
-    Loads daraframe and extracts text depending on newspaper and langugae
-    """
-    df = pd.read_csv(path)
-    df = df.loc[(df.originTenantId==newspaper) & (df.language==lang)]
-    df = df[['text','rejected']]
-    df = df.rename(columns={"rejected": "label"})
-
-    return df
+from src.preprocessing_text import TextLoader, TextProcessor
+from src.prepare_bert_tf import df2dict, compute_metrics, prepare_training

-def df2dict(df: pd.DataFrame):
+def preprocess(text: str, nlp) -> str:
    """
-    Converts Dataframe into Huggingface Dataset
+    Applies preprocessing to text
+    :param text: Input text
+    :param nlp: Loaded nlp model
    """

-    df = df.sample(100)
-    train, test = train_test_split(df, test_size=0.2)
-
-    
-    ds_train = Dataset.from_pandas(train)
-    ds_test = Dataset.from_pandas(test)
-
-    ds = DatasetDict()
-    ds['train'] = ds_train
-    ds['test'] = ds_test
+    tp = TextProcessor(nlp)

-    return ds
+    text_proc = tp.remove_spaces(text)
+    text_proc = tp.remove_punctuation(text_proc)
+    text_proc = tp.remove_mentions(text_proc)
+    text_proc = tp.fold_case(text_proc)

-def compute_metrics(eval_pred):
-    accuracy = evaluate.load("accuracy")
-    predictions, labels = eval_pred
-    predictions = np.argmax(predictions, axis=1)
-    return accuracy.compute(predictions=predictions, references=labels)
+    return text_proc

-def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
+@click.argument('input_data')
+@click.argument('text_preprocessing', required=False)
+def main(input_data: Union[str, os.PathLike], text_preprocessing: bool):
    """
-    Prepares training and sets params
+    Prepares data and trains BERT model with TF
+    :param input_data: path to input data
    """

-    batches_per_epoch = len(dataset["train"]) // batch_size
-    total_train_steps = int(batches_per_epoch * num_epochs)
-    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
-
-    return optimizer, schedule
-
+    def preprocess_function(examples):
+        """
+        Prepares tokenizer for mapping
+        """
+        return tokenizer(examples["text"], truncation=True)
+    
+    # Extract path
+    p = Path(input_data)
+    p_repo = p.parent.parent

-@click.argument('input_data')
-def main(input_data: Union[str, os.PathLike]):
   
-    # load data and extract only german text from tagesanzeiger
-    print('Load text')
+    # Load data and extract only german text from tagesanzeiger
+    print('Load and preprocess text')
    tl = TextLoader(input_data)
    df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', load_subset=True)
+    
+    if text_preprocessing:
+        nlp = spacy.load('de_core_news_sm')
+        text_proc = df_de.text.apply(preprocess, nlp=nlp)
+        df_de.text = text_proc

-    ## Dataframe to dict/Train-test split
-    ds = df2dict(df_de)

-    ## Preprocessing/Tokenization
+    ## Prepare data for modeling
+    ds = df2dict(df_de)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
-
-    def preprocess_function(examples):
-        return tokenizer(examples["text"], truncation=True)
-    
-    #truncate sequences to be no longer than the models maximum input length
    tokenized_text = ds.map(preprocess_function)
-
-    #dynamically padding of sentences to the longest length in a batch
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

-    ## Evaluation
-    accuracy = evaluate.load("accuracy")

    # Training
+    print('Train model')
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}

-    optimizer, schedule = prepare_training(tokenized_text)
+    optimizer, _ = prepare_training(tokenized_text)
    model = TFAutoModelForSequenceClassification.from_pretrained(
        "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
        )
@@ -126,7 +96,8 @@ def main(input_data: Union[str, os.PathLike]):

    model.compile(optimizer=optimizer)  

-    checkpoint_filepath = '/Users/franziskaoschmann/Documents/public_policy/tmp/checkpoint/'
+    path_checkpoint = (p_repo).joinpath('tmp/checkpoint/')
+    checkpoint_filepath = path_checkpoint
    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
    checkpoint_callback = ModelCheckpoint(checkpoint_filepath, monitor = 'val_loss',
                                          save_best_only = True,
@@ -140,8 +111,12 @@ def main(input_data: Union[str, os.PathLike]):
    callbacks = [metric_callback, checkpoint_callback, tensorboard_callback]

    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks)
-    model.save_pretrained('/Users/franziskaoschmann/Documents/public_policy/saved_models/')
-    tokenizer.save_pretrained('/Users/franziskaoschmann/Documents/public_policy/saved_models/')
+
+
+    # Save model
+    path_model = (p_repo).joinpath('saved_models/')
+    model.save_pretrained(path_model)
+    tokenizer.save_pretrained(path_model)




--- a/moderation_classifier/train_BERT_tmp.py
+++ b/moderation_classifier/train_BERT_tmp.py
-from datasets import Dataset, DatasetDict
-
-import evaluate
-from transformers import AutoTokenizer
-from transformers import DataCollatorWithPadding
-
-from transformers import create_optimizer
-from transformers import TFAutoModelForSequenceClassification
-
-import numpy as np
-
-from transformers.keras_callbacks import KerasMetricCallback
-from transformers.keras_callbacks import PushToHubCallback
-
-import tensorflow as tf
-from tf.keras.callbacks import ModelCheckpoint
-
-import pandas as pd
-
-from typing import Union
-import os
-
-import click
-
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import f1_score, precision_recall_fscore_support
-
-from src.preprocessing_text import TextLoader
-
-def load_text(path: Union[str, os.PathLike], newspaper: str = 'tagesanzeiger', lang: str = 'de') -> pd.DataFrame: 
-    """
-    Loads daraframe and extracts text depending on newspaper and langugae
-    """
-    df = pd.read_csv(path)
-    df = df.loc[(df.originTenantId==newspaper) & (df.language==lang)]
-    df = df[['text','rejected']]
-    df = df.rename(columns={"rejected": "label"})
-
-    return df
-
-def df2dict(df: pd.DataFrame):
-    """
-    Converts Dataframe into Huggingface Dataset
-    """
-
-    df = df.sample(100)
-    train, test = train_test_split(df, test_size=0.2)
-
-    
-    ds_train = Dataset.from_pandas(train)
-    ds_test = Dataset.from_pandas(test)
-
-    ds = DatasetDict()
-    ds['train'] = ds_train
-    ds['test'] = ds_test
-
-    return ds
-
-def compute_metrics(eval_pred):
-    accuracy = evaluate.load("accuracy")
-    predictions, labels = eval_pred
-    predictions = np.argmax(predictions, axis=1)
-    return accuracy.compute(predictions=predictions, references=labels)
-
-def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
-    """
-    Prepares training and sets params
-    """
-
-    batches_per_epoch = len(dataset["train"]) // batch_size
-    total_train_steps = int(batches_per_epoch * num_epochs)
-    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
-
-    return optimizer, schedule
-
-
-@click.argument('input_data')
-def main(input_data: Union[str, os.PathLike]):
-   
-    # load data and extract only german text from tagesanzeiger
-    print('Load text')
-    tl = TextLoader(input_data)
-    df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', load_subset=True)
-
-    ## Dataframe to dict/Train-test split
-    ds = df2dict(df_de)
-
-    ## Preprocessing/Tokenization
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
-
-    def preprocess_function(examples):
-        return tokenizer(examples["text"], truncation=True)
-    
-    #truncate sequences to be no longer than the models maximum input length
-    tokenized_text = ds.map(preprocess_function)
-
-    #dynamically padding of sentences to the longest length in a batch
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
-
-    ## Evaluation
-    accuracy = evaluate.load("accuracy")
-
-    # Training
-    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
-    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
-
-    optimizer, schedule = prepare_training(tokenized_text)
-    model = TFAutoModelForSequenceClassification.from_pretrained(
-        "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
-        )
-    
-    tf_train_set = model.prepare_tf_dataset(
-        tokenized_text["train"],
-        shuffle=True,
-        batch_size=16,
-        collate_fn=data_collator,
-    )
-
-    tf_validation_set = model.prepare_tf_dataset(
-        tokenized_text["test"],
-        shuffle=False,
-        batch_size=16,
-        collate_fn=data_collator,
-    )
-
-    model.compile(optimizer=optimizer)  
-
-    checkpoint_filepath = '/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/checkpoint/'
-    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
-    checkpoint_callback = ModelCheckpoint(checkpoint_filepath,
-                                          monitor: str = 'val_loss',
-                                          save_best_only: bool = True,
-                                          save_weights_only: bool = False,
-                                          mode: str = 'min',
-                                          save_freq='epoch',
-                                          initial_value_threshold=None)
-
-    callbacks = [metric_callback, checkpoint_callback]
-
-    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks)
-    model.save_pretrained('/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/saved_models/')
-    tokenizer.save_pretrained('/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/saved_models/')
-
-
-
-
-if __name__ == "__main__":
-    main()
--- a/src/prepare_bert_tf.py
+++ b/src/prepare_bert_tf.py
+import pandas as pd
+from datasets import Dataset, DatasetDict
+from sklearn.model_selection import train_test_split
+import evaluate
+import numpy as np
+from transformers import create_optimizer
+
+def df2dict(df: pd.DataFrame, test_size: float = 0.2):
+    """
+    Converts Dataframe into Huggingface Dataset
+    """
+    df = df.sample(100)
+    train, test = train_test_split(df, test_size=test_size)
+    
+    ds_train = Dataset.from_pandas(train)
+    ds_test = Dataset.from_pandas(test)
+
+    ds = DatasetDict()
+    ds['train'] = ds_train
+    ds['test'] = ds_test
+
+    return ds
+
+
+def compute_metrics(eval_pred):
+    """
+    Computes metrics during training
+    """
+    accuracy = evaluate.load("accuracy")
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+
+    return accuracy.compute(predictions=predictions, references=labels)
+
+
+def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
+    """
+    Prepares training and sets params
+    """
+    batches_per_epoch = len(dataset["train"]) // batch_size
+    total_train_steps = int(batches_per_epoch * num_epochs)
+    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+
+    return optimizer, schedule
\ No newline at end of file