Skip to content
Snippets Groups Projects
Commit 567cbb18 authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

Add optional text preprocessing

parent 64a06473
No related branches found
No related tags found
1 merge request!2Dev train models
......@@ -13,15 +13,23 @@ import os
@click.command()
@click.option('-p', '--prepare_data', is_flag=True)
@click.option('-tp', '--text_preprocessing', is_flag=True)
@click.option('-tm', '--train_mnb', is_flag=True)
@click.option('-tb', '--train_bert', is_flag=True)
@click.option('-tbto', '--train_bert_torch', is_flag=True)
@click.argument('input_data')
def main(prepare_data: bool, train_mnb: bool, train_bert: bool, train_bert_torch: bool, input_data: Union[str, os.PathLike]):
def main(prepare_data: bool,
text_preprocessing: bool,
train_mnb: bool,
train_bert: bool,
train_bert_torch: bool,
input_data: Union[str, os.PathLike]):
"""
Run moderation classifier.
:param prepare_data: Binary flag to specify if data should be prepared.
:param text_preprocessing: Binary flag to set text preprocessing.
:param train_mnb: Binary flag to specify whether MNB should be trained.
:param train_mnb: Binary flag to specify whether BERT should be trained.
:param input_data: Path to input dataframe.
"""
......@@ -35,7 +43,7 @@ def main(prepare_data: bool, train_mnb: bool, train_bert: bool, train_bert_torch
train_MNB.main(input_data)
if train_bert:
train_BERT.main(input_data)
train_BERT.main(input_data, text_preprocessing)
if train_bert_torch:
train_BERT_torch.main(input_data)
......
from datasets import Dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
from transformers import TFAutoModelForSequenceClassification
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard
import pandas as pd
import click
import datetime
from typing import Union
import os
from pathlib import Path
import spacy
from typing import Union
import click
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_fscore_support
from src.preprocessing_text import TextLoader
def load_text(path: Union[str, os.PathLike], newspaper: str = 'tagesanzeiger', lang: str = 'de') -> pd.DataFrame:
"""
Loads daraframe and extracts text depending on newspaper and langugae
"""
df = pd.read_csv(path)
df = df.loc[(df.originTenantId==newspaper) & (df.language==lang)]
df = df[['text','rejected']]
df = df.rename(columns={"rejected": "label"})
return df
from src.preprocessing_text import TextLoader, TextProcessor
from src.prepare_bert_tf import df2dict, compute_metrics, prepare_training
def df2dict(df: pd.DataFrame):
def preprocess(text: str, nlp) -> str:
"""
Converts Dataframe into Huggingface Dataset
Applies preprocessing to text
:param text: Input text
:param nlp: Loaded nlp model
"""
df = df.sample(100)
train, test = train_test_split(df, test_size=0.2)
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)
ds = DatasetDict()
ds['train'] = ds_train
ds['test'] = ds_test
tp = TextProcessor(nlp)
return ds
text_proc = tp.remove_spaces(text)
text_proc = tp.remove_punctuation(text_proc)
text_proc = tp.remove_mentions(text_proc)
text_proc = tp.fold_case(text_proc)
def compute_metrics(eval_pred):
accuracy = evaluate.load("accuracy")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
return text_proc
def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
@click.argument('input_data')
@click.argument('text_preprocessing', required=False)
def main(input_data: Union[str, os.PathLike], text_preprocessing: bool):
"""
Prepares training and sets params
Prepares data and trains BERT model with TF
:param input_data: path to input data
"""
batches_per_epoch = len(dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
return optimizer, schedule
def preprocess_function(examples):
"""
Prepares tokenizer for mapping
"""
return tokenizer(examples["text"], truncation=True)
# Extract path
p = Path(input_data)
p_repo = p.parent.parent
@click.argument('input_data')
def main(input_data: Union[str, os.PathLike]):
# load data and extract only german text from tagesanzeiger
print('Load text')
# Load data and extract only german text from tagesanzeiger
print('Load and preprocess text')
tl = TextLoader(input_data)
df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', load_subset=True)
if text_preprocessing:
nlp = spacy.load('de_core_news_sm')
text_proc = df_de.text.apply(preprocess, nlp=nlp)
df_de.text = text_proc
## Dataframe to dict/Train-test split
ds = df2dict(df_de)
## Preprocessing/Tokenization
## Prepare data for modeling
ds = df2dict(df_de)
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
#truncate sequences to be no longer than the models maximum input length
tokenized_text = ds.map(preprocess_function)
#dynamically padding of sentences to the longest length in a batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
## Evaluation
accuracy = evaluate.load("accuracy")
# Training
print('Train model')
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
optimizer, schedule = prepare_training(tokenized_text)
optimizer, _ = prepare_training(tokenized_text)
model = TFAutoModelForSequenceClassification.from_pretrained(
"bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
)
......@@ -126,7 +96,8 @@ def main(input_data: Union[str, os.PathLike]):
model.compile(optimizer=optimizer)
checkpoint_filepath = '/Users/franziskaoschmann/Documents/public_policy/tmp/checkpoint/'
path_checkpoint = (p_repo).joinpath('tmp/checkpoint/')
checkpoint_filepath = path_checkpoint
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
checkpoint_callback = ModelCheckpoint(checkpoint_filepath, monitor = 'val_loss',
save_best_only = True,
......@@ -140,8 +111,12 @@ def main(input_data: Union[str, os.PathLike]):
callbacks = [metric_callback, checkpoint_callback, tensorboard_callback]
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks)
model.save_pretrained('/Users/franziskaoschmann/Documents/public_policy/saved_models/')
tokenizer.save_pretrained('/Users/franziskaoschmann/Documents/public_policy/saved_models/')
# Save model
path_model = (p_repo).joinpath('saved_models/')
model.save_pretrained(path_model)
tokenizer.save_pretrained(path_model)
......
from datasets import Dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
from transformers import TFAutoModelForSequenceClassification
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf
from tf.keras.callbacks import ModelCheckpoint
import pandas as pd
from typing import Union
import os
import click
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_fscore_support
from src.preprocessing_text import TextLoader
def load_text(path: Union[str, os.PathLike], newspaper: str = 'tagesanzeiger', lang: str = 'de') -> pd.DataFrame:
"""
Loads daraframe and extracts text depending on newspaper and langugae
"""
df = pd.read_csv(path)
df = df.loc[(df.originTenantId==newspaper) & (df.language==lang)]
df = df[['text','rejected']]
df = df.rename(columns={"rejected": "label"})
return df
def df2dict(df: pd.DataFrame):
"""
Converts Dataframe into Huggingface Dataset
"""
df = df.sample(100)
train, test = train_test_split(df, test_size=0.2)
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)
ds = DatasetDict()
ds['train'] = ds_train
ds['test'] = ds_test
return ds
def compute_metrics(eval_pred):
accuracy = evaluate.load("accuracy")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
"""
Prepares training and sets params
"""
batches_per_epoch = len(dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
return optimizer, schedule
@click.argument('input_data')
def main(input_data: Union[str, os.PathLike]):
# load data and extract only german text from tagesanzeiger
print('Load text')
tl = TextLoader(input_data)
df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', load_subset=True)
## Dataframe to dict/Train-test split
ds = df2dict(df_de)
## Preprocessing/Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
#truncate sequences to be no longer than the models maximum input length
tokenized_text = ds.map(preprocess_function)
#dynamically padding of sentences to the longest length in a batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
## Evaluation
accuracy = evaluate.load("accuracy")
# Training
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
optimizer, schedule = prepare_training(tokenized_text)
model = TFAutoModelForSequenceClassification.from_pretrained(
"bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
)
tf_train_set = model.prepare_tf_dataset(
tokenized_text["train"],
shuffle=True,
batch_size=16,
collate_fn=data_collator,
)
tf_validation_set = model.prepare_tf_dataset(
tokenized_text["test"],
shuffle=False,
batch_size=16,
collate_fn=data_collator,
)
model.compile(optimizer=optimizer)
checkpoint_filepath = '/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/checkpoint/'
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
checkpoint_callback = ModelCheckpoint(checkpoint_filepath,
monitor: str = 'val_loss',
save_best_only: bool = True,
save_weights_only: bool = False,
mode: str = 'min',
save_freq='epoch',
initial_value_threshold=None)
callbacks = [metric_callback, checkpoint_callback]
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks)
model.save_pretrained('/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/saved_models/')
tokenizer.save_pretrained('/cluster/work/sis/cdss/oschmanf/ppg-moderation-classifier/tmp/saved_models/')
if __name__ == "__main__":
main()
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np
from transformers import create_optimizer
def df2dict(df: pd.DataFrame, test_size: float = 0.2):
"""
Converts Dataframe into Huggingface Dataset
"""
df = df.sample(100)
train, test = train_test_split(df, test_size=test_size)
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)
ds = DatasetDict()
ds['train'] = ds_train
ds['test'] = ds_test
return ds
def compute_metrics(eval_pred):
"""
Computes metrics during training
"""
accuracy = evaluate.load("accuracy")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
"""
Prepares training and sets params
"""
batches_per_epoch = len(dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
return optimizer, schedule
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment