Newer
Older
from datasets import Dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import pandas as pd
from typing import Union
import os
import click
from sklearn.model_selection import train_test_split
from src.preprocessing_text import TextLoader
def load_text(
path: Union[str, os.PathLike], newspaper: str = "tagesanzeiger", lang: str = "de"
) -> pd.DataFrame:
"""
Loads daraframe and extracts text depending on newspaper and langugae
"""
df = pd.read_csv(path)
df = df.loc[(df.originTenantId == newspaper) & (df.language == lang)]
df = df[["text", "rejected"]]
df = df.rename(columns={"rejected": "label"})
return df
def df2dict(df: pd.DataFrame):
"""
Converts Dataframe into Huggingface Dataset
"""
df = df.sample(10000)
train, test = train_test_split(df, test_size=0.2)
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)
ds = DatasetDict()
def compute_metrics(eval_pred):
accuracy = evaluate.load("accuracy")
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
"""
Prepares training and sets params
"""
batches_per_epoch = len(dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(
init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
return optimizer, schedule
def main(input_data: Union[str, os.PathLike]):
# load data and extract only german text from tagesanzeiger
tl = TextLoader(input_data)
df_de = tl.load_text_csv(newspaper="tagesanzeiger", load_subset=True)
ds = df2dict(df_de)
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
# truncate sequences to be no longer than the models maximum input length
print("map")
tokenized_text = ds.map(preprocess_function)
# dynamically padding of sentences to the longest length in a batch
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Training
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
)
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
training_args = TrainingArguments(
output_dir="my_awesome_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_text["train"],
eval_dataset=tokenized_text["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
if __name__ == "__main__":
main()