Skip to content
Snippets Groups Projects
train_BERT_torch.py 3.66 KiB
Newer Older
from datasets import Dataset, DatasetDict

import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import numpy as np

import pandas as pd

from typing import Union
import os

import click

from sklearn.model_selection import train_test_split

from src.preprocessing_text import TextLoader

Franziska Oschmann's avatar
Franziska Oschmann committed

def load_text(
    path: Union[str, os.PathLike], newspaper: str = "tagesanzeiger", lang: str = "de"
) -> pd.DataFrame:
    """
    Loads daraframe and extracts text depending on newspaper and langugae
    """
    df = pd.read_csv(path)
Franziska Oschmann's avatar
Franziska Oschmann committed
    df = df.loc[(df.originTenantId == newspaper) & (df.language == lang)]
    df = df[["text", "rejected"]]
    df = df.rename(columns={"rejected": "label"})

    return df

Franziska Oschmann's avatar
Franziska Oschmann committed

def df2dict(df: pd.DataFrame):
    """
    Converts Dataframe into Huggingface Dataset
    """

    df = df.sample(10000)
    train, test = train_test_split(df, test_size=0.2)

    ds_train = Dataset.from_pandas(train)
    ds_test = Dataset.from_pandas(test)

    ds = DatasetDict()
Franziska Oschmann's avatar
Franziska Oschmann committed
    ds["train"] = ds_train
    ds["test"] = ds_test
Franziska Oschmann's avatar
Franziska Oschmann committed

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Franziska Oschmann's avatar
Franziska Oschmann committed

def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5):
    """
    Prepares training and sets params
    """

    batches_per_epoch = len(dataset["train"]) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
Franziska Oschmann's avatar
Franziska Oschmann committed
    optimizer, schedule = create_optimizer(
        init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
    )
Franziska Oschmann's avatar
Franziska Oschmann committed
@click.argument("input_data")
def main(input_data: Union[str, os.PathLike]):
    # load data and extract only german text from tagesanzeiger
Franziska Oschmann's avatar
Franziska Oschmann committed
    print("Load text")
    tl = TextLoader(input_data)
Franziska Oschmann's avatar
Franziska Oschmann committed
    df_de = tl.load_text_csv(newspaper="tagesanzeiger", load_subset=True)
Franziska Oschmann's avatar
Franziska Oschmann committed
    # Dataframe to dict/Train-test split
Franziska Oschmann's avatar
Franziska Oschmann committed
    # Preprocessing/Tokenization
    print("tokenize")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)
Franziska Oschmann's avatar
Franziska Oschmann committed

    # truncate sequences to be no longer than the models maximum input length
    print("map")
    tokenized_text = ds.map(preprocess_function)

Franziska Oschmann's avatar
Franziska Oschmann committed
    # dynamically padding of sentences to the longest length in a batch
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}

    model = AutoModelForSequenceClassification.from_pretrained(
Franziska Oschmann's avatar
Franziska Oschmann committed
        "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir="my_awesome_model",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_text["train"],
        eval_dataset=tokenized_text["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

if __name__ == "__main__":
    main()