diff --git a/README.md b/README.md index dac397921c188b966d7e9fead294b65c03a112af..95fc854d4c54c326b5625a1f17597f14b6f1f274 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,110 @@ # Moderation classifier -## Installation +# Installation local ``` python -m venv pp_env source pp_env/bin/activate pip install -r requirements.txt ``` +# Installation Euler -## Usage +## Tensorflow -### 1. Activation of environment +``` +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +python -m venv --system-site-packages pp_env_tf_python310 +source pp_env_tf_python310/bin/activate +pip install -r requirements.txt +``` + +# Activation of environment + +## Local ``` source pp_env/bin/activate ``` -### 2. Preprocessing of dataframe (adding language field) +## On Euler + +### TensorFlow +``` +srun --pty --mem-per-cpu=3g --gpus=1 --gres=gpumem:12g bash +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source pp_env_tf_python310/bin/activate +``` + +# Usage + +## 1. Preprocessing of dataframe (adding language field) ``` moderation_classifier --prepare_data path_to_csv ``` + +## 2. Model training + +For the model training several option can be chosen: + +``` +Usage: moderation_classifier [OPTIONS] INPUT_DATA + + Run moderation classifier. + :param split_data: Binary flag to specify if data should be split. + :param prepare_data: Binary flag to specify if data should be prepared. + :param text_preprocessing: Binary flag to set text preprocessing. + :param newspaper: Name of newspaper selected for training. + :param topic: Topic selected for training. + :param pretrained_model: Name of pretrained BERT model to use for finetuning. + :param train_mnb: Binary flag to specify whether MNB should be trained. + :param train_bert: Binary flag to specify whether BERT should be trained. + :param eval_mnb: Binary flag to specify whether MNB should be evaluated. + :param eval_bert: Binary flag to specify whether BERT should be evaluated. + :param input_data: Path to input dataframe. + +Options: + -s, --split + -p, --prepare_data + -tp, --text_preprocessing + -n, --newspaper TEXT + -t, --topic TEXT + -pm, --pretrained_model TEXT + -tm, --train_mnb + -tb, --train_bert + -em, --eval_mnb + -eb, --eval_bert + -tbto, --train_bert_torch +``` + +The most important options during training are the model type (MNB or BERT) and the newspaper and topic selected for training. + +### MNB +Training for all newspapers and topics is started with the following command: +``` +moderation_classifier --train_mnb INPUT_DATA +``` + +Training for one newspapers (here: tagesanzeiger) and one topic (here: Wissen) is started with the following command: +``` +moderation_classifier --newspaper tagesanzeiger --topic Wissen --train_mnb INPUT_DATA +``` + +After the training is finished a log-file with all relevant information (path to train data, params for filtering, ..) is stored in `saved_models/MNB_logs`. For the evaluation of the training only the path to this log-file is needed. The evaluation of the training run is started with: +``` +moderation_classifier --eval_mnb LOG_FILE +``` + +### BERT +Training for all newspapers and topics is started with the following command: +``` +moderation_classifier --text_preprocessing --pretrained_model "bert-base-german-cased" --train_bert INPUT_DATA +``` + +Training for one newspapers (here: tagesanzeiger) and one topic (here: Wissen) is started with the following command: +``` +moderation_classifier --text_preprocessing --pretrained_model "bert-base-german-cased" --newspaper tagesanzeiger --topic Wissen --train_bert INPUT_DATA +``` + +After the training is finished a log-file with all relevant information (path to train data, params for filtering, ..) is stored in `saved_models/BERT_logs`. For the evaluation of the training only the path to this log-file is needed. The evaluation of the training run is started with: +``` +moderation_classifier --eval_bert LOG_FILE +``` diff --git a/euler/prepare_data.sh b/euler/prepare_data.sh new file mode 100755 index 0000000000000000000000000000000000000000..3fab9c0258a9e35519c872661b2455a3966cdec5 --- /dev/null +++ b/euler/prepare_data.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=4g --time=6:00:00 --wrap "moderation_classifier --prepare_data ../data/tamedia_for_classifier_v3.csv" diff --git a/euler/prepare_data_cluster.sh b/euler/prepare_data_cluster.sh new file mode 100755 index 0000000000000000000000000000000000000000..3fab9c0258a9e35519c872661b2455a3966cdec5 --- /dev/null +++ b/euler/prepare_data_cluster.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=4g --time=6:00:00 --wrap "moderation_classifier --prepare_data ../data/tamedia_for_classifier_v3.csv" diff --git a/euler/train_model_cluster_germbert_alltopic_allhsprob.sh b/euler/train_model_cluster_germbert_alltopic_allhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..87b6f743183958f6a08abd579faa3fe089096b54 --- /dev/null +++ b/euler/train_model_cluster_germbert_alltopic_allhsprob.sh @@ -0,0 +1,15 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "bert-base-german-cased" + --text_preprocessing + --train_bert data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_germbert_alltopic_highhsprob.sh b/euler/train_model_cluster_germbert_alltopic_highhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..a74b17c9d1f107cdb8d25dc0c22ea37919475d5b --- /dev/null +++ b/euler/train_model_cluster_germbert_alltopic_highhsprob.sh @@ -0,0 +1,16 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "bert-base-german-cased" + --text_preprocessing + --hsprob '[0.7,1]' + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_germbert_alltopic_lowhsprob.sh b/euler/train_model_cluster_germbert_alltopic_lowhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..5dbc36a0d667c0ba2d349b383fec76b1d0c27cb5 --- /dev/null +++ b/euler/train_model_cluster_germbert_alltopic_lowhsprob.sh @@ -0,0 +1,16 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "bert-base-german-cased" + --text_preprocessing + --hsprob '[0.0,0.3]' + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_germbert_wissentopic_allhsprob.sh b/euler/train_model_cluster_germbert_wissentopic_allhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..276034dafb5b740276c04d3a6c64b47fc7d678dd --- /dev/null +++ b/euler/train_model_cluster_germbert_wissentopic_allhsprob.sh @@ -0,0 +1,16 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "bert-base-german-cased" + --text_preprocessing + --topic 'Wissen' + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_hsbert_alltopic_allhsprob.sh b/euler/train_model_cluster_hsbert_alltopic_allhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..86aa89bc1328464bce42d4c0fcdab3fd76fb92df --- /dev/null +++ b/euler/train_model_cluster_hsbert_alltopic_allhsprob.sh @@ -0,0 +1,15 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse" + --text_preprocessing + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_hsbert_alltopic_highhsprob.sh b/euler/train_model_cluster_hsbert_alltopic_highhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..165e8c4817a40970c050127c5ca43158fe24f1fc --- /dev/null +++ b/euler/train_model_cluster_hsbert_alltopic_highhsprob.sh @@ -0,0 +1,16 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse" + --text_preprocessing + --hsprob '[0.7,1]' + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_hsbert_alltopic_lowhsprob.sh b/euler/train_model_cluster_hsbert_alltopic_lowhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..aea1bc6bfe851c5f1b51c02edcadbb11e9fce542 --- /dev/null +++ b/euler/train_model_cluster_hsbert_alltopic_lowhsprob.sh @@ -0,0 +1,16 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse" + --text_preprocessing + --hsprob '[0.0,0.3]' + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/euler/train_model_cluster_hsbert_wissentopic_allhsprob.sh b/euler/train_model_cluster_hsbert_wissentopic_allhsprob.sh new file mode 100755 index 0000000000000000000000000000000000000000..9363b3106e49dacba8e03cca68703312b1d3f7e2 --- /dev/null +++ b/euler/train_model_cluster_hsbert_wissentopic_allhsprob.sh @@ -0,0 +1,16 @@ +!/bin/bash + +module load gcc/8.2.0 python_gpu/3.10.4 eth_proxy +source ../pp_env_tf_python310/bin/activate + +sbatch --mem-per-cpu=12g\ + --gpus=1\ + --gres=gpumem:12g\ + --time=30:00:00\ + --wrap "moderation_classifier --newspaper tagesanzeiger + --pretrained_model "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse" + --text_preprocessing + --topic 'Wissen' + --train_bert ../data/tamedia_for_classifier_v4_preproc_train.csv" + + diff --git a/moderation_classifier/eval_BERT.py b/moderation_classifier/eval_BERT.py new file mode 100644 index 0000000000000000000000000000000000000000..84ff6ec60fa9a9c2c8282f953712f9f5ffc80a98 --- /dev/null +++ b/moderation_classifier/eval_BERT.py @@ -0,0 +1,130 @@ +from transformers import AutoTokenizer, TFAutoModelForSequenceClassification +import tensorflow as tf + +import click +import numpy as np +import os +import pandas as pd +from pathlib import Path +from typing import List, Union + +from sklearn.metrics import precision_recall_fscore_support, accuracy_score + +from src.preprocessing_text import TextLoader, TextProcessor +from src.train_logs import load_logs +from src.BERT_utils import predict_batches +from src.eval_utils import gen_scores_dict + + +@click.argument("train_logs") +def main(train_logs: Union[str, os.PathLike]): + """ + Prepares data and evaluates trained BERT model with TF + :param train_logs: path to csv-file containing train logs + """ + + # Load logs + ( + path_repo, + path_model, + input_data, + text_preprocessing, + newspaper, + lang, + topic, + hsprob, + remove_duplicates, + min_num_words, + pretrained_model, + ) = load_logs(train_logs) + + + # Load data and extract only text from tagesanzeiger + print("Load and preprocess text") + tl = TextLoader(input_data) + df_de = tl.load_text_csv( + newspaper=newspaper, + lang=lang, + topic=topic, + hsprob=hsprob, + load_subset=False, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + ) + + if text_preprocessing: + tp = TextProcessor() + text_proc = tp.fit_transform(df_de.text) + df_de.text = text_proc + comon_topics = tl.get_comments_per_topic(df_de) + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(pretrained_model) + model = TFAutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=path_model + ) + + # Split text into batches + y_pred_all, y_prob_all = predict_batches(df_de.text.values, model, tokenizer) + + import pdb; pdb.set_trace() + + # eval all + precision, recall, f1, _ = precision_recall_fscore_support( + df_de.label, y_pred_all, average="weighted" + ) + accuracy = accuracy_score(df_de.label, y_pred_all) + + results_all = gen_scores_dict(precision, recall, f1, accuracy) + + # eval per topic + topics = [t[0] for t in comon_topics] + results_t = dict() + + for t in topics: + y_test_t = df_de[df_de.topic == t].label + y_pred_t = y_pred_all[df_de.topic == t] + + precision, recall, f1, _ = precision_recall_fscore_support( + y_test_t, y_pred_t, average="weighted" + ) + accuracy = accuracy_score(y_test_t, y_pred_t) + + results_t[t] = gen_scores_dict(precision, recall, f1, accuracy) + + # Compute rejection rate + reject_rate_all = np.round(df_de.label.mean(), 4) * 100 + reject_rate_topic = [ + np.round(df_de[df_de.topic == k].label.mean(), 4) * 100 for k in topics + ] + + # Compute number comments + num_comm_all = df_de.shape[0] + num_comm_topic = [df_de[df_de.topic == k].shape[0] for k in topics] + + # Save results labels + df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"]) + df_res_all.loc["rejection rate"] = reject_rate_all + df_res_all.loc["number comments"] = num_comm_all + + df_res_topic = pd.DataFrame.from_dict(results_t) + df_res_topic.loc["rejection rate"] = reject_rate_topic + df_res_topic.loc["number comments"] = num_comm_topic + + df_res = df_res_all.join(df_res_topic) + df_res.loc["data"] = [input_data] * df_res.shape[1] + + df_res.to_csv( + path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + ".csv" + ) + + # Save results probs + df_prob_all = df_de.copy() + df_prob_all['bert_probability'] = y_prob_all + df_prob_all.to_csv( + path_repo + "/results/results_eval_BERT/" + Path(path_model).stem + "_bert_probability.csv" + ) + + +if __name__ == "__main__": + main() diff --git a/moderation_classifier/eval_MNB.py b/moderation_classifier/eval_MNB.py new file mode 100644 index 0000000000000000000000000000000000000000..d510863549480b0c3851407da7d130a2b5b5cf06 --- /dev/null +++ b/moderation_classifier/eval_MNB.py @@ -0,0 +1,110 @@ +import click +from collections import Counter +import numpy as np +import pandas as pd +from pathlib import Path +from sklearn.metrics import precision_recall_fscore_support + +from typing import Union +import os + +from src.MNB_utils import load_model +from src.preprocessing_text import TextLoader +from src.train_logs import load_logs +from src.eval_utils import gen_scores_dict + + +@click.argument("train_logs") +def main(train_logs: Union[str, os.PathLike]): + """ + Prepares data and evaluates trained MNB model + :param train_logs: path to csv-file containing train logs + """ + + # Load logs + ( + path_repo, + path_model, + input_data, + _, + newspaper, + lang, + topic, + remove_duplicates, + min_num_words, + ) = load_logs(train_logs) + + # Load model + pipe = load_model(path_model) + + # Load test data + tl = TextLoader(input_data) + df_test = tl.load_text_csv( + newspaper=newspaper, + lang=lang, + topic=topic, + load_subset=False, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + ) + + X_test = df_test.text + y_test = df_test.label + + # Make prediction + y_pred = pipe.predict(X_test) + + # Compute scores and add to dict + precision, recall, f1, _ = precision_recall_fscore_support( + y_test, y_pred, average="weighted" + ) + accuracy = pipe.score(X_test, y_test) + + results_all = gen_scores_dict(precision, recall, f1, accuracy) + + # Get results per topic + count_topics = Counter(df_test["topic"]).most_common(10) + topics = [t[0] for t in count_topics] + results_t = dict() + + for t in topics: + X_test_t = df_test[df_test.topic == t].text + y_test_t = df_test[df_test.topic == t].label + + y_pred_t = pipe.predict(X_test_t) + precision, recall, f1, _ = precision_recall_fscore_support( + y_test_t, y_pred_t, average="weighted" + ) + accuracy = pipe.score(X_test_t, y_test_t) + + results_t[t] = gen_scores_dict(precision, recall, f1, accuracy) + + # Compute rejection rate + reject_rate_all = np.round(df_test.label.mean(), 4) * 100 + reject_rate_topic = [ + np.round(df_test[df_test.topic == k].label.mean(), 4) * 100 for k in topics + ] + + # Compute number comments + num_comm_all = df_test.shape[0] + num_comm_topic = [df_test[df_test.topic == k].shape[0] for k in topics] + + # Save results + df_res_all = pd.DataFrame().from_dict(results_all, orient="index", columns=["all"]) + df_res_all.loc["rejection rate"] = reject_rate_all + df_res_all.loc["number comments"] = num_comm_all + + df_res_topic = pd.DataFrame.from_dict(results_t) + df_res_topic.loc["rejection rate"] = reject_rate_topic + df_res_topic.loc["number comments"] = num_comm_topic + + df_res = df_res_all.join(df_res_topic) + df_res.loc["data"] = [input_data] * df_res.shape[1] + + df_res.to_csv( + path_repo + "/results/results_eval_MNB/" + Path(path_model).stem + ".csv" + ) + + +if __name__ == "__main__": + main() diff --git a/moderation_classifier/main.py b/moderation_classifier/main.py index 449af44b889a55146be5b1653bfe57b156155637..b8a7746bd0fd751f342f44a42059d34f894b3ca9 100644 --- a/moderation_classifier/main.py +++ b/moderation_classifier/main.py @@ -1,29 +1,93 @@ # imports -from pathlib import Path - import click -from src.preprocessing import DataProcessor +from src.preprocessing_df import DataProcessor +import moderation_classifier.split_data as split_data +import moderation_classifier.train_MNB as train_MNB +import moderation_classifier.train_BERT as train_BERT +import moderation_classifier.eval_MNB as eval_MNB +import moderation_classifier.eval_BERT as eval_BERT +import moderation_classifier.train_BERT_torch as train_BERT_torch from typing import Union import os + @click.command() -@click.option('-p', '--prepare_data', is_flag=True) -@click.argument('input_data') -def main(prepare_data: bool, input_data: Union[str, os.PathLike]): +@click.option("-s", "--split", is_flag=True) +@click.option("-p", "--prepare_data", is_flag=True) +@click.option("-tp", "--text_preprocessing", is_flag=True) +@click.option("-n", "--newspaper", default=None) +@click.option("-t", "--topic", default=None) +@click.option("-h", "--hsprob", default=None) +@click.option("-pm", "--pretrained_model", default=None) +@click.option("-tm", "--train_mnb", is_flag=True) +@click.option("-tb", "--train_bert", is_flag=True) +@click.option("-em", "--eval_mnb", is_flag=True) +@click.option("-eb", "--eval_bert", is_flag=True) +@click.option("-tbto", "--train_bert_torch", is_flag=True) +@click.argument("input_data") +def main( + split: bool, + prepare_data: bool, + text_preprocessing: bool, + newspaper: str, + topic: str, + hsprob: list, + pretrained_model: str, + train_mnb: bool, + train_bert: bool, + eval_mnb: bool, + eval_bert: bool, + train_bert_torch: bool, + input_data: Union[str, os.PathLike], +): """ Run moderation classifier. + :param split_data: Binary flag to specify if data should be split. :param prepare_data: Binary flag to specify if data should be prepared. + :param text_preprocessing: Binary flag to set text preprocessing. + :param newspaper: Name of newspaper selected for training. + :param topic: Topic selected for training. + :param hsprob: List with min max values for hate speech probability + :param pretrained_model: Name of pretrained BERT model to use for finetuning. + :param train_mnb: Binary flag to specify whether MNB should be trained. + :param train_bert: Binary flag to specify whether BERT should be trained. + :param eval_mnb: Binary flag to specify whether MNB should be evaluated. + :param eval_bert: Binary flag to specify whether BERT should be evaluated. :param input_data: Path to input dataframe. """ - + + if split: + split_data.main(input_data) + if prepare_data: dp = DataProcessor(input_data) dp.add_language() print(input_data) - print('Prepare data') + print("Prepare data") + + if train_mnb: + train_MNB.main(input_data, newspaper, topic) + + if train_bert: + if hsprob is None: + pass + else: + hsprob = eval(hsprob) + train_BERT.main( + input_data, text_preprocessing, newspaper, topic, hsprob, pretrained_model + ) + + if eval_mnb: + eval_MNB.main(input_data) + + if eval_bert: + eval_BERT.main(input_data) + + if train_bert_torch: + train_BERT_torch.main(input_data) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/moderation_classifier/predict_BERT.py b/moderation_classifier/predict_BERT.py new file mode 100644 index 0000000000000000000000000000000000000000..242b874d36e5880b13a4146b339095bb662f8113 --- /dev/null +++ b/moderation_classifier/predict_BERT.py @@ -0,0 +1,19 @@ +from datasets import load_dataset +from evaluate import evaluator +from transformers import pipeline + +data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(10)) + +task_evaluator = evaluator("text-classification") + +pipe = pipeline("text-classification", model="../saved_models/20230630-103946/") + +eval_results = task_evaluator.compute( + model_or_pipeline=pipe, + data=data, + label_mapping={"NEGATIVE": 0, "POSITIVE": 1} +) + +import pdb; pdb.set_trace() +print(eval_results) + diff --git a/moderation_classifier/split_data.py b/moderation_classifier/split_data.py new file mode 100644 index 0000000000000000000000000000000000000000..5adce77edd9f6874f6b7729c2ab36b28ddcf6d7e --- /dev/null +++ b/moderation_classifier/split_data.py @@ -0,0 +1,33 @@ +import os +import pandas as pd +from pathlib import Path +from typing import Union + +from sklearn.model_selection import train_test_split + + +def main(input_data: Union[str, os.PathLike]): + """ + Performs train-test split with respect to newspaper count + """ + df = pd.read_csv(input_data) + + df_train, df_test = train_test_split(df, test_size=0.3, stratify=df.originTenantId) + + path_train = ( + Path(input_data) + .parent.joinpath(Path(input_data).stem + "_train") + .with_suffix(".csv") + ) + path_test = ( + Path(input_data) + .parent.joinpath(Path(input_data).stem + "_test") + .with_suffix(".csv") + ) + + df_train.to_csv(path_train) + df_test.to_csv(path_test) + + +if __name__ == "__main__": + main() diff --git a/moderation_classifier/train_BERT.py b/moderation_classifier/train_BERT.py new file mode 100644 index 0000000000000000000000000000000000000000..d69708715d93259e362e6570f06f83f544e46d61 --- /dev/null +++ b/moderation_classifier/train_BERT.py @@ -0,0 +1,167 @@ +from transformers import AutoTokenizer +from transformers import DataCollatorWithPadding +from transformers import TFAutoModelForSequenceClassification +from transformers.keras_callbacks import KerasMetricCallback + +from tensorflow.keras.callbacks import ModelCheckpoint +from tensorflow.keras.callbacks import TensorBoard + +import click +import datetime +import os +import pandas as pd +from pathlib import Path +import spacy +from typing import Union + +from src.preprocessing_text import TextLoader, TextProcessor +from src.prepare_bert_tf import df2dict, compute_metrics, prepare_training +from src.train_logs import save_logs + + +@click.argument("input_data", required=True) +@click.argument("text_preprocessing", required=False) +@click.argument("newspaper", required=False) +@click.argument("topic", required=False) +@click.argument("pretrained_model", required=True) +def main( + input_data: Union[str, os.PathLike], + text_preprocessing: bool, + newspaper: str, + topic: str, + hsprob: list, + pretrained_model: str, +): + """ + Prepares data and trains BERT model with TF + :param input_data: path to input data + :param text_preprocessing: Binary flag to set text preprocessing. + :param newspaper: Name of newspaper selected for training. + :param topic: Topic selected for training. + :param hsprob: List with min max values for hate speech probability + :param pretrained_model: Name of pretrained BERT model to use for finetuning. + """ + + def preprocess_function(examples): + """ + Prepares tokenizer for mapping + """ + return tokenizer(examples["text"], truncation=True) + + # Extract path + p = Path(input_data) + p_repo = p.parent.parent + + # Load data and extract only text from tagesanzeiger + print("Load and preprocess text") + lang = "de" + remove_duplicates = True + min_num_words = 3 + tl = TextLoader(input_data) + df_de = tl.load_text_csv( + newspaper=newspaper, + lang=lang, + topic=topic, + hsprob=hsprob, + load_subset=False, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + ) + + if text_preprocessing: + tp = TextProcessor(lowercase=False) + text_proc = tp.fit_transform(df_de.text) + df_de.text = text_proc + #df_de = df_de.sample(100) + + # Prepare data for modeling + ds = df2dict(df_de) + # pretrained_model = "bert-base-german-cased" + tokenizer = AutoTokenizer.from_pretrained(pretrained_model) + tokenized_text = ds.map(preprocess_function) + data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") + + # Training + print("Train model") + id2label = {0: "NEGATIVE", 1: "POSITIVE"} + label2id = {"NEGATIVE": 0, "POSITIVE": 1} + + optimizer, _ = prepare_training(tokenized_text) + model = TFAutoModelForSequenceClassification.from_pretrained( + pretrained_model, num_labels=2, id2label=id2label, label2id=label2id + ) + + tf_train_set = model.prepare_tf_dataset( + tokenized_text["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) + + tf_validation_set = model.prepare_tf_dataset( + tokenized_text["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + + model.compile(optimizer=optimizer) + + # Define checkpoint + time_stemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + path_checkpoint = (p_repo).joinpath("tmp/checkpoint/" + time_stemp) + checkpoint_filepath = path_checkpoint + metric_callback = KerasMetricCallback( + metric_fn=compute_metrics, eval_dataset=tf_validation_set + ) + checkpoint_callback = ModelCheckpoint( + checkpoint_filepath, + monitor="val_loss", + save_best_only=True, + save_weights_only=False, + mode="min", + save_freq="epoch", + initial_value_threshold=None, + ) + log_dir = "logs/fit/" + time_stemp + tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1) + + callbacks = [metric_callback, checkpoint_callback, tensorboard_callback] + + # Fit model + print("Train model") + model.fit( + x=tf_train_set, + validation_data=tf_validation_set, + epochs=5, + verbose=2, + callbacks=callbacks, + ) + + # Save model + print("Save model") + path_model = (p_repo).joinpath("saved_models/" + time_stemp) + model.save_pretrained(path_model) + tokenizer.save_pretrained(path_model) + + # Save model logs + save_logs( + path_repo=p_repo, + path_model=path_model, + input_data=input_data, + text_preprocessing=True, + newspaper=newspaper, + lang=lang, + topic=topic, + hsprob=hsprob, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + model_name="BERT", + pretrained_model=pretrained_model, + ) + + print("Done") + + +if __name__ == "__main__": + main() diff --git a/moderation_classifier/train_BERT_torch.py b/moderation_classifier/train_BERT_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..5be8e6997533ceb8f8ba69b187f37dddd8bd525b --- /dev/null +++ b/moderation_classifier/train_BERT_torch.py @@ -0,0 +1,135 @@ +from datasets import Dataset, DatasetDict + +import evaluate +from transformers import AutoTokenizer +from transformers import DataCollatorWithPadding + +from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer + +import numpy as np + +import pandas as pd + +from typing import Union +import os + +import click + +from sklearn.model_selection import train_test_split + +from src.preprocessing_text import TextLoader + + +def load_text( + path: Union[str, os.PathLike], newspaper: str = "tagesanzeiger", lang: str = "de" +) -> pd.DataFrame: + """ + Loads daraframe and extracts text depending on newspaper and langugae + """ + df = pd.read_csv(path) + df = df.loc[(df.originTenantId == newspaper) & (df.language == lang)] + df = df[["text", "rejected"]] + df = df.rename(columns={"rejected": "label"}) + + return df + + +def df2dict(df: pd.DataFrame): + """ + Converts Dataframe into Huggingface Dataset + """ + + df = df.sample(10000) + train, test = train_test_split(df, test_size=0.2) + + ds_train = Dataset.from_pandas(train) + ds_test = Dataset.from_pandas(test) + + ds = DatasetDict() + ds["train"] = ds_train + ds["test"] = ds_test + + return ds + + +def compute_metrics(eval_pred): + accuracy = evaluate.load("accuracy") + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + + +def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5): + """ + Prepares training and sets params + """ + + batches_per_epoch = len(dataset["train"]) // batch_size + total_train_steps = int(batches_per_epoch * num_epochs) + optimizer, schedule = create_optimizer( + init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps + ) + + return optimizer, schedule + + +@click.argument("input_data") +def main(input_data: Union[str, os.PathLike]): + # load data and extract only german text from tagesanzeiger + print("Load text") + tl = TextLoader(input_data) + df_de = tl.load_text_csv(newspaper="tagesanzeiger", load_subset=True) + + # Dataframe to dict/Train-test split + ds = df2dict(df_de) + + # Preprocessing/Tokenization + print("tokenize") + tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") + + def preprocess_function(examples): + return tokenizer(examples["text"], truncation=True) + + # truncate sequences to be no longer than the models maximum input length + print("map") + tokenized_text = ds.map(preprocess_function) + + # dynamically padding of sentences to the longest length in a batch + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Training + id2label = {0: "NEGATIVE", 1: "POSITIVE"} + label2id = {"NEGATIVE": 0, "POSITIVE": 1} + + model = AutoModelForSequenceClassification.from_pretrained( + "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id + ) + + training_args = TrainingArguments( + output_dir="my_awesome_model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=False, + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_text["train"], + eval_dataset=tokenized_text["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + trainer.train() + +if __name__ == "__main__": + main() diff --git a/moderation_classifier/train_MNB.py b/moderation_classifier/train_MNB.py new file mode 100644 index 0000000000000000000000000000000000000000..44bddadbdd1aeddb3cc57d66d25e7ab18d1eddac --- /dev/null +++ b/moderation_classifier/train_MNB.py @@ -0,0 +1,72 @@ +from sklearn.model_selection import train_test_split + +import click +from pathlib import Path +from typing import Union +import os + +from src.MNB_utils import create_pipeline, create_path, save_model +from src.preprocessing_text import TextLoader +from src.train_logs import save_logs + + +@click.argument("input_data") +@click.argument("newspaper") +@click.argument("topic") +def main(input_data: Union[str, os.PathLike], newspaper: str, topic: str): + """ + Runs training of MNB. + :param input_data: Path to input dataframe. + """ + + # Extract path + p = Path(input_data) + p_repo = p.parent.parent + + # Load data and extract only text from tagesanzeiger + print("Load and preprocess text") + lang = "de" + remove_duplicates = True + min_num_words = 3 + tl = TextLoader(input_data) + df_de = tl.load_text_csv( + newspaper=newspaper, + lang=lang, + topic=topic, + load_subset=False, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + ) + + # Prepare data for modeling + text = df_de.text + label = df_de.label + + X_train, X_val, y_train, y_val = train_test_split(text, label, stratify=label) + + # Training + print("Train model") + pipe = create_pipeline() + pipe.fit(X_train, y_train) + val_score = pipe.score(X_val, y_val) + + # Save model and training logs + path = create_path() + save_model(pipe, path) + save_logs( + path_repo=p_repo, + path_model=path, + input_data=input_data, + text_preprocessing=True, + newspaper=newspaper, + lang=lang, + topic=topic, + remove_duplicates=remove_duplicates, + min_num_words=min_num_words, + model_name="MNB", + val_score=val_score, + ) + + +if __name__ == "__main__": + main() diff --git a/notebooks/data-exploration.ipynb b/notebooks/data-exploration.ipynb index f209771b25cb8884dc4407d569b90cfbe6fbc77c..cbf67ec62b6fe302c85b0a3eba015381e1cd3d38 100644 --- a/notebooks/data-exploration.ipynb +++ b/notebooks/data-exploration.ipynb @@ -38,12 +38,29 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "id": "6efcb560", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../data/tamedia_for_classifier_v4_preproc.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m../data/tamedia_for_classifier_v4_preproc.csv\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1663\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1664\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1668\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1669\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/tamedia_for_classifier_v4_preproc.csv'" + ] + } + ], "source": [ - "df = pd.read_csv('../data/tamedia_for_classifier_v2_preproc.csv')" + "df = pd.read_csv('../data/tamedia_for_classifier_v4_preproc.csv')" ] }, { @@ -73,7 +90,6 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>Unnamed: 0</th>\n", " <th>ID</th>\n", " <th>createdAt</th>\n", " <th>text</th>\n", @@ -81,13 +97,14 @@ " <th>state</th>\n", " <th>originTenantId</th>\n", " <th>replyTo</th>\n", - " <th>language</th>\n", + " <th>asset.risk</th>\n", + " <th>topic</th>\n", + " <th>hsprob</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>0</td>\n", " <td>5fee66486ef49d0033d97e4c</td>\n", " <td>2021-01-01T00:01:12Z</td>\n", " <td>Hat schon welche, möchte aber lieber nicht erw...</td>\n", @@ -95,11 +112,12 @@ " <td>rejected</td>\n", " <td>tagesanzeiger</td>\n", " <td>5f537bbdd2abdd0032ec12ad</td>\n", - " <td>de</td>\n", + " <td>high</td>\n", + " <td>Zürich</td>\n", + " <td>0.051257</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>1</td>\n", " <td>5fee66b7e9b26b00322cc53e</td>\n", " <td>2021-01-01T00:03:03Z</td>\n", " <td>Wieso nicht? Absolut kein Argument.</td>\n", @@ -107,11 +125,12 @@ " <td>rejected</td>\n", " <td>tagesanzeiger</td>\n", " <td>NaN</td>\n", - " <td>de</td>\n", + " <td>high</td>\n", + " <td>Bundeshaus</td>\n", + " <td>0.012496</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>2</td>\n", " <td>5fee66bfe9b26b00322cc543</td>\n", " <td>2021-01-01T00:03:11Z</td>\n", " <td>Eine Impfung kostet vergleichsweise wenig. Und...</td>\n", @@ -119,11 +138,12 @@ " <td>approved</td>\n", " <td>derbund</td>\n", " <td>5fee4bccb3aa6d0032c3c1f0</td>\n", - " <td>de</td>\n", + " <td>high</td>\n", + " <td>Bundeshaus</td>\n", + " <td>0.027282</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>3</td>\n", " <td>5fee66dca0dd250033ef02ea</td>\n", " <td>2021-01-01T00:03:40Z</td>\n", " <td>Sind Sie einfach nur etwas einfach oder hochgr...</td>\n", @@ -131,11 +151,12 @@ " <td>approved</td>\n", " <td>tagesanzeiger</td>\n", " <td>5fee1998e9b26b00322caaad</td>\n", - " <td>de</td>\n", + " <td>low</td>\n", + " <td>Meinungen</td>\n", + " <td>0.020309</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>4</td>\n", " <td>5fee66ec6ef49d0033d97e7e</td>\n", " <td>2021-01-01T00:03:56Z</td>\n", " <td>Hä??? Von welchem Paralleluniversum ist hier m...</td>\n", @@ -143,19 +164,21 @@ " <td>rejected</td>\n", " <td>tagesanzeiger</td>\n", " <td>5fedfcbdf31d260033d38738</td>\n", - " <td>de</td>\n", + " <td>low</td>\n", + " <td>Schweiz</td>\n", + " <td>0.018285</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " Unnamed: 0 ID createdAt \n", - "0 0 5fee66486ef49d0033d97e4c 2021-01-01T00:01:12Z \\\n", - "1 1 5fee66b7e9b26b00322cc53e 2021-01-01T00:03:03Z \n", - "2 2 5fee66bfe9b26b00322cc543 2021-01-01T00:03:11Z \n", - "3 3 5fee66dca0dd250033ef02ea 2021-01-01T00:03:40Z \n", - "4 4 5fee66ec6ef49d0033d97e7e 2021-01-01T00:03:56Z \n", + " ID createdAt \n", + "0 5fee66486ef49d0033d97e4c 2021-01-01T00:01:12Z \\\n", + "1 5fee66b7e9b26b00322cc53e 2021-01-01T00:03:03Z \n", + "2 5fee66bfe9b26b00322cc543 2021-01-01T00:03:11Z \n", + "3 5fee66dca0dd250033ef02ea 2021-01-01T00:03:40Z \n", + "4 5fee66ec6ef49d0033d97e7e 2021-01-01T00:03:56Z \n", "\n", " text rejected state \n", "0 Hat schon welche, möchte aber lieber nicht erw... 1 rejected \\\n", @@ -164,12 +187,12 @@ "3 Sind Sie einfach nur etwas einfach oder hochgr... 0 approved \n", "4 Hä??? Von welchem Paralleluniversum ist hier m... 1 rejected \n", "\n", - " originTenantId replyTo language \n", - "0 tagesanzeiger 5f537bbdd2abdd0032ec12ad de \n", - "1 tagesanzeiger NaN de \n", - "2 derbund 5fee4bccb3aa6d0032c3c1f0 de \n", - "3 tagesanzeiger 5fee1998e9b26b00322caaad de \n", - "4 tagesanzeiger 5fedfcbdf31d260033d38738 de " + " originTenantId replyTo asset.risk topic hsprob \n", + "0 tagesanzeiger 5f537bbdd2abdd0032ec12ad high Zürich 0.051257 \n", + "1 tagesanzeiger NaN high Bundeshaus 0.012496 \n", + "2 derbund 5fee4bccb3aa6d0032c3c1f0 high Bundeshaus 0.027282 \n", + "3 tagesanzeiger 5fee1998e9b26b00322caaad low Meinungen 0.020309 \n", + "4 tagesanzeiger 5fedfcbdf31d260033d38738 low Schweiz 0.018285 " ] }, "execution_count": 4, @@ -204,18 +227,29 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "<Axes: xlabel='language', ylabel='Count'>" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyError", + "evalue": "'language'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/core/indexes/base.py:3652\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3651\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3652\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3653\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/_libs/index.pyx:147\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/_libs/index.pyx:176\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'language'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m fig, axes \u001b[38;5;241m=\u001b[39m plt\u001b[38;5;241m.\u001b[39msubplots(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m1\u001b[39m, figsize\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m10\u001b[39m,\u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m sns\u001b[38;5;241m.\u001b[39mhistplot(\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlanguage\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/core/frame.py:3761\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3760\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3761\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3762\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3763\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/core/indexes/base.py:3654\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3652\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3653\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m-> 3654\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3655\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3656\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3657\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3658\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3659\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'language'" + ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "<Figure size 1000x500 with 1 Axes>" ] @@ -239,24 +273,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "3bbbe6f6", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rejected comments:\n", - "ganz im Gegensatz zu den Mit\"arbeitern\" im Steueramt\n", - "\n", - "\n", - "Sie armer Mensch. Immer müssen Sie sich beklagen, weil Ihnen die Welt so furchtbar ungerecht erscheint.\n", - "\n", - "\n", - "Es ist interessant wie es funktioniert: Patrizia Dänzi vom IKRK wird Chefin der Deza im EDA. Frau Spojlaric Egger vom EDA wird IKRK Chefin. MitarbeiterInnen scheinen bessere Opportunitäten ausserhalb Ihrer Karriereinstitution zu bekommen. Geht da Know-How nicht verloren?\n", - "\n", - "\n" + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'language'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/bn/hrm9f3gs76z5zb1bxxc4g_s00000gn/T/ipykernel_92361/1030998729.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_rejected_de\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlanguage\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'de'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrejected\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf_accepted_de\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlanguage\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'de'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrejected\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Rejected comments:'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5985\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5986\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5987\u001b[0m ):\n\u001b[1;32m 5988\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5989\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'language'" ] } ], @@ -272,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "173770c7", "metadata": {}, "outputs": [ @@ -280,19 +310,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Accepted comments:\n", - "In Art. 23e Abs.2 PTM kommt der Ausdruck „Leib und Leben“ nicht vor, sondern es wird von der „Verbreitung von Furcht und Schrecken“ gesprochen, ohne näher zu definieren, durch welche Aktionen diese Art von Furcht und Schrecken verbreitet wird und nach welchen Kriterien festgestellt wird, ob jemand in Furcht und Schrecken versetzt worden ist. Damit ist der Deliktekatalog, der zu Furcht und Schrecken im Sinn des PTM führt und als terroristische Aktivität gilt, nicht abschliessend definiert. Es ist damit unklar, welche Verhaltensarten dazu führen, als terroristisch aktive Person eingestuft zu werden. Das ist nach meinem Verständnis der Vorbehalt, den Nils Melzer anbringt, der angesichts seines CV nicht als Leichtgewichtjurist eingestuft werden kann.\n", - "Was strafbare Handlungen gegen Leib und Leben sind, ist in Art. 111 bis 136 StGB definiert und es braucht kein PTM, die Verfolgung solcher Vergehen zu ermöglichen.\n", - "Das PTM ist meines Erachtens ein Gesetz für präventive Massnahmen, die basierend auf den geltenden Gesetzen nicht eingeleitet werden dürfen.\n", - "Angesichts dieses Sachverhaltes ist Ihre Darstellung, freundlich ausgedrückt, laienhaft.\n", - "\n", - "\n", - "Ach ja. Und wie meinen Sie, Frau Baus, ist die Menschheit die fürchterliche Krankheit Pocken los geworden? Genau, mit Impfzwang. Die obligatorische Pockenimpfung wurde z.B. in Deutschland Mitte der 70er Jahre abgeschafft, in Österreich Anfang der 80er. Auch Polio, Diphterie und viele üble Infektionskrankheiten mehr wurden dank der Impfung von fast der ganzen Bevölkerung richtiggehend ausgerottet. Nur weil die Impfungen in der Vergangenheit so verdammt erfolgreich waren, müssen wir diese schreckliche Krankheiten nicht mehr erleben. Aber eben, wir vergessen leider (zu) schnell.\n", - "\n", - "\n", - "Abstand halten, Maske tragen und Hände desinfizieren hilft auch gegen Grippeviren. Deshalb nichts als logisch, gibt es aktuell keine Grippewelle.\n", - "\n", - "\n" + "Accepted comments:\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'df_accepted_de' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[8], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAccepted comments:\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m c \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdf_accepted_de\u001b[49m\u001b[38;5;241m.\u001b[39mtext\u001b[38;5;241m.\u001b[39msample(\u001b[38;5;241m3\u001b[39m):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(c)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_accepted_de' is not defined" ] } ], @@ -313,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "078ad528", "metadata": {}, "outputs": [], @@ -323,19 +352,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "484f790d", "metadata": {}, "outputs": [ { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 640x480 with 1 Axes>" - ] - }, - "metadata": {}, - "output_type": "display_data" + "ename": "NameError", + "evalue": "name 'df_accepted_de' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Create and generate a word cloud image:\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m text_de_accepted \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[43mdf_accepted_de\u001b[49m\u001b[38;5;241m.\u001b[39mtext)\n\u001b[1;32m 3\u001b[0m wordcloud \u001b[38;5;241m=\u001b[39m WordCloud(stopwords\u001b[38;5;241m=\u001b[39mgerman_stop_words)\u001b[38;5;241m.\u001b[39mgenerate(text_de_accepted)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Display the generated image:\u001b[39;00m\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_accepted_de' is not defined" + ] } ], "source": [ @@ -351,21 +381,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "5cc400e0", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 640x480 with 1 Axes>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Create and generate a word cloud image:\n", "text_de_rejected = ' '.join(df_rejected_de.text)\n", @@ -387,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "58e91383", "metadata": {}, "outputs": [], @@ -398,31 +417,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "00cc3793", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Text(0.5, 1.0, 'accepted'), (0.0, 400.0)]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 1000x500 with 2 Axes>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "fig, axes = plt.subplots(1, 2, figsize=(10,5))\n", "fig.suptitle('Number of words per sentence')\n", @@ -440,29 +438,21 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "id": "974e3104", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[Text(0.5, 1.0, 'German comments')]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 640x480 with 1 Axes>" - ] - }, - "metadata": {}, - "output_type": "display_data" + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'language'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/bn/hrm9f3gs76z5zb1bxxc4g_s00000gn/T/ipykernel_92361/38630575.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlanguage\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m'de'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrejected\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'German comments'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Documents/public_policy/pp_env/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5985\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5986\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5987\u001b[0m ):\n\u001b[1;32m 5988\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5989\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'language'" + ] } ], "source": [ diff --git a/notebooks/hate-speech-score.ipynb b/notebooks/hate-speech-score.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..53b0308aa57b60992fd03f19421d57cd6f33f423 --- /dev/null +++ b/notebooks/hate-speech-score.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2f0ca1cd", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from datetime import date\n", + "from wordcloud import WordCloud \n", + "\n", + "from nltk.corpus import stopwords\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e3567f43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>createdAt</th>\n", + " <th>text</th>\n", + " <th>rejected</th>\n", + " <th>state</th>\n", + " <th>originTenantId</th>\n", + " <th>replyTo</th>\n", + " <th>asset.risk</th>\n", + " <th>topic</th>\n", + " <th>hsprob</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5fee66486ef49d0033d97e4c</td>\n", + " <td>2021-01-01T00:01:12Z</td>\n", + " <td>Hat schon welche, möchte aber lieber nicht erw...</td>\n", + " <td>1</td>\n", + " <td>rejected</td>\n", + " <td>tagesanzeiger</td>\n", + " <td>5f537bbdd2abdd0032ec12ad</td>\n", + " <td>high</td>\n", + " <td>Zürich</td>\n", + " <td>0.051257</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>5fee66b7e9b26b00322cc53e</td>\n", + " <td>2021-01-01T00:03:03Z</td>\n", + " <td>Wieso nicht? Absolut kein Argument.</td>\n", + " <td>1</td>\n", + " <td>rejected</td>\n", + " <td>tagesanzeiger</td>\n", + " <td>NaN</td>\n", + " <td>high</td>\n", + " <td>Bundeshaus</td>\n", + " <td>0.012496</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>5fee66bfe9b26b00322cc543</td>\n", + " <td>2021-01-01T00:03:11Z</td>\n", + " <td>Eine Impfung kostet vergleichsweise wenig. Und...</td>\n", + " <td>0</td>\n", + " <td>approved</td>\n", + " <td>derbund</td>\n", + " <td>5fee4bccb3aa6d0032c3c1f0</td>\n", + " <td>high</td>\n", + " <td>Bundeshaus</td>\n", + " <td>0.027282</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>5fee66dca0dd250033ef02ea</td>\n", + " <td>2021-01-01T00:03:40Z</td>\n", + " <td>Sind Sie einfach nur etwas einfach oder hochgr...</td>\n", + " <td>0</td>\n", + " <td>approved</td>\n", + " <td>tagesanzeiger</td>\n", + " <td>5fee1998e9b26b00322caaad</td>\n", + " <td>low</td>\n", + " <td>Meinungen</td>\n", + " <td>0.020309</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5fee66ec6ef49d0033d97e7e</td>\n", + " <td>2021-01-01T00:03:56Z</td>\n", + " <td>Hä??? Von welchem Paralleluniversum ist hier m...</td>\n", + " <td>1</td>\n", + " <td>rejected</td>\n", + " <td>tagesanzeiger</td>\n", + " <td>5fedfcbdf31d260033d38738</td>\n", + " <td>low</td>\n", + " <td>Schweiz</td>\n", + " <td>0.018285</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID createdAt \n", + "0 5fee66486ef49d0033d97e4c 2021-01-01T00:01:12Z \\\n", + "1 5fee66b7e9b26b00322cc53e 2021-01-01T00:03:03Z \n", + "2 5fee66bfe9b26b00322cc543 2021-01-01T00:03:11Z \n", + "3 5fee66dca0dd250033ef02ea 2021-01-01T00:03:40Z \n", + "4 5fee66ec6ef49d0033d97e7e 2021-01-01T00:03:56Z \n", + "\n", + " text rejected state \n", + "0 Hat schon welche, möchte aber lieber nicht erw... 1 rejected \\\n", + "1 Wieso nicht? Absolut kein Argument. 1 rejected \n", + "2 Eine Impfung kostet vergleichsweise wenig. Und... 0 approved \n", + "3 Sind Sie einfach nur etwas einfach oder hochgr... 0 approved \n", + "4 Hä??? Von welchem Paralleluniversum ist hier m... 1 rejected \n", + "\n", + " originTenantId replyTo asset.risk topic hsprob \n", + "0 tagesanzeiger 5f537bbdd2abdd0032ec12ad high Zürich 0.051257 \n", + "1 tagesanzeiger NaN high Bundeshaus 0.012496 \n", + "2 derbund 5fee4bccb3aa6d0032c3c1f0 high Bundeshaus 0.027282 \n", + "3 tagesanzeiger 5fee1998e9b26b00322caaad low Meinungen 0.020309 \n", + "4 tagesanzeiger 5fedfcbdf31d260033d38738 low Schweiz 0.018285 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data\n", + "df = pd.read_csv('../data/tamedia_for_classifier_v4.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ee505e09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<Axes: ylabel='Frequency'>" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Distribution of probabilities\n", + "df.hsprob.plot.hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b6ae7880", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<Axes: >" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Number of rejected and eccepted comments for high hsprob\n", + "df[(df.hsprob>0.7)&(df.originTenantId=='tagesanzeiger')].state.hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "4b775b9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<Axes: >" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Number of rejected and eccepted comments for low hsprob\n", + "df[(df.hsprob<0.3)&(df.originTenantId=='tagesanzeiger')].state.hist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d019c140", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pp_env", + "language": "python", + "name": "pp_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/text-processing.ipynb b/notebooks/text-processing.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a27fec0845255d75c0a581e11a0dfc1e6c44a0dc --- /dev/null +++ b/notebooks/text-processing.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a7eee72f", + "metadata": {}, + "outputs": [], + "source": [ + "from src.preprocessing_text import TextLoader, TextProcessor\n", + "import spacy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1cad5dc2", + "metadata": {}, + "outputs": [], + "source": [ + "input_data = '/Users/franziskaoschmann/Documents/public_policy/data/tamedia_for_classifier_v2_preproc.csv'\n", + "\n", + "tl = TextLoader(input_data)\n", + "\n", + "df_de = tl.load_text_csv(newspaper = 'tagesanzeiger', lang ='de')\n", + "\n", + "nlp = spacy.load('de_core_news_sm')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71ddb0ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 566, 1061, 1381, 1445, 1496, 1589, 1723, 1934, 2221, 2641, 2947,\n", + " 3628, 3899, 4007, 4280, 4650, 4852, 5202, 5656, 5770, 5985, 6260,\n", + " 7141, 7204, 7804, 7972, 8005, 8261, 8504, 8846, 8857]),)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "np.where(['@' in t for t in df_de.text[:10000]])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "28887dee", + "metadata": {}, + "outputs": [], + "source": [ + "sample_text = df_de.iloc[566].text" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "163a698c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Die Übersterblichkeit wird aufgrund der Todesfallzahlen des 7-Tage-Durchschnitts und nicht eines Jahresdurchschnitts berechnet. So sind übrigens auch saisonale Unterschiede berücksichtigt.\\nUnd nein @John Zürcher, es gab keine statistisch relevante Untersterblichkeit im Sommer, sondern nur eine, die sich am unteren Band der durchschnittlichen Sterblichkeit der letzten 5 Jahre bewegt hat. Den Grund dafür habe ich erwähnt.\\n@Martin Mader: was Sie schreiben ist Unsinn. Die durchschnittliche Sterblichkeit wird anhand der tatsächlichen Todesfälle über die letzten 5 Jahre festgehalten. Da sind alle massgebenden Parameter automatisch enthalten. Logisch, dass dieser so berechnete Durchschnitt sich jährlich aufgrund der Lebenserwartung verändert.\\n@Mark Keller: Ich habe nie etwas anderes behauptet. Man kann sogar soweit gehen, nachträglich die durchschnittliche Lebenszeit zu berechnen, die einem an dieser Pandemie verstorbenen Menschen genommen wurde. Sobald sich die Todesfallzahlen nach einer Phase der Übersterblichkeit durch eine unterdurchschnittliche Sterblichkeit oder durch eine Untersterblichkeit ausgeglichen haben, erhält man diesen Wert (ich gehe von rund 6 Monaten aus). Da es aber nur ein Durchschnittswert ist, sagt er nicht über die Lebenszeit aus, die einer an Corona verstorbenen Person wirklich weggenommen wurde. Bei den einen mögen es ein paar Tage oder Wochen sein, bei anderen aber mehrere Jahre oder gar Jahrzehnte.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_text" + ] + }, + { + "cell_type": "markdown", + "id": "3babd6fc", + "metadata": {}, + "source": [ + "### Remove spaces" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d114f6a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Die Übersterblichkeit wird aufgrund der Todesfallzahlen des 7-Tage-Durchschnitts und nicht eines Jahresdurchschnitts berechnet. So sind übrigens auch saisonale Unterschiede berücksichtigt. Und nein @John Zürcher, es gab keine statistisch relevante Untersterblichkeit im Sommer, sondern nur eine, die sich am unteren Band der durchschnittlichen Sterblichkeit der letzten 5 Jahre bewegt hat. Den Grund dafür habe ich erwähnt. @Martin Mader: was Sie schreiben ist Unsinn. Die durchschnittliche Sterblichkeit wird anhand der tatsächlichen Todesfälle über die letzten 5 Jahre festgehalten. Da sind alle massgebenden Parameter automatisch enthalten. Logisch, dass dieser so berechnete Durchschnitt sich jährlich aufgrund der Lebenserwartung verändert. @Mark Keller: Ich habe nie etwas anderes behauptet. Man kann sogar soweit gehen, nachträglich die durchschnittliche Lebenszeit zu berechnen, die einem an dieser Pandemie verstorbenen Menschen genommen wurde. Sobald sich die Todesfallzahlen nach einer Phase der Übersterblichkeit durch eine unterdurchschnittliche Sterblichkeit oder durch eine Untersterblichkeit ausgeglichen haben, erhält man diesen Wert (ich gehe von rund 6 Monaten aus). Da es aber nur ein Durchschnittswert ist, sagt er nicht über die Lebenszeit aus, die einer an Corona verstorbenen Person wirklich weggenommen wurde. Bei den einen mögen es ein paar Tage oder Wochen sein, bei anderen aber mehrere Jahre oder gar Jahrzehnte.'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tp = TextProcessor(nlp)\n", + "\n", + "text_proc = tp.remove_spaces(sample_text)\n", + "\n", + "text_proc" + ] + }, + { + "cell_type": "markdown", + "id": "bfc38df2", + "metadata": {}, + "source": [ + "### Remove punctuation" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "216b53db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Die Übersterblichkeit wird aufgrund der Todesfallzahlen des 7TageDurchschnitts und nicht eines Jahresdurchschnitts berechnet So sind übrigens auch saisonale Unterschiede berücksichtigt Und nein @John Zürcher es gab keine statistisch relevante Untersterblichkeit im Sommer sondern nur eine die sich am unteren Band der durchschnittlichen Sterblichkeit der letzten 5 Jahre bewegt hat Den Grund dafür habe ich erwähnt @Martin Mader was Sie schreiben ist Unsinn Die durchschnittliche Sterblichkeit wird anhand der tatsächlichen Todesfälle über die letzten 5 Jahre festgehalten Da sind alle massgebenden Parameter automatisch enthalten Logisch dass dieser so berechnete Durchschnitt sich jährlich aufgrund der Lebenserwartung verändert @Mark Keller Ich habe nie etwas anderes behauptet Man kann sogar soweit gehen nachträglich die durchschnittliche Lebenszeit zu berechnen die einem an dieser Pandemie verstorbenen Menschen genommen wurde Sobald sich die Todesfallzahlen nach einer Phase der Übersterblichkeit durch eine unterdurchschnittliche Sterblichkeit oder durch eine Untersterblichkeit ausgeglichen haben erhält man diesen Wert ich gehe von rund 6 Monaten aus Da es aber nur ein Durchschnittswert ist sagt er nicht über die Lebenszeit aus die einer an Corona verstorbenen Person wirklich weggenommen wurde Bei den einen mögen es ein paar Tage oder Wochen sein bei anderen aber mehrere Jahre oder gar Jahrzehnte'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_proc = tp.remove_punctuation(text_proc)\n", + "\n", + "text_proc" + ] + }, + { + "cell_type": "markdown", + "id": "0df7f111", + "metadata": {}, + "source": [ + "### Remove @-mentions" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "67edcb18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Die Übersterblichkeit wird aufgrund der Todesfallzahlen des 7TageDurchschnitts und nicht eines Jahresdurchschnitts berechnet So sind übrigens auch saisonale Unterschiede berücksichtigt Und nein Zürcher es gab keine statistisch relevante Untersterblichkeit im Sommer sondern nur eine die sich am unteren Band der durchschnittlichen Sterblichkeit der letzten 5 Jahre bewegt hat Den Grund dafür habe ich erwähnt Mader was Sie schreiben ist Unsinn Die durchschnittliche Sterblichkeit wird anhand der tatsächlichen Todesfälle über die letzten 5 Jahre festgehalten Da sind alle massgebenden Parameter automatisch enthalten Logisch dass dieser so berechnete Durchschnitt sich jährlich aufgrund der Lebenserwartung verändert Keller Ich habe nie etwas anderes behauptet Man kann sogar soweit gehen nachträglich die durchschnittliche Lebenszeit zu berechnen die einem an dieser Pandemie verstorbenen Menschen genommen wurde Sobald sich die Todesfallzahlen nach einer Phase der Übersterblichkeit durch eine unterdurchschnittliche Sterblichkeit oder durch eine Untersterblichkeit ausgeglichen haben erhält man diesen Wert ich gehe von rund 6 Monaten aus Da es aber nur ein Durchschnittswert ist sagt er nicht über die Lebenszeit aus die einer an Corona verstorbenen Person wirklich weggenommen wurde Bei den einen mögen es ein paar Tage oder Wochen sein bei anderen aber mehrere Jahre oder gar Jahrzehnte'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_proc = tp.remove_mentions(text_proc)\n", + "\n", + "text_proc" + ] + }, + { + "cell_type": "markdown", + "id": "6783795b", + "metadata": {}, + "source": [ + "### Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2421eeb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'der Übersterblichkeit werden aufgrund der Todesfallzahle der 7TageDurchschnitts und nicht ein Jahresdurchschnitt berechnen so sein übrigens auch saisonal Unterschied berücksichtigen und nein Zürcher es geben kein statistisch relevant Untersterblichkeit in Sommer sondern nur einer der sich an unterer Band der durchschnittlich Sterblichkeit der letzter 5 Jahr bewegen haben der Grund dafür haben ich erwähnen Mader was sie schreiben sein Unsinn der durchschnittlich Sterblichkeit werden anhand der tatsächlich Todesfall über der letzter 5 Jahr festhalten da sein aller massgebend Parameter automatisch enthalt Logisch dass dieser so berechnen Durchschnitt sich jährlich aufgrund der Lebenserwartung verändern Keller ich haben nie etwas anderer behaupten man können sogar soweit gehen nachträglich der durchschnittlich Lebenszeit zu berechnen der ein an dieser Pandemie verstorben Mensch nehmen werden Sobald sich der Todesfallzahle nach ein Phase der Übersterblichkeit durch ein unterdurchschnittlich Sterblichkeit oder durch ein Untersterblichkeit ausgleichen haben erhalten man dieser Wert ich gehen von rund 6 Monat aus da es aber nur ein Durchschnittswert sein sagen er nicht über der Lebenszeit aus der ein an Corona verstorben Person wirklich wegnommen werden bei der einer mögen es ein paar Tag oder Woche sein bei anderer aber mehrere Jahr oder gar Jahrzehnt'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_proc = tp.lemmatize_text(text_proc)\n", + "\n", + "text_proc" + ] + }, + { + "cell_type": "markdown", + "id": "7ebfb311", + "metadata": {}, + "source": [ + "### Lowercase" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d7414b09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'der übersterblichkeit werden aufgrund der todesfallzahle der 7tagedurchschnitts und nicht ein jahresdurchschnitt berechnen so sein übrigens auch saisonal unterschied berücksichtigen und nein zürcher es geben kein statistisch relevant untersterblichkeit in sommer sondern nur einer der sich an unterer band der durchschnittlich sterblichkeit der letzter 5 jahr bewegen haben der grund dafür haben ich erwähnen mader was sie schreiben sein unsinn der durchschnittlich sterblichkeit werden anhand der tatsächlich todesfall über der letzter 5 jahr festhalten da sein aller massgebend parameter automatisch enthalt logisch dass dieser so berechnen durchschnitt sich jährlich aufgrund der lebenserwartung verändern keller ich haben nie etwas anderer behaupten man können sogar soweit gehen nachträglich der durchschnittlich lebenszeit zu berechnen der ein an dieser pandemie verstorben mensch nehmen werden sobald sich der todesfallzahle nach ein phase der übersterblichkeit durch ein unterdurchschnittlich sterblichkeit oder durch ein untersterblichkeit ausgleichen haben erhalten man dieser wert ich gehen von rund 6 monat aus da es aber nur ein durchschnittswert sein sagen er nicht über der lebenszeit aus der ein an corona verstorben person wirklich wegnommen werden bei der einer mögen es ein paar tag oder woche sein bei anderer aber mehrere jahr oder gar jahrzehnt'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_proc = tp.fold_case(text_proc)\n", + "\n", + "text_proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9c3cfee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pp_env", + "language": "python", + "name": "pp_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/visualize-embeddings.ipynb b/notebooks/visualize-embeddings.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..9592e974af63f2f07a9ef9cbdd83372516fed408 --- /dev/null +++ b/notebooks/visualize-embeddings.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f192e924", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/franziskaoschmann/Documents/public_policy/pp_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n", + "import tensorflow as tf\n", + "from sklearn.manifold import TSNE\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "import numpy\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "017a5f0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Load subset of data\n", + "df = pd.read_csv('../data/tamedia_for_classifier_v4_preproc_test.csv')\n", + "df_hsprob = df[df.hsprob > 0.7]\n", + "df_sub = df_hsprob.sample(100)\n", + "text = list(df_sub.text)\n", + "label = df_sub.rejected" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "24d7835a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "All PyTorch model weights were used when initializing TFBertForSequenceClassification.\n", + "\n", + "All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" + ] + } + ], + "source": [ + "# Load tokenizer and model\n", + "tokenizer = AutoTokenizer.from_pretrained('deepset/bert-base-german-cased-hatespeech-GermEval18Coarse')\n", + "model = TFAutoModelForSequenceClassification.from_pretrained('deepset/bert-base-german-cased-hatespeech-GermEval18Coarse')\n", + "inputs = tokenizer(list(text), return_tensors=\"tf\", padding=True, truncation=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b5e09595", + "metadata": {}, + "outputs": [], + "source": [ + "# Get hidden states of model\n", + "model_out = model(**inputs, output_hidden_states=True,return_dict=True)\n", + "hidden_states = model_out.hidden_states[1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "14888084", + "metadata": {}, + "outputs": [], + "source": [ + "# Reduce embedding\n", + "layer_embed_reduced = tf.reduce_sum(hidden_states[10], axis = 1).numpy()\n", + "dim_reducer = TSNE(n_components=2)\n", + "two_dim_embed = dim_reducer.fit_transform(layer_embed_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fc730de1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x35f8d78e0>" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualize embedding\n", + "ix_rej = np.where(df_sub.rejected == 1)\n", + "ix_acc = np.where(df_sub.rejected == 0)\n", + "plt.scatter(two_dim_embed[ix_rej,0], two_dim_embed[ix_rej,1], label = 'rejected')\n", + "plt.scatter(two_dim_embed[ix_acc,0], two_dim_embed[ix_acc,1], label = 'accepted')\n", + "plt.legend()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pp_env", + "language": "python", + "name": "pp_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/saved_models/BERT_logs/eval_pretrained_germ_bert.csv b/saved_models/BERT_logs/eval_pretrained_germ_bert.csv new file mode 100644 index 0000000000000000000000000000000000000000..9c87b2ec0dcb875d9cc2017111cac6218409ee68 --- /dev/null +++ b/saved_models/BERT_logs/eval_pretrained_germ_bert.csv @@ -0,0 +1,13 @@ +,logs +path_repo,. +path_model,bert-base-german-cased +input_data,data/tamedia_for_classifier_v4_preproc_test.csv +text_preprocessing,TRUE +newspaper,tagesanzeiger +lang,de +topic, +remove_duplicates,TRUE +min_num_words,3 +val_score, +hsprob,"[0.0,1.0]" +pretrained_model,bert-base-german-cased diff --git a/saved_models/BERT_logs/eval_pretrained_hs_bert.csv b/saved_models/BERT_logs/eval_pretrained_hs_bert.csv new file mode 100644 index 0000000000000000000000000000000000000000..3703b92caf0ff6c7eeff0de3d02ec3466d7f5a9e --- /dev/null +++ b/saved_models/BERT_logs/eval_pretrained_hs_bert.csv @@ -0,0 +1,13 @@ +,logs +path_repo,. +path_model,deepset/bert-base-german-cased-hatespeech-GermEval18Coarse +input_data,data/tamedia_for_classifier_v4_preproc_test.csv +text_preprocessing,TRUE +newspaper,tagesanzeiger +lang,de +topic, +remove_duplicates,TRUE +min_num_words,3 +val_score, +hsprob,"[0.0, 1.0]" +pretrained_model,deepset/bert-base-german-cased-hatespeech-GermEval18Coarse diff --git a/src/BERT_utils.py b/src/BERT_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..990948da813360ca1c198c87ce836a4982a1051b --- /dev/null +++ b/src/BERT_utils.py @@ -0,0 +1,37 @@ +import numpy as np +import tensorflow as tf +from tqdm import tqdm + +from typing import List + +def split_batches(text: np.ndarray, batch_size: int=100) -> List: + """ + Splits list with comments into batches + :param text: Array containing comments + :param batch_size: Number of comments per batch + """ + text_list = list(text) + text_batches=[text_list[idx:idx+batch_size] for idx in range(0, len(text_list), batch_size)] + return text_batches + +def predict_batches(text: np.ndarray, model, tokenizer) -> np.ndarray: + """ + Makes prediction for all batches and combines all predictions + :param text: Array containing comments + :param model: + :param tokenizer: + """ + text_batches = split_batches(text) + y_pred_all = [] + y_prob_all = [] + for batch in tqdm(text_batches): + inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True) + logits = model(**inputs).logits + y_pred_batch = tf.argmax(logits,axis=1) + y_prob_batch = tf.math.softmax(logits, axis=-1)[:,1] + y_pred_all.append(y_pred_batch) + y_prob_all.append(y_prob_batch) + y_pred_all = np.concatenate(y_pred_all) + y_prob_all = np.concatenate(y_prob_all) + + return y_pred_all, y_prob_all \ No newline at end of file diff --git a/src/MNB_utils.py b/src/MNB_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c6db6889735594c494a1f3680a5e72d60eff09a9 --- /dev/null +++ b/src/MNB_utils.py @@ -0,0 +1,67 @@ +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +from sklearn.feature_extraction.text import TfidfVectorizer + +from nltk.corpus import stopwords + +import datetime +from joblib import dump, load +import os +from pathlib import Path + +from typing import Union + +from src.preprocessing_text import TextProcessor + + +def create_pipeline() -> Pipeline: + """ + Creates classification pipeline + """ + + # define preprocessor + tp = TextProcessor() + + # define vectorizer + stop_words_ge = stopwords.words("german") + vectorizer = TfidfVectorizer( + stop_words=stop_words_ge, ngram_range=(1, 4), max_features=3000 + ) + + # define model + mnb = MultinomialNB(alpha=0.1) + + # set pipeline + pipe = Pipeline([("processor", tp), ("vectorizer", vectorizer), ("mnb", mnb)]) + + return pipe + + +def create_path() -> Union[str, os.PathLike]: + """ + Creates path to store trained model + """ + if not os.path.exists("saved_models/MNB/"): + os.makedirs("saved_models/MNB/") + + timestemp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + return Path("saved_models/MNB/" + timestemp + ".joblib") + + +def save_model(pipe: Pipeline, path): + """ + Saves trained model + :param pipe: Trained pipeline + """ + dump(pipe, path) + + +def load_model(path: Union[str, os.PathLike]) -> Pipeline: + """ + Loads trained model + :param path: Path to pipeline + """ + pipe = load(path) + + return pipe diff --git a/src/eval_utils.py b/src/eval_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac84a1eecb3ab9dbee79581239dce9c8b112569 --- /dev/null +++ b/src/eval_utils.py @@ -0,0 +1,15 @@ +def gen_scores_dict(precision: float, recall: float, f1: float, accuracy: float): + """ + Generates dictionary containing most important scores + :param precision: Precision score + :param recall: Recall score + :param f1: F1 score + :param accuracy: Accuracy score + """ + results = dict() + results["precision"] = precision + results["recall"] = recall + results["f1"] = f1 + results["accuracy"] = accuracy + + return results diff --git a/src/prepare_bert_tf.py b/src/prepare_bert_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..e5e315f4de03b22afb16694fb4f3d797f4a5e0de --- /dev/null +++ b/src/prepare_bert_tf.py @@ -0,0 +1,58 @@ +import pandas as pd +from datasets import Dataset, DatasetDict +from sklearn.model_selection import train_test_split +import evaluate +import numpy as np +from transformers import create_optimizer + + +def df2dict(df: pd.DataFrame, test_size: float = 0.2, split_data: bool = True): + """ + Converts Dataframe into Huggingface Dataset + :param df: input dataframe + :param test_size: size of test set + :param split_data: whether data should be split or not + """ + #df.sample(10000, replace=True) + + if split_data: + train, test = train_test_split(df, test_size=test_size) + + ds_train = Dataset.from_pandas(train) + ds_test = Dataset.from_pandas(test) + + ds = DatasetDict() + ds["train"] = ds_train + ds["test"] = ds_test + + else: + ds = Dataset.from_pandas(df) + + #ds = DatasetDict() + #ds["test"] = ds_all + + return ds + + +def compute_metrics(eval_pred): + """ + Computes metrics during training + """ + accuracy = evaluate.load("accuracy") + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + + return accuracy.compute(predictions=predictions, references=labels) + + +def prepare_training(dataset, batch_size: int = 16, num_epochs: int = 5): + """ + Prepares training and sets params + """ + batches_per_epoch = len(dataset["train"]) // batch_size + total_train_steps = int(batches_per_epoch * num_epochs) + optimizer, schedule = create_optimizer( + init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps + ) + + return optimizer, schedule diff --git a/src/preprocessing.py b/src/preprocessing.py index 59f7476f855218a69ee9dfd70b8dce1be8c478d5..421eb48713cb3b44c7ba55687907ab4edbe419b5 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -7,34 +7,28 @@ import pandas as pd from pathlib import Path from typing import Union -import time class DataProcessor(object): - def __init__(self, path_data: Union[str, os.PathLike]): - """ :param path_data: Path to input dataframe. """ self.path_data = path_data - def get_lang_detector(self, nlp, name): """ Gets language detector. """ return LanguageDetector(seed=42) - - def detect_language(self, text: str, nlp_model): + def detect_language(self, text: str, nlp_model): """Detect language per comment. :param text: Text of comment. """ doc = nlp_model(text) language = doc._.language - return language['language'] - + return language["language"] def init_nlp_model(self): """ @@ -42,12 +36,10 @@ class DataProcessor(object): """ self.nlp_model = spacy.load("en_core_web_sm") Language.factory("language_detector", func=self.get_lang_detector) - self.nlp_model.add_pipe('language_detector', last=True) - + self.nlp_model.add_pipe("language_detector", last=True) def add_language(self): - """Add language column to dataframe and saves new file. - """ + """Add language column to dataframe and saves new file.""" # Load data df = pd.read_csv(self.path_data) @@ -55,14 +47,11 @@ class DataProcessor(object): # Detect language self.init_nlp_model() - lang = df_new.text.apply(self.detect_language, nlp_model = self.nlp_model) - df_new['language'] = lang + lang = df_new.text.apply(self.detect_language, nlp_model=self.nlp_model) + df_new["language"] = lang # Save new file f = self.path_data fname_new = f"{os.path.splitext(os.path.basename(f))[0]}_preproc.csv" path_new = Path(Path(self.path_data).parent).joinpath(fname_new) df_new.to_csv(path_new) - - - diff --git a/src/preprocessing_df.py b/src/preprocessing_df.py new file mode 100644 index 0000000000000000000000000000000000000000..421eb48713cb3b44c7ba55687907ab4edbe419b5 --- /dev/null +++ b/src/preprocessing_df.py @@ -0,0 +1,57 @@ +import spacy +from spacy.language import Language +from spacy_language_detection import LanguageDetector + +import os +import pandas as pd +from pathlib import Path +from typing import Union + + +class DataProcessor(object): + def __init__(self, path_data: Union[str, os.PathLike]): + """ + :param path_data: Path to input dataframe. + """ + self.path_data = path_data + + def get_lang_detector(self, nlp, name): + """ + Gets language detector. + """ + return LanguageDetector(seed=42) + + def detect_language(self, text: str, nlp_model): + """Detect language per comment. + :param text: Text of comment. + """ + doc = nlp_model(text) + language = doc._.language + + return language["language"] + + def init_nlp_model(self): + """ + Initializes NLP model for langugae detection + """ + self.nlp_model = spacy.load("en_core_web_sm") + Language.factory("language_detector", func=self.get_lang_detector) + self.nlp_model.add_pipe("language_detector", last=True) + + def add_language(self): + """Add language column to dataframe and saves new file.""" + + # Load data + df = pd.read_csv(self.path_data) + df_new = df.copy() + + # Detect language + self.init_nlp_model() + lang = df_new.text.apply(self.detect_language, nlp_model=self.nlp_model) + df_new["language"] = lang + + # Save new file + f = self.path_data + fname_new = f"{os.path.splitext(os.path.basename(f))[0]}_preproc.csv" + path_new = Path(Path(self.path_data).parent).joinpath(fname_new) + df_new.to_csv(path_new) diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py new file mode 100644 index 0000000000000000000000000000000000000000..bb0a615807f64f6c7fd4d4561fafef0000f0d657 --- /dev/null +++ b/src/preprocessing_text.py @@ -0,0 +1,290 @@ +import string + +from collections import Counter +import emoji +import itertools +import numpy as np +import os +import pandas as pd +import re +from sklearn.base import BaseEstimator, TransformerMixin +import spacy +from tqdm import tqdm + +from typing import Union, List + + +class TextLoader(object): + """ + Loads text data from specific path + """ + + def __init__(self, path: Union[str, os.PathLike]): + self.path = path + + def load_col(self, col_name: str) -> List: + """ + Loads specific column of dataframe to use less memory + :param col_name: Column name in dataframe + """ + col = pd.read_csv(self.path, usecols=[col_name]).values + col = list(itertools.chain.from_iterable(col)) + return col + + def load_text_csv( + self, + newspaper: str = None, + lang: str = None, + topic: str = None, + hsprob: list = None, + load_subset: bool = False, + remove_duplicates: bool = False, + min_num_words: int = None, + ) -> pd.DataFrame: + """ + Loads dataframe and extracts text depending on newspaper and langugae + """ + if load_subset: + newspaper_col = self.load_col(col_name="originTenantId") + language_col = self.load_col(col_name="language") + text_col = self.load_col(col_name="text") + rejected_col = self.load_col(col_name="rejected") + + df = pd.DataFrame( + { + "text": text_col, + "originTenantId": newspaper_col, + "language": language_col, + "rejected": rejected_col, + } + ) + + else: + df = pd.read_csv(self.path) + # df = df.sample(100000) + + df = df.rename(columns={"rejected": "label"}) + df_filter = self.filter_df( + df, + min_num_words, + remove_duplicates, + newspaper, + lang, + topic, + hsprob + ) + + return df_filter + + def filter_df( + self, + df: pd.DataFrame, + min_num_words: int, + remove_duplicates: bool, + newspaper: str, + lang: str, + topic: str, + hsprob: list, + ) -> pd.DataFrame: + """ + Filters data depending on given arguments. + :param df: Input dataframe + :param min_words: minimal amount of words per topic + :param remove_duplicates: Boolean flag whether or not to remove duplicates. + :param newspaper: Name of newspaper + :param lang: Language + :param topic: Topic + :param hsprob: List with min max values for hate speech probability + """ + + if min_num_words: + df = self.filter_min_words(df) + + if newspaper: + df = self.filter_newspaper(df, newspaper=newspaper) + + if lang: + df = self.filter_language(df, lang=lang) + + if topic: + df = self.filter_topic(df, topic=topic) + + if hsprob: + df = self.filter_hsprob(df, thresh=hsprob) + + if remove_duplicates: + df = self.remove_duplicate_comments(df) + + #df = df[["text", "originTenantId", "label", "topic"]] + df = df[["text", "originTenantId", "label", "topic", "hsprob"]] + + return df + + def filter_newspaper(self, df: pd.DataFrame, newspaper: str): + """ + Filters out comments from specific newspaper. + :param df: Input dataframe + :param newspaper: Name of newspaper + """ + return df.loc[(df.originTenantId == newspaper)] + + def filter_language(self, df: pd.DataFrame, lang: str): + """ + Filters out comments with specific language + :param df: Input dataframe + :param lang: Language + """ + return df.loc[(df.language == lang)] + + def filter_topic(self, df: pd.DataFrame, topic: str): + """ + Filters out comments with specific topic + :param df: Input dataframe + :param lang: Language + """ + return df.loc[(df.topic == topic)] + + def filter_min_words(self, df: pd.DataFrame, min_words: int = 3): + """Filters out comments with less than min words + :param df: Input dataframe + :param min_words: minimal amount of words per topic + """ + return df[np.array([len((re.findall(r"\w+", t))) for t in df.text]) > min_words] + + def filter_hsprob(self, df: pd.DataFrame, thresh: list): + """ + Filters out comments from specific newspaper. + :param df: Input dataframe + :param newspaper: Name of newspaper + """ + return df.loc[(df.hsprob > thresh[0])&(df.hsprob < thresh[1])] + + def get_comments_per_topic(self, df, num_topic: int = 10) -> dict: + """ + Returns dictionary containing df's per topic for most common topics. + :param num_topic: Number f most common topics + """ + # df = pd.read_csv(self.path) + # df = df.rename(columns={"rejected": "label"}) + + topics = Counter(df["topic"]).most_common(num_topic) + + return topics + + def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray: + """ " + Find duplicate comments in dataframe + :param df: Input dataframe + """ + c_comm = Counter(df.text.values) + duplicate_comments = np.array(list(c_comm.keys()))[ + np.where(np.array(list(c_comm.values())) > 1) + ] + + # indices_repetitions = np.concatenate( + # [ + # np.where(df.text == d)[0][ + # np.argsort(df.createdAt.iloc[np.where(df.text == d)[0]].values)[:-1] + # ] + # for d in tqdm(duplicate_comments) + # ] + # ) + if len(duplicate_comments) > 0: + indices_repetitions = np.concatenate( + [np.where(df.text == d)[0] for d in tqdm(duplicate_comments)] + ) + else: + indices_repetitions = np.array([]) + + return indices_repetitions + + def remove_duplicate_comments(self, df: pd.DataFrame) -> pd.DataFrame: + """Removes duplicates from dataframe + :param df: Input dataframe + """ + print("Find and remove duplicates") + indices = self.find_duplicate_comments(df) + if len(indices) > 0: + return df.drop(df.index[indices]) + else: + return df + + +class TextProcessor(BaseEstimator, TransformerMixin): + def __init__(self, lowercase=True): # params setting single steps to True or False + self.punctuation = list(string.punctuation) + self.punctuation.remove("@") + self.punctuation = "".join(self.punctuation) + self.lowercase = lowercase + + def fit(self, X, y=None): + """ + Fits preprocessing to data + """ + return self + + def transform(self, X): + """ + Transforms data after fitting + """ + text_proc = X.apply(self.preprocess) + return text_proc + + def preprocess(self, text) -> str: + """ + Applies preprocessing to text + :param text: Input text + :param nlp: Loaded nlp model + """ + text_proc = self.transcripe_emojis(text) + text_proc = self.remove_spaces(text) + text_proc = self.remove_punctuation(text_proc) + text_proc = self.remove_mentions(text_proc) + if self.lowercase: + text_proc = self.fold_case(text_proc) + + return text_proc + + def remove_spaces(self, text: str) -> str: + """ + Removes extra white spaces and linebreaks + :param text: Input text + """ + return " ".join(text.split()) + + def remove_punctuation(self, text: str) -> str: + """ + Removes puntuation from text except @ + :param text: Input text + """ + return text.translate(str.maketrans("", "", self.punctuation)) + + def fold_case(self, text: str) -> str: + """ + Transforms text to lowercase + :param text: Input text + """ + return text.casefold() + + def remove_mentions(self, text: str) -> str: + """ + Removes @-mentions from text + :param text: Input text + """ + return re.sub("@([a-zA-Z0-9]{1,15})", "", text) + + def lemmatize_text(self, text: str) -> str: + """ + Lemmatizes text + :param text: Input text + """ + doc = self.nlp(text) + return " ".join([word.lemma_ for word in doc]) + + def transcripe_emojis(self, text: str) -> str: + """ + Transcripes emojis into words + """ + return emoji.demojize(text, language="de", delimiters=("", "")).replace( + "_", " " + ) diff --git a/src/train_logs.py b/src/train_logs.py new file mode 100644 index 0000000000000000000000000000000000000000..5c22d41b961cfe60e2fa44324b5b968d4e426a39 --- /dev/null +++ b/src/train_logs.py @@ -0,0 +1,123 @@ +from typing import Tuple, Union, Optional +import os + +import pandas as pd +import numpy as np + + +def save_logs( + path_repo: Union[str, os.PathLike], + path_model: Union[str, os.PathLike], + input_data: Union[str, os.PathLike], + text_preprocessing: bool, + newspaper: str, + lang: str, + topic: str, + hsprob: list, + remove_duplicates: bool, + min_num_words: int, + model_name: str, + val_score: Optional[Union[str, os.PathLike]] = None, + pretrained_model: Optional[str] = None, +): + """ + Saves training logs which can be used during evaluation + :param path_repo: Path to repository + :param path_model: Path to trained model + :param input_data: Path to used train data + :param text_preprocessing: Boolean flag whether preprocessing was used or not + :param newspaper: Name of newspaper + :param lang: Selected language + :param topic: Selected topic + :param hsprob: List with min max values for hate speech probability + :param remove_duplicates: Boolean flag whether duplicates should be removed + :param min_num_words: Minimum number of words per comment + :param model_name: Name of model + :param pretrained_model: Name of pretrained BERT model + """ + logs = dict() + logs["path_repo"] = path_repo + logs["path_model"] = path_model + logs["input_data"] = input_data + logs["text_preprocessing"] = text_preprocessing + logs["newspaper"] = newspaper + logs["lang"] = lang + logs["topic"] = topic + logs["hsprob"] = hsprob + logs["remove_duplicates"] = remove_duplicates + logs["min_num_words"] = min_num_words + logs["val_score"] = val_score + logs["pretrained_model"] = pretrained_model + + path_logs = (path_repo).joinpath("saved_models/" + model_name + "_logs/") + if not os.path.exists(path_logs): + os.makedirs(path_logs) + + df_logs = pd.DataFrame.from_dict(logs, orient="index", columns=["logs"]) + + df_logs.to_csv(path_logs.joinpath(path_model.stem).with_suffix(".csv")) + + +def load_logs( + train_logs: Union[str, os.PathLike] +) -> Tuple[ + Union[str, os.PathLike], + Union[str, os.PathLike], + str, + bool, + str, + str, + str, + list, + bool, + int, +]: + """ + Loads training logs + :param train_logs: Path to csv-file containing logs + """ + df = pd.read_csv(train_logs, index_col="Unnamed: 0") + path_repo = df.loc["path_repo"].values[0] + path_model = df.loc["path_model"].values[0] + input_data = df.loc["input_data"].values[0].replace("train", "test") + text_preprocessing = df.loc["text_preprocessing"].values[0] + newspaper = df.loc["newspaper"].values[0] + lang = df.loc["lang"].values[0] + topic = df.loc["topic"].values[0] + hsprob = eval(df.loc["hsprob"].values[0]) + remove_duplicates = df.loc["remove_duplicates"].values[0] + min_num_words = df.loc["min_num_words"].values[0] + pretrained_model = df.loc["pretrained_model"].values[0] + + # check whether topic is str or NaN + if topic != topic: + topic = None + + if pretrained_model: + return ( + path_repo, + path_model, + input_data, + text_preprocessing, + newspaper, + lang, + topic, + hsprob, + remove_duplicates, + min_num_words, + pretrained_model, + ) + + else: + return ( + path_repo, + path_model, + input_data, + text_preprocessing, + newspaper, + lang, + topic, + hsprob, + remove_duplicates, + min_num_words, + )