Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
ppg-moderation-classifier
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
oschmanf
ppg-moderation-classifier
Commits
4c110768
Commit
4c110768
authored
1 year ago
by
Franziska Oschmann
Browse files
Options
Downloads
Patches
Plain Diff
moderation_classifier/train_MNB.py: Clean code
parent
e8e6d084
No related branches found
No related tags found
1 merge request
!2
Dev train models
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
moderation_classifier/train_MNB.py
+32
-117
32 additions, 117 deletions
moderation_classifier/train_MNB.py
with
32 additions
and
117 deletions
moderation_classifier/train_MNB.py
+
32
−
117
View file @
4c110768
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
f1_score
,
precision_recall_fscore_support
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.pipeline
import
Pipeline
import
click
import
datetime
from
joblib
import
dump
from
nltk.corpus
import
stopwords
import
pandas
as
pd
import
spacy
...
...
@@ -15,37 +16,14 @@ import os
from
src.preprocessing_text
import
TextLoader
,
TextProcessor
# ToDo
# clean code
# add contional storage of results (check if rows (train-test pair) exists and store results in this row)
# train on diff corpi and generateb table with predictions
# notebook with example of preprocessing?
# Tests
def
preprocess
(
text
:
str
,
nlp
)
->
str
:
"""
Applies preprocessing to text
:param text: Input text
:param nlp: Loaded nlp model
"""
tp
=
TextProcessor
(
nlp
)
text_proc
=
tp
.
remove_spaces
(
text
)
text_proc
=
tp
.
remove_punctuation
(
text_proc
)
text_proc
=
tp
.
remove_mentions
(
text_proc
)
# text_proc = tp.lemmatize_text(text_proc)
text_proc
=
tp
.
fold_case
(
text_proc
)
return
text_proc
def
create_pipeline
():
"""
Creates classification pipeline
"""
# define preprocessor
tp
=
TextProcessor
()
# define vectorizer
stop_words_ge
=
stopwords
.
words
(
"
german
"
)
vectorizer
=
TfidfVectorizer
(
...
...
@@ -56,43 +34,22 @@ def create_pipeline():
mnb
=
MultinomialNB
(
alpha
=
0.01
)
# set pipeline
pipe
=
Pipeline
([(
"
vectorizer
"
,
vectorizer
),
(
"
mnb
"
,
mnb
)])
pipe
=
Pipeline
([
(
"
processor
"
,
tp
),
(
"
vectorizer
"
,
vectorizer
),
(
"
mnb
"
,
mnb
)])
return
pipe
def
save_results
(
train_spec
:
str
,
test_spec
:
str
,
prec
:
float
,
recall
:
float
,
f1
:
float
)
->
None
:
def
save_model
(
pipe
:
Pipeline
):
"""
Saves results to csv file
Saves trained model
:param pipe: Trained pipeline
"""
path_results
=
"
MNB_results.csv
"
if
os
.
path
.
exists
(
path_results
):
df_res
=
pd
.
read_csv
(
path_results
,
index_col
=
"
Unnamed: 0
"
)
else
:
df_res
=
pd
.
DataFrame
(
{
"
train: newspaper
"
:
pd
.
Series
(
dtype
=
"
str
"
),
"
test: newspaper
"
:
pd
.
Series
(
dtype
=
"
str
"
),
"
precision
"
:
pd
.
Series
(
dtype
=
"
float
"
),
"
recall
"
:
pd
.
Series
(
dtype
=
"
float
"
),
"
f1
"
:
pd
.
Series
(
dtype
=
"
float
"
),
}
)
df_res_tmp
=
pd
.
DataFrame
(
{
"
train: newspaper
"
:
[
train_spec
],
"
test: newspaper
"
:
[
test_spec
],
"
precision
"
:
[
prec
],
"
recall
"
:
[
recall
],
"
f1
"
:
[
recall
],
}
)
df_res
=
pd
.
concat
([
df_res
,
df_res_tmp
])
df_res
.
to_csv
(
path_results
)
if
not
os
.
path
.
exists
(
"
saved_models/MNB/
"
):
os
.
makedirs
(
"
saved_models/MNB/
"
)
timestemp
=
datetime
.
datetime
.
now
().
strftime
(
"
%Y%m%d-%H%M%S
"
)
dump
(
pipe
,
"
saved_models/MNB/
"
+
timestemp
+
"
.joblib
"
)
@click.argument
(
"
input_data
"
)
...
...
@@ -102,70 +59,28 @@ def main(input_data: Union[str, os.PathLike]):
:param input_data: Path to input dataframe.
"""
#
l
oad data and extract only
german
text from tagesanzeiger
print
(
"
Load text
"
)
#
L
oad data and extract only text from tagesanzeiger
print
(
"
Load
and preprocess
text
"
)
tl
=
TextLoader
(
input_data
)
df_de
=
tl
.
load_text_csv
(
newspaper
=
"
tagesanzeiger
"
,
load_subset
=
False
,
remove_duplicates
=
False
,
min_num_words
=
3
,
)
#df_de = df_de.sample(50000)
df_de
=
tl
.
load_text_csv
(
lang
=
"
de
"
)
df_de
=
df_de
[
(
(
df_de
.
originTenantId
==
"
tagesanzeiger
"
)
|
(
df_de
.
originTenantId
==
"
bazonline
"
)
|
(
df_de
.
originTenantId
==
"
derbund
"
)
|
(
df_de
.
originTenantId
==
"
bernerzeitung
"
)
)
]
# apply preprocessing
print
(
"
Preprocess text
"
)
df_de_sub
=
df_de
# .sample(50000)
nlp
=
spacy
.
load
(
"
de_core_news_sm
"
)
text_proc
=
df_de_sub
.
text
.
apply
(
preprocess
,
nlp
=
nlp
)
df_de_sub
.
text
=
text_proc
# # add test: other lang/other newspaper in df
# add test: lowercase/spec char in text stc
newspapers
=
df_de_sub
.
originTenantId
.
unique
()
for
n_train
in
newspapers
:
text_n1
=
df_de_sub
[
df_de_sub
.
originTenantId
==
n_train
].
text
label_n1
=
df_de_sub
[
df_de_sub
.
originTenantId
==
n_train
].
label
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
text_n1
,
label_n1
,
stratify
=
label_n1
)
print
(
"
Train model
"
)
pipe
=
create_pipeline
()
pipe
.
fit
(
X_train
,
y_train
)
for
n_test
in
newspapers
:
text_n2
=
df_de_sub
[
df_de_sub
.
originTenantId
==
n_test
].
text
label_n2
=
df_de_sub
[
df_de_sub
.
originTenantId
==
n_test
].
label
# train pipeline
if
n_train
!=
n_test
:
X_test
=
text_n2
y_test
=
label_n2
y_pred
=
pipe
.
predict
(
X_test
)
precision
,
recall
,
*
_
=
precision_recall_fscore_support
(
y_test
,
y_pred
,
average
=
"
weighted
"
)
f1
=
f1_score
(
y_test
,
y_pred
)
# Prepare data for modeling
text
=
df_de
.
text
label
=
df_de
.
label
save_results
((
n
_train
)
,
(
n_test
),
precision
,
recall
,
f1
)
X_train
,
X_val
,
y
_train
,
y_val
=
train_test_split
(
text
,
label
,
stratify
=
label
)
print
(
"
Accuracy is:
"
)
print
(
pipe
.
score
(
X_test
,
y_test
))
print
(
"
"
)
print
(
"
Precision, Recall:
"
)
print
(
precision
,
recall
)
print
(
"
"
)
# Training
print
(
"
Train model
"
)
pipe
=
create_pipeline
()
pipe
.
fit
(
X_train
,
y_train
)
save_model
(
pipe
)
if
__name__
==
"
__main__
"
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment