Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
ppg-moderation-classifier
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
oschmanf
ppg-moderation-classifier
Commits
dc1a5289
Commit
dc1a5289
authored
1 year ago
by
Franziska Oschmann
Browse files
Options
Downloads
Patches
Plain Diff
Split Loading into reading csv and filtering
parent
3f4a8f21
No related branches found
Branches containing commit
No related tags found
1 merge request
!2
Dev train models
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/preprocessing_text.py
+83
-23
83 additions, 23 deletions
src/preprocessing_text.py
with
83 additions
and
23 deletions
src/preprocessing_text.py
+
83
−
23
View file @
dc1a5289
...
@@ -30,7 +30,12 @@ class TextLoader(object):
...
@@ -30,7 +30,12 @@ class TextLoader(object):
return
col
return
col
def
load_text_csv
(
def
load_text_csv
(
self
,
newspaper
:
str
=
None
,
lang
:
str
=
None
,
load_subset
:
bool
=
False
,
remove_duplicates
:
bool
=
False
,
self
,
newspaper
:
str
=
None
,
lang
:
str
=
None
,
load_subset
:
bool
=
False
,
remove_duplicates
:
bool
=
False
,
min_num_words
:
int
=
None
,
)
->
pd
.
DataFrame
:
)
->
pd
.
DataFrame
:
"""
"""
Loads dataframe and extracts text depending on newspaper and langugae
Loads dataframe and extracts text depending on newspaper and langugae
...
@@ -52,23 +57,70 @@ class TextLoader(object):
...
@@ -52,23 +57,70 @@ class TextLoader(object):
else
:
else
:
df
=
pd
.
read_csv
(
self
.
path
)
df
=
pd
.
read_csv
(
self
.
path
)
#df = df.sample(100000)
# df = df.sample(100000)
df
=
df
.
rename
(
columns
=
{
"
rejected
"
:
"
label
"
})
df_filter
=
self
.
filter_df
(
df
,
min_num_words
,
remove_duplicates
,
newspaper
,
lang
)
return
df_filter
def
filter_df
(
self
,
df
:
pd
.
DataFrame
,
min_num_words
:
int
,
remove_duplicates
:
bool
,
newspaper
:
str
,
lang
:
str
,
)
->
pd
.
DataFrame
:
"""
Filters data depending on given arguments.
:param df: Input dataframe
:param min_words: minimal amount of words per topic
:param remove_duplicates: Boolean flag whether or not to remove duplicates.
:param newspaper: Name of newspaper
:param lang: Language
"""
if
min_num_words
:
df
=
self
.
filter_min_words
(
df
)
if
newspaper
:
df
=
self
.
filter_newspaper
(
df
,
newspaper
=
newspaper
)
if
lang
:
df
=
self
.
filter_language
(
df
,
lang
=
lang
)
if
remove_duplicates
:
if
remove_duplicates
:
df
=
self
.
remove_duplicate_comments
(
df
)
df
=
self
.
remove_duplicate_comments
(
df
)
if
newspaper
is
None
and
lang
is
not
None
:
df
=
df
[[
"
text
"
,
"
originTenantId
"
,
"
label
"
,
"
topic
"
]]
df_filter
=
df
.
loc
[(
df
.
language
==
lang
)]
elif
newspaper
is
not
None
and
lang
is
None
:
df_filter
=
df
.
loc
[(
df
.
originTenantId
==
newspaper
)]
else
:
df_filter
=
df
.
loc
[(
df
.
originTenantId
==
newspaper
)
&
(
df
.
language
==
lang
)]
df_filter
=
df_filter
[[
"
text
"
,
"
originTenantId
"
,
"
rejected
"
,
"
topic
"
]]
return
df
df_filter
=
df_filter
.
rename
(
columns
=
{
"
rejected
"
:
"
label
"
})
return
df_filter
def
filter_newspaper
(
self
,
df
:
pd
.
DataFrame
,
newspaper
:
str
):
"""
Filters out comments from specific newspaper.
:param df: Input dataframe
:param newspaper: Name of newspaper
"""
return
df
.
loc
[(
df
.
originTenantId
==
newspaper
)]
def
filter_language
(
self
,
df
:
pd
.
DataFrame
,
lang
:
str
):
"""
Filters out comments with specific language
:param df: Input dataframe
:param lang: Language
"""
return
df
.
loc
[(
df
.
language
==
lang
)]
def
filter_min_words
(
self
,
df
:
pd
.
DataFrame
,
min_words
:
int
=
3
):
"""
Filters out comments with less than min words
:param df: Input dataframe
:param min_words: minimal amount of words per topic
"""
return
df
[
np
.
array
([
len
((
re
.
findall
(
r
"
\w+
"
,
t
)))
for
t
in
df
.
text
])
>
min_words
]
def
get_comments_per_topic
(
self
,
num_topic
:
int
=
10
)
->
dict
:
def
get_comments_per_topic
(
self
,
num_topic
:
int
=
10
)
->
dict
:
"""
"""
...
@@ -78,33 +130,40 @@ class TextLoader(object):
...
@@ -78,33 +130,40 @@ class TextLoader(object):
df
=
pd
.
read_csv
(
self
.
path
)
df
=
pd
.
read_csv
(
self
.
path
)
df
=
df
.
rename
(
columns
=
{
"
rejected
"
:
"
label
"
})
df
=
df
.
rename
(
columns
=
{
"
rejected
"
:
"
label
"
})
topics
=
Counter
(
df
[
'
topic
'
]).
most_common
(
num_topic
)
topics
=
Counter
(
df
[
"
topic
"
]).
most_common
(
num_topic
)
comm_per_topic
=
dict
()
comm_per_topic
=
dict
()
for
t
in
topics
:
for
t
in
topics
:
df_topic
=
df
[
df
.
topic
==
t
[
0
]]
df_topic
=
df
[
df
.
topic
==
t
[
0
]]
comm_per_topic
[
t
[
0
]]
=
df_topic
comm_per_topic
[
t
[
0
]]
=
df_topic
return
comm_per_topic
return
comm_per_topic
def
find_duplicate_comments
(
self
,
df
:
pd
.
DataFrame
)
->
np
.
ndarray
:
def
find_duplicate_comments
(
self
,
df
:
pd
.
DataFrame
)
->
np
.
ndarray
:
""""
"""
"
Find duplicate comments in dataframe
Find duplicate comments in dataframe
:param df: Input dataframe
:param df: Input dataframe
"""
"""
c_comm
=
Counter
(
df
.
text
.
values
)
c_comm
=
Counter
(
df
.
text
.
values
)
duplicate_comments
=
np
.
array
(
list
(
c_comm
.
keys
()))[
np
.
where
(
np
.
array
(
list
(
c_comm
.
values
()))
>
1
)]
duplicate_comments
=
np
.
array
(
list
(
c_comm
.
keys
()))[
np
.
where
(
np
.
array
(
list
(
c_comm
.
values
()))
>
1
)
indices_repetitions
=
np
.
concatenate
([
np
.
where
(
df
.
text
==
d
)[
0
][
np
.
argsort
(
df
.
createdAt
[
np
.
where
(
df
.
text
==
d
)[
0
]].
values
)[:
-
1
]]
for
d
in
tqdm
(
duplicate_comments
)])
]
indices_repetitions
=
np
.
concatenate
(
[
np
.
where
(
df
.
text
==
d
)[
0
][
np
.
argsort
(
df
.
createdAt
.
iloc
[
np
.
where
(
df
.
text
==
d
)[
0
]].
values
)[:
-
1
]
]
for
d
in
tqdm
(
duplicate_comments
)
]
)
return
indices_repetitions
return
indices_repetitions
def
remove_duplicate_comments
(
self
,
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
def
remove_duplicate_comments
(
self
,
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
"""
Removes duplicates from dataframe
"""
Removes duplicates from dataframe
:param df: Input dataframe
:param df: Input dataframe
"""
"""
print
(
'
Find and remove duplicates
'
)
print
(
"
Find and remove duplicates
"
)
indices
=
self
.
find_duplicate_comments
(
df
)
indices
=
self
.
find_duplicate_comments
(
df
)
return
df
.
drop
(
indices
)
return
df
.
drop
(
indices
)
...
@@ -157,10 +216,11 @@ class TextProcessor(object):
...
@@ -157,10 +216,11 @@ class TextProcessor(object):
"""
"""
doc
=
self
.
nlp
(
text
)
doc
=
self
.
nlp
(
text
)
return
"
"
.
join
([
word
.
lemma_
for
word
in
doc
])
return
"
"
.
join
([
word
.
lemma_
for
word
in
doc
])
def
transcripe_emojis
(
self
,
text
:
str
)
->
str
:
def
transcripe_emojis
(
self
,
text
:
str
)
->
str
:
"""
"""
Transcripes emojis into words
Transcripes emojis into words
"""
"""
return
emoji
.
demojize
(
text
,
language
=
'
de
'
,
delimiters
=
(
""
,
""
)).
replace
(
'
_
'
,
'
'
)
return
emoji
.
demojize
(
text
,
language
=
"
de
"
,
delimiters
=
(
""
,
""
)).
replace
(
"
_
"
,
"
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment