Skip to content
Snippets Groups Projects
Commit dc1a5289 authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

Split Loading into reading csv and filtering

parent 3f4a8f21
No related branches found
No related tags found
1 merge request!2Dev train models
...@@ -30,7 +30,12 @@ class TextLoader(object): ...@@ -30,7 +30,12 @@ class TextLoader(object):
return col return col
def load_text_csv( def load_text_csv(
self, newspaper: str = None, lang: str = None, load_subset: bool = False, remove_duplicates: bool = False, self,
newspaper: str = None,
lang: str = None,
load_subset: bool = False,
remove_duplicates: bool = False,
min_num_words: int = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Loads dataframe and extracts text depending on newspaper and langugae Loads dataframe and extracts text depending on newspaper and langugae
...@@ -52,23 +57,70 @@ class TextLoader(object): ...@@ -52,23 +57,70 @@ class TextLoader(object):
else: else:
df = pd.read_csv(self.path) df = pd.read_csv(self.path)
#df = df.sample(100000) # df = df.sample(100000)
df = df.rename(columns={"rejected": "label"})
df_filter = self.filter_df(
df, min_num_words, remove_duplicates, newspaper, lang
)
return df_filter
def filter_df(
self,
df: pd.DataFrame,
min_num_words: int,
remove_duplicates: bool,
newspaper: str,
lang: str,
) -> pd.DataFrame:
"""
Filters data depending on given arguments.
:param df: Input dataframe
:param min_words: minimal amount of words per topic
:param remove_duplicates: Boolean flag whether or not to remove duplicates.
:param newspaper: Name of newspaper
:param lang: Language
"""
if min_num_words:
df = self.filter_min_words(df)
if newspaper:
df = self.filter_newspaper(df, newspaper=newspaper)
if lang:
df = self.filter_language(df, lang=lang)
if remove_duplicates: if remove_duplicates:
df = self.remove_duplicate_comments(df) df = self.remove_duplicate_comments(df)
if newspaper is None and lang is not None: df = df[["text", "originTenantId", "label", "topic"]]
df_filter = df.loc[(df.language == lang)]
elif newspaper is not None and lang is None:
df_filter = df.loc[(df.originTenantId == newspaper)]
else:
df_filter = df.loc[(df.originTenantId == newspaper) & (df.language == lang)]
df_filter = df_filter[["text", "originTenantId", "rejected", "topic"]] return df
df_filter = df_filter.rename(columns={"rejected": "label"})
return df_filter def filter_newspaper(self, df: pd.DataFrame, newspaper: str):
"""
Filters out comments from specific newspaper.
:param df: Input dataframe
:param newspaper: Name of newspaper
"""
return df.loc[(df.originTenantId == newspaper)]
def filter_language(self, df: pd.DataFrame, lang: str):
"""
Filters out comments with specific language
:param df: Input dataframe
:param lang: Language
"""
return df.loc[(df.language == lang)]
def filter_min_words(self, df: pd.DataFrame, min_words: int = 3):
"""Filters out comments with less than min words
:param df: Input dataframe
:param min_words: minimal amount of words per topic
"""
return df[np.array([len((re.findall(r"\w+", t))) for t in df.text]) > min_words]
def get_comments_per_topic(self, num_topic: int = 10) -> dict: def get_comments_per_topic(self, num_topic: int = 10) -> dict:
""" """
...@@ -78,33 +130,40 @@ class TextLoader(object): ...@@ -78,33 +130,40 @@ class TextLoader(object):
df = pd.read_csv(self.path) df = pd.read_csv(self.path)
df = df.rename(columns={"rejected": "label"}) df = df.rename(columns={"rejected": "label"})
topics = Counter(df['topic']).most_common(num_topic) topics = Counter(df["topic"]).most_common(num_topic)
comm_per_topic = dict() comm_per_topic = dict()
for t in topics: for t in topics:
df_topic = df[df.topic == t[0]] df_topic = df[df.topic == t[0]]
comm_per_topic[t[0]] = df_topic comm_per_topic[t[0]] = df_topic
return comm_per_topic return comm_per_topic
def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray: def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray:
"""" """ "
Find duplicate comments in dataframe Find duplicate comments in dataframe
:param df: Input dataframe :param df: Input dataframe
""" """
c_comm = Counter(df.text.values) c_comm = Counter(df.text.values)
duplicate_comments = np.array(list(c_comm.keys()))[np.where(np.array(list(c_comm.values())) > 1)] duplicate_comments = np.array(list(c_comm.keys()))[
np.where(np.array(list(c_comm.values())) > 1)
indices_repetitions = np.concatenate([np.where(df.text == d)[0][np.argsort(df.createdAt[np.where(df.text == d)[0]].values)[:-1]] for d in tqdm(duplicate_comments)]) ]
indices_repetitions = np.concatenate(
[
np.where(df.text == d)[0][
np.argsort(df.createdAt.iloc[np.where(df.text == d)[0]].values)[:-1]
]
for d in tqdm(duplicate_comments)
]
)
return indices_repetitions return indices_repetitions
def remove_duplicate_comments(self, df: pd.DataFrame) -> pd.DataFrame: def remove_duplicate_comments(self, df: pd.DataFrame) -> pd.DataFrame:
"""Removes duplicates from dataframe """Removes duplicates from dataframe
:param df: Input dataframe :param df: Input dataframe
""" """
print('Find and remove duplicates') print("Find and remove duplicates")
indices = self.find_duplicate_comments(df) indices = self.find_duplicate_comments(df)
return df.drop(indices) return df.drop(indices)
...@@ -157,10 +216,11 @@ class TextProcessor(object): ...@@ -157,10 +216,11 @@ class TextProcessor(object):
""" """
doc = self.nlp(text) doc = self.nlp(text)
return " ".join([word.lemma_ for word in doc]) return " ".join([word.lemma_ for word in doc])
def transcripe_emojis(self, text: str) -> str: def transcripe_emojis(self, text: str) -> str:
""" """
Transcripes emojis into words Transcripes emojis into words
""" """
return emoji.demojize(text, language='de', delimiters=("", "")).replace('_', ' ') return emoji.demojize(text, language="de", delimiters=("", "")).replace(
"_", " "
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment