From dc1a52890e3492c9de29eb1c63c642aa66f4bfb1 Mon Sep 17 00:00:00 2001
From: Franziska Oschmann
 <franziskaoschmann@staff-net-oct-dock-1-a-dhcp-024.intern.ethz.ch>
Date: Thu, 6 Jul 2023 15:13:07 +0200
Subject: [PATCH] Split Loading into reading csv and filtering

---
 src/preprocessing_text.py | 106 +++++++++++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 23 deletions(-)

diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py
index b0164ba..18d40d6 100644
--- a/src/preprocessing_text.py
+++ b/src/preprocessing_text.py
@@ -30,7 +30,12 @@ class TextLoader(object):
         return col
 
     def load_text_csv(
-        self, newspaper: str = None, lang: str = None, load_subset: bool = False, remove_duplicates: bool = False,
+        self,
+        newspaper: str = None,
+        lang: str = None,
+        load_subset: bool = False,
+        remove_duplicates: bool = False,
+        min_num_words: int = None,
     ) -> pd.DataFrame:
         """
         Loads dataframe and extracts text depending on newspaper and langugae
@@ -52,23 +57,70 @@ class TextLoader(object):
 
         else:
             df = pd.read_csv(self.path)
-            #df = df.sample(100000)
+            # df = df.sample(100000)
+
+        df = df.rename(columns={"rejected": "label"})
+        df_filter = self.filter_df(
+            df, min_num_words, remove_duplicates, newspaper, lang
+        )
+
+        return df_filter
+
+    def filter_df(
+        self,
+        df: pd.DataFrame,
+        min_num_words: int,
+        remove_duplicates: bool,
+        newspaper: str,
+        lang: str,
+    ) -> pd.DataFrame:
+        """
+        Filters data depending on given arguments.
+        :param df: Input dataframe
+        :param min_words: minimal amount of words per topic
+        :param remove_duplicates: Boolean flag whether or not to remove duplicates.
+        :param newspaper: Name of newspaper
+        :param lang: Language
+        """
+
+        if min_num_words:
+            df = self.filter_min_words(df)
+
+        if newspaper:
+            df = self.filter_newspaper(df, newspaper=newspaper)
+
+        if lang:
+            df = self.filter_language(df, lang=lang)
 
         if remove_duplicates:
             df = self.remove_duplicate_comments(df)
 
-        if newspaper is None and lang is not None:
-            df_filter = df.loc[(df.language == lang)]
-        elif newspaper is not None and lang is None:
-            df_filter = df.loc[(df.originTenantId == newspaper)]
-        else:
-            df_filter = df.loc[(df.originTenantId == newspaper) & (df.language == lang)]
+        df = df[["text", "originTenantId", "label", "topic"]]
 
-        df_filter = df_filter[["text", "originTenantId", "rejected", "topic"]]
-        df_filter = df_filter.rename(columns={"rejected": "label"})
+        return df
 
-        return df_filter
-    
+    def filter_newspaper(self, df: pd.DataFrame, newspaper: str):
+        """
+        Filters out comments from specific newspaper.
+        :param df: Input dataframe
+        :param newspaper: Name of newspaper
+        """
+        return df.loc[(df.originTenantId == newspaper)]
+
+    def filter_language(self, df: pd.DataFrame, lang: str):
+        """
+        Filters out comments with specific language
+        :param df: Input dataframe
+        :param lang: Language
+        """
+        return df.loc[(df.language == lang)]
+
+    def filter_min_words(self, df: pd.DataFrame, min_words: int = 3):
+        """Filters out comments with less than min words
+        :param df: Input dataframe
+        :param min_words: minimal amount of words per topic
+        """
+        return df[np.array([len((re.findall(r"\w+", t))) for t in df.text]) > min_words]
 
     def get_comments_per_topic(self, num_topic: int = 10) -> dict:
         """
@@ -78,33 +130,40 @@ class TextLoader(object):
         df = pd.read_csv(self.path)
         df = df.rename(columns={"rejected": "label"})
 
-        topics = Counter(df['topic']).most_common(num_topic)
+        topics = Counter(df["topic"]).most_common(num_topic)
         comm_per_topic = dict()
         for t in topics:
             df_topic = df[df.topic == t[0]]
             comm_per_topic[t[0]] = df_topic
 
         return comm_per_topic
-    
 
     def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray:
-        """"
+        """ "
         Find duplicate comments in dataframe
         :param df: Input dataframe
         """
         c_comm = Counter(df.text.values)
-        duplicate_comments = np.array(list(c_comm.keys()))[np.where(np.array(list(c_comm.values())) > 1)]
-
-        indices_repetitions = np.concatenate([np.where(df.text == d)[0][np.argsort(df.createdAt[np.where(df.text == d)[0]].values)[:-1]] for d in tqdm(duplicate_comments)])
+        duplicate_comments = np.array(list(c_comm.keys()))[
+            np.where(np.array(list(c_comm.values())) > 1)
+        ]
+
+        indices_repetitions = np.concatenate(
+            [
+                np.where(df.text == d)[0][
+                    np.argsort(df.createdAt.iloc[np.where(df.text == d)[0]].values)[:-1]
+                ]
+                for d in tqdm(duplicate_comments)
+            ]
+        )
 
         return indices_repetitions
 
-
     def remove_duplicate_comments(self, df: pd.DataFrame) -> pd.DataFrame:
         """Removes duplicates from dataframe
         :param df: Input dataframe
         """
-        print('Find and remove duplicates')
+        print("Find and remove duplicates")
         indices = self.find_duplicate_comments(df)
 
         return df.drop(indices)
@@ -157,10 +216,11 @@ class TextProcessor(object):
         """
         doc = self.nlp(text)
         return " ".join([word.lemma_ for word in doc])
-    
+
     def transcripe_emojis(self, text: str) -> str:
         """
         Transcripes emojis into words
         """
-        return emoji.demojize(text, language='de', delimiters=("", "")).replace('_', ' ')
-
+        return emoji.demojize(text, language="de", delimiters=("", "")).replace(
+            "_", " "
+        )
-- 
GitLab