From dc1a52890e3492c9de29eb1c63c642aa66f4bfb1 Mon Sep 17 00:00:00 2001 From: Franziska Oschmann <franziskaoschmann@staff-net-oct-dock-1-a-dhcp-024.intern.ethz.ch> Date: Thu, 6 Jul 2023 15:13:07 +0200 Subject: [PATCH] Split Loading into reading csv and filtering --- src/preprocessing_text.py | 106 +++++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 23 deletions(-) diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py index b0164ba..18d40d6 100644 --- a/src/preprocessing_text.py +++ b/src/preprocessing_text.py @@ -30,7 +30,12 @@ class TextLoader(object): return col def load_text_csv( - self, newspaper: str = None, lang: str = None, load_subset: bool = False, remove_duplicates: bool = False, + self, + newspaper: str = None, + lang: str = None, + load_subset: bool = False, + remove_duplicates: bool = False, + min_num_words: int = None, ) -> pd.DataFrame: """ Loads dataframe and extracts text depending on newspaper and langugae @@ -52,23 +57,70 @@ class TextLoader(object): else: df = pd.read_csv(self.path) - #df = df.sample(100000) + # df = df.sample(100000) + + df = df.rename(columns={"rejected": "label"}) + df_filter = self.filter_df( + df, min_num_words, remove_duplicates, newspaper, lang + ) + + return df_filter + + def filter_df( + self, + df: pd.DataFrame, + min_num_words: int, + remove_duplicates: bool, + newspaper: str, + lang: str, + ) -> pd.DataFrame: + """ + Filters data depending on given arguments. + :param df: Input dataframe + :param min_words: minimal amount of words per topic + :param remove_duplicates: Boolean flag whether or not to remove duplicates. + :param newspaper: Name of newspaper + :param lang: Language + """ + + if min_num_words: + df = self.filter_min_words(df) + + if newspaper: + df = self.filter_newspaper(df, newspaper=newspaper) + + if lang: + df = self.filter_language(df, lang=lang) if remove_duplicates: df = self.remove_duplicate_comments(df) - if newspaper is None and lang is not None: - df_filter = df.loc[(df.language == lang)] - elif newspaper is not None and lang is None: - df_filter = df.loc[(df.originTenantId == newspaper)] - else: - df_filter = df.loc[(df.originTenantId == newspaper) & (df.language == lang)] + df = df[["text", "originTenantId", "label", "topic"]] - df_filter = df_filter[["text", "originTenantId", "rejected", "topic"]] - df_filter = df_filter.rename(columns={"rejected": "label"}) + return df - return df_filter - + def filter_newspaper(self, df: pd.DataFrame, newspaper: str): + """ + Filters out comments from specific newspaper. + :param df: Input dataframe + :param newspaper: Name of newspaper + """ + return df.loc[(df.originTenantId == newspaper)] + + def filter_language(self, df: pd.DataFrame, lang: str): + """ + Filters out comments with specific language + :param df: Input dataframe + :param lang: Language + """ + return df.loc[(df.language == lang)] + + def filter_min_words(self, df: pd.DataFrame, min_words: int = 3): + """Filters out comments with less than min words + :param df: Input dataframe + :param min_words: minimal amount of words per topic + """ + return df[np.array([len((re.findall(r"\w+", t))) for t in df.text]) > min_words] def get_comments_per_topic(self, num_topic: int = 10) -> dict: """ @@ -78,33 +130,40 @@ class TextLoader(object): df = pd.read_csv(self.path) df = df.rename(columns={"rejected": "label"}) - topics = Counter(df['topic']).most_common(num_topic) + topics = Counter(df["topic"]).most_common(num_topic) comm_per_topic = dict() for t in topics: df_topic = df[df.topic == t[0]] comm_per_topic[t[0]] = df_topic return comm_per_topic - def find_duplicate_comments(self, df: pd.DataFrame) -> np.ndarray: - """" + """ " Find duplicate comments in dataframe :param df: Input dataframe """ c_comm = Counter(df.text.values) - duplicate_comments = np.array(list(c_comm.keys()))[np.where(np.array(list(c_comm.values())) > 1)] - - indices_repetitions = np.concatenate([np.where(df.text == d)[0][np.argsort(df.createdAt[np.where(df.text == d)[0]].values)[:-1]] for d in tqdm(duplicate_comments)]) + duplicate_comments = np.array(list(c_comm.keys()))[ + np.where(np.array(list(c_comm.values())) > 1) + ] + + indices_repetitions = np.concatenate( + [ + np.where(df.text == d)[0][ + np.argsort(df.createdAt.iloc[np.where(df.text == d)[0]].values)[:-1] + ] + for d in tqdm(duplicate_comments) + ] + ) return indices_repetitions - def remove_duplicate_comments(self, df: pd.DataFrame) -> pd.DataFrame: """Removes duplicates from dataframe :param df: Input dataframe """ - print('Find and remove duplicates') + print("Find and remove duplicates") indices = self.find_duplicate_comments(df) return df.drop(indices) @@ -157,10 +216,11 @@ class TextProcessor(object): """ doc = self.nlp(text) return " ".join([word.lemma_ for word in doc]) - + def transcripe_emojis(self, text: str) -> str: """ Transcripes emojis into words """ - return emoji.demojize(text, language='de', delimiters=("", "")).replace('_', ' ') - + return emoji.demojize(text, language="de", delimiters=("", "")).replace( + "_", " " + ) -- GitLab