Add fit and transform methods to TextProcessor to allow integration into pipelines

e8e6d084 · Franziska Oschmann · dc1a5289 · e8e6d084
Commit e8e6d084 authored 1 year ago by Franziska Oschmann
--- a/src/preprocessing_text.py
+++ b/src/preprocessing_text.py
@@ -7,6 +7,8 @@ import numpy as np
 import os
 import pandas as pd
 import re
+from sklearn.base import BaseEstimator, TransformerMixin
+import spacy
 from tqdm import tqdm
 from typing import Union, List
@@ -169,17 +171,38 @@ class TextLoader(object):
        return df.drop(indices)
-class TextProcessor(object):
+class TextProcessor(BaseEstimator, TransformerMixin):
-    def __init__(self, nlp):  # params setting single steps to True or False
+    def __init__(self):  # params setting single steps to True or False
        self.punctuation = list(string.punctuation)
        self.punctuation.remove("@")
        self.punctuation = "".join(self.punctuation)
-        self.nlp = nlp
+    def fit(self, X, y=None):
+        """
+        Fits preprocessing to data
+        """
+        return self
+    def transform(self, X):
+        """
+        Transforms data after fitting
+        """
+        text_proc = X.apply(self.preprocess)
+        return text_proc
+    def preprocess(self, text) -> str:
+        """
+        Applies preprocessing to text
+        :param text: Input text
+        :param nlp: Loaded nlp model
+        """
+        text_proc = self.transcripe_emojis(text)
+        text_proc = self.remove_spaces(text)
+        text_proc = self.remove_punctuation(text_proc)
+        text_proc = self.remove_mentions(text_proc)
+        text_proc = self.fold_case(text_proc)
-    # multiple quotation marks
+        return text_proc
-    # htmls ,
-    # transcription of emojis to words,
    def remove_spaces(self, text: str) -> str:
        """