From e8e6d0845ac25057bb40b8e6f37c4989164ccf68 Mon Sep 17 00:00:00 2001
From: Franziska Oschmann
 <franziskaoschmann@staff-net-vpn-dhcp-7902.intern.ethz.ch>
Date: Fri, 7 Jul 2023 15:48:45 +0200
Subject: [PATCH] Add fit and transform methods to TextProcessor to allow
 integration into pipelines

---
 src/preprocessing_text.py | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py
index 18d40d6..a6a2042 100644
--- a/src/preprocessing_text.py
+++ b/src/preprocessing_text.py
@@ -7,6 +7,8 @@ import numpy as np
 import os
 import pandas as pd
 import re
+from sklearn.base import BaseEstimator, TransformerMixin
+import spacy
 from tqdm import tqdm
 
 from typing import Union, List
@@ -169,17 +171,38 @@ class TextLoader(object):
         return df.drop(indices)
 
 
-class TextProcessor(object):
-    def __init__(self, nlp):  # params setting single steps to True or False
+class TextProcessor(BaseEstimator, TransformerMixin):
+    def __init__(self):  # params setting single steps to True or False
         self.punctuation = list(string.punctuation)
         self.punctuation.remove("@")
         self.punctuation = "".join(self.punctuation)
 
-        self.nlp = nlp
+    def fit(self, X, y=None):
+        """
+        Fits preprocessing to data
+        """
+        return self
+
+    def transform(self, X):
+        """
+        Transforms data after fitting
+        """
+        text_proc = X.apply(self.preprocess)
+        return text_proc
+
+    def preprocess(self, text) -> str:
+        """
+        Applies preprocessing to text
+        :param text: Input text
+        :param nlp: Loaded nlp model
+        """
+        text_proc = self.transcripe_emojis(text)
+        text_proc = self.remove_spaces(text)
+        text_proc = self.remove_punctuation(text_proc)
+        text_proc = self.remove_mentions(text_proc)
+        text_proc = self.fold_case(text_proc)
 
-    # multiple quotation marks
-    # htmls ,
-    # transcription of emojis to words,
+        return text_proc
 
     def remove_spaces(self, text: str) -> str:
         """
-- 
GitLab