From e8e6d0845ac25057bb40b8e6f37c4989164ccf68 Mon Sep 17 00:00:00 2001 From: Franziska Oschmann <franziskaoschmann@staff-net-vpn-dhcp-7902.intern.ethz.ch> Date: Fri, 7 Jul 2023 15:48:45 +0200 Subject: [PATCH] Add fit and transform methods to TextProcessor to allow integration into pipelines --- src/preprocessing_text.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py index 18d40d6..a6a2042 100644 --- a/src/preprocessing_text.py +++ b/src/preprocessing_text.py @@ -7,6 +7,8 @@ import numpy as np import os import pandas as pd import re +from sklearn.base import BaseEstimator, TransformerMixin +import spacy from tqdm import tqdm from typing import Union, List @@ -169,17 +171,38 @@ class TextLoader(object): return df.drop(indices) -class TextProcessor(object): - def __init__(self, nlp): # params setting single steps to True or False +class TextProcessor(BaseEstimator, TransformerMixin): + def __init__(self): # params setting single steps to True or False self.punctuation = list(string.punctuation) self.punctuation.remove("@") self.punctuation = "".join(self.punctuation) - self.nlp = nlp + def fit(self, X, y=None): + """ + Fits preprocessing to data + """ + return self + + def transform(self, X): + """ + Transforms data after fitting + """ + text_proc = X.apply(self.preprocess) + return text_proc + + def preprocess(self, text) -> str: + """ + Applies preprocessing to text + :param text: Input text + :param nlp: Loaded nlp model + """ + text_proc = self.transcripe_emojis(text) + text_proc = self.remove_spaces(text) + text_proc = self.remove_punctuation(text_proc) + text_proc = self.remove_mentions(text_proc) + text_proc = self.fold_case(text_proc) - # multiple quotation marks - # htmls , - # transcription of emojis to words, + return text_proc def remove_spaces(self, text: str) -> str: """ -- GitLab