diff --git a/src/preprocessing_text.py b/src/preprocessing_text.py index 18d40d656be4f1cce5c1e7700c9621ea79788d33..a6a20428291c435d3e5298390103f3803a2e95ba 100644 --- a/src/preprocessing_text.py +++ b/src/preprocessing_text.py @@ -7,6 +7,8 @@ import numpy as np import os import pandas as pd import re +from sklearn.base import BaseEstimator, TransformerMixin +import spacy from tqdm import tqdm from typing import Union, List @@ -169,17 +171,38 @@ class TextLoader(object): return df.drop(indices) -class TextProcessor(object): - def __init__(self, nlp): # params setting single steps to True or False +class TextProcessor(BaseEstimator, TransformerMixin): + def __init__(self): # params setting single steps to True or False self.punctuation = list(string.punctuation) self.punctuation.remove("@") self.punctuation = "".join(self.punctuation) - self.nlp = nlp + def fit(self, X, y=None): + """ + Fits preprocessing to data + """ + return self + + def transform(self, X): + """ + Transforms data after fitting + """ + text_proc = X.apply(self.preprocess) + return text_proc + + def preprocess(self, text) -> str: + """ + Applies preprocessing to text + :param text: Input text + :param nlp: Loaded nlp model + """ + text_proc = self.transcripe_emojis(text) + text_proc = self.remove_spaces(text) + text_proc = self.remove_punctuation(text_proc) + text_proc = self.remove_mentions(text_proc) + text_proc = self.fold_case(text_proc) - # multiple quotation marks - # htmls , - # transcription of emojis to words, + return text_proc def remove_spaces(self, text: str) -> str: """