Skip to content
Snippets Groups Projects
Commit e8e6d084 authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

Add fit and transform methods to TextProcessor to allow integration into pipelines

parent dc1a5289
No related branches found
No related tags found
1 merge request!2Dev train models
......@@ -7,6 +7,8 @@ import numpy as np
import os
import pandas as pd
import re
from sklearn.base import BaseEstimator, TransformerMixin
import spacy
from tqdm import tqdm
from typing import Union, List
......@@ -169,17 +171,38 @@ class TextLoader(object):
return df.drop(indices)
class TextProcessor(object):
def __init__(self, nlp): # params setting single steps to True or False
class TextProcessor(BaseEstimator, TransformerMixin):
def __init__(self): # params setting single steps to True or False
self.punctuation = list(string.punctuation)
self.punctuation.remove("@")
self.punctuation = "".join(self.punctuation)
self.nlp = nlp
def fit(self, X, y=None):
"""
Fits preprocessing to data
"""
return self
def transform(self, X):
"""
Transforms data after fitting
"""
text_proc = X.apply(self.preprocess)
return text_proc
def preprocess(self, text) -> str:
"""
Applies preprocessing to text
:param text: Input text
:param nlp: Loaded nlp model
"""
text_proc = self.transcripe_emojis(text)
text_proc = self.remove_spaces(text)
text_proc = self.remove_punctuation(text_proc)
text_proc = self.remove_mentions(text_proc)
text_proc = self.fold_case(text_proc)
# multiple quotation marks
# htmls ,
# transcription of emojis to words,
return text_proc
def remove_spaces(self, text: str) -> str:
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment