Skip to content
Snippets Groups Projects
Commit e8e6d084 authored by Franziska Oschmann's avatar Franziska Oschmann
Browse files

Add fit and transform methods to TextProcessor to allow integration into pipelines

parent dc1a5289
No related branches found
No related tags found
1 merge request!2Dev train models
...@@ -7,6 +7,8 @@ import numpy as np ...@@ -7,6 +7,8 @@ import numpy as np
import os import os
import pandas as pd import pandas as pd
import re import re
from sklearn.base import BaseEstimator, TransformerMixin
import spacy
from tqdm import tqdm from tqdm import tqdm
from typing import Union, List from typing import Union, List
...@@ -169,17 +171,38 @@ class TextLoader(object): ...@@ -169,17 +171,38 @@ class TextLoader(object):
return df.drop(indices) return df.drop(indices)
class TextProcessor(object): class TextProcessor(BaseEstimator, TransformerMixin):
def __init__(self, nlp): # params setting single steps to True or False def __init__(self): # params setting single steps to True or False
self.punctuation = list(string.punctuation) self.punctuation = list(string.punctuation)
self.punctuation.remove("@") self.punctuation.remove("@")
self.punctuation = "".join(self.punctuation) self.punctuation = "".join(self.punctuation)
self.nlp = nlp def fit(self, X, y=None):
"""
Fits preprocessing to data
"""
return self
def transform(self, X):
"""
Transforms data after fitting
"""
text_proc = X.apply(self.preprocess)
return text_proc
def preprocess(self, text) -> str:
"""
Applies preprocessing to text
:param text: Input text
:param nlp: Loaded nlp model
"""
text_proc = self.transcripe_emojis(text)
text_proc = self.remove_spaces(text)
text_proc = self.remove_punctuation(text_proc)
text_proc = self.remove_mentions(text_proc)
text_proc = self.fold_case(text_proc)
# multiple quotation marks return text_proc
# htmls ,
# transcription of emojis to words,
def remove_spaces(self, text: str) -> str: def remove_spaces(self, text: str) -> str:
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment