Skip to content
Snippets Groups Projects
Commit 49e4d81c authored by oschmanf's avatar oschmanf
Browse files

Merge branch 'dev' into 'main'

Dev

See merge request !1
parents 41d54851 a0530799
No related branches found
No related tags found
1 merge request!1Dev
*.egg-info
data
pp_env
# Moderation classifier
## Installation
```
python -m venv pp_env
source pp_env/bin/activate
pip install -r requirements.txt
```
## Usage
### 1. Activation of environment
```
source pp_env/bin/activate
```
### 2. Preprocessing of dataframe (adding language field)
```
moderation_classifier --prepare_data path_to_csv
```
# imports
from pathlib import Path
import click
from src.preprocessing import DataProcessor
from typing import Union
import os
@click.command()
@click.option('-p', '--prepare_data', is_flag=True)
@click.argument('input_data')
def main(prepare_data: bool, input_data: Union[str, os.PathLike]):
"""
Run moderation classifier.
:param prepare_data: Binary flag to specify if data should be prepared.
:param input_data: Path to input dataframe.
"""
if prepare_data:
dp = DataProcessor(input_data)
dp.add_language()
print(input_data)
print('Prepare data')
if __name__ == "__main__":
main()
\ No newline at end of file
This diff is collapsed.
contourpy==1.0.7
cycler==0.11.0
fonttools==4.39.3
kiwisolver==1.4.4
matplotlib==3.7.1
numpy==1.24.3
packaging==23.1
pandas==2.0.1
Pillow==9.5.0
pyparsing==3.0.9
python-dateutil==2.8.2
pytz==2023.3
seaborn==0.12.2
six==1.16.0
tzdata==2023.3
blis==0.7.9
catalogue==2.0.8
certifi==2023.5.7
charset-normalizer==3.1.0
click==8.1.3
confection==0.0.4
contourpy==1.0.7
cycler==0.11.0
cymem==2.0.7
exceptiongroup==1.1.1
fonttools==4.39.3
idna==3.4
iniconfig==2.0.0
Jinja2==3.1.2
kiwisolver==1.4.4
langcodes==3.3.0
langdetect==1.0.7
MarkupSafe==2.1.2
matplotlib==3.7.1
-e git+https://sissource.ethz.ch/oschmanf/ppg-moderation-classifier.git@88174086e04736f9260972c2baef121e8f0dee61#egg=moderation_classifier
murmurhash==1.0.9
numpy==1.24.3
packaging==23.1
pandas==2.0.1
pathy==0.10.1
Pillow==9.5.0
pluggy==1.0.0
preshed==3.0.8
pydantic==1.10.7
pyparsing==3.0.9
pytest==7.3.1
python-dateutil==2.8.2
pytz==2023.3
requests==2.30.0
seaborn==0.12.2
six==1.16.0
smart-open==6.3.0
spacy==3.5.2
spacy-langdetect==0.1.2
spacy-legacy==3.0.12
spacy-loggers==1.0.4
srsly==2.4.6
thinc==8.1.10
tomli==2.0.1
tqdm==4.65.0
typer==0.7.0
typing_extensions==4.5.0
tzdata==2023.3
urllib3==2.0.2
wasabi==1.1.1
setup.py 0 → 100644
from setuptools import find_packages, setup
setup(
name='moderation_classifier',
python_requires='>=3.10',
packages=find_packages(),
entry_points={
'console_scripts': [
'moderation_classifier = moderation_classifier.main:main',
],
},
)
import spacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector
import os
import pandas as pd
from pathlib import Path
from typing import Union
import time
class DataProcessor(object):
def __init__(self, path_data: Union[str, os.PathLike]):
"""
:param path_data: Path to input dataframe.
"""
self.path_data = path_data
def get_lang_detector(self, nlp, name):
"""
Gets language detector.
"""
return LanguageDetector(seed=42)
def detect_language(self, text: str, nlp_model):
"""Detect language per comment.
:param text: Text of comment.
"""
doc = nlp_model(text)
language = doc._.language
return language['language']
def init_nlp_model(self):
"""
Initializes NLP model for langugae detection
"""
self.nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=self.get_lang_detector)
self.nlp_model.add_pipe('language_detector', last=True)
def add_language(self):
"""Add language column to dataframe and saves new file.
"""
# Load data
df = pd.read_csv(self.path_data)
df_new = df.copy()
# Detect language
self.init_nlp_model()
lang = df_new.text.apply(self.detect_language, nlp_model = self.nlp_model)
df_new['language'] = lang
# Save new file
f = self.path_data
fname_new = f"{os.path.splitext(os.path.basename(f))[0]}_preproc.csv"
path_new = Path(Path(self.path_data).parent).joinpath(fname_new)
df_new.to_csv(path_new)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment