-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
45 lines (35 loc) · 1.42 KB
/
preprocess.py
File metadata and controls
45 lines (35 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# preprocess_pipeline.py
import re
import spacy # type: ignore
from spacy.lang.en.stop_words import STOP_WORDS # type: ignore
import html
# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def preprocess_text(texts, lemmatize=True, handle_html=True):
preprocessed_texts = []
for text in texts:
# Handle HTML tags and entities
if handle_html:
text = html.unescape(text)
text = re.sub(r'<[^>]+>', '', text)
# Convert to lowercase (if not handled by spacy model)
text = text.lower()
# Replace non-standard apostrophes and other special characters
text = re.sub(r'[^\x00-\x7F]+', '', text)
# Tokenize using spacy
doc = nlp(text)
# Lemmatize and remove stopwords and punctuations
words = []
for token in doc:
if token.is_stop or token.is_punct:
continue
if lemmatize:
words.append(token.lemma_)
else:
words.append(token.text)
# Join the words back into a single string
preprocessed_text = ' '.join(words)
# Additional check to remove leading and trailing spaces
preprocessed_text = preprocessed_text.strip()
preprocessed_texts.append(preprocessed_text)
return preprocessed_texts