-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
87 lines (65 loc) · 2.86 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import nltk
import argparse
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
# Default running parameters
MODEL_NAME = 'logistic_regression'
VECTORISER = 'count_vectoriser'
MAX_FEATURES = None
MAX_DF = 1.0
MIN_DF = 1
# Argument parser
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', default=MODEL_NAME, type=str, help='Machine Learning model to use; choose one of "logistic_regression", "random_forest", "svm" and "multinomial_nb"' )
parser.add_argument('--vectoriser', default=VECTORISER, type=str, help='Vectoriser type; choose one of "count_vectoriser", "tfidf" and "word2vec"' )
parser.add_argument('--max_features', default=MAX_FEATURES, type=int, help='If not None, build a vocabulary that only considers the top max_features ordered by term frequency across the corpus; does not apply to word2vec.' )
parser.add_argument('--max_df', default=MAX_DF, type=float, help='Ignore terms that have a document frequency strictly higher than the given threshold; does not apply to word2vec.')
parser.add_argument('--min_df', default=MIN_DF, type=int, help='Ignore terms that have a document frequency strictly lower than the given threshold; does not apply to word2vec.')
# Parse the aforementioned arguments
opt = parser.parse_args()
return opt
# Tokenization of text and set English stopwords
tokenizer = ToktokTokenizer()
# Check if stopwords are already downloaded
def download_stopwords():
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
download_stopwords()
stopword_list = nltk.corpus.stopwords.words('english')
# Removing the html strips (if any)
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
# Removing the square brackets
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
# Removing the noisy text
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
return text
# Removing special characters
def remove_special_characters(text, remove_digits=True):
pattern=r'[^a-zA-z0-9-$\s]'
text=re.sub(pattern, ' ', text)
return text
# Stemming the text
def simple_stemmer(text):
ps=nltk.porter.PorterStemmer()
text= ' '.join([ps.stem(word) for word in text.split()])
return text
# Removing the stopwords
def remove_stopwords(text, is_lower_case=False):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text