-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup_dataset.py
117 lines (85 loc) · 4.22 KB
/
setup_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""Loads the raw dataset,cleans it and saves the proocessed dataset"""
import os
from tqdm import trange
import pandas as pd
from sentanalyzer import locations, version_manager
from sentanalyzer.data import TEST, TRAIN, TRAIN_TEST
from sentanalyzer.data.preprocessing import (TextPreprocessPipeline,
remove_non_word,
remove_url_pattern,
replace_chat_words,
replace_emoticons,
replace_short_forms,
space_remover, tokenize)
from sentanalyzer.data.utils import load_dataset
from sentanalyzer.utils import logger
TRAIN_TEST_DATASET = locations.get_dataset_path(dataset_type=TRAIN_TEST,version=version_manager.VERSION_2)
TRAIN_DATASET = locations.get_dataset_path(dataset_type=TRAIN,version=version_manager.VERSION_2)
TEST_DATASET = locations.get_dataset_path(dataset_type=TEST,version=version_manager.VERSION_2)
TEST_PERCENT = 0.3
TOTAL_SAMPLES = 1_00_000#2_00_000
TRAIN_SIZE = int((1/2)*TOTAL_SAMPLES*(1-TEST_PERCENT)) #per each target
TEST_SIZE = TOTAL_SAMPLES - TRAIN_SIZE
def process_dataset(df:pd.DataFrame,pipeline:TextPreprocessPipeline):
"""Preprocesses using TextPreprocessPipeline
Args:
df (pd.DataFrame): dataframe to be preprocessed
Returns:
dict: contains
+ cleaned_df
+ train_df
+ test_df
"""
pipeline = TextPreprocessPipeline(pipeline)
cleaned_df = pd.DataFrame()
for i in trange(len(df),desc="Processing tweets ",ascii=True):
tweet = df.loc[i,'text']
processed_tweet = pipeline.apply_on(tweet)
if processed_tweet.strip():
cleaned_df = cleaned_df.append([{'text':processed_tweet.strip(),'sentiment':df['sentiment'][i]}],ignore_index=True)
# cleaned_df['sentiment'] = df['sentiment'].values
cleaned_df.dropna(inplace=True)
cleaned_df.reset_index(drop=True,inplace=True)
# print(cleaned_df.isnull().any())
# return
train_df = cleaned_df[cleaned_df['sentiment'] == 4].iloc[:TRAIN_SIZE, :]
train_df = train_df.append(cleaned_df[cleaned_df['sentiment'] == 0].iloc[:TRAIN_SIZE, :],ignore_index=True)
test_df = cleaned_df[cleaned_df['sentiment'] == 4].iloc[TRAIN_SIZE:, :]
test_df = test_df.append(cleaned_df[cleaned_df['sentiment'] == 0].iloc[TRAIN_SIZE:, :],ignore_index=True)
return {
"cleaned_df" :cleaned_df,
"train_df":train_df,
"test_df":test_df
}
def setup_dataset_v1():
"""Creates the text preprocessed dataset"""
print("Process started....")
df = load_dataset(location=locations.get_raw_dataset_path(),
req_cols=['sentiment','text'],
total_len=TOTAL_SAMPLES,
targets=[4,0],
encoding='latin-1')
print("Dataset loaded...Starting preprocesing...")
tweet_cleaner_fns = [tokenize,
replace_short_forms,
replace_chat_words,
remove_url_pattern,
replace_emoticons,
remove_non_word,
space_remover]
processed_dataset = process_dataset(df,pipeline=tweet_cleaner_fns)
cleaned_df,train_df,test_df = (processed_dataset['cleaned_df'],
processed_dataset['train_df'],
processed_dataset['test_df'])
cleaned_df.to_csv(TRAIN_TEST_DATASET)
train_df.to_csv(TRAIN_DATASET)
test_df.to_csv(TEST_DATASET)
print("Dataset successfully created")
log_data = "\nDataset details :"
log_data += f"\n+ Total Dataset size : {len(cleaned_df)} | file_name : {os.path.basename(TRAIN_TEST_DATASET)}"
log_data += f"\n+ Train Dataset size : {len(train_df)} | file_name : {os.path.basename(TRAIN_DATASET)}"
log_data += f"\n+ Test Dataset size : {len(test_df)} | file_name : {os.path.basename(TEST_DATASET)}\n"
print(log_data)
logger.log(log_data,locations.get_dataset_logs_path())
if __name__ == '__main__':
setup_dataset_v1()