-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtrainer.py
82 lines (67 loc) · 2.92 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import utils
# import ensemble_capsule_network
import ensemble_capsule_network
import network_test
from config import Config
# import network
from preprocessing import text_preprocessing, load_word_embedding_matrix
folder_path = "D:\\deep_learning_experiments"
lankadeepa_data_path = folder_path + "\\sinhala_data\\lankadeepa_tagged_comments.csv"
gossip_lanka_data_path = folder_path + "\\sinhala_data\\gossip_lanka_tagged_comments.csv"
word_embedding_keyed_vectors_path = 'D:\\deep_learning_experiments\\word_vectors_sinhala\\keyed.kv'
word_embedding_matrix_path = 'D:\\deep_learning_experiments\\word_embedding_matrix'
EMBEDDING_SIZE = 300
lankadeepa_data = pd.read_csv(lankadeepa_data_path)[:9059]
gossipLanka_data = pd.read_csv(gossip_lanka_data_path)
gossipLanka_data = gossipLanka_data.drop(columns=['Unnamed: 3'])
word_embedding_path = folder_path
all_data = pd.concat([lankadeepa_data, gossipLanka_data], ignore_index=True)
all_data['label'] = all_data['label'] - 2
print(all_data)
comments_text, labels = text_preprocessing(all_data)
t = Tokenizer()
t.fit_on_texts(comments_text)
vocab_size = len(t.word_index) + 1
print(vocab_size)
encoded_docs = t.texts_to_sequences(comments_text)
# for i in encoded_docs:
# print(len(i))
# zzz = lambda z: len(z)
lengths = list(map(lambda z: len(z), encoded_docs))
print('###########################################')
print(lengths)
max_length = max(lengths)
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
comment_labels = np.array(labels)
comment_labels = utils.to_categorical(comment_labels)
padded_docs = np.array(padded_docs)
print("Shape of all comments: ", padded_docs.shape)
print("Shape of labels: ", comment_labels.shape)
X_train, X_test, y_train, y_test = train_test_split(padded_docs, comment_labels, test_size=0.1, random_state=42,
shuffle=True)
print("Train lables shape: ", y_train.shape)
# generate embedding matrix
# embedding_matrix = generate_embedding_matrix(word_embedding_keyed_vectors_path, word_embedding_matrix_path, vocab_size,
# EMBEDDING_SIZE, t)
# load embedding matrix
embedding_matrix = load_word_embedding_matrix(word_embedding_matrix_path)
# print(embedding_matrix[1])
config = Config(
seq_len=max_length,
num_classes=4,
vocab_size=vocab_size,
embedding_size=EMBEDDING_SIZE,
dropout_rate=0.8,
x_train=X_train,
y_train=y_train,
x_test=X_test,
y_test=y_test,
pretrain_vec=embedding_matrix)
model = ensemble_capsule_network.ensemble_capsule_network(config)
# model = network_test.get_model_from_text_layer(config)
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=100)