-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCADOCS.py
143 lines (118 loc) · 5.28 KB
/
CADOCS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import logging, json, warnings
logging.basicConfig(level="INFO")
warnings.filterwarnings('ignore')
import os
import pandas as pd
from rasa_nlu.training_data import load_data
from rasa_nlu.model import Trainer, Interpreter
from rasa_nlu import config
# Path were the dataset is stored
DATASET_PATH = "dataset.csv"
# Path were the project is located
PROJECT_PATH = os.getcwd()
# Creation of the DataFrame containing the dataset
df = pd.read_csv(DATASET_PATH, sep=";", header=None)
# Function that indents the printing of a JSON object
def pprint(o):
print(json.dumps(o, indent=2))
# This class represents the logic of the ML model
class CADOCSModel:
# Defines the number of the dataset updates required to perform a re-train of the model
TO_RETRAIN = 10
# Number of updates performed. It resets when it reaches the value specified by TO_RETRAIN
UPDATE_COUNT = 0
# Created an MarkDown file containing the training data
def create_training_data_md(self):
int1 = []
int2 = []
int3 = []
int4 = []
for row in df.iterrows():
if row[1][1] == "get_smells":
int1.append(row[1][0])
if row[1][1] == "get_smells_date":
int2.append(row[1][0])
if row[1][1] == "report":
int3.append(row[1][0])
if row[1][1] == "info":
int4.append(row[1][0])
with open(os.path.join(PROJECT_PATH, "nlu.md"), "wt", encoding="utf-8") as f:
f.write("## intent: get_smells\n")
for q in int1:
f.write(f"- {q}\n")
f.write("## intent: get_smells_date\n")
for q in int2:
f.write(f"- {q}\n")
f.write("## intent: report\n")
for q in int3:
f.write(f"- {q}\n")
f.write("## intent: info\n")
for q in int4:
f.write(f"- {q}\n")
'''# Created an MarkDown file containing the testing data
def create_test_data_md(self):
qs1 = df[2][27:]
qs2 = df[3][27:]
qs3 = df[4][27:]
qs4 = df[5][27:]
with open(os.path.join(PROJECT_PATH, "nlu_test.md"), "wt", encoding="utf-8") as f:
f.write("## intent: get_smells\n")
for q in qs1:
f.write(f"- {q}\n")
f.write("## intent: get_smells_date\n")
for q in qs2:
f.write(f"- {q}\n")
f.write("## intent: report\n")
for q in qs3:
f.write(f"- {q}\n")
f.write("## intent: info\n")
for q in qs4:
f.write(f"- {q}\n")
'''
# This function it is where the training of the model takes place
def train_model(self):
# It creates the training and the testing data from the dataset
self.create_training_data_md()
# self.create_test_data_md()
print("MD files created")
training_data = load_data(os.path.join(PROJECT_PATH, "nlu.md"))
# A Trainer object is created and the model is trained
trainer = Trainer(config.load(os.path.join(PROJECT_PATH, "config.yml")))
print("Starting training...")
trainer.train(training_data)
# The model is saved and stored in the specified path, so it can be used again in the future
print("Storing...")
trainer.persist(PROJECT_PATH, fixed_model_name="current")
print("Stored!")
# This function uses the generated model to make predictions on a given message
# The message is the sentence of the user from which you want to extract the intent
def give_prediction(self, message):
# An interpreter is created and it's used to load the pre-trained model
print("Loading interpreter...")
interpreter = Interpreter.load(model_dir="default/current")
# The interpreter parses the message and causes the model to provide its prediction
return interpreter.parse(message)
# This function updates the existing dataset
# When new question are asked to the Conversational Agent and a correct prediction is provided, that entry is added to the dataset
# After a number of updates specified by TO_RETRAIN has been performed, Active Learning is performed
def update_dataset(self, message, intent):
df = pd.read_csv(DATASET_PATH, sep=";", header=None)
# A new row for the dataset is created with the given message and intent
new_row = {0: [message],
1: [intent]}
df_tmp = pd.DataFrame(new_row)
# The new row is added to the existing dataset
df = df.append(df_tmp)
pd.DataFrame(df).to_csv(os.path.join(PROJECT_PATH, "dataset.csv"), sep=";", header=None, index=False)
# Since an updated has been performed, the counter is increased
self.UPDATE_COUNT += 1
# This is where the Active Learning is performed
# When the value of TO_RETRAIN is equal to the UPDATE_COUNT one, the latter is reset and the model is trained again
if (self.TO_RETRAIN == self.UPDATE_COUNT):
self.UPDATE_COUNT = 0
self.train_model()
return
if __name__ == "__main__":
model = CADOCSModel()
res = model.give_prediction("hello CADOCS, show me the community smells in the repository https://github.com/tensorflow/ranking from 21/05/2020")
print(res)