Skip to content

Commit

Permalink
Feat: Added SVM
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastianCB-dev committed Nov 10, 2022
1 parent e72a873 commit ba7b103
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 69 deletions.
2 changes: 1 addition & 1 deletion BECK/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,4 @@
class_comment += 1
continue

df_cve.to_csv('./datasets/coseno.csv', index=False, encoding="utf-8")
df_cve.to_csv('./datasets/coseno.csv', index=False, encoding="utf-8")
15 changes: 15 additions & 0 deletions BECK/helpers/change_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd

df_completo = pd.read_csv(
'./comentarios_coseno_distancia.csv', encoding='utf-8', low_memory=False)
df_comentarios = pd.read_csv(
'./datasets/comentarios_full.csv', encoding='utf-8', low_memory=False)
for i in range(0, len(df_completo['Comentario'].index)):
# for i in range(0, 3):
print(i)
clase = df_comentarios[df_comentarios['text'] ==
df_completo.iloc[i]['Comentario']]['class']
df_completo.at[i, 'Clase'] = clase.iloc[0]

df_completo.to_csv('./comentarios_coseno_distancia.csv',
encoding='utf-8', index=False)
108 changes: 40 additions & 68 deletions BECK/helpers/get_item_dataset.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,42 @@
from cmath import inf
import pandas as pd
from pprint import pprint

df = pd.read_csv('./datasets/coseno_vs_euclidian.csv')
values_items_largos = [0, 1, 1, 2, 2, 3, 3]
df_results = pd.read_csv('./results.csv')
import numpy as np
df = pd.read_csv('./comentarios_coseno_distancia', encoding='utf-8')
df_coseno = pd.read_csv(
'drive/MyDrive/word_embedding_spanish/dataset_entrenamiento_coseno.csv', encoding='utf-8', delimiter=';')
array = []
items = list(df_coseno.columns)
classes = list(df['Clase'])
class_idx = 0
columns = list(df_results.columns)[2:]
for i in range(0, 7068):
column = 0
comment = list(df.iloc[i])
result = {}
result['Comentario'] = comment[0]
result['Comentario Preprocesado'] = comment[1]
rest_comment = comment[-29:-1]
comment = comment[2:-29]
for i in range(0, len(comment), 8):
data = comment[i:(i+8)]
# Euclidian
menor_euclidian = inf
menor_euclidian_idx = 0
#Coseno
menor_coseno = inf
menor_coseno_idx = 0
# Coseno
for idx, i2 in enumerate(range(0, len(data), 2)):
if data[i2] < menor_coseno:
menor_coseno = data[i2]
menor_coseno_idx = idx
# Euclidian
for idx, i3 in enumerate(range(1, len(data), 2)):
if data[i3] < menor_euclidian:
menor_euclidian = data[i3]
menor_euclidian_idx = idx
result[columns[column]] = menor_coseno_idx
column += 1
result[columns[column]] = menor_euclidian_idx
column += 1
# Anormal items
for i in range(0, len(rest_comment), 14):
data = rest_comment[i:(i+14)]
# Euclidian
menor_euclidian = inf
menor_euclidian_idx = 0
#Coseno
menor_coseno = inf
menor_coseno_idx = 0
# Coseno
for idx, i2 in enumerate(range(0, len(data), 2)):
if data[i2] < menor_coseno:
menor_coseno = data[i2]
menor_coseno_idx = idx
# Euclidian
for idx, i3 in enumerate(range(1, len(data), 2)):
if data[i3] < menor_euclidian:
menor_euclidian = data[i3]
menor_euclidian_idx = idx
result[columns[column]] = values_items_largos[menor_coseno_idx]
column += 1
result[columns[column]] = values_items_largos[menor_euclidian_idx]
column += 1

result['Clase'] = classes[class_idx]
class_idx += 1
df_results = df_results.append(result, ignore_index=True)


df_results.to_csv('results_coseno_euclidian.csv', index=False)
items_raros = [0, 1, 1, 2, 2, 3, 3]
comment_count = 0
for i in range(0, len(df.index)):
comment = list(df.copy().iloc[i])[2:][:-1]
normal = list(
np.array(comment.copy()[:60] + comment.copy()[67:71] + comment.copy()[78:]))
special = list(np.array(comment.copy()[60:67] + comment.copy()[71:78]))
comment_data = {}
item_count = 0
for index in range(0, len(normal), 4):
item = normal[index: index + 4]
mayor = 0
mayor_idx = 0
# print(item)
for index, result in enumerate(item):
if result > mayor:
mayor = result
mayor_idx = index
comment_data[items[item_count]] = mayor_idx
item_count += 1
for index2 in range(0, len(special), 7):
item2 = special[index2: index2 + 7]
mayor2 = 0
mayor_idx2 = 0
for index2, result2 in enumerate(item2):
if result2 > mayor2:
mayor2 = result2
mayor_idx2 = index2
comment_data[items[item_count]] = items_raros[mayor_idx2]
item_count += 1
comment_data['Clase'] = classes[comment_count]
comment_count += 1
df_coseno = df_coseno.append(comment_data, ignore_index=True)
df_coseno.to_csv('dataset_entrenamiento.csv', index=False, encoding="utf-8")
50 changes: 50 additions & 0 deletions BECK/models/SVM/test1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
strat = df[stratify] if stratify else None
train_set, test_set = train_test_split(
df, test_size=0.25, random_state=rstate, shuffle=shuffle, stratify=strat)
strat = test_set[stratify] if stratify else None
val_set, test_set = train_test_split(
test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
return (train_set, val_set, test_set)


df = pd.read_csv('../../dataset_entrenamiento.csv')
train_set, val_set, test_set = train_val_test_split(
df, stratify='Clase')
x_train = train_set.drop(labels='Clase', axis=1)
y_train = train_set['Clase']
x_test = test_set.drop(labels='Clase', axis=1)
y_test = test_set['Clase']
x_val = val_set.drop(labels='Clase', axis=1)
y_val = val_set['Clase']


train_set, val_set, test_set = train_val_test_split(
df, stratify='Clase')

svm_clf = SVC(kernel="linear", degree=3, coef0=10, C=20, probability=True)
svm_clf.fit(x_train, y_train)

#Test
print(' TEST '.center(50, '#'))
y_pred = svm_clf.predict(x_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('F1_Score: ', f1_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))

#Validation
print(' VALIDATION '.center(50, '#'))
y_pred2 = svm_clf.predict(x_val)
print(classification_report(y_val, y_pred2))
print(confusion_matrix(y_val, y_pred2))
print('F1_Score: ', f1_score(y_val, y_pred2))
print('Precision: ', precision_score(y_val, y_pred2))
print('Accuracy: ', accuracy_score(y_val, y_pred2))
print('Recall: ', recall_score(y_val, y_pred2))

0 comments on commit ba7b103

Please sign in to comment.