diff --git a/BECK/app.py b/BECK/app.py index 51e9b7d..e219344 100644 --- a/BECK/app.py +++ b/BECK/app.py @@ -52,4 +52,4 @@ class_comment += 1 continue -df_cve.to_csv('./datasets/coseno.csv', index=False, encoding="utf-8") \ No newline at end of file +df_cve.to_csv('./datasets/coseno.csv', index=False, encoding="utf-8") diff --git a/BECK/helpers/change_class.py b/BECK/helpers/change_class.py new file mode 100644 index 0000000..3411d08 --- /dev/null +++ b/BECK/helpers/change_class.py @@ -0,0 +1,15 @@ +import pandas as pd + +df_completo = pd.read_csv( + './comentarios_coseno_distancia.csv', encoding='utf-8', low_memory=False) +df_comentarios = pd.read_csv( + './datasets/comentarios_full.csv', encoding='utf-8', low_memory=False) +for i in range(0, len(df_completo['Comentario'].index)): + # for i in range(0, 3): + print(i) + clase = df_comentarios[df_comentarios['text'] == + df_completo.iloc[i]['Comentario']]['class'] + df_completo.at[i, 'Clase'] = clase.iloc[0] + +df_completo.to_csv('./comentarios_coseno_distancia.csv', + encoding='utf-8', index=False) diff --git a/BECK/helpers/get_item_dataset.py b/BECK/helpers/get_item_dataset.py index af30c65..6f6d97c 100644 --- a/BECK/helpers/get_item_dataset.py +++ b/BECK/helpers/get_item_dataset.py @@ -1,70 +1,42 @@ -from cmath import inf import pandas as pd -from pprint import pprint - -df = pd.read_csv('./datasets/coseno_vs_euclidian.csv') -values_items_largos = [0, 1, 1, 2, 2, 3, 3] -df_results = pd.read_csv('./results.csv') +import numpy as np +df = pd.read_csv('./comentarios_coseno_distancia', encoding='utf-8') +df_coseno = pd.read_csv( + 'drive/MyDrive/word_embedding_spanish/dataset_entrenamiento_coseno.csv', encoding='utf-8', delimiter=';') +array = [] +items = list(df_coseno.columns) classes = list(df['Clase']) -class_idx = 0 -columns = list(df_results.columns)[2:] -for i in range(0, 7068): - column = 0 - comment = list(df.iloc[i]) - result = {} - result['Comentario'] = comment[0] - result['Comentario Preprocesado'] = comment[1] - rest_comment = comment[-29:-1] - comment = comment[2:-29] - for i in range(0, len(comment), 8): - data = comment[i:(i+8)] - # Euclidian - menor_euclidian = inf - menor_euclidian_idx = 0 - #Coseno - menor_coseno = inf - menor_coseno_idx = 0 - # Coseno - for idx, i2 in enumerate(range(0, len(data), 2)): - if data[i2] < menor_coseno: - menor_coseno = data[i2] - menor_coseno_idx = idx - # Euclidian - for idx, i3 in enumerate(range(1, len(data), 2)): - if data[i3] < menor_euclidian: - menor_euclidian = data[i3] - menor_euclidian_idx = idx - result[columns[column]] = menor_coseno_idx - column += 1 - result[columns[column]] = menor_euclidian_idx - column += 1 - # Anormal items - for i in range(0, len(rest_comment), 14): - data = rest_comment[i:(i+14)] - # Euclidian - menor_euclidian = inf - menor_euclidian_idx = 0 - #Coseno - menor_coseno = inf - menor_coseno_idx = 0 - # Coseno - for idx, i2 in enumerate(range(0, len(data), 2)): - if data[i2] < menor_coseno: - menor_coseno = data[i2] - menor_coseno_idx = idx - # Euclidian - for idx, i3 in enumerate(range(1, len(data), 2)): - if data[i3] < menor_euclidian: - menor_euclidian = data[i3] - menor_euclidian_idx = idx - result[columns[column]] = values_items_largos[menor_coseno_idx] - column += 1 - result[columns[column]] = values_items_largos[menor_euclidian_idx] - column += 1 - - result['Clase'] = classes[class_idx] - class_idx += 1 - df_results = df_results.append(result, ignore_index=True) - - -df_results.to_csv('results_coseno_euclidian.csv', index=False) \ No newline at end of file +items_raros = [0, 1, 1, 2, 2, 3, 3] +comment_count = 0 +for i in range(0, len(df.index)): + comment = list(df.copy().iloc[i])[2:][:-1] + normal = list( + np.array(comment.copy()[:60] + comment.copy()[67:71] + comment.copy()[78:])) + special = list(np.array(comment.copy()[60:67] + comment.copy()[71:78])) + comment_data = {} + item_count = 0 + for index in range(0, len(normal), 4): + item = normal[index: index + 4] + mayor = 0 + mayor_idx = 0 + # print(item) + for index, result in enumerate(item): + if result > mayor: + mayor = result + mayor_idx = index + comment_data[items[item_count]] = mayor_idx + item_count += 1 + for index2 in range(0, len(special), 7): + item2 = special[index2: index2 + 7] + mayor2 = 0 + mayor_idx2 = 0 + for index2, result2 in enumerate(item2): + if result2 > mayor2: + mayor2 = result2 + mayor_idx2 = index2 + comment_data[items[item_count]] = items_raros[mayor_idx2] + item_count += 1 + comment_data['Clase'] = classes[comment_count] + comment_count += 1 + df_coseno = df_coseno.append(comment_data, ignore_index=True) +df_coseno.to_csv('dataset_entrenamiento.csv', index=False, encoding="utf-8") diff --git a/BECK/models/SVM/test1.py b/BECK/models/SVM/test1.py new file mode 100644 index 0000000..219ad47 --- /dev/null +++ b/BECK/models/SVM/test1.py @@ -0,0 +1,50 @@ +from sklearn.svm import SVC +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score +def train_val_test_split(df, rstate=42, shuffle=True, stratify=None): + strat = df[stratify] if stratify else None + train_set, test_set = train_test_split( + df, test_size=0.25, random_state=rstate, shuffle=shuffle, stratify=strat) + strat = test_set[stratify] if stratify else None + val_set, test_set = train_test_split( + test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat) + return (train_set, val_set, test_set) + + +df = pd.read_csv('../../dataset_entrenamiento.csv') +train_set, val_set, test_set = train_val_test_split( + df, stratify='Clase') +x_train = train_set.drop(labels='Clase', axis=1) +y_train = train_set['Clase'] +x_test = test_set.drop(labels='Clase', axis=1) +y_test = test_set['Clase'] +x_val = val_set.drop(labels='Clase', axis=1) +y_val = val_set['Clase'] + + +train_set, val_set, test_set = train_val_test_split( + df, stratify='Clase') + +svm_clf = SVC(kernel="linear", degree=3, coef0=10, C=20, probability=True) +svm_clf.fit(x_train, y_train) + +#Test +print(' TEST '.center(50, '#')) +y_pred = svm_clf.predict(x_test) +print(classification_report(y_test, y_pred)) +print(confusion_matrix(y_test, y_pred)) +print('F1_Score: ', f1_score(y_test, y_pred)) +print('Precision: ', precision_score(y_test, y_pred)) +print('Accuracy: ', accuracy_score(y_test, y_pred)) +print('Recall: ', recall_score(y_test, y_pred)) + +#Validation +print(' VALIDATION '.center(50, '#')) +y_pred2 = svm_clf.predict(x_val) +print(classification_report(y_val, y_pred2)) +print(confusion_matrix(y_val, y_pred2)) +print('F1_Score: ', f1_score(y_val, y_pred2)) +print('Precision: ', precision_score(y_val, y_pred2)) +print('Accuracy: ', accuracy_score(y_val, y_pred2)) +print('Recall: ', recall_score(y_val, y_pred2)) \ No newline at end of file