diff --git a/BECK/app.py b/BECK/app.py index 08920e0..ea4028b 100644 --- a/BECK/app.py +++ b/BECK/app.py @@ -28,7 +28,21 @@ comment_test = preprocesamiento.preprocesamiento_con_ortografia( comments_array[0]) - +array_item = [] # Get Vector Beck +item_string = 'Pensamiento o deseos suicidas' +for key in beck_data_preprocessing[item_string].keys(): + array_item.append(beck_data_preprocessing[item_string][key]["data"]) + +print(comment_test) +i = 0 +for item in array_item: + coseno = w2v.get_cosine_distance(item, comment_test) + print(f'{item_string} - Item BECK {i} distancia coseno: ${coseno}') + euclidian = w2v.get_euclidian_distance(["hoy", "me", "quiero", "morir"], ["no", "morir"]) + print(f'{item_string} - Item BECK {i} distancia euclidiana: ${euclidian}') + i += 1 + + result = w2v.getVectorBeck(comment_test, beck_data_preprocessing) print(result) diff --git a/BECK/coseno_vs_euclidian.py b/BECK/coseno_vs_euclidian.py new file mode 100644 index 0000000..8143028 --- /dev/null +++ b/BECK/coseno_vs_euclidian.py @@ -0,0 +1,56 @@ +import pandas as pd +from preprocessing_service import Preprocesamiento +import json +import pprint +from model_word2vec_service import ModelWord2Vec +df = pd.read_excel("./datasets/DATASET_ENTRENAMIENTO.xlsx",index_col=[1,2]).reset_index() +comments = list(df["text"]) +classes = list(df["class"]) + + +# df_prueba_chelsea = df_prueba_chelsea.append({'Nombre': 'Vaughn', 'Numero de camiseta':'33'}, ignore_index=True) +comment_0 = comments[0] +preprocesamiento = Preprocesamiento() + +w2v = ModelWord2Vec() +df_cve = pd.read_csv('./coseno_vs_euclidian.csv') +columns = list(df_cve.columns)[2:] + +# Lectura beck +beck_data_preprocessing = {} +try: + if open('./JSON/items_preprocessing.json', 'r'): + beck_data_preprocessing = json.loads( + open('./JSON/items_preprocessing.json', 'r', encoding='utf-8').read()) +except Exception as e: + print(f'Error: {e}') + +class_comment = 0 +for comment in comments: + try: + print(f'Comentario: {class_comment + 1}/{len(comments)}') + new_comment = {} + contador = 0 + new_comment["Comentario"] = comment + comment_preprocesado = preprocesamiento.preprocesamiento_sin_ortografia( + comment) + new_comment["Comentario Preprocesado"] = comment_preprocesado + w2v.add_corpus(comment_preprocesado) + for item in beck_data_preprocessing.keys(): + for result in beck_data_preprocessing[item].keys(): + new_comment[columns[contador]] = w2v.get_cosine_distance(comment_preprocesado, beck_data_preprocessing[item][result]["data"]) + contador += 1 + new_comment[columns[contador]] = w2v.get_euclidian_distance(comment_preprocesado, beck_data_preprocessing[item][result]["data"]) + # new_comment[columns[contador]] = 0 + contador += 1 + + # Add to dataframe + new_comment["Clase"] = classes[class_comment] + df_cve = df_cve.append(new_comment, ignore_index=True) + class_comment += 1 + except Exception: + print(f'Error en el comentario {class_comment} omitiendo...') + class_comment += 1 + continue + +df_cve.to_csv('test.csv', index=False, encoding="utf-8") \ No newline at end of file diff --git a/BECK/datasets/DATASET_ENTRENAMIENTO.xlsx b/BECK/datasets/DATASET_ENTRENAMIENTO.xlsx new file mode 100644 index 0000000..119cf82 Binary files /dev/null and b/BECK/datasets/DATASET_ENTRENAMIENTO.xlsx differ diff --git a/BECK/model_word2vec_service.py b/BECK/model_word2vec_service.py index 701c2f2..98a6b2e 100644 --- a/BECK/model_word2vec_service.py +++ b/BECK/model_word2vec_service.py @@ -1,3 +1,4 @@ +from pprint import pprint from gensim.models import Word2Vec import numpy as np @@ -42,22 +43,27 @@ def get_euclidian_distance(self, corpus_a, corpus_b): :param corpus_b: El segundo corpus a comparar :return: La distancia euclidiana entre dos vectores. """ - if len(corpus_a) !=len(corpus_b) : - diferencia= -(len(corpus_a) -len(corpus_b)) - if len(corpus_a) > len(corpus_b): + vector_corpus_a = self.get_word_vector(corpus_a) + vector_corpus_a = list(np.array(vector_corpus_a).tolist()) + vector_corpus_b = self.get_word_vector(corpus_b) + vector_corpus_b = list(np.array(vector_corpus_b).tolist()) + #Diferencia entre ambos + if len(vector_corpus_a) != len( vector_corpus_b): + diferencia = abs(len(vector_corpus_a) - len( vector_corpus_b)) + if len(vector_corpus_a) > len( vector_corpus_b): i = 0 while i < diferencia: - corpus_b.append(0) + vector_corpus_b.append(self.getVector250()) i += 1 - return np.linalg.norm(self.model.wv[corpus_a] - self.model.wv[corpus_b]) - if len(corpus_a) < len(corpus_b): + return np.linalg.norm(np.array(vector_corpus_a) - np.array(vector_corpus_b)) + if len(vector_corpus_a) < len(vector_corpus_b): i = 0 while i < diferencia: - corpus_a.append(0) + vector_corpus_a.append(self.getVector250()) i += 1 - return np.linalg.norm(self.model.wv[corpus_a] - self.model.wv[corpus_b]) + return np.linalg.norm(np.array(vector_corpus_a) - np.array(vector_corpus_b)) else: - return np.linalg.norm(self.model.wv[corpus_a] - self.model.wv[corpus_b]) + return np.linalg.norm(np.array(vector_corpus_a) - np.array(vector_corpus_b)) def get_cosine_distance(self, corpus_a, corpus_b): @@ -90,4 +96,10 @@ def getVectorBeck(self, commentVector, beck): itemBeck = beck[item][result] array.append(itemBeck['value']) return array + + def getVector250(self): + return list(np.zeros(250)) + + + \ No newline at end of file diff --git a/BECK/word2vec.model b/BECK/word2vec.model index a45b468..b2ffef3 100644 Binary files a/BECK/word2vec.model and b/BECK/word2vec.model differ