Skip to content

Commit

Permalink
Feat: Added data
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastianCB-dev committed Oct 11, 2022
1 parent db50e0c commit 828e43e
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 10 deletions.
16 changes: 15 additions & 1 deletion BECK/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,21 @@

comment_test = preprocesamiento.preprocesamiento_con_ortografia(
comments_array[0])

array_item = []
# Get Vector Beck
item_string = 'Pensamiento o deseos suicidas'
for key in beck_data_preprocessing[item_string].keys():
array_item.append(beck_data_preprocessing[item_string][key]["data"])

print(comment_test)
i = 0
for item in array_item:
coseno = w2v.get_cosine_distance(item, comment_test)
print(f'{item_string} - Item BECK {i} distancia coseno: ${coseno}')
euclidian = w2v.get_euclidian_distance(["hoy", "me", "quiero", "morir"], ["no", "morir"])
print(f'{item_string} - Item BECK {i} distancia euclidiana: ${euclidian}')
i += 1


result = w2v.getVectorBeck(comment_test, beck_data_preprocessing)
print(result)
56 changes: 56 additions & 0 deletions BECK/coseno_vs_euclidian.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pandas as pd
from preprocessing_service import Preprocesamiento
import json
import pprint
from model_word2vec_service import ModelWord2Vec
df = pd.read_excel("./datasets/DATASET_ENTRENAMIENTO.xlsx",index_col=[1,2]).reset_index()
comments = list(df["text"])
classes = list(df["class"])


# df_prueba_chelsea = df_prueba_chelsea.append({'Nombre': 'Vaughn', 'Numero de camiseta':'33'}, ignore_index=True)
comment_0 = comments[0]
preprocesamiento = Preprocesamiento()

w2v = ModelWord2Vec()
df_cve = pd.read_csv('./coseno_vs_euclidian.csv')
columns = list(df_cve.columns)[2:]

# Lectura beck
beck_data_preprocessing = {}
try:
if open('./JSON/items_preprocessing.json', 'r'):
beck_data_preprocessing = json.loads(
open('./JSON/items_preprocessing.json', 'r', encoding='utf-8').read())
except Exception as e:
print(f'Error: {e}')

class_comment = 0
for comment in comments:
try:
print(f'Comentario: {class_comment + 1}/{len(comments)}')
new_comment = {}
contador = 0
new_comment["Comentario"] = comment
comment_preprocesado = preprocesamiento.preprocesamiento_sin_ortografia(
comment)
new_comment["Comentario Preprocesado"] = comment_preprocesado
w2v.add_corpus(comment_preprocesado)
for item in beck_data_preprocessing.keys():
for result in beck_data_preprocessing[item].keys():
new_comment[columns[contador]] = w2v.get_cosine_distance(comment_preprocesado, beck_data_preprocessing[item][result]["data"])
contador += 1
new_comment[columns[contador]] = w2v.get_euclidian_distance(comment_preprocesado, beck_data_preprocessing[item][result]["data"])
# new_comment[columns[contador]] = 0
contador += 1

# Add to dataframe
new_comment["Clase"] = classes[class_comment]
df_cve = df_cve.append(new_comment, ignore_index=True)
class_comment += 1
except Exception:
print(f'Error en el comentario {class_comment} omitiendo...')
class_comment += 1
continue

df_cve.to_csv('test.csv', index=False, encoding="utf-8")
Binary file added BECK/datasets/DATASET_ENTRENAMIENTO.xlsx
Binary file not shown.
30 changes: 21 additions & 9 deletions BECK/model_word2vec_service.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pprint import pprint
from gensim.models import Word2Vec
import numpy as np

Expand Down Expand Up @@ -42,22 +43,27 @@ def get_euclidian_distance(self, corpus_a, corpus_b):
:param corpus_b: El segundo corpus a comparar
:return: La distancia euclidiana entre dos vectores.
"""
if len(corpus_a) !=len(corpus_b) :
diferencia= -(len(corpus_a) -len(corpus_b))
if len(corpus_a) > len(corpus_b):
vector_corpus_a = self.get_word_vector(corpus_a)
vector_corpus_a = list(np.array(vector_corpus_a).tolist())
vector_corpus_b = self.get_word_vector(corpus_b)
vector_corpus_b = list(np.array(vector_corpus_b).tolist())
#Diferencia entre ambos
if len(vector_corpus_a) != len( vector_corpus_b):
diferencia = abs(len(vector_corpus_a) - len( vector_corpus_b))
if len(vector_corpus_a) > len( vector_corpus_b):
i = 0
while i < diferencia:
corpus_b.append(0)
vector_corpus_b.append(self.getVector250())
i += 1
return np.linalg.norm(self.model.wv[corpus_a] - self.model.wv[corpus_b])
if len(corpus_a) < len(corpus_b):
return np.linalg.norm(np.array(vector_corpus_a) - np.array(vector_corpus_b))
if len(vector_corpus_a) < len(vector_corpus_b):
i = 0
while i < diferencia:
corpus_a.append(0)
vector_corpus_a.append(self.getVector250())
i += 1
return np.linalg.norm(self.model.wv[corpus_a] - self.model.wv[corpus_b])
return np.linalg.norm(np.array(vector_corpus_a) - np.array(vector_corpus_b))
else:
return np.linalg.norm(self.model.wv[corpus_a] - self.model.wv[corpus_b])
return np.linalg.norm(np.array(vector_corpus_a) - np.array(vector_corpus_b))


def get_cosine_distance(self, corpus_a, corpus_b):
Expand Down Expand Up @@ -90,4 +96,10 @@ def getVectorBeck(self, commentVector, beck):
itemBeck = beck[item][result]
array.append(itemBeck['value'])
return array

def getVector250(self):
return list(np.zeros(250))




Binary file modified BECK/word2vec.model
Binary file not shown.

0 comments on commit 828e43e

Please sign in to comment.