Feat: Added functions to fill the BECK

SebastianCB-dev · Dec 4, 2022 · 10fdb7e · 10fdb7e
1 parent ba7b103
commit 10fdb7e
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 69 deletions.
diff --git a/BECK/app.py b/BECK/app.py
@@ -1,55 +1,17 @@
-import pandas as pd
 from preprocessing_service import Preprocesamiento
-import json
-import pprint
 from model_word2vec_service import ModelWord2Vec
-import nltk
-nltk.download('punkt')
-df = pd.read_csv("./datasets/comentarios_full.csv", encoding='utf-8')
-comments = list(df["text"])
-classes =  list(df["class"])
-
-# df_prueba_chelsea = df_prueba_chelsea.append({'Nombre': 'Vaughn', 'Numero de camiseta':'33'}, ignore_index=True)
-preprocesamiento = Preprocesamiento()
 
+pp = Preprocesamiento()
 w2v = ModelWord2Vec()
-df_cve = pd.read_csv('./datasets/coseno.csv', encoding='utf-8')
-columns = list(df_cve.columns)[2:]
-inicio = 6000
-fin = 8000
-class_comment = 0
-# Lectura beck
-beck_data_preprocessing = {}
-try:
-    if open('./JSON/items_preprocessing.json', 'r'):
-        beck_data_preprocessing = json.loads(
-            open('./JSON/items_preprocessing.json', 'r', encoding='utf-8').read())
-except Exception as e:
-    print(f'Error: {e}')
-
-for comment in comments[inicio:fin]:
-  try:
-    print(f'Comentario: {class_comment + 1}/{len(comments)}')
-    new_comment = {}
-    contador = 0
-    new_comment["Comentario"] = comment
-    comment_preprocesado = preprocesamiento.preprocesamiento_con_ortografia(
-        comment)
-    new_comment["Comentario Preprocesado"] = comment_preprocesado
-    w2v.add_corpus(comment_preprocesado)
-    for item in beck_data_preprocessing.keys():
-      for result in beck_data_preprocessing[item].keys():
-        new_comment[columns[contador]] = w2v.get_cosine_similarity(comment_preprocesado, beck_data_preprocessing[item][result]["data"])
-        contador += 1
-
-    # Add to dataframe
-    new_comment["Clase"] = classes[class_comment]
-    df_cve = df_cve.append(new_comment, ignore_index=True)
-    class_comment += 1
-  except Exception as e:
-    print(f'Error en el comentario {class_comment} omitiendo...')
-    print(e)
-    class_comment += 1
-    continue
 
-df_cve.to_csv('./datasets/coseno.csv', index=False, encoding="utf-8")
+comentario = "Estoy muy triste y no se que hacer"
+# Escribe un comentario triste y largo
+comentario = "Estoy muy triste y no se que hacer es un día nublado y esta lloviendo"
+# Preprocesado del comentario
+comentario_procesado = pp.preprocesamiento_con_ortografia(comentario)
+# Obtener la similitud de coseno entre el comentario y 
+# Cada una de las respuestas del inventario de depresión de BECK (BDI-II)
+cosine_similarity_beck = w2v.get_cosine_similarity_BECK(comentario_procesado)
+# Obtener la respuesta por item basandose en la similitud de coseno
+results_beck = w2v.get_result_beck(cosine_similarity_beck)
+print("El comentario lleno el inventario BECK de esta manera:", results_beck)
diff --git a/BECK/model_word2vec_service.py b/BECK/model_word2vec_service.py
@@ -1,7 +1,7 @@
 from pprint import pprint
 from gensim.models import Word2Vec
 import numpy as np
-
+import json
 class ModelWord2Vec:
 
   def __init__(self):
@@ -10,6 +10,15 @@ def __init__(self):
     """
     self.model = Word2Vec.load('./depresion.model')
 
+  def get_beck(self):
+    beck_data_preprocessing = {}
+    try:
+        if open('./JSON/items_preprocessing.json', 'r'):
+            beck_data_preprocessing = json.loads(
+                open('./JSON/items_preprocessing.json', 'r', encoding='utf-8').read())
+    except Exception as e:
+      print(f'Error: {e}')
+    return beck_data_preprocessing
   def get_model(self):
     """
     me devuelve el modelo
@@ -44,30 +53,73 @@ def get_cosine_similarity(self, corpus_a, corpus_b):
     """
     return self.model.wv.n_similarity(corpus_a, corpus_b)
 
-  def get_word_vector(self, word):
+  def get_word_vectors(self, corpus):
     """
-    > La función toma una palabra como entrada y devuelve el vector de palabra para esa palabra
+    > La función toma una list de palabras como entrada y devuelve el vector de 250D para esa palabra
     
     :param word: La palabra cuya representación vectorial desea obtener
     :return: La palabra vector para la palabra.
     """
-    return self.model.wv[word]
+    array_result = []
+    # TODO en error añadir al vocabulario y volver a llamar a la función
+    for word in corpus:
+      try:
+        array_result.append(self.model.wv[word])
+      except:
+        array_result.append(self.getVector250())
+    return array_result
 
-  def getVectorBeck(self, commentVector, beck):
-    array = []
+  def get_cosine_similarity_BECK(self, corpus):
+    beck = self.get_beck()
+    data = []
     for item in beck.keys():
-      for idx, result in enumerate(beck[item].keys()): 
-        if idx == 0:
-          itemBeck = beck[item][result]
-        if( self.get_cosine_distance(commentVector, itemBeck["data"]) < 
-        self.get_cosine_distance(commentVector, beck[item][result]["data"])):
-          itemBeck = beck[item][result]
-      array.append(itemBeck['value'])   
-    return array
+      for result in beck[item].keys():
+        similarity = self.get_cosine_similarity(corpus, beck[item][result]["data"])
+        data.append(similarity)
+
+    return data    
+
+  def get_result_beck(self, cosine_similarities):
+    results = []
+    primera_parte = cosine_similarities[:60]  # 4 respuestas
+    segunda_parte = cosine_similarities[60:67]  # 7 respuestas
+    tercera_parte = cosine_similarities[67:71]  # 4 respuestas
+    cuarta_parte = cosine_similarities[71:78]  # 7 respuestas
+    quinta_parte = cosine_similarities[78:]  # 4 respuestas
+    results.append(self.getMaxBeck4Items(primera_parte))
+    results.append(self.getMaxBeck7Items(segunda_parte))
+    results.append(self.getMaxBeck4Items(tercera_parte))
+    results.append(self.getMaxBeck7Items(cuarta_parte))
+    results.append(self.getMaxBeck4Items(quinta_parte))
+    results_flat = [x for sublist in results for x in sublist]
+    return results_flat
+
+  def getMaxBeck4Items(self, array):
+    results = []
+    for index in range(0, len(array), 4):
+      item = array[index: index + 4]
+      mayor = 0
+      mayor_idx = 0
+      for index, result in enumerate(item):
+        if result > mayor:
+          mayor = result
+          mayor_idx = index
+      results.append(mayor_idx)
+    return results
+
+  def getMaxBeck7Items(self, array):
+    results_beck = [0, 1, 1, 2, 2, 3, 3]
+    results = []
+    for index in range(0, len(array), 7):
+      item = array[index: index + 7]
+      mayor = 0
+      mayor_idx = 0
+      for index, result in enumerate(item):
+        if result > mayor:
+          mayor = result
+          mayor_idx = results_beck[index]
+      results.append(mayor_idx)
+    return results
 
   def getVector250(self):
     return list(np.zeros(250))
-
-
-
-
diff --git a/BECK/preprocessing_service.py b/BECK/preprocessing_service.py
@@ -1,5 +1,7 @@
 import emoji
 import re
+import nltk
+nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 import spacy
 import stanza
@@ -47,8 +49,10 @@ def preprocesamiento_con_ortografia(self, texto):
     texto = self.eliminacion_data_inutil(texto)
     texto = self.correccion_ortografica(texto)
     texto = self.normalizar(texto)
-    texto = self.stop_words(texto)
+    texto = texto.split(" ")
     texto = self.lematizacion(texto)
+    texto = " ".join(texto)
+    texto = self.stop_words(texto)
     texto = self.eliminar_duplicados(texto)
     return texto
 

diff --git a/BECK/test.py b/BECK/test.py
@@ -0,0 +1,8 @@
+import numpy as np
+#array del 1 al 21
+array = list(np.arange(1, 22))
+primera_parte = array[:15] # 4 respuestas
+segunda_parte = array[15:16] # 7 respuestas
+tercera_parte = array[16:17] # 4 respuestas
+cuarta_parte = array[17:18] # 7 respuestas
+quinta_parte = array[18:] # 4 respuestas