Feat: Added Model construction

SebastianCB-dev · Nov 3, 2022 · 86e75d0 · 86e75d0
1 parent 7aba0bc
commit 86e75d0
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 10 deletions.
diff --git a/BECK/app.py b/BECK/app.py
@@ -2,15 +2,23 @@
 import nltk
 nltk.download('punkt')
 from preprocessing_service import Preprocesamiento
+from gensim.models import Word2Vec
 
-df = pd.read_csv('./comentarios_español_depresivos.csv', encoding='utf-8')
+df_positivo = pd.read_csv('./comentarios_español_depresivos.csv', encoding='utf-8')
+df_negativo = pd.read_csv('./comentarios_español_no_depresivos.csv', encoding='utf-8')
 pp = Preprocesamiento()
+tokens = []
 
-comentarios = list(df['text'])
-clases = list(df['class'])
+comentarios_depresivos = list(df_positivo['text'])
+clases_depresivos = list(df_positivo['class'])
 
-print('-- Comentario antes --')
-print(comentarios[0])
-print('-- Comentario despúes --')
-comentario_preprocesado = pp.preprocesamiento_sin_ortografia(comentarios[0])
-print(comentario_preprocesado)
+
+
+# Entrenamiento
+# vector size = 200 dimensiones
+# window = Ventana referente a las palabras siguientes 
+# For example "stackoverflow great website for programmers" with 5 words(suppose we save the stop words great and for here) if the window size is 2 then the vector of word "stackoverflow" is directly affected by the word "great" and "website", if the window size is 5 "stackoverflow" can be directly affected by two more words "for" and "programmers". The 'affected' here means it will pull the vector of two words closer.
+
+
+model = Word2Vec(sentences=tokens, vector_size=200,
+                 window=7, workers=4, sg=1, epochs=20)
diff --git a/BECK/preprocessing_service.py b/BECK/preprocessing_service.py
@@ -49,7 +49,7 @@ def preprocesamiento_con_ortografia(self, texto):
     texto = self.correccion_ortografica(texto)
     texto = self.stop_words(texto)
     texto = self.lematizacion(texto)
-    texto = self.eliminar_duplicados(texto)
+    #texto = self.eliminar_duplicados(texto)
     return texto
 
 
@@ -67,7 +67,7 @@ def preprocesamiento_sin_ortografia(self, texto):
     texto = self.eliminacion_data_inutil(texto)
     texto = self.stop_words(texto)
     texto = self.lematizacion(texto)
-    texto = self.eliminar_duplicados(texto)
+    #texto = self.eliminar_duplicados(texto)
     return texto
 
   def eliminar_etiquetados(self, texto):