-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
86e75d0
commit 0394d6d
Showing
3 changed files
with
44 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +0,0 @@ | ||
import pandas as pd | ||
import nltk | ||
nltk.download('punkt') | ||
from preprocessing_service import Preprocesamiento | ||
from gensim.models import Word2Vec | ||
|
||
df_positivo = pd.read_csv('./comentarios_español_depresivos.csv', encoding='utf-8') | ||
df_negativo = pd.read_csv('./comentarios_español_no_depresivos.csv', encoding='utf-8') | ||
pp = Preprocesamiento() | ||
tokens = [] | ||
|
||
comentarios_depresivos = list(df_positivo['text']) | ||
clases_depresivos = list(df_positivo['class']) | ||
|
||
|
||
|
||
# Entrenamiento | ||
# vector size = 200 dimensiones | ||
# window = Ventana referente a las palabras siguientes | ||
# For example "stackoverflow great website for programmers" with 5 words(suppose we save the stop words great and for here) if the window size is 2 then the vector of word "stackoverflow" is directly affected by the word "great" and "website", if the window size is 5 "stackoverflow" can be directly affected by two more words "for" and "programmers". The 'affected' here means it will pull the vector of two words closer. | ||
|
||
|
||
model = Word2Vec(sentences=tokens, vector_size=200, | ||
window=7, workers=4, sg=1, epochs=20) | ||
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import pandas as pd | ||
import nltk | ||
nltk.download('punkt') | ||
from preprocessing_service import Preprocesamiento | ||
from gensim.models import Word2Vec | ||
|
||
df_positivo = pd.read_csv('./comentarios_español_depresivos.csv', encoding='utf-8') | ||
df_negativo = pd.read_csv('./comentarios_español_no_depresivos.csv', encoding='utf-8') | ||
pp = Preprocesamiento() | ||
tokens = [] | ||
|
||
comentarios_depresivos = list(df_positivo['text']) | ||
comentarios_no_depresivos = list(df_negativo['text']) | ||
|
||
count = 1 | ||
for comentario in comentarios_depresivos: | ||
print(f'Preprocesando comentario: {count}/{len(comentarios_depresivos) + len(comentarios_no_depresivos)}') | ||
try: | ||
comentario_preprocesado = pp.preprocesamiento_sin_ortografia(comentario) | ||
tokens.append(comentario_preprocesado) | ||
count += 1 | ||
except Exception as e: | ||
print( | ||
f'Error preprocesando el comentario {count}/{len(comentarios_depresivos) + len(comentarios_no_depresivos)}') | ||
count += 1 | ||
continue | ||
|
||
|
||
for comentario in comentarios_no_depresivos: | ||
print( | ||
f'Preprocesando comentario: {count}/{len(comentarios_depresivos) + len(comentarios_no_depresivos)}') | ||
try: | ||
comentario_preprocesado = pp.preprocesamiento_sin_ortografia(comentario) | ||
tokens.append(comentario_preprocesado) | ||
except Exception as e: | ||
print( | ||
f'Error preprocesando el comentario {count}/{len(comentarios_depresivos) + len(comentarios_no_depresivos)}') | ||
count += 1 | ||
continue | ||
|
||
model = Word2Vec(sentences=tokens, vector_size=200, | ||
window=7, workers=4, sg=1, epochs=20) | ||
|
||
model.save('depresion.model') |