Merge pull request #16 from KEKDATA/feature/Auth-profile

Feature/auth profile
KEKDATA · Jun 2, 2020 · c24bb4e · c24bb4e
2 parents 62bdd3d + 594f670
commit c24bb4e
Show file tree

Hide file tree

Showing 53 changed files with 4,857 additions and 1,535 deletions.
diff --git a/README.md b/README.md
@@ -3,6 +3,9 @@
 1) frontend - Директория для веб клиента
 2) backend - Директория для консольного (или не очень консольного, тк можно с помощью электрона запустить) приложения по парсингу, на текущий момент, твитера и API Websocket
 
+TODO:
+ 1) Error handlers
+
 ## SUPER KEKSPONSOR
 
 <table>

diff --git a/backend/README.md b/backend/README.md
@@ -1,2 +1,21 @@
 Диаграмма процессов API, парсинга
 <img src="https://d.radikal.ru/d32/2005/e4/57263917df9b.jpg" />
+
+Download Node.js https://nodejs.org/en/download/
+Download Python 3 https://www.python.org/downloads/
+
+Open dir backend
+
+```bash
+    $ npm i
+```
+```bash
+    $ pip install dostoevsky
+```
+```bash
+    $ python -m dostoevsky download fasttext-social-network-model
+```
+```bash
+    $ npm run start
+```
+
diff --git a/backend/package-lock.json b/backend/package-lock.json
diff --git a/backend/package.json b/backend/package.json
@@ -21,33 +21,33 @@
   "dependencies": {
     "bull": "^3.14.0",
     "cheerio": "^1.0.0-rc.3",
-    "effector": "^20.15.6",
+    "effector": "^20.15.8",
     "env-cmd": "^10.1.0",
     "foreman": "^3.0.1",
     "nanoid": "^3.1.9",
     "natural": "^2.1.5",
     "playwright": "^1.0.2",
     "ramda": "^0.27.0",
     "redis": "^3.0.2",
-    "sequelize": "^5.21.10",
+    "sequelize": "^5.21.11",
     "spelling-corrector": "^3.0.0",
     "sqlite3": "^4.2.0",
     "stopword": "^1.0.1",
     "ws": "^7.3.0"
   },
   "devDependencies": {
-    "@babel/cli": "^7.8.4",
-    "@babel/core": "^7.9.6",
-    "@babel/node": "^7.8.7",
-    "@babel/plugin-transform-typescript": "^7.9.6",
-    "@babel/preset-env": "^7.9.6",
-    "@babel/preset-typescript": "^7.9.0",
+    "@babel/cli": "^7.10.1",
+    "@babel/core": "^7.10.2",
+    "@babel/node": "^7.10.1",
+    "@babel/plugin-transform-typescript": "^7.10.1",
+    "@babel/preset-env": "^7.10.2",
+    "@babel/preset-typescript": "^7.10.1",
     "@types/bull": "^3.13.0",
-    "@types/node": "^14.0.5",
-    "@types/redis": "^2.8.21",
+    "@types/node": "^14.0.6",
+    "@types/redis": "^2.8.22",
     "@types/sequelize": "^4.28.9",
     "@types/ws": "^7.2.4",
-    "@typescript-eslint/parser": "^3.0.0",
+    "@typescript-eslint/parser": "^3.1.0",
     "babel-eslint": "^10.1.0",
     "eslint": "^7.1.0",
     "eslint-config-prettier": "^6.11.0",
@@ -56,14 +56,14 @@
     "eslint-plugin-node": "^11.1.0",
     "eslint-plugin-prettier": "^3.1.3",
     "husky": "^4.2.5",
-    "lint-staged": "^10.2.6",
+    "lint-staged": "^10.2.7",
     "prettier": "1.19.1",
-    "ts-node": "^8.10.1",
+    "ts-node": "^8.10.2",
     "typescript": "^3.9.3",
     "@types/cheerio": "^0.22.18",
     "@types/natural": "^0.6.3",
     "@types/ramda": "^0.27.6",
-    "electron": "^9.0.0"
+    "electron": "^9.0.1"
   },
   "husky": {
     "hooks": {

diff --git a/backend/src/constants/language.ts b/backend/src/constants/language.ts
@@ -0,0 +1,2 @@
+export const RU = 'ru';
+export const ENG = 'eng';
diff --git a/backend/src/lib/normalizers/alphabet.ts b/backend/src/lib/normalizers/alphabet.ts
@@ -1,9 +1,38 @@
 import { aposToLexForm } from '../lex_form_convert/apos_to_lex_form';
+import { ENG, RU } from '../../constants/language';
 
 export const getTextWithAlphaOnly = (text: string) => {
   const textLexicalForm = aposToLexForm(text);
   const casedText = textLexicalForm.toLowerCase();
-  const textWithAlphaOnly = casedText.replace(/[^a-zA-Z\s]+/g, '');
+  const withoutLinks = casedText.replace(/(https?:\/\/[^\s]+)/g, '');
+  const russianText = withoutLinks.replace(/[^а-яА-Я0-9\s]+/g, '');
+  const englishText = withoutLinks.replace(/[^a-zA-Z0-9\s]+/g, '');
 
-  return textWithAlphaOnly;
+  let language = '';
+
+  if (russianText.length > englishText.length) {
+    language = RU;
+  } else {
+    language = ENG;
+  }
+
+  let normalizedText = '';
+
+  switch (language) {
+    case RU: {
+      normalizedText = russianText;
+      break;
+    }
+
+    case ENG: {
+      normalizedText = englishText;
+      break;
+    }
+
+    default: {
+      break;
+    }
+  }
+
+  return { text: normalizedText, language };
 };
diff --git a/backend/src/lib/ru_social_sentiment/README.md b/backend/src/lib/ru_social_sentiment/README.md
@@ -0,0 +1,19 @@
+```
+@inproceedings{rogers-etal-2018-rusentiment,
+    title = "{R}u{S}entiment: An Enriched Sentiment Analysis Dataset for Social Media in {R}ussian",
+    author = "Rogers, Anna  and
+      Romanov, Alexey  and
+      Rumshisky, Anna  and
+      Volkova, Svitlana  and
+      Gronas, Mikhail  and
+      Gribov, Alex",
+    booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
+    month = aug,
+    year = "2018",
+    address = "Santa Fe, New Mexico, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/C18-1064",
+    pages = "755--763",
+}
+
+```
diff --git a/backend/src/lib/ru_social_sentiment/index.ts b/backend/src/lib/ru_social_sentiment/index.ts
@@ -0,0 +1,82 @@
+import { spawn } from 'child_process';
+
+export const getRuSentiment = async (
+  data: Array<{ text: string; textIndex: number }>,
+) => {
+  // Пустые значения не принимает библиотека и крашится
+  const notEmptyData = data
+    .map(({ text }) => text)
+    .filter(text => {
+      const withoutSpaces = text.replace(/\s/g, '');
+
+      return withoutSpaces.length > 0;
+    });
+
+  let indexOfSentiment = 0;
+
+  const pythonProcess = spawn('python3', [
+    'src/lib/ru_social_sentiment/sentiment_coefficient.py',
+    JSON.stringify(notEmptyData),
+  ]);
+
+  const getSentiments = () => {
+    return new Promise<{
+      dataWithSentiments: { [key: string]: number };
+      countOfSentimentCoefficients: number;
+    }>((resolve, reject) => {
+      pythonProcess.stdout.on('data', (sentiments: ArrayBuffer) => {
+        const result = sentiments.toString();
+
+        const normalizedSentiments: { [key: string]: number } = {};
+
+        let countOfSentimentCoefficients = 0;
+
+        if (result !== undefined && result.length > 0) {
+          const parsedResult = JSON.parse(result.replace(/'/g, '"'));
+
+          data.forEach(({ text, textIndex }) => {
+            const withoutSpaces = text.replace(/\s/g, '');
+            let coefficient = 0;
+
+            if (withoutSpaces.length > 0) {
+              const sentiment: { [key: string]: number } =
+                parsedResult[indexOfSentiment];
+
+              const sentimentKeys = Object.keys(sentiment);
+              const isNegative = sentimentKeys.includes('negative');
+              const isPositive = sentimentKeys.includes('positive');
+
+              if (isNegative) {
+                coefficient = -1 * sentiment.negative;
+              } else if (isPositive) {
+                coefficient = sentiment.positive;
+              } else {
+                const [_, valueSentiment] = Object.entries(sentiment)[0];
+
+                coefficient = valueSentiment;
+              }
+
+              indexOfSentiment++;
+
+              countOfSentimentCoefficients =
+                countOfSentimentCoefficients + coefficient;
+            }
+
+            normalizedSentiments[textIndex] = coefficient;
+          });
+        }
+
+        resolve({
+          dataWithSentiments: normalizedSentiments,
+          countOfSentimentCoefficients,
+        });
+
+        pythonProcess.kill('SIGTERM');
+      });
+    });
+  };
+
+  const result = await getSentiments();
+
+  return result;
+};
diff --git a/backend/src/lib/ru_social_sentiment/sentiment_coefficient.py b/backend/src/lib/ru_social_sentiment/sentiment_coefficient.py
@@ -0,0 +1,21 @@
+# Данная библиотека имеет приватный дата сет под тональность соц сетей, который показал неплохие результаты
+# В следствии чего - закинул скрипт Python 3.
+# "Говнопалка"
+
+from dostoevsky.tokenization import RegexTokenizer
+from dostoevsky.models import FastTextSocialNetworkModel
+
+import sys
+import json
+
+tokenizer = RegexTokenizer()
+
+model = FastTextSocialNetworkModel(tokenizer=tokenizer)
+
+texts = json.loads(sys.argv[1])
+
+results = model.predict(texts, k=2)
+
+print(results)
+
+sys.stdout.flush()
diff --git a/.../sentiment_analysis/sentiment_analysis.ts → backend/src/lib/sentiment_analysis/index.ts b/.../sentiment_analysis/sentiment_analysis.ts → backend/src/lib/sentiment_analysis/index.ts
@@ -7,24 +7,25 @@ import {
 //@ts-ignore
 import stopword from 'stopword';
 
-export const getTextWithSentimentAnalysis = (data: Array<string>) => {
+export const getTextWithSentimentAnalysis = (
+  data: Array<{ text: string; textIndex: number }>,
+) => {
   const tokenizer = new WordTokenizer();
   const analyzer = new SentimentAnalyzer('English', PorterStemmer, 'afinn');
 
-  const dataWithSentiments = [];
+  const dataWithSentiments: { [key: string]: number } = {};
 
   let countOfSentimentCoefficients = 0;
-  let lengthOfData = data.length;
 
-  for (let i = 0; i < lengthOfData; i++) {
-    const elementOfData = data[i];
+  for (let i = 0; i < data.length; i++) {
+    const { text, textIndex } = data[i];
 
-    const tokenizedData = tokenizer.tokenize(elementOfData);
+    const tokenizedData = tokenizer.tokenize(text);
 
     const dataWithoutStopWords = stopword.removeStopwords(tokenizedData);
 
     if (dataWithoutStopWords.length === 0) {
-      dataWithSentiments.push(0);
+      dataWithSentiments[textIndex] = 0;
       continue;
     }
 
@@ -35,10 +36,8 @@ export const getTextWithSentimentAnalysis = (data: Array<string>) => {
     countOfSentimentCoefficients =
       sentimentCoefficient + countOfSentimentCoefficients;
 
-    dataWithSentiments.push(sentimentCoefficient);
+    dataWithSentiments[textIndex] = sentimentCoefficient;
   }
 
-  const meanSentiment = countOfSentimentCoefficients / lengthOfData;
-
-  return { dataWithSentiments, meanSentiment };
+  return { dataWithSentiments, countOfSentimentCoefficients };
 };
diff --git a/backend/src/queues/bayes.ts b/backend/src/queues/bayes.ts
@@ -7,7 +7,7 @@ import { OPTIONS, MAX_JOBS_PER_WORKER } from './config';
 console.info('Bayes connected');
 
 const bayesQueue = new Queue('bayes', OPTIONS);
-const callbackQueue = new Queue('callback', OPTIONS);
+const mergeQueue = new Queue('merge', OPTIONS);
 
 bayesQueue.process(MAX_JOBS_PER_WORKER, job => {
   const {
@@ -19,7 +19,7 @@ bayesQueue.process(MAX_JOBS_PER_WORKER, job => {
     normalizedTweetsForAnalysis,
   );
 
-  callbackQueue.add({
+  mergeQueue.add({
     jobId: id,
     options: { tweetsWithBayesClassifier, isBayes: true },
   });

diff --git a/backend/src/queues/merge.ts b/backend/src/queues/merge.ts
@@ -6,13 +6,15 @@ import { insertionSentimentTweetsSort } from '../twitter/tweets/lib/insertion_se
 import { OPTIONS, MAX_JOBS_PER_WORKER } from './config';
 import { Send } from '../types';
 
-const callbackQueue = new Queue('callback', OPTIONS);
+console.info('Merge connected');
+
+const mergeQueue = new Queue('merge', OPTIONS);
 const webQueue = new Queue('web', OPTIONS);
 
 // TODO: Чистить со временем или после завершения работы нужных процессов.
 const jobsProgress = new Map();
 
-callbackQueue.process(MAX_JOBS_PER_WORKER, job => {
+mergeQueue.process(MAX_JOBS_PER_WORKER, job => {
   const { jobId, options }: { jobId: string; options: Send } = job.data;
 
   const jobOptions = jobsProgress.get(jobId);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		export const RU = 'ru';
		export const ENG = 'eng';