Skip to content

Commit

Permalink
added russian sentiment for tweets
Browse files Browse the repository at this point in the history
  • Loading branch information
TchernyavskyDaniil committed Jun 1, 2020
1 parent bf0c26b commit f7ea822
Show file tree
Hide file tree
Showing 7 changed files with 220 additions and 30 deletions.
32 changes: 30 additions & 2 deletions backend/src/lib/normalizers/alphabet.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,35 @@ import { aposToLexForm } from '../lex_form_convert/apos_to_lex_form';
export const getTextWithAlphaOnly = (text: string) => {
const textLexicalForm = aposToLexForm(text);
const casedText = textLexicalForm.toLowerCase();
const textWithAlphaOnly = casedText.replace(/[^a-zA-Z\s]+/g, '');
const withoutLinks = casedText.replace(/(https?:\/\/[^\s]+)/g, '');
const russianText = withoutLinks.replace(/[^а-яА-Я0-9\s]+/g, '');
const englishText = withoutLinks.replace(/[^a-zA-Z0-9\s]+/g, '');

return textWithAlphaOnly;
let language = '';

if (russianText.length > englishText.length) {
language = 'ru';
} else {
language = 'eng';
}

let normalizedText = '';

switch (language) {
case 'ru': {
normalizedText = russianText;
break;
}

case 'eng': {
normalizedText = englishText;
break;
}

default: {
break;
}
}

return { text: normalizedText, language };
};
82 changes: 82 additions & 0 deletions backend/src/lib/ru_social_sentiment/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import { spawn } from 'child_process';

export const getRuSentiment = async (
data: Array<{ text: string; textIndex: number }>,
) => {
// Пустые значения не принимает библиотека и крашится
const notEmptyData = data
.map(({ text }) => text)
.filter(text => {
const withoutSpaces = text.replace(/\s/g, '');

return withoutSpaces.length > 0;
});

let indexOfSentiment = 0;

const pythonProcess = spawn('python3', [
'src/lib/ru_social_sentiment/sentiment_coefficient.py',
JSON.stringify(notEmptyData),
]);

const getSentiments = () => {
return new Promise<{
dataWithSentiments: { [key: string]: number };
countOfSentimentCoefficients: number;
}>((resolve, reject) => {
pythonProcess.stdout.on('data', sentiments => {
const result = sentiments.toString();

const normalizedSentiments: { [key: string]: number } = {};

let countOfSentimentCoefficients = 0;

if (result !== undefined && result.length > 0) {
const parsedResult = JSON.parse(result.replace(/'/g, '"'));

data.forEach(({ text, textIndex }) => {
const withoutSpaces = text.replace(/\s/g, '');
let coefficient = 0;

if (withoutSpaces.length > 0) {
const sentiment: { [key: string]: number } =
parsedResult[indexOfSentiment];

const sentimentKeys = Object.keys(sentiment);
const isNegative = sentimentKeys.includes('negative');
const isPositive = sentimentKeys.includes('positive');

if (isNegative) {
coefficient = -1 * sentiment.negative;
} else if (isPositive) {
coefficient = sentiment.positive;
} else {
const [_, valueSentiment] = Object.entries(sentiment)[0];

coefficient = valueSentiment;
}

indexOfSentiment++;

countOfSentimentCoefficients =
countOfSentimentCoefficients + coefficient;
}

normalizedSentiments[textIndex] = coefficient;
});
}

resolve({
dataWithSentiments: normalizedSentiments,
countOfSentimentCoefficients,
});

pythonProcess.kill('SIGTERM');
});
});
};

const result = await getSentiments();

return result;
};
17 changes: 17 additions & 0 deletions backend/src/lib/ru_social_sentiment/sentiment_coefficient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

import sys
import json

tokenizer = RegexTokenizer()

model = FastTextSocialNetworkModel(tokenizer=tokenizer)

texts = json.loads(sys.argv[1])

results = model.predict(texts, k=2)

print(results)

sys.stdout.flush()
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,25 @@ import {
//@ts-ignore
import stopword from 'stopword';

export const getTextWithSentimentAnalysis = (data: Array<string>) => {
export const getTextWithSentimentAnalysis = (
data: Array<{ text: string; textIndex: number }>,
) => {
const tokenizer = new WordTokenizer();
const analyzer = new SentimentAnalyzer('English', PorterStemmer, 'afinn');

const dataWithSentiments = [];
const dataWithSentiments: { [key: string]: number } = {};

let countOfSentimentCoefficients = 0;
let lengthOfData = data.length;

for (let i = 0; i < lengthOfData; i++) {
const elementOfData = data[i];
for (let i = 0; i < data.length; i++) {
const { text, textIndex } = data[i];

const tokenizedData = tokenizer.tokenize(elementOfData);
const tokenizedData = tokenizer.tokenize(text);

const dataWithoutStopWords = stopword.removeStopwords(tokenizedData);

if (dataWithoutStopWords.length === 0) {
dataWithSentiments.push(0);
dataWithSentiments[textIndex] = 0;
continue;
}

Expand All @@ -35,10 +36,8 @@ export const getTextWithSentimentAnalysis = (data: Array<string>) => {
countOfSentimentCoefficients =
sentimentCoefficient + countOfSentimentCoefficients;

dataWithSentiments.push(sentimentCoefficient);
dataWithSentiments[textIndex] = sentimentCoefficient;
}

const meanSentiment = countOfSentimentCoefficients / lengthOfData;

return { dataWithSentiments, meanSentiment };
return { dataWithSentiments, countOfSentimentCoefficients };
};
45 changes: 39 additions & 6 deletions backend/src/queues/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,45 @@ parserQueue.process(MAX_JOBS_PER_WORKER, async job => {

callbackQueue.add({ jobId: id, options: { parsedTweets } });

const normalizedTweetsForAnalysis = parsedTweets.map(({ tweetContent }) => {
const tweetWithAlphaOnly = getTextWithAlphaOnly(tweetContent);
const normalizedTweetsForBayes = [];

return tweetWithAlphaOnly;
});
const russianTweets = [];
const englishTweets = [];

for (let i = 0; i < parsedTweets.length; i++) {
const { tweetContent } = parsedTweets[i];

const {
text,
language,
}: {
text: string;
language: string;
} = getTextWithAlphaOnly(tweetContent);

switch (language) {
case 'ru': {
russianTweets.push({ text, textIndex: i });
break;
}

case 'eng': {
englishTweets.push({ text, textIndex: i });
break;
}

sentimentQueue.add({ normalizedTweetsForAnalysis, id });
bayesQueue.add({ normalizedTweetsForAnalysis, id });
default: {
break;
}
}

normalizedTweetsForBayes.push(text);
}

sentimentQueue.add({
russianTweets,
englishTweets,
id,
});
bayesQueue.add({ normalizedTweetsForAnalysis: normalizedTweetsForBayes, id });
});
47 changes: 39 additions & 8 deletions backend/src/queues/sentiment.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,55 @@
import Queue from 'bull';

import { getTextWithSentimentAnalysis } from '../lib/sentiment_analysis/sentiment_analysis';
import { getTextWithSentimentAnalysis } from '../lib/sentiment_analysis';

import { OPTIONS, MAX_JOBS_PER_WORKER } from './config';
import { getRuSentiment } from '../lib/ru_social_sentiment';

console.info('Sentiment connected');

const sentimentQueue = new Queue('sentiment', OPTIONS);
const callbackQueue = new Queue('callback', OPTIONS);

sentimentQueue.process(MAX_JOBS_PER_WORKER, job => {
sentimentQueue.process(MAX_JOBS_PER_WORKER, async job => {
const {
normalizedTweetsForAnalysis,
englishTweets,
russianTweets,
id,
}: { normalizedTweetsForAnalysis: Array<string>; id: string } = job.data;
}: {
englishTweets: Array<{ text: string; textIndex: number }>;
russianTweets: Array<{ text: string; textIndex: number }>;
id: string;
} = job.data;

const {
dataWithSentiments: tweetsWithSentiments,
meanSentiment,
} = getTextWithSentimentAnalysis(normalizedTweetsForAnalysis);
let tweetsWithSentiments: number[] = [];
let meanSentiment = 0;
let totalSentimentCoefficient = 0;

const analyzedRussianTweets = await getRuSentiment(russianTweets);
const analyzedEnglishTweets = getTextWithSentimentAnalysis(englishTweets);

const russianSentiments = analyzedRussianTweets.dataWithSentiments;
const englishSentiments = analyzedEnglishTweets.dataWithSentiments;

const lengthOfRuSentiment = Object.values(russianSentiments).length;
const lengthOfEnglishTweets = Object.values(englishSentiments).length;

const summaryLength = lengthOfRuSentiment + lengthOfEnglishTweets;

for (let i = 0; i < summaryLength; i++) {
const russianSentimentCoefficient = russianSentiments[i];
const englishSentimentCoefficient = englishSentiments[i];

const actualSentimentCoefficient =
russianSentimentCoefficient || englishSentimentCoefficient;

tweetsWithSentiments.push(actualSentimentCoefficient);

totalSentimentCoefficient =
actualSentimentCoefficient + totalSentimentCoefficient;
}

meanSentiment = totalSentimentCoefficient / summaryLength;

callbackQueue.add({
jobId: id,
Expand Down
6 changes: 3 additions & 3 deletions backend/src/twitter/profile/profile_info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,20 @@ const parseProfileInfoFx = createEffect<{ browser: Browser; page: Page }, any>({
let classifierData = null;

if (description.length > 0) {
const descriptionWithAlphaOnly = getTextWithAlphaOnly(description);
const { text } = getTextWithAlphaOnly(description);

const analyzer = new SentimentAnalyzer('English', PorterStemmer, 'afinn');
const tokenizer = new WordTokenizer();

const tokenizedData = tokenizer.tokenize(descriptionWithAlphaOnly);
const tokenizedData = tokenizer.tokenize(text);
const dataWithoutStopWords = stopword.removeStopwords(tokenizedData);

sentimentCoefficient = Number(
analyzer.getSentiment(dataWithoutStopWords),
);

const bayesClassifier = getWordsTrigramsBayesClassifier();
classifierData = bayesClassifier.classify(descriptionWithAlphaOnly);
classifierData = bayesClassifier.classify(text);
}

await browser.close();
Expand Down

0 comments on commit f7ea822

Please sign in to comment.