-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
147 lines (120 loc) · 5.42 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
const inquirer = require('inquirer');
const ora = require('ora');
const puppeteer = require('puppeteer');
function askQuestions() {
const questions = [
{
name: 'tag',
type: 'input',
message: 'Tag to scrape ( Defaults to none ):'
}
];
return inquirer.prompt(questions);
}
async function main() {
const answers = await askQuestions();
const throbber = ora('Scraping Dev.to for your articles...').start();
const rootURL = 'https://dev.to';
let baseURL = rootURL;
if (answers && answers.tag !== '') {
baseURL = `https://dev.to/t/${answers.tag}`;
}
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(baseURL);
const featuredArticle = async () => {
const singleArticleSeletor = await page.$('.big-article');
if (singleArticleSeletor) {
const title = await singleArticleSeletor.$eval('.content-wrapper > h3', e => e.textContent.trim()) || '';
const userInformation = await singleArticleSeletor.$eval('.featured-user-name > a', e => e.textContent.split('・')) || [];
const author = (userInformation[0] || '').trim();
let publicationDate = '';
let publicationTime = '';
const publicationInfo = (userInformation[1] || '').split('\n');
if (publicationInfo) {
publicationDate = (publicationInfo[0] || '').trim();
publicationTime = (publicationInfo[1] || '').trim().replace(/\(|\)/g, '');
}
const tags = await singleArticleSeletor.$$eval('.featured-tags > a', e => e.map(x => x.textContent)) || [];
let url = await singleArticleSeletor.$eval('a.index-article-link', e => e.getAttribute('href')) || '';
url = rootURL + url;
return {
title,
author,
publicationDate,
publicationTime,
tags,
url,
}
} else {
return null;
}
};
const subArticles = async () => {
let substories = await page.$$('#substories > div.single-article.single-article-small-pic');
const subArticles = [];
for (const substory of substories) {
let title = await substory.$eval('h3', e => e.textContent) || '';
const tags = await substory.$$eval('.tags > a', e => e.map(x => x.textContent)) || [];
let author = '';
let publishDate = '';
const userInformationSelector = await substory.$('h4');
if (userInformationSelector) {
const userInformation = await substory.$eval('h4', e => e.textContent.split('・'));
author = (userInformation[0] || '').trim();
publishDate = (userInformation[1] || '').trim();
}
let url = '';
const urlSelector = await substory.$('.index-article-link');
if (urlSelector) {
url = await substory.$eval('.index-article-link', e => e.getAttribute('href'));
url = baseURL + url;
}
let numberOfReactions = 0;
const reactions = await substory.$('.reactions-count');
if (reactions) {
numberOfReactions = await reactions.$eval('span.engagement-count-number', e => e.innerHTML.trim());
numberOfReactions = Number(numberOfReactions);
}
let numberOfComments = 0;
const comments = await substory.$('.comments-count');
if (comments) {
numberOfComments = await comments.$eval('span.engagement-count-number', e => e.innerHTML.trim());
numberOfComments = Number(numberOfComments);
}
let readingTime = '';
const time = await substory.$('.article-reading-time');
if (time) {
readingTime = await substory.$eval('a.article-reading-time', e => e.textContent.trim());
}
if (tags && tags.length) {
subArticles.push({
title: title.trim(),
author,
publishDate,
url,
tags,
numberOfReactions,
numberOfComments,
readingTime
});
}
}
return subArticles;
};
const [featuredArticleResult, subArticleResult ] = await Promise.all([featuredArticle(), subArticles()]);
await browser.close();
throbber.stopAndPersist({
text: 'All done scraping your articles!'
});
console.log('#### FEATURED ARTICLE ####');
console.table(featuredArticleResult);
console.log('#### OTHER ARTICLES ####');
console.log(subArticleResult);
} catch (error) {
throbber.stopAndPersist({ text: 'Oopps, something bad happened :( '});
console.error(error);
}
}
main();