Skip to content

Commit

Permalink
update scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
squidgetx committed Jan 10, 2023
1 parent 281b39c commit f3fa7d7
Showing 1 changed file with 37 additions and 34 deletions.
71 changes: 37 additions & 34 deletions server/db/scraper.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { default as config } from "../lib/config.js";

const BEARER_TOKEN = config.twitterBearerToken;

const WRITE_DB = false;
const WRITE_DB = true;

async function fetch_for_id(id, time) {
const client = new Client(BEARER_TOKEN);
Expand Down Expand Up @@ -183,6 +183,15 @@ async function parse_response(response) {
// Todo deal with > 100 tweets, but i dont think that really is important for now
// A bit complicated because RT/Quote tweets exist :P
// So for RT/Quote tweets, we add 2 tweets to the DB: one for the original tweet and one for the OG
if (response.data == undefined) {
// No new tweets in the DB
return {
tweets: [],
users: [],
media: [],
links: [],
};
}
const tweets = response.data
.map((t) => parse_tweet(t))
// filter duplicate ids, which apparently happens sometimes (keep first entry)
Expand Down Expand Up @@ -329,42 +338,36 @@ async function update_db(data) {
});
}

async function test_fetch(id) {
const response = await fetch_for_id(id, "2022-11-11T00:00:00.000Z");
/* Function to get the list of elites from the db
* that we want to get the tweets for
*/
async function fetchElites() {
// Get the top 50 center left and center right accounts
const query = `
select elites.id, max(created_at) as date
from elites left join tweets
on elites.id = tweets.author_id
where rank < 50
group by elites.id;
`;
const results = await DB.any(query);
return results;
}

async function fetchTweetsForElites() {
const elites = await fetchElites();
for (let e of elites) {
console.log("fetching for ", e);
fetch_and_write_for_id(e.id, e.date.toISOString());
}
}

async function fetch_and_write_for_id(id, date) {
const response = await fetch_for_id(id, date);
const data = await parse_response(response);
if (WRITE_DB) {
const result = await update_db(data);
}
}

// Maddow
//test_fetch("16129920").then((result) => {
//console.log("fetched")
//});

// Fox news

for (const i of [
"20402945",
"2922928743",
"807095",
"16467567",
"3108351",
"16467567",
]) {
test_fetch(i).then((result) => {
console.log("fetched", i);
});
}

/*
Design: have a fixed table of elites that doesn't need any maintenance
Then, we query for the most recent tweet we have for each elite
With a fallback date
Then we kick off all the fetch async jobs (2k or so?)
If jobs fail we need to manage retry logic as well, but the update op should be idempotent I guess
I wonder also if batching the db writes is necessary? or we can just not care
Also remember that we will need to migrate/re-run in production
*/
fetchTweetsForElites().then(() => console.log("fetched"));

0 comments on commit f3fa7d7

Please sign in to comment.