-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add Twitter profiles and summary statistics
- Loading branch information
1 parent
4ad37d6
commit 6948306
Showing
13 changed files
with
15,898 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.Rproj.user | ||
.Rhistory | ||
.RData | ||
.Ruserdata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
# journals-on-twitter | ||
Dataset of the Twitter accoutns of scientific journals (based on the Web of Science indices SCIE, SSCI and AHCI) | ||
|
||
Here is a dataset of 3.485 Twitter accounts pertaining to a sample of 13.821 distinct journals listed in Web of Science’s three major indices (SCIE, SSCI and AHCI). | ||
|
||
The dataset is available under a Creative Commons-license (CC0). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
library(tidyverse) | ||
|
||
# read data | ||
DFF <- arrow::read_parquet("Summary_Statistics\\twitter_data.parquet") | ||
|
||
# function for Modal value | ||
Mode <- function(x) { | ||
ux <- unique(x) | ||
ux[which.max(tabulate(match(x, ux)))] | ||
} | ||
|
||
DF <- DFF %>% | ||
filter(year == 2020 | year == 2021) | ||
# library(plyr) | ||
# ALLNAMES <- plyr::ddply(DF, .(twitter), summarize, allnames = paste(uniquenames, collapse=";")) %>% | ||
# separate_rows(allnames) %>% | ||
# distinct() %>% | ||
# select(twitter) %>% | ||
# group_by(twitter) %>% | ||
# count() | ||
# colnames(ALLNAMES) <- c("twitter", "total_names") | ||
# ALLREP <- plyr::ddply(DF, .(twitter), summarize, allnames_rep = paste(uniquenames_rep, collapse=";")) %>% | ||
# separate_rows(allnames_rep) %>% | ||
# distinct() %>% | ||
# select(twitter) %>% | ||
# group_by(twitter) %>% | ||
# count() | ||
# colnames(ALLREP) <- c("twitter", "total_names_rep") | ||
# ALLRT <- plyr::ddply(DF, .(twitter), summarize, allnames_rt = paste(uniquenames_rt, collapse=";")) %>% | ||
# separate_rows(allnames_rt) %>% | ||
# distinct() %>% | ||
# select(twitter) %>% | ||
# group_by(twitter) %>% | ||
# count() | ||
# colnames(ALLRT) <- c("twitter", "total_names_rt") | ||
# ALLMEN <- plyr::ddply(DF, .(twitter), summarize, allnames_mentions = paste(uniquenames_mentions, collapse=";")) %>% | ||
# separate_rows(allnames_mentions) %>% | ||
# distinct() %>% | ||
# select(twitter) %>% | ||
# group_by(twitter) %>% | ||
# count() | ||
# colnames(ALLMEN) <- c("twitter", "total_names_mentions") | ||
# detach(package:plyr) | ||
DF$tweets_total <- as.numeric(DF$tweets_total) | ||
DF <- DF %>% | ||
select(-account_created_date, -year, -starts_with("uniquenames")) %>% | ||
group_by(twitter) %>% | ||
mutate(totaltw = sum(tweets_total)) | ||
|
||
JJ <- read_csv("twitter_accounts_of_journals.csv") | ||
JJ <- JJ %>% | ||
pivot_longer(cols = c("ahci", "ssci", "scie"), | ||
names_to = "wos_index") %>% | ||
filter(value == 1) %>% | ||
select(-value) | ||
TT <- left_join(JJ, DF) | ||
TT <- TT %>% | ||
mutate(has_twitter = ifelse(is.na(twitter), 0 , 1)) | ||
|
||
TT <- TT %>% | ||
mutate(wos_index = case_when( | ||
wos_index == "ahci" ~ "AHCI", | ||
wos_index == "ssci" ~ "SSCI", | ||
wos_index == "scie" ~ "SCIE", | ||
)) | ||
|
||
|
||
# =========== | ||
# Summary Statistics | ||
# =========== | ||
# by index | ||
TT %>% | ||
filter(!is.na(totaltw)) %>% | ||
select(twitter, totaltw, wos_index) %>% | ||
distinct() %>% | ||
group_by(wos_index) %>% | ||
summarise( | ||
avg = mean(totaltw, na.rm = TRUE) / 2, | ||
med = median(totaltw, na.rm = TRUE) / 2, | ||
mode = Mode(totaltw) / 2, | ||
sd = sd(totaltw, na.rm = TRUE) / 2, | ||
min = min(totaltw, na.rm = TRUE) / 2, | ||
q1 = quantile(totaltw, probs = 0.25, na.rm = TRUE) / 2, | ||
q3 = quantile(totaltw, probs = 0.75, na.rm = TRUE) / 2, | ||
max = max(totaltw, na.rm = TRUE) / 2 | ||
) | ||
|
||
# total | ||
TT %>% | ||
filter(!is.na(totaltw)) %>% | ||
select(twitter, totaltw) %>% | ||
distinct() %>% | ||
summarise( | ||
avg = mean(totaltw, na.rm = TRUE) / 2, | ||
med = median(totaltw, na.rm = TRUE) / 2, | ||
mode = Mode(totaltw) / 2, | ||
sd = sd(totaltw, na.rm = TRUE) / 2, | ||
min = min(totaltw, na.rm = TRUE) / 2, | ||
q1 = quantile(totaltw, probs = 0.25, na.rm = TRUE) / 2, | ||
q3 = quantile(totaltw, probs = 0.75, na.rm = TRUE) / 2, | ||
max = max(totaltw, na.rm = TRUE) / 2 | ||
) | ||
|
||
# most prolific accounts | ||
TT %>% | ||
select(journal_title, totaltw) %>% | ||
mutate(totaltw = totaltw/2) %>% | ||
arrange(desc(totaltw)) %>% | ||
distinct() %>% | ||
View() | ||
|
||
# Tweets per year | ||
TT %>% | ||
select(twitter, totaltw) %>% | ||
mutate(totaltw = totaltw/2) %>% | ||
distinct() %>% | ||
filter(!is.na(totaltw)) %>% | ||
summarise(totaltw = sum(totaltw)) | ||
|
||
# =========== | ||
# graph | ||
# =========== | ||
|
||
TT <- TT %>% | ||
distinct(twitter, totaltw, wos_index) | ||
|
||
ann_label <- data.frame(wos_index = c("AHCI", "SCIE", "SSCI"), | ||
label = c( | ||
median(TT$totaltw[TT$wos_index == "AHCI"], na.rm = T)/2, | ||
median(TT$totaltw[TT$wos_index == "SCIE"], na.rm = T)/2, | ||
median(TT$totaltw[TT$wos_index == "SSCI"], na.rm = T)/2 | ||
)) | ||
ann_label$label = paste0("Median: ", ann_label$label) | ||
|
||
TT %>% | ||
select(twitter, totaltw, wos_index) %>% | ||
distinct() %>% | ||
ggplot(aes(y = totaltw / 2)) + | ||
geom_histogram(binwidth = 5) + | ||
# geom_hline(yintercept = median(TT$totaltw, na.rm = T), | ||
# linetype = "dashed") + | ||
geom_hline(data = filter(TT, wos_index == "AHCI"), | ||
aes(yintercept = median(totaltw/2, na.rm = T)), | ||
linetype = "dashed") + | ||
geom_hline(data = filter(TT, wos_index == "SCIE"), | ||
aes(yintercept = median(totaltw/2, na.rm = T)), | ||
linetype = "dashed") + | ||
geom_hline(data = filter(TT, wos_index == "SSCI"), | ||
aes(yintercept = median(totaltw/2, na.rm = T)), | ||
linetype = "dashed") + | ||
geom_text(data = ann_label, | ||
aes(label = label), x = 50, y = ifelse(ann_label$wos_index == "SCIE", 270, | ||
ifelse(ann_label$wos_index == "SSCI", 200, 180)), | ||
size = 3) + | ||
xlab("Journals") + | ||
ylab("Tweets per Year") + | ||
theme_minimal() + | ||
facet_wrap(~ wos_index, nrow = 3) + | ||
theme(strip.text.x = element_text(size = 10, face = "bold", | ||
margin = margin ( t = 5 ))) + | ||
scale_y_continuous(breaks = seq(0, 1500, by = 100)) + | ||
coord_flip() | ||
|
||
ggsave("Graph\\active_journals_histogram_by_index.png", | ||
width = 6.8, | ||
height = 5.0, | ||
units = "in", | ||
dpi = 300) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
library(tidyverse) | ||
|
||
# read data | ||
DFF <- arrow::read_parquet("Summary_Statistics\\twitter_data.parquet") | ||
|
||
DF <- DFF %>% | ||
filter(year == 2020 | year == 2021) | ||
library(plyr) | ||
ALLNAMES <- plyr::ddply(DF, .(twitter), summarize, allnames = paste(uniquenames, collapse=";")) %>% | ||
separate_rows(allnames) %>% | ||
distinct() %>% | ||
select(twitter) %>% | ||
group_by(twitter) %>% | ||
count() | ||
colnames(ALLNAMES) <- c("twitter", "total_names") | ||
ALLREP <- plyr::ddply(DF, .(twitter), summarize, allnames_rep = paste(uniquenames_rep, collapse=";")) %>% | ||
separate_rows(allnames_rep) %>% | ||
distinct() %>% | ||
select(twitter) %>% | ||
group_by(twitter) %>% | ||
count() | ||
colnames(ALLREP) <- c("twitter", "total_names_rep") | ||
ALLRT <- plyr::ddply(DF, .(twitter), summarize, allnames_rt = paste(uniquenames_rt, collapse=";")) %>% | ||
separate_rows(allnames_rt) %>% | ||
distinct() %>% | ||
select(twitter) %>% | ||
group_by(twitter) %>% | ||
count() | ||
colnames(ALLRT) <- c("twitter", "total_names_rt") | ||
ALLMEN <- plyr::ddply(DF, .(twitter), summarize, allnames_mentions = paste(uniquenames_mentions, collapse=";")) %>% | ||
separate_rows(allnames_mentions) %>% | ||
distinct() %>% | ||
select(twitter) %>% | ||
group_by(twitter) %>% | ||
count() | ||
colnames(ALLMEN) <- c("twitter", "total_names_mentions") | ||
detach(package:plyr) | ||
DF$tweets_total <- as.numeric(DF$tweets_total) | ||
DF <- DF %>% | ||
select(-account_created_date, -year, -starts_with("uniquenames")) %>% | ||
group_by(twitter) %>% | ||
mutate(totaltw = sum(tweets_total), | ||
total_mentions = sum(unique_mentions), | ||
total_rt = sum(unique_rt), | ||
total_rep = sum(unique_rep) | ||
) %>% | ||
#filter(totaltw >= 50) %>% | ||
select(-tweets_total, -unique_mentions, -unique_rt, -unique_rep) %>% | ||
distinct() %>% | ||
left_join(ALLNAMES) %>% | ||
left_join(ALLMEN) %>% | ||
left_join(ALLREP) %>% | ||
left_join(ALLRT) %>% | ||
group_by(twitter) %>% | ||
mutate(ratio = round(total_names / totaltw, 2), | ||
ratio_mentions = round(total_names_mentions / totaltw, 2), | ||
ratio_rep = round(total_names_rep / totaltw, 2), | ||
ratio_rt = round(total_names_rt / totaltw, 2) | ||
) | ||
rm(ALLNAMES, ALLMEN, ALLREP, ALLRT) | ||
|
||
JJ <- read_csv("twitter_accounts_of_journals.csv") | ||
JJ <- JJ %>% | ||
pivot_longer(cols = c("ahci", "ssci", "scie"), | ||
names_to = "wos_index") %>% | ||
filter(value == 1) %>% | ||
select(-value) | ||
TT <- left_join(JJ, DF) | ||
TT <- TT %>% | ||
mutate(has_twitter = ifelse(is.na(twitter), 0 , 1)) | ||
|
||
Reg <- DFF %>% | ||
select(twitter, account_created_date) %>% | ||
distinct() | ||
TT <- left_join(TT, Reg) | ||
rm(Reg) | ||
|
||
# prepare data | ||
ahci <- TT %>% | ||
filter(wos_index == "ahci") %>% | ||
select(twitter, ratio, wos_index) %>% | ||
filter(!is.na(twitter)) | ||
ssci <- TT %>% | ||
filter(wos_index == "ssci") %>% | ||
select(twitter, ratio, wos_index) %>% | ||
filter(!is.na(twitter)) | ||
scie <- TT %>% | ||
filter(wos_index == "scie") %>% | ||
select(twitter, ratio, wos_index) %>% | ||
filter(!is.na(twitter)) | ||
|
||
# create graph | ||
rbind(ahci, ssci) %>% | ||
rbind(scie) %>% | ||
mutate(wos_index = case_when( | ||
wos_index == "ahci" ~ "AHCI", | ||
wos_index == "ssci" ~ "SSCI", | ||
wos_index == "scie" ~ "SCIE", | ||
)) %>% | ||
ggplot(aes(x = ratio)) + | ||
geom_boxplot() + | ||
scale_fill_manual(values = c("grey", "black", "white")) + | ||
scale_x_continuous(limits = c(0, 5)) + | ||
theme_minimal() + | ||
theme(axis.text.y = element_blank(), | ||
legend.title=element_blank()) + | ||
xlab("community engagement ratio") + | ||
facet_wrap(~ wos_index, ncol = 1) + | ||
theme(strip.text.x = element_text(size = 10, face = "bold", | ||
margin = margin ( t = 5 ))) | ||
|
||
ggsave("Graph\\community_engagement.png", | ||
width = 6.5, | ||
height = 3, | ||
units = "in", | ||
dpi = 300) | ||
|
||
# Summary Statistics | ||
Mode <- function(x) { | ||
ux <- unique(x) | ||
ux[which.max(tabulate(match(x, ux)))] | ||
} | ||
|
||
TT %>% | ||
select(twitter, ratio, wos_index) %>% | ||
filter(!is.na(twitter)) %>% | ||
group_by(wos_index) %>% | ||
filter(!is.na(ratio)) %>% | ||
summarise( | ||
avg = mean(ratio, na.rm = TRUE), | ||
med = median(ratio, na.rm = TRUE), | ||
mode = Mode(ratio), | ||
sd = sd(ratio, na.rm = TRUE), | ||
min = min(ratio, na.rm = TRUE), | ||
q1 = quantile(ratio, probs = 0.25, na.rm = TRUE), | ||
q3 = quantile(ratio, probs = 0.75, na.rm = TRUE), | ||
max = max(ratio, na.rm = TRUE) | ||
) | ||
TT %>% | ||
select(twitter, ratio) %>% | ||
filter(!is.na(twitter)) %>% | ||
filter(!is.na(ratio)) %>% | ||
summarise( | ||
avg = mean(ratio, na.rm = TRUE), | ||
med = median(ratio, na.rm = TRUE), | ||
mode = Mode(ratio), | ||
sd = sd(ratio, na.rm = TRUE), | ||
min = min(ratio, na.rm = TRUE), | ||
q1 = quantile(ratio, probs = 0.25, na.rm = TRUE), | ||
q3 = quantile(ratio, probs = 0.75, na.rm = TRUE), | ||
max = max(ratio, na.rm = TRUE) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
library(tidyverse) | ||
|
||
# read data | ||
df <- read_csv("twitter_accounts_of_journals.csv") %>% | ||
select(twitter, account_description) %>% | ||
distinct() | ||
|
||
# 231 (6%) | ||
df %>% | ||
filter(grepl("open access|\\boa\\b", account_description, ignore.case = T)) | ||
|
||
# 595 (15.6%) | ||
df %>% | ||
filter(grepl("peer.review|\\breviewed\\b|refereed", account_description, ignore.case =)) | ||
|
||
# 263 (6.8%) | ||
df %>% | ||
filter(grepl("\\bJIF\\b|Impact.Factor|CiteScore|[0-9](\\.|,)[0-9][0-9]|most.cited|highly.cited", account_description, ignore.case = T)) |
Oops, something went wrong.