Skip to content

Commit

Permalink
add Twitter profiles and summary statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
andreaspacher committed Jun 9, 2022
1 parent 4ad37d6 commit 6948306
Show file tree
Hide file tree
Showing 13 changed files with 15,898 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# journals-on-twitter
Dataset of the Twitter accoutns of scientific journals (based on the Web of Science indices SCIE, SSCI and AHCI)

Here is a dataset of 3.485 Twitter accounts pertaining to a sample of 13.821 distinct journals listed in Web of Science’s three major indices (SCIE, SSCI and AHCI).

The dataset is available under a Creative Commons-license (CC0).
168 changes: 168 additions & 0 deletions Summary_Statistics/activity_of_journals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
library(tidyverse)

# read data
DFF <- arrow::read_parquet("Summary_Statistics\\twitter_data.parquet")

# function for Modal value
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

DF <- DFF %>%
filter(year == 2020 | year == 2021)
# library(plyr)
# ALLNAMES <- plyr::ddply(DF, .(twitter), summarize, allnames = paste(uniquenames, collapse=";")) %>%
# separate_rows(allnames) %>%
# distinct() %>%
# select(twitter) %>%
# group_by(twitter) %>%
# count()
# colnames(ALLNAMES) <- c("twitter", "total_names")
# ALLREP <- plyr::ddply(DF, .(twitter), summarize, allnames_rep = paste(uniquenames_rep, collapse=";")) %>%
# separate_rows(allnames_rep) %>%
# distinct() %>%
# select(twitter) %>%
# group_by(twitter) %>%
# count()
# colnames(ALLREP) <- c("twitter", "total_names_rep")
# ALLRT <- plyr::ddply(DF, .(twitter), summarize, allnames_rt = paste(uniquenames_rt, collapse=";")) %>%
# separate_rows(allnames_rt) %>%
# distinct() %>%
# select(twitter) %>%
# group_by(twitter) %>%
# count()
# colnames(ALLRT) <- c("twitter", "total_names_rt")
# ALLMEN <- plyr::ddply(DF, .(twitter), summarize, allnames_mentions = paste(uniquenames_mentions, collapse=";")) %>%
# separate_rows(allnames_mentions) %>%
# distinct() %>%
# select(twitter) %>%
# group_by(twitter) %>%
# count()
# colnames(ALLMEN) <- c("twitter", "total_names_mentions")
# detach(package:plyr)
DF$tweets_total <- as.numeric(DF$tweets_total)
DF <- DF %>%
select(-account_created_date, -year, -starts_with("uniquenames")) %>%
group_by(twitter) %>%
mutate(totaltw = sum(tweets_total))

JJ <- read_csv("twitter_accounts_of_journals.csv")
JJ <- JJ %>%
pivot_longer(cols = c("ahci", "ssci", "scie"),
names_to = "wos_index") %>%
filter(value == 1) %>%
select(-value)
TT <- left_join(JJ, DF)
TT <- TT %>%
mutate(has_twitter = ifelse(is.na(twitter), 0 , 1))

TT <- TT %>%
mutate(wos_index = case_when(
wos_index == "ahci" ~ "AHCI",
wos_index == "ssci" ~ "SSCI",
wos_index == "scie" ~ "SCIE",
))


# ===========
# Summary Statistics
# ===========
# by index
TT %>%
filter(!is.na(totaltw)) %>%
select(twitter, totaltw, wos_index) %>%
distinct() %>%
group_by(wos_index) %>%
summarise(
avg = mean(totaltw, na.rm = TRUE) / 2,
med = median(totaltw, na.rm = TRUE) / 2,
mode = Mode(totaltw) / 2,
sd = sd(totaltw, na.rm = TRUE) / 2,
min = min(totaltw, na.rm = TRUE) / 2,
q1 = quantile(totaltw, probs = 0.25, na.rm = TRUE) / 2,
q3 = quantile(totaltw, probs = 0.75, na.rm = TRUE) / 2,
max = max(totaltw, na.rm = TRUE) / 2
)

# total
TT %>%
filter(!is.na(totaltw)) %>%
select(twitter, totaltw) %>%
distinct() %>%
summarise(
avg = mean(totaltw, na.rm = TRUE) / 2,
med = median(totaltw, na.rm = TRUE) / 2,
mode = Mode(totaltw) / 2,
sd = sd(totaltw, na.rm = TRUE) / 2,
min = min(totaltw, na.rm = TRUE) / 2,
q1 = quantile(totaltw, probs = 0.25, na.rm = TRUE) / 2,
q3 = quantile(totaltw, probs = 0.75, na.rm = TRUE) / 2,
max = max(totaltw, na.rm = TRUE) / 2
)

# most prolific accounts
TT %>%
select(journal_title, totaltw) %>%
mutate(totaltw = totaltw/2) %>%
arrange(desc(totaltw)) %>%
distinct() %>%
View()

# Tweets per year
TT %>%
select(twitter, totaltw) %>%
mutate(totaltw = totaltw/2) %>%
distinct() %>%
filter(!is.na(totaltw)) %>%
summarise(totaltw = sum(totaltw))

# ===========
# graph
# ===========

TT <- TT %>%
distinct(twitter, totaltw, wos_index)

ann_label <- data.frame(wos_index = c("AHCI", "SCIE", "SSCI"),
label = c(
median(TT$totaltw[TT$wos_index == "AHCI"], na.rm = T)/2,
median(TT$totaltw[TT$wos_index == "SCIE"], na.rm = T)/2,
median(TT$totaltw[TT$wos_index == "SSCI"], na.rm = T)/2
))
ann_label$label = paste0("Median: ", ann_label$label)

TT %>%
select(twitter, totaltw, wos_index) %>%
distinct() %>%
ggplot(aes(y = totaltw / 2)) +
geom_histogram(binwidth = 5) +
# geom_hline(yintercept = median(TT$totaltw, na.rm = T),
# linetype = "dashed") +
geom_hline(data = filter(TT, wos_index == "AHCI"),
aes(yintercept = median(totaltw/2, na.rm = T)),
linetype = "dashed") +
geom_hline(data = filter(TT, wos_index == "SCIE"),
aes(yintercept = median(totaltw/2, na.rm = T)),
linetype = "dashed") +
geom_hline(data = filter(TT, wos_index == "SSCI"),
aes(yintercept = median(totaltw/2, na.rm = T)),
linetype = "dashed") +
geom_text(data = ann_label,
aes(label = label), x = 50, y = ifelse(ann_label$wos_index == "SCIE", 270,
ifelse(ann_label$wos_index == "SSCI", 200, 180)),
size = 3) +
xlab("Journals") +
ylab("Tweets per Year") +
theme_minimal() +
facet_wrap(~ wos_index, nrow = 3) +
theme(strip.text.x = element_text(size = 10, face = "bold",
margin = margin ( t = 5 ))) +
scale_y_continuous(breaks = seq(0, 1500, by = 100)) +
coord_flip()

ggsave("Graph\\active_journals_histogram_by_index.png",
width = 6.8,
height = 5.0,
units = "in",
dpi = 300)
152 changes: 152 additions & 0 deletions Summary_Statistics/community_engagement.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
library(tidyverse)

# read data
DFF <- arrow::read_parquet("Summary_Statistics\\twitter_data.parquet")

DF <- DFF %>%
filter(year == 2020 | year == 2021)
library(plyr)
ALLNAMES <- plyr::ddply(DF, .(twitter), summarize, allnames = paste(uniquenames, collapse=";")) %>%
separate_rows(allnames) %>%
distinct() %>%
select(twitter) %>%
group_by(twitter) %>%
count()
colnames(ALLNAMES) <- c("twitter", "total_names")
ALLREP <- plyr::ddply(DF, .(twitter), summarize, allnames_rep = paste(uniquenames_rep, collapse=";")) %>%
separate_rows(allnames_rep) %>%
distinct() %>%
select(twitter) %>%
group_by(twitter) %>%
count()
colnames(ALLREP) <- c("twitter", "total_names_rep")
ALLRT <- plyr::ddply(DF, .(twitter), summarize, allnames_rt = paste(uniquenames_rt, collapse=";")) %>%
separate_rows(allnames_rt) %>%
distinct() %>%
select(twitter) %>%
group_by(twitter) %>%
count()
colnames(ALLRT) <- c("twitter", "total_names_rt")
ALLMEN <- plyr::ddply(DF, .(twitter), summarize, allnames_mentions = paste(uniquenames_mentions, collapse=";")) %>%
separate_rows(allnames_mentions) %>%
distinct() %>%
select(twitter) %>%
group_by(twitter) %>%
count()
colnames(ALLMEN) <- c("twitter", "total_names_mentions")
detach(package:plyr)
DF$tweets_total <- as.numeric(DF$tweets_total)
DF <- DF %>%
select(-account_created_date, -year, -starts_with("uniquenames")) %>%
group_by(twitter) %>%
mutate(totaltw = sum(tweets_total),
total_mentions = sum(unique_mentions),
total_rt = sum(unique_rt),
total_rep = sum(unique_rep)
) %>%
#filter(totaltw >= 50) %>%
select(-tweets_total, -unique_mentions, -unique_rt, -unique_rep) %>%
distinct() %>%
left_join(ALLNAMES) %>%
left_join(ALLMEN) %>%
left_join(ALLREP) %>%
left_join(ALLRT) %>%
group_by(twitter) %>%
mutate(ratio = round(total_names / totaltw, 2),
ratio_mentions = round(total_names_mentions / totaltw, 2),
ratio_rep = round(total_names_rep / totaltw, 2),
ratio_rt = round(total_names_rt / totaltw, 2)
)
rm(ALLNAMES, ALLMEN, ALLREP, ALLRT)

JJ <- read_csv("twitter_accounts_of_journals.csv")
JJ <- JJ %>%
pivot_longer(cols = c("ahci", "ssci", "scie"),
names_to = "wos_index") %>%
filter(value == 1) %>%
select(-value)
TT <- left_join(JJ, DF)
TT <- TT %>%
mutate(has_twitter = ifelse(is.na(twitter), 0 , 1))

Reg <- DFF %>%
select(twitter, account_created_date) %>%
distinct()
TT <- left_join(TT, Reg)
rm(Reg)

# prepare data
ahci <- TT %>%
filter(wos_index == "ahci") %>%
select(twitter, ratio, wos_index) %>%
filter(!is.na(twitter))
ssci <- TT %>%
filter(wos_index == "ssci") %>%
select(twitter, ratio, wos_index) %>%
filter(!is.na(twitter))
scie <- TT %>%
filter(wos_index == "scie") %>%
select(twitter, ratio, wos_index) %>%
filter(!is.na(twitter))

# create graph
rbind(ahci, ssci) %>%
rbind(scie) %>%
mutate(wos_index = case_when(
wos_index == "ahci" ~ "AHCI",
wos_index == "ssci" ~ "SSCI",
wos_index == "scie" ~ "SCIE",
)) %>%
ggplot(aes(x = ratio)) +
geom_boxplot() +
scale_fill_manual(values = c("grey", "black", "white")) +
scale_x_continuous(limits = c(0, 5)) +
theme_minimal() +
theme(axis.text.y = element_blank(),
legend.title=element_blank()) +
xlab("community engagement ratio") +
facet_wrap(~ wos_index, ncol = 1) +
theme(strip.text.x = element_text(size = 10, face = "bold",
margin = margin ( t = 5 )))

ggsave("Graph\\community_engagement.png",
width = 6.5,
height = 3,
units = "in",
dpi = 300)

# Summary Statistics
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

TT %>%
select(twitter, ratio, wos_index) %>%
filter(!is.na(twitter)) %>%
group_by(wos_index) %>%
filter(!is.na(ratio)) %>%
summarise(
avg = mean(ratio, na.rm = TRUE),
med = median(ratio, na.rm = TRUE),
mode = Mode(ratio),
sd = sd(ratio, na.rm = TRUE),
min = min(ratio, na.rm = TRUE),
q1 = quantile(ratio, probs = 0.25, na.rm = TRUE),
q3 = quantile(ratio, probs = 0.75, na.rm = TRUE),
max = max(ratio, na.rm = TRUE)
)
TT %>%
select(twitter, ratio) %>%
filter(!is.na(twitter)) %>%
filter(!is.na(ratio)) %>%
summarise(
avg = mean(ratio, na.rm = TRUE),
med = median(ratio, na.rm = TRUE),
mode = Mode(ratio),
sd = sd(ratio, na.rm = TRUE),
min = min(ratio, na.rm = TRUE),
q1 = quantile(ratio, probs = 0.25, na.rm = TRUE),
q3 = quantile(ratio, probs = 0.75, na.rm = TRUE),
max = max(ratio, na.rm = TRUE)
)
18 changes: 18 additions & 0 deletions Summary_Statistics/profile_descriptions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
library(tidyverse)

# read data
df <- read_csv("twitter_accounts_of_journals.csv") %>%
select(twitter, account_description) %>%
distinct()

# 231 (6%)
df %>%
filter(grepl("open access|\\boa\\b", account_description, ignore.case = T))

# 595 (15.6%)
df %>%
filter(grepl("peer.review|\\breviewed\\b|refereed", account_description, ignore.case =))

# 263 (6.8%)
df %>%
filter(grepl("\\bJIF\\b|Impact.Factor|CiteScore|[0-9](\\.|,)[0-9][0-9]|most.cited|highly.cited", account_description, ignore.case = T))
Loading

0 comments on commit 6948306

Please sign in to comment.