add Twitter profiles and summary statistics

andreaspacher · Jun 9, 2022 · 6948306 · 6948306
1 parent 4ad37d6
commit 6948306
Show file tree

Hide file tree

Showing 13 changed files with 15,898 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/README.md b/README.md
@@ -1,2 +1,5 @@
 # journals-on-twitter
-Dataset of the Twitter accoutns of scientific journals (based on the Web of Science indices SCIE, SSCI and AHCI)
+
+Here is a dataset of 3.485 Twitter accounts pertaining to a sample of 13.821 distinct journals listed in Web of Science’s three major indices (SCIE, SSCI and AHCI).
+
+The dataset is available under a Creative Commons-license (CC0).
diff --git a/Summary_Statistics/activity_of_journals.R b/Summary_Statistics/activity_of_journals.R
@@ -0,0 +1,168 @@
+library(tidyverse)
+
+# read data
+DFF <- arrow::read_parquet("Summary_Statistics\\twitter_data.parquet")
+
+# function for Modal value
+Mode <- function(x) {
+  ux <- unique(x)
+  ux[which.max(tabulate(match(x, ux)))]
+}
+
+DF <- DFF %>%
+  filter(year == 2020 | year == 2021)
+# library(plyr)
+# ALLNAMES <- plyr::ddply(DF, .(twitter), summarize, allnames = paste(uniquenames, collapse=";")) %>%
+#   separate_rows(allnames) %>%
+#   distinct() %>%
+#   select(twitter) %>%
+#   group_by(twitter) %>%
+#   count()
+# colnames(ALLNAMES) <- c("twitter", "total_names")
+# ALLREP <- plyr::ddply(DF, .(twitter), summarize, allnames_rep = paste(uniquenames_rep, collapse=";")) %>%
+#   separate_rows(allnames_rep) %>%
+#   distinct() %>%
+#   select(twitter) %>%
+#   group_by(twitter) %>%
+#   count()
+# colnames(ALLREP) <- c("twitter", "total_names_rep")
+# ALLRT <- plyr::ddply(DF, .(twitter), summarize, allnames_rt = paste(uniquenames_rt, collapse=";")) %>%
+#   separate_rows(allnames_rt) %>%
+#   distinct() %>%
+#   select(twitter) %>%
+#   group_by(twitter) %>%
+#   count()
+# colnames(ALLRT) <- c("twitter", "total_names_rt")
+# ALLMEN <- plyr::ddply(DF, .(twitter), summarize, allnames_mentions = paste(uniquenames_mentions, collapse=";")) %>%
+#   separate_rows(allnames_mentions) %>%
+#   distinct() %>%
+#   select(twitter) %>%
+#   group_by(twitter) %>%
+#   count()
+# colnames(ALLMEN) <- c("twitter", "total_names_mentions")
+# detach(package:plyr)
+DF$tweets_total <- as.numeric(DF$tweets_total)
+DF <- DF %>%
+  select(-account_created_date, -year, -starts_with("uniquenames")) %>%
+  group_by(twitter) %>%
+  mutate(totaltw = sum(tweets_total))
+
+JJ <- read_csv("twitter_accounts_of_journals.csv")
+JJ <- JJ %>%
+  pivot_longer(cols = c("ahci", "ssci", "scie"),
+               names_to = "wos_index") %>%
+  filter(value == 1) %>%
+  select(-value)
+TT <- left_join(JJ, DF)
+TT <- TT %>%
+  mutate(has_twitter = ifelse(is.na(twitter), 0 , 1))
+
+TT <- TT %>%
+  mutate(wos_index = case_when(
+  wos_index == "ahci" ~ "AHCI",
+  wos_index == "ssci" ~ "SSCI",
+  wos_index == "scie" ~ "SCIE",
+))
+
+
+# ===========
+# Summary Statistics
+# ===========
+# by index
+TT %>%
+  filter(!is.na(totaltw)) %>%
+  select(twitter, totaltw, wos_index) %>%
+  distinct() %>%
+  group_by(wos_index) %>%
+  summarise(
+    avg = mean(totaltw, na.rm = TRUE) / 2,
+    med = median(totaltw, na.rm = TRUE) / 2,
+    mode = Mode(totaltw) / 2,
+    sd = sd(totaltw, na.rm = TRUE) / 2,
+    min = min(totaltw, na.rm = TRUE) / 2,
+    q1 = quantile(totaltw, probs = 0.25, na.rm = TRUE) / 2,
+    q3 = quantile(totaltw, probs = 0.75, na.rm = TRUE) / 2,
+    max = max(totaltw, na.rm = TRUE) / 2
+  )
+
+# total
+TT %>%
+  filter(!is.na(totaltw)) %>%
+  select(twitter, totaltw) %>%
+  distinct() %>%
+  summarise(
+    avg = mean(totaltw, na.rm = TRUE) / 2,
+    med = median(totaltw, na.rm = TRUE) / 2,
+    mode = Mode(totaltw) / 2,
+    sd = sd(totaltw, na.rm = TRUE) / 2,
+    min = min(totaltw, na.rm = TRUE) / 2,
+    q1 = quantile(totaltw, probs = 0.25, na.rm = TRUE) / 2,
+    q3 = quantile(totaltw, probs = 0.75, na.rm = TRUE) / 2,
+    max = max(totaltw, na.rm = TRUE) / 2
+  )
+
+# most prolific accounts 
+TT %>%
+  select(journal_title, totaltw) %>%
+  mutate(totaltw = totaltw/2) %>%
+  arrange(desc(totaltw)) %>%
+  distinct() %>%
+  View()
+
+# Tweets per year
+TT %>%
+  select(twitter, totaltw) %>%
+  mutate(totaltw = totaltw/2) %>%
+  distinct() %>%
+  filter(!is.na(totaltw)) %>%
+  summarise(totaltw = sum(totaltw))
+
+# ===========
+# graph
+# ===========
+
+TT <- TT %>%
+  distinct(twitter, totaltw, wos_index)
+
+ann_label <- data.frame(wos_index = c("AHCI", "SCIE", "SSCI"),
+                        label = c(
+                          median(TT$totaltw[TT$wos_index == "AHCI"], na.rm = T)/2,
+                          median(TT$totaltw[TT$wos_index == "SCIE"], na.rm = T)/2,
+                          median(TT$totaltw[TT$wos_index == "SSCI"], na.rm = T)/2
+                        ))
+ann_label$label = paste0("Median: ", ann_label$label)
+
+TT %>%
+  select(twitter, totaltw, wos_index) %>%
+  distinct() %>%
+  ggplot(aes(y = totaltw / 2)) +
+  geom_histogram(binwidth = 5) +
+  # geom_hline(yintercept = median(TT$totaltw, na.rm = T),
+  #            linetype = "dashed") +
+  geom_hline(data = filter(TT, wos_index == "AHCI"),
+             aes(yintercept = median(totaltw/2, na.rm = T)),
+             linetype = "dashed") +
+  geom_hline(data = filter(TT, wos_index == "SCIE"),
+             aes(yintercept = median(totaltw/2, na.rm = T)),
+             linetype = "dashed") +
+  geom_hline(data = filter(TT, wos_index == "SSCI"),
+             aes(yintercept = median(totaltw/2, na.rm = T)),
+             linetype = "dashed") +
+  geom_text(data = ann_label,
+            aes(label = label), x = 50, y = ifelse(ann_label$wos_index == "SCIE", 270,
+                                                   ifelse(ann_label$wos_index == "SSCI", 200, 180)),
+            size = 3) +
+  xlab("Journals") +
+  ylab("Tweets per Year") + 
+  theme_minimal() +
+  facet_wrap(~ wos_index, nrow = 3) +
+  theme(strip.text.x = element_text(size = 10, face = "bold",
+                                    margin = margin ( t = 5 ))) +
+  scale_y_continuous(breaks = seq(0, 1500, by = 100)) +
+  coord_flip()
+
+ggsave("Graph\\active_journals_histogram_by_index.png",
+       width = 6.8,
+       height = 5.0,
+       units = "in",
+       dpi = 300)
diff --git a/Summary_Statistics/community_engagement.R b/Summary_Statistics/community_engagement.R
@@ -0,0 +1,152 @@
+library(tidyverse)
+
+# read data
+DFF <- arrow::read_parquet("Summary_Statistics\\twitter_data.parquet")
+
+DF <- DFF %>%
+  filter(year == 2020 | year == 2021)
+library(plyr)
+ALLNAMES <- plyr::ddply(DF, .(twitter), summarize, allnames = paste(uniquenames, collapse=";")) %>%
+  separate_rows(allnames) %>%
+  distinct() %>%
+  select(twitter) %>%
+  group_by(twitter) %>%
+  count()
+colnames(ALLNAMES) <- c("twitter", "total_names")
+ALLREP <- plyr::ddply(DF, .(twitter), summarize, allnames_rep = paste(uniquenames_rep, collapse=";")) %>%
+  separate_rows(allnames_rep) %>%
+  distinct() %>%
+  select(twitter) %>%
+  group_by(twitter) %>%
+  count()
+colnames(ALLREP) <- c("twitter", "total_names_rep")
+ALLRT <- plyr::ddply(DF, .(twitter), summarize, allnames_rt = paste(uniquenames_rt, collapse=";")) %>%
+  separate_rows(allnames_rt) %>%
+  distinct() %>%
+  select(twitter) %>%
+  group_by(twitter) %>%
+  count()
+colnames(ALLRT) <- c("twitter", "total_names_rt")
+ALLMEN <- plyr::ddply(DF, .(twitter), summarize, allnames_mentions = paste(uniquenames_mentions, collapse=";")) %>%
+  separate_rows(allnames_mentions) %>%
+  distinct() %>%
+  select(twitter) %>%
+  group_by(twitter) %>%
+  count()
+colnames(ALLMEN) <- c("twitter", "total_names_mentions")
+detach(package:plyr)
+DF$tweets_total <- as.numeric(DF$tweets_total)
+DF <- DF %>%
+  select(-account_created_date, -year, -starts_with("uniquenames")) %>%
+  group_by(twitter) %>%
+  mutate(totaltw = sum(tweets_total),
+         total_mentions = sum(unique_mentions),
+         total_rt = sum(unique_rt),
+         total_rep = sum(unique_rep)
+  ) %>%
+  #filter(totaltw >= 50) %>%
+  select(-tweets_total, -unique_mentions, -unique_rt, -unique_rep) %>%
+  distinct() %>%
+  left_join(ALLNAMES) %>%
+  left_join(ALLMEN) %>%
+  left_join(ALLREP) %>%
+  left_join(ALLRT) %>%
+  group_by(twitter) %>%
+  mutate(ratio = round(total_names / totaltw, 2),
+         ratio_mentions = round(total_names_mentions / totaltw, 2),
+         ratio_rep = round(total_names_rep / totaltw, 2),
+         ratio_rt = round(total_names_rt / totaltw, 2)
+  )
+rm(ALLNAMES, ALLMEN, ALLREP, ALLRT)
+
+JJ <- read_csv("twitter_accounts_of_journals.csv")
+JJ <- JJ %>%
+  pivot_longer(cols = c("ahci", "ssci", "scie"),
+               names_to = "wos_index") %>%
+  filter(value == 1) %>%
+  select(-value)
+TT <- left_join(JJ, DF)
+TT <- TT %>%
+  mutate(has_twitter = ifelse(is.na(twitter), 0 , 1))
+
+Reg <- DFF %>%
+  select(twitter, account_created_date) %>%
+  distinct()
+TT <- left_join(TT, Reg)
+rm(Reg)
+
+# prepare data 
+ahci <- TT %>%
+  filter(wos_index == "ahci") %>%
+  select(twitter, ratio, wos_index) %>%
+  filter(!is.na(twitter))
+ssci <- TT %>%
+  filter(wos_index == "ssci") %>%
+  select(twitter, ratio, wos_index) %>%
+  filter(!is.na(twitter))
+scie <- TT %>%
+  filter(wos_index == "scie") %>%
+  select(twitter, ratio, wos_index) %>%
+  filter(!is.na(twitter))
+
+# create graph
+rbind(ahci, ssci) %>%
+  rbind(scie) %>%
+  mutate(wos_index = case_when(
+    wos_index == "ahci" ~ "AHCI",
+    wos_index == "ssci" ~ "SSCI",
+    wos_index == "scie" ~ "SCIE",
+  )) %>%
+  ggplot(aes(x = ratio)) +
+  geom_boxplot() +
+  scale_fill_manual(values = c("grey", "black", "white")) +
+  scale_x_continuous(limits = c(0, 5)) +
+  theme_minimal() +
+  theme(axis.text.y = element_blank(),
+        legend.title=element_blank()) +
+  xlab("community engagement ratio") +
+  facet_wrap(~ wos_index, ncol = 1) +
+  theme(strip.text.x = element_text(size = 10, face = "bold",
+                                    margin = margin ( t = 5 )))
+
+ggsave("Graph\\community_engagement.png",
+       width = 6.5,
+       height = 3,
+       units = "in",
+       dpi = 300)
+
+# Summary Statistics
+Mode <- function(x) {
+  ux <- unique(x)
+  ux[which.max(tabulate(match(x, ux)))]
+}
+
+TT %>%
+  select(twitter, ratio, wos_index) %>%
+  filter(!is.na(twitter)) %>%
+  group_by(wos_index) %>%
+  filter(!is.na(ratio)) %>%
+  summarise(
+    avg = mean(ratio, na.rm = TRUE),
+    med = median(ratio, na.rm = TRUE),
+    mode = Mode(ratio),
+    sd = sd(ratio, na.rm = TRUE),
+    min = min(ratio, na.rm = TRUE),
+    q1 = quantile(ratio, probs = 0.25, na.rm = TRUE),
+    q3 = quantile(ratio, probs = 0.75, na.rm = TRUE),
+    max = max(ratio, na.rm = TRUE)
+  )
+TT %>%
+  select(twitter, ratio) %>%
+  filter(!is.na(twitter)) %>%
+  filter(!is.na(ratio)) %>%
+  summarise(
+    avg = mean(ratio, na.rm = TRUE),
+    med = median(ratio, na.rm = TRUE),
+    mode = Mode(ratio),
+    sd = sd(ratio, na.rm = TRUE),
+    min = min(ratio, na.rm = TRUE),
+    q1 = quantile(ratio, probs = 0.25, na.rm = TRUE),
+    q3 = quantile(ratio, probs = 0.75, na.rm = TRUE),
+    max = max(ratio, na.rm = TRUE)
+  )
diff --git a/Summary_Statistics/profile_descriptions.R b/Summary_Statistics/profile_descriptions.R
@@ -0,0 +1,18 @@
+library(tidyverse)
+
+# read data
+df <- read_csv("twitter_accounts_of_journals.csv") %>%
+  select(twitter, account_description) %>%
+  distinct()
+
+# 231 (6%)
+df %>%
+  filter(grepl("open access|\\boa\\b", account_description, ignore.case = T))
+
+# 595 (15.6%)
+df %>%
+  filter(grepl("peer.review|\\breviewed\\b|refereed", account_description, ignore.case =))
+
+# 263 (6.8%)
+df %>%
+  filter(grepl("\\bJIF\\b|Impact.Factor|CiteScore|[0-9](\\.|,)[0-9][0-9]|most.cited|highly.cited", account_description, ignore.case = T))