-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathePANDDA_app.R
254 lines (206 loc) · 12 KB
/
ePANDDA_app.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
Start<-print(Sys.time())
######################################### Load Required Libraries ###########################################
if (require("RCurl",warn.conflicts=FALSE)==FALSE) {
install.packages("RCurl",repos="http://cran.cnr.berkeley.edu/");
library("RCurl");
}
if (require("RJSONIO",warn.conflicts=FALSE)==FALSE) {
install.packages("RJSONIO",repos="http://cran.cnr.berkeley.edu/");
library("RJSONIO");
}
if (require("stringdist",warn.conflicts=FALSE)==FALSE) {
install.packages("stringdist",repos="http://cran.cnr.berkeley.edu/");
library("stringdist");
}
if (require("doParallel",warn.conflicts=FALSE)==FALSE) {
install.packages("doParallel",repos="http://cran.cnr.berkeley.edu/");
library("doParallel");
}
if (require("plyr",warn.conflicts=FALSE)==FALSE) {
install.packages("plyr",repos="http://cran.cnr.berkeley.edu/");
library("plyr");
}
# Start a cluster for multicore, 4 by default or higher if passed as command line argument
CommandArgument<-commandArgs(TRUE)
if (length(CommandArgument)==0) {
Cluster<-makeCluster(3)
} else {
Cluster<-makeCluster(as.numeric(CommandArgument[1]))
}
#############################################################################################################
######################################### DATA DOWNLOAD, EPANDDA ############################################
#############################################################################################################
# No functions at this time.
############################################ Download Datasets from API ####################################
print(paste("download PBDB refs",Sys.time()))
# Increase the timeout option to allow for larger data downloads
options(timeout=300)
# Download references from the Paleobiology Database through the API
GotURL<-RCurl::getURL("https://paleobiodb.org/data1.2/colls/refs.csv?all_records")
PBDBRefs<-read.csv(text=GotURL,header=TRUE)
# Pull out only the needed columns and rename them to match GDDRefs
PBDBRefs<-PBDBRefs[,c("reference_no","author1last","pubyr","reftitle","pubtitle")]
colnames(PBDBRefs)<-c("pbdb_no","pbdb_author","pbdb_year","pbdb_title","pbdb_pubtitle")
# Change data types of PBDBRefs to appropriate types
PBDBRefs[,"pbdb_no"]<-as.numeric(as.character(PBDBRefs[,"pbdb_no"]))
PBDBRefs[,"pbdb_author"]<-as.character(PBDBRefs[,"pbdb_author"])
PBDBRefs[,"pbdb_year"]<-as.numeric(as.character(PBDBRefs[,"pbdb_year"]))
PBDBRefs[,"pbdb_title"]<-as.character(PBDBRefs[,"pbdb_title"])
PBDBRefs[,"pbdb_pubtitle"]<-as.character(PBDBRefs[,"pbdb_pubtitle"])
# Remove PBDB Refs with no title
PBDBRefs<-subset(PBDBRefs,nchar(PBDBRefs[,"pbdb_title"])>2)
# Find the current directory
CurrentDirectory<-getwd()
print(paste("download DD refs",Sys.time()))
# Move othe input folder
setwd(paste(CurrentDirectory,"/input",sep=""))
# Load in the input.bibjson file
GDDRefs<-fromJSON("input.bibjson") # if testing: "~/Documents/DeepDive/ePANDDA/EPANDDA_app-master/input/input.bibjson"
# Extract authors, docid, year, title, journal, and publisher information from the BibJson List into vectors
gdd_id<-parSapply(Cluster,GDDRefs,function(x) x[["_gddid"]])
gdd_author<-parSapply(Cluster,GDDRefs,function(x) paste(unlist(x[["author"]]),collapse=" "))
gdd_year<-parSapply(Cluster,GDDRefs,function(x) x[["year"]])
gdd_title<-parSapply(Cluster,GDDRefs,function(x) x[["title"]])
gdd_pubtitle<-parSapply(Cluster,GDDRefs,function(x) x[["journal"]])
gdd_publisher<-parSapply(Cluster,GDDRefs,function(x) x[["publisher"]])
# Create identically formatted data.frames for geodeepdive and pbdb references (overwrite GDDRefs)
GDDRefs<-as.data.frame(cbind(gdd_id,gdd_author,gdd_year,gdd_title,gdd_pubtitle, gdd_publisher),stringsAsFactors=FALSE)
# Change data types of DDRefs to appropriate types
GDDRefs[,"gdd_id"]<-as.character(GDDRefs[,"gdd_id"])
GDDRefs[,"gdd_author"]<-as.character(GDDRefs[,"gdd_author"])
GDDRefs[,"gdd_year"]<-as.numeric(as.character(GDDRefs[,"gdd_year"]))
GDDRefs[,"gdd_title"]<-as.character(GDDRefs[,"gdd_title"])
GDDRefs[,"gdd_pubtitle"]<-as.character(GDDRefs[,"gdd_pubtitle"])
GDDRefs[,"gdd_publisher"]<-as.character(GDDRefs[,"gdd_publisher"])
# Update DDRefs[,"gdd_pubtitle"] to usgs bulletin where appropriate
print(paste("usgs bulletin update",Sys.time()))
USGS_Bulletin<-which(GDDRefs[,"gdd_pubtitle"]=="Bulletin"&GDDRefs[,"gdd_publisher"]=="USGS")
GDDRefs[USGS_Bulletin,"gdd_pubtitle"]<-"usgs bulletin"
# update and overwrite gdd_pubtitle
gdd_pubtitle<-GDDRefs[,"gdd_pubtitle"]
# Convert the title and pubtitle to all lower case, because stringsim, unlike grep, cannot distinguish between cases
PBDBRefs[,"pbdb_title"]<-tolower(PBDBRefs[,"pbdb_title"])
PBDBRefs[,"pbdb_pubtitle"]<-tolower(PBDBRefs[,"pbdb_pubtitle"])
GDDRefs[,"gdd_title"]<-tolower(GDDRefs[,"gdd_title"])
GDDRefs[,"gdd_pubtitle"]<-tolower(GDDRefs[,"gdd_pubtitle"])
# Record stats
# Record the initial number of PBDB documents
PBDBDocs<-dim(PBDBRefs)[1]
# Record the initial number of GeoDeepDive documents
GDDDocs<-dim(GDDRefs)[1]
############################### HARMONIZE JOURNAL TITLES BETWEEN PBDB AND GDD ################################
print(paste("harmonize publication titles between pbdb and gdd",Sys.time()))
# replace all versions of "United States Geological Survey" in PBDB with "usgs"
PBDBRefs[,"pbdb_pubtitle"]<-gsub("u.s. geological survey","usgs",PBDBRefs[,"pbdb_pubtitle"])
PBDBRefs[,"pbdb_pubtitle"]<-gsub("u. s. geological survey","usgs", PBDBRefs[,"pbdb_pubtitle"])
PBDBRefs[,"pbdb_pubtitle"]<-gsub("u.s.g.s.","usgs", PBDBRefs[,"pbdb_pubtitle"])
PBDBRefs[,"pbdb_pubtitle"]<-gsub("us geological survey","usgs", PBDBRefs[,"pbdb_pubtitle"])
PBDBRefs[,"pbdb_pubtitle"]<-gsub("united states geological survey","usgs", PBDBRefs[,"pbdb_pubtitle"])
# For Geobios:
geobios<-which(PBDBRefs[,"pbdb_pubtitle"]=="géobios"|
PBDBRefs[,"pbdb_pubtitle"]=="geobios mémoire spécial")
# Replace titles to match GeoDeepDive
PBDBRefs[geobios,"pbdb_pubtitle"]<-"geobios"
# For Canadian Journal of Earth Sciences:
canadian_journal<-which(PBDBRefs[,"pbdb_pubtitle"]=="canadian journal of earth science"|
PBDBRefs[,"pbdb_pubtitle"]=="canadian journal earth science")
# Replace titles to match GeoDeepDive
PBDBRefs[canadian_journal,"pbdb_pubtitle"]<-"canadian journal of earth sciences"
#############################################################################################################
########################################## MATCH TITLES, EPANDDA ############################################
#############################################################################################################
# Find the best title stringsim for each PBDB ref in GDD
matchTitle<-function(x,y) {
Similarity<-stringdist::stringsim(x,y)
MaxTitle<-max(Similarity)
MaxIndex<-which.max(Similarity)
return(c(MaxIndex,MaxTitle))
}
############################################ Initial Title Match Script #####################################
# Status update
print(paste("perform title matches",Sys.time()))
# Export the functions to the cluster
clusterExport(cl=Cluster,varlist=c("matchTitle","stringsim"))
# Find the best title stringsim for each PBDB ref in GDD
TitleSimilarity<-parSapply(Cluster,PBDBRefs[,"pbdb_title"],matchTitle,GDDRefs[,"gdd_title"])
# Reshape the Title Similarity Output
TitleSimilarity<-as.data.frame(t(unname(TitleSimilarity)))
# Bind Title Similarity by pbdb_no
InitialMatches<-cbind(PBDBRefs[,"pbdb_no"],TitleSimilarity)
InitialMatches[,"V1"]<-GDDRefs[InitialMatches[,"V1"],"gdd_id"]
colnames(InitialMatches)<-c("pbdb_no","gdd_id","title_sim")
# Merge initial matches, pbdb refs, and gdd refs
InitialMatches<-merge(InitialMatches,GDDRefs,by="gdd_id",all.x=TRUE)
InitialMatches<-merge(InitialMatches,PBDBRefs,by="pbdb_no",all.x=TRUE)
# Status update
print(paste("finish title matches",Sys.time()))
#############################################################################################################
########################################## MATCH FIELDS, EPANDDA ############################################
#############################################################################################################
# A function for matching additional bibliographic fields between the best and worst match
matchAdditional<-function(InitialMatches) {
# Whether the publication year is identical
Year<-InitialMatches["pbdb_year"]==InitialMatches["gdd_year"]
# The similarity of the journal names
Journal<-stringsim(InitialMatches["pbdb_pubtitle"],InitialMatches["gdd_pubtitle"])
# Whether the first author's surname is present in the GDD bibliography
Author<-grepl(InitialMatches["pbdb_author"],InitialMatches["gdd_author"],perl=TRUE,ignore.case=TRUE)
# Return output
FinalOutput<-setNames(c(InitialMatches["pbdb_no"],InitialMatches["gdd_id"],InitialMatches["title_sim"],Author,Year,Journal),c("pbdb_no","gdd_id","title_sim","author_in","year_match","pubtitle_sim"))
return(FinalOutput)
}
######################################### Match Additional Fields Script ####################################
print(paste("perform additional matches",Sys.time()))
# Reset the data types; columns are sometimes coerced to the incorrect data type for unknown reasons
InitialMatches[,"pbdb_no"]<-as.numeric(InitialMatches[,"pbdb_no"])
InitialMatches[,"gdd_id"]<-as.character(InitialMatches[,"gdd_id"])
InitialMatches[,"title_sim"]<-as.numeric(InitialMatches[,"title_sim"])
InitialMatches[,"gdd_author"]<-as.character(InitialMatches[,"gdd_author"])
InitialMatches[,"gdd_year"]<-as.numeric(InitialMatches[,"gdd_year"])
InitialMatches[,"gdd_title"]<-as.character(InitialMatches[,"gdd_title"])
InitialMatches[,"gdd_pubtitle"]<-as.character(InitialMatches[,"gdd_pubtitle"])
InitialMatches[,"gdd_publisher"]<-as.character(InitialMatches[,"gdd_publisher"]) # This is where the break was happening
InitialMatches[,"pbdb_author"]<-as.character(InitialMatches[,"pbdb_author"])
InitialMatches[,"pbdb_year"]<-as.numeric(InitialMatches[,"pbdb_year"])
InitialMatches[,"pbdb_title"]<-as.character(InitialMatches[,"pbdb_title"])
InitialMatches[,"pbdb_pubtitle"]<-as.character(InitialMatches[,"pbdb_pubtitle"])
# export matchAdditional to the cluster
clusterExport(cl=Cluster,varlist=c("matchAdditional"))
# Perform the additional matches
MatchReferences<-parApply(Cluster, InitialMatches, 1, matchAdditional)
# Stop the Cluser
stopCluster(Cluster)
# Reformat MatchReferences
MatchReferences<-as.data.frame(t(MatchReferences),stringsAsFactors=FALSE)
print(paste("organize stats",Sys.time()))
# Record Stats
# Create a vector of the title_sim column of MatchReferences
TitleSim<-as.numeric(as.character(MatchReferences[,"title_sim"]))
# Create a table showing title_sim values rounded to the nearest hundredth
TitleSimTable<-table(round_any(TitleSim,0.1,floor))
RoundedTitleSim<-paste(names(TitleSimTable), collapse=" ")
Refs<-paste(TitleSimTable, collapse=" ")
# Create stat descriptions
Descriptions<-c("Date","Initial number of PBDBRefs","Initial number of GDDRefs","Rounded title similarities","Number of references")
# Create date and time record for stats file
Date<-as.character(Sys.time())
# Bind stats
Stats<-rbind(Date,PBDBDocs,GDDDocs,RoundedTitleSim,Refs)
Stats<-as.data.frame(cbind(Stats,Descriptions),row.names=FALSE)
colnames(Stats)<-c("Stats","Descriptions")
print(Stats)
# Status Update
print(paste("finish matches",Sys.time()))
# Print status
print(paste("Writing Outputs",Sys.time()))
# Write the Outputs
setwd(paste(CurrentDirectory,"/output",sep=""))
# Clear any old output files
unlink("*")
# Write the output
write.csv(GDDRefs, "DDRefs.csv")
write.csv(PBDBRefs, "PBDBRefs.csv")
write.csv(MatchReferences, "MatchReferences.csv")
write.csv(Stats,"Stats.csv",row.names=FALSE)
# Print the completion notice
print(paste("Complete",Sys.time()))