-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbib.py
469 lines (392 loc) · 15.8 KB
/
bib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#! /usr/bin/python3
# System imports
from datetime import datetime
import argparse
import warnings
import sys
# Package imports
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
# Set the formatting identifiers. Since we're using kramdown, we
# don't have to use the HTML tags.
em = '_'
strong = '**'
# Set some HTML tags to go around each part of the reference.
open_span = '<span>'
close_span = '</span>'
# First is to define a function to format the names we get from BibTeX,
# since this task will be the same for every paper type.
def reorder(names, faname):
"""Format the string of author names and return a string.
Adapated from one of the `customization` functions in
`bibtexparser`.
INPUT:
names -- string of names to be formatted. The names from BibTeX are
formatted in the style "Last, First Middle and Last, First
Middle and Last, First Middle" and this is the expected
style here.
faname -- string of the initialized name of the author to whom
formatting will be applied
OUTPUT:
nameout -- string of formatted names. The current format is
"F.M. Last, F.M. Last, and F.M. Last".
"""
# Set the format tag for the website's owner, to highlight where on
# the author list the website owner is. Default is **
my_name_format_tag = '**'
# Convert the input string to a list by splitting the string at the
# "and " and strip out any remaining whitespace.
nameslist = [i.strip() for i in names.replace('\n', ' ').split("and ")]
# Initialize a list to store the names after they've been tidied
# up.
tidynames = []
# Loop through each name in the list.
for namestring in nameslist:
# Strip whitespace from the string
namestring = namestring.strip()
# If, for some reason, we've gotten a blank name, skip it
if len(namestring) < 1:
continue
# Split the `namestring` at the comma, but only perform the
# split once.
namesplit = namestring.rsplit(',', 1)
# In the expected format, the first element of the split
# namestring is the last name. Strip any whitespace and {}.
last = namesplit[0].strip().strip('{}')
# There could be many first/middle names, so we collect them in
# a list. All of the first/middle names are stored in the
# second element of namesplit seperated by whitespace. Split
# the first/middle names at the whitespace then strip out any
# remaining whitespace and any periods (the periods will be
# added in the proper place later).
#print(namesplit)
#print(namesplit[1])
firsts = [i.strip().strip('.') for i in namesplit[1].split()]
# For the case of hyphenated first names, we need to split at
# the hyphen as well. Possible bug: this only works if the
# first first name is the hyphenated one, and this replaces all
# of the first names with the names split at the hyphen. We'd
# like to handle multiple hyphens or a hyphenated name with an
# initial more intelligently.
if '-' in firsts[0]:
firsts = firsts[0].split('-')
# Now that all the first name edge cases are sorted out, we
# want to initialize all the first names. Set the variable
# initials to an empty string to we can add to it. Then loop
# through each of the items in the list of first names. Take
# the first element of each item and append a period, but no
# space.
initials = ''
for item in firsts:
initials += item[0] + '.'
# Stick all of the parts of the name together in `tidynames`
tidynames.append(initials + ' ' + last)
# Find the case of the website author and set the format for that
# name
if faname is not None:
try:
i = tidynames.index(faname)
tidynames[i] = my_name_format_tag + tidynames[i] + my_name_format_tag
except ValueError:
warnings.warn("Couldn't find {} in the names list. Sorry!".format(faname))
# Handle the various cases of number of authors and how they should
# be joined. Convert the elements of `tidynames` to a string.
if len(tidynames) > 2:
tidynames[-1] = 'and ' + tidynames[-1]
nameout = ', '.join(tidynames)
elif len(tidynames) == 2:
tidynames[-1] = 'and ' + tidynames[-1]
nameout = ' '.join(tidynames)
else:
# If `tidynames` only has one name, we only need to convert it
# to a string. The first way that came to mind was to join the
# list to an empty string.
nameout = ''.join(tidynames)
# Return `nameout`, the string of formatted authors
return nameout
def journal_article(ref, faname):
# Get the string of author names in the proper format from
# the `reorder` function. Get some other information. Hack
# the journal title to remove the '\' before '&' in
# 'Energy & Fuels' because Mendeley inserts an extra '\'
# into the BibTeX.
authors = reorder(ref["author"], faname)
title = ref["title"]
journal = ref["journal"]
if '\&' in journal:
words = journal.strip().split('\&')
journal = words[0] + '&' + words[1]
# Start building the string containing the formatted
# reference. Each bit should be surrounded by a span. The
# {:.XYZ} is the kramdown notation to add that class to the
# HTML tag. Each line should be ended with two spaces
# before the newline so that kramdown inserts an HTML <br>
# there.
reference = (
'\n{{:.paper}}\n{open}{title}{close}{{:.papertitle}} \n'
'{open}{authors}{close}{{:.authors}} \n'
'{open}{em}{journal}{em}, '.format(
open=open_span, close=close_span, title=title, authors=authors, em=em,
journal=journal,
)
)
# Not all journal articles will have vol., no., and pp.
# because some may be "In Press".
if "volume" in ref:
reference += 'vol. ' + ref["volume"] + ', '
if "number" in ref:
reference += 'no. ' + ref["number"] + ', '
if "pages" in ref:
reference += 'pp. ' + ref["pages"] + ', '
year = ref["year"]
if "month" in ref:
month = ref["month"].title()
if month == "May":
month += ' '
else:
month += '. '
reference += (
'{month}{year}{close}{{:.journal}} \n'.format(
month=month, year=year, close=close_span,
)
)
else:
reference += (
'{year}{close}{{:.journal}} \n'.format(
year=year, close=close_span,
)
)
if "doi" in ref:
reference += (
'{open}{strong}DOI:{strong} [{doi}]'
'(https://dx.doi.org/{doi}){close}{{:.doi}} \n'.format(
open=open_span, close=close_span, strong=strong,
doi=ref["doi"],
)
)
# Extra comments, such as links to files, should be stored
# as "Notes" for each reference in Mendeley. Mendeley will
# export this field with the tag "annote" in BibTeX.
if "annote" in ref:
reference += (
'{open}{annote}{close}{{:.comment}} \n'.format(
open=open_span, close=close_span,
annote=ref["annote"].replace('\\', ''),
)
)
return reference
def in_proceedings(ref, faname):
authors = reorder(ref["author"], faname)
title = ref["title"]
year = ref["year"]
# Start building the reference string.
reference = (
'\n{{:.paper}}\n{open}{title}{close}{{:.papertitle}} \n'
'{open}{authors}{close}{{:.authors}} \n'
'{open}'.format(
open=open_span, close=close_span, title=title, authors=authors,
)
)
# Since Mendeley doesn't allow customization of BibTeX
# output, we hack the "pages" field to contain the paper
# number for the conference paper. Not all of this type of
# reference will have this, so we check for it.
if "pages" in ref:
paperno = ref["pages"]
reference += paperno + ', '
# Insert the conference title, stored in the "booktitle"
# field.
conf = ref["booktitle"]
reference += conf + ', '
if "organization" in ref:
reference += ref["organization"] + ', '
if "address" in ref:
reference += ref["address"] + ', '
if "month" in ref:
month = ref["month"].title()
if month == "May":
month += ' '
else:
month += '. '
reference += (
'{month}{year}{close}{{:.journal}} \n'.format(
month=month, year=year, close=close_span,
)
)
else:
reference += (
'{year}{close}{{:.journal}} \n'.format(
year=year, close=close_span,
)
)
if "doi" in ref:
reference += (
'{open}{strong}DOI:{strong} [{doi}]'
'(https://dx.doi.org/{doi}){close}{{:.doi}} \n'.format(
open=open_span, strong=strong, doi=ref["doi"],
close=close_span,
)
)
# Extra comments, such as links to files, should be stored
# as "Notes" for each reference in Mendeley. Mendeley will
# export this field with the tag "annote" in BibTeX.
if "annote" in ref:
reference += (
'{open}{annote}{close}{{:.comment}} \n'.format(
open=open_span, annote=ref["annote"].replace('\\', ''),
close=close_span,
)
)
return reference
def thesis(ref, faname):
authors = reorder(ref["author"], faname)
title = ref["title"]
year = ref["year"]
reference = (
'\n{{:.paper}}\n{open}{title}{close}{{:.papertitle}} \n'
'{open}{authors}{close}{{:.authors}} \n'
'{open}'.format(
open=open_span, close=close_span, title=title, authors=authors,
)
)
if "school" in ref:
reference += ref["school"] + ', '
if "month" in ref:
month = ref["month"].title()
if month == "May":
month += ' '
else:
month += '. '
reference += month
reference += year + close_span + '{:.journal} \n'
if "annote" in ref:
reference += (
'{open}{annote}{close}{{:.comment}} \n'.format(
open=open_span, annote=ref["annote"].replace('\\', ''),
close=close_span,
)
)
return reference
def load_bibtex(bib_file_name):
# Open and parse the BibTeX file in `bib_file_name` using
# `bibtexparser`
with open(bib_file_name, 'r') as bib_file:
bp = BibTexParser(bib_file.read(), customization=convert_to_unicode)
# Get a dictionary of dictionaries of key, value pairs from the
# BibTeX file. The structure is
# {ID:{authors:...},ID:{authors:...}}.
refsdict = bp.get_entry_dict()
# Create a list of all the types of documents found in the BibTeX
# file, typically `article`, `inproceedings`, and `phdthesis`.
# Dedupe the list.
entry_types = []
for k, ref in refsdict.items():
entry_types.append(ref["ENTRYTYPE"])
entry_types = set(entry_types)
# For each of the types of reference, we need to sort each by month
# then year. We store the dictionary representing each reference in
# a sorted list for each type of reference. Then we store each of
# these sorted lists in a dictionary whose key is the type of
# reference and value is the list of dictionaries.
sort_dict = {}
for t in entry_types:
#temp = sorted([val for key, val in refsdict.items()
# if val["ENTRYTYPE"] == t], key=lambda l:
# datetime.strptime(l["month"], '%B').month, reverse=True)
print(t)
#print(refsdict)
temp = sorted([val for key, val in refsdict.items()
if val["ENTRYTYPE"] == t], key=lambda l:l["ID"], reverse=True)
sort_dict[t] = sorted(temp, key=lambda k: int(k["year"]), reverse=True)
if len(sort_dict[t]) > 2:
print(sort_dict[t][:2])
else:
print(sort_dict[t])
return sort_dict
def main(argv):
arg_parser = argparse.ArgumentParser(
description=(
"Convert a BibTeX file to kramdown output with optional author highlighting."
),
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
arg_parser.add_argument(
"-b", "--bibfile",
help="Set the filename of the BibTeX reference file.",
default="refs.bib",
type=str,
)
arg_parser.add_argument(
"-o", "--output",
help="Set the filename of the kramdown output.",
default="pubs.md",
type=str,
)
arg_parser.add_argument(
"-a", "--author",
help="Set the name of the author to be highlighted.",
type=str,
)
args = arg_parser.parse_args(argv)
bib_file_name = args.bibfile
output_file_name = args.output
faname = args.author
sort_dict = load_bibtex(bib_file_name)
print(sort_dict.keys())
# Open the output file with utf-8 encoding, write mode, and Unix
# newlines.
with open(output_file_name, encoding='utf-8', mode='w',
newline='') as out_file:
# Start with journal articles.
#out_file.write('Journal Articles\n---\n')
# To get the year numbering correct, we have to set a dummy
# value for pubyear (usage described below).
pubyear = ''
# Loop through all the references in the article type. The
# logic in this loop (and the loops for the other reference
# types) is not amenable to generalization due to different
# information for each reference type. Therefore, its easiest
# to write out the logic for each loop instead of writing the
# logic into a function and calling that.
out_file.write("---\n")
out_file.write("layout: page\n")
out_file.write("title: Publications\n")
out_file.write("permalink: /publications/\n")
out_file.write("---\n\n")
if 'article' in sort_dict:
for ref in sort_dict["article"]:
# Get the publication year. If the year of the current
# reference is not equal to the year of the previous
# reference, we need to set `pubyear` equal to `year`.
year = ref["year"]
if year != pubyear:
pubyear = year
write_year = '\n{{:.year}}\n### {}\n'.format(year)
out_file.write(write_year)
out_file.write(journal_article(ref, faname))
# Next are conference papers and posters.
#out_file.write('\nConference Publications and Posters\n---\n')
# Same trick for the pubyear as for the journal articles.
pubyear = ''
# Loop through the references in the `inproceedings` type.
for ref in sort_dict["inproceedings"]:
year = ref["year"]
print("YEAR",year)
if year != pubyear:
pubyear = year
write_year = '\n{{:.year}}\n### {}\n'.format(year)
out_file.write(write_year)
out_file.write(in_proceedings(ref, faname))
# Finally are the theses and dissertations. Same general logic
# as for the other reference types.
pubyear = ''
for ref in sort_dict["phdthesis"]:
out_file.write("\nPh.D. Dissertation\n---\n\n")
year = ref["year"]
if year != pubyear:
pubyear = year
write_year = '{{:.year}}\n### {}\n'.format(year)
out_file.write(write_year)
out_file.write(thesis(ref, faname))
if __name__ == "__main__":
main(sys.argv[1:])