forked from mutronic/marcaroni
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate-data.py
executable file
·58 lines (52 loc) · 2.31 KB
/
update-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/local/bin/python3
# vim: set expandtab:
# vim: tabstop=2:
# vim: ai:
# vim: shiftwidth=2:
from marcaroni import db
import re
try:
conn = db.connect()
cur = conn.cursor()
except Exception as e:
print("Update data could not connect.")
exit(1)
else:
output = open('bib-data.txt', 'w')
output.write('identifier,id,source,tag,subfield\n')
errors = open('shitty-isbns.txt','w')
#debug
print('Starting query...')
#cur.copy_expert("COPY (SELECT bre.id, bre.source, rfr.value FROM biblio.record_entry bre JOIN metabib.real_full_rec rfr ON bre.id = rfr.record WHERE not bre.deleted AND rfr.tag = '020' AND rfr.subfield in ('a','z') and bre.source is not NULL) TO STDOUT WITH CSV HEADER", data_dictionary)
cur.execute("SELECT bre.id, bre.source, rfr.value, rfr.tag, rfr.subfield FROM biblio.record_entry bre JOIN metabib.real_full_rec rfr ON bre.id = rfr.record WHERE not bre.deleted AND (rfr.tag = '020' OR rfr.tag = '035') AND (rfr.subfield = 'a' OR rfr.subfield = 'z') and bre.source is not NULL")
#debug
print('Query finished. Cleaning data...')
for row in cur:
identifier = row[2].strip()
identifier = identifier.strip(',')
if str(row[3]) == '020':
identifier = identifier.split(' ')[0]
identifier = identifier.strip('-')
identifier = identifier.split('ü')[0] ## Delete me when umlauts are fixed
identifier = identifier.split('(')[0]
identifier = identifier.split('\\')[0]
## If ISBN is the wrong length, warn but don't break
# if len(identifier) != 10 and len(identifier) != 13:
## print("We probably have not found a good isbn here: " + cleaned)
# errors.write(','.join(map(str,row)))
# errors.write('\n')
## Only consider matchable isbns strings that are between 9 and 14 chars.
if len(identifier) < 9 or len(identifier) > 14:
continue
elif str(row[3]) == '856':
identifier = re.sub(r'.* ca login url ','',identifier)
else:
if not any(i.isdigit() for i in identifier):
continue
if len(identifier) == 0:
continue
output.write(','.join((str(identifier), str(row[0]), str(row[1]), str(row[3]), str(row[4]))))
output.write('\n')
#debug
print('Done.')
exit()