-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-html-response-codes.py
157 lines (137 loc) · 5.16 KB
/
parse-html-response-codes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 22 14:35:41 2022
Original/inspiration from https://github.com/mbelvadi/lcc_from_isbn, Copyright 2021 Melissa Belvadi Open Source License CC-BY-NC-SA granted
Changes: used OCLC Classify portion, simpliefied ISBN cleaning, and simplified to use an input file of ISBNs. Can use within Binder.
Examples for different response codes for an ISBN query:
http://classify.oclc.org/classify2/Classify?isbn=978157506752&summary=true
<response code="101"/>
Meaning: no match
http://classify.oclc.org/classify2/Classify?isbn=9789004335462&summary=true
<response code="0"/> or <response code="2"/>
Retrieve most popular LCC from <recommendations><lcc><mostPopular holdings="673" nsfa="HX11.I46" sfa="HX11.I46"/>
http://classify.oclc.org/classify2/Classify?isbn=9781526109538&summary=true
<response code="4"/>
need to navigate to a subpage to retrieve the most popular LCC
If the response code is 102, OCLC is reporting an unexpected error; can check the record by hand.
"""
import logging
import isbnlib #https://pypi.org/project/isbnlib/
import requests #https://pypi.org/project/requests/
from requests.utils import requote_uri
import xml.dom.pulldom
import xml.dom.minidom
import xml.sax.saxutils
import pandas as pd
from isbnlib import canonical
UA = 'isbnlib (gzip)'
myheaders = {'User-Agent': UA}
base = 'http://classify.oclc.org/classify2/Classify?'
summaryBase = '&summary=true'
summaryInd = 'false'
logger = logging.getLogger(__name__)
global parmvalue
global parmtype
parmtype="isbn"
# OCLC Classify2 method
def get_oclc_data(parmtype, parmvalue=""):
global lcc_value
lcc_value = None
try:
nexturl = base + parmtype+"=" + requote_uri(parmvalue)+"&summary=true"
logger.debug("OCLC URL: {} ".format(nexturl))
except Exception as ue:
logger.error("OCLC URL encode failed: {}".format(ue))
return None
else:
try:
r = requests.get(nexturl, headers=myheaders)
if not r.ok:
logger.error("OCLC Request returned http error: {}".format(r.status_code))
return None
except Exception as e:
logger.error("OCLC URL request failed: {}".format(e))
return None
else:
wq = r.text
xdoc = xml.dom.minidom.parseString(wq)
response = xdoc.getElementsByTagName('response')[0]
respCode = response.attributes["code"].value
if respCode == '0' or respCode == '2':
recommendations = xdoc.getElementsByTagName('recommendations')[0]
if recommendations:
if len(xdoc.getElementsByTagName('lcc')) > 0:
local_lcc = recommendations.getElementsByTagName('lcc')[0]
if local_lcc:
for mostPopular in local_lcc.getElementsByTagName('mostPopular'):
nsfa = mostPopular.attributes["nsfa"].value
lcc_value = nsfa
elif respCode == '4':
works = xdoc.getElementsByTagName('works')[0]
logger.debug('Works found: ' + str(len(works.getElementsByTagName('work'))))
for work in works.getElementsByTagName('work'):
try:
m_wi = work.attributes["wi"].value
except:
continue
else:
try:
schemes = work.attributes["schemes"].value
except:
continue
if 'LCC' in schemes:
logger.debug(f'going to try to get lcc using wi {m_wi}')
lcc_value = get_oclc_data('wi',m_wi)
break
elif respCode != '102':
logger.error("OCLC reporting odd error {}, check by hand: {}".format(respCode,nexturl))
if lcc_value:
return lcc_value
else:
return None
def validate_json(data):
if str(data) == "":
logger.error("validate_json: returns False because no data in passed string: {}".format(str(data)))
return False
return True
def fix_isbn(isbn):
lib_isbn = isbnlib.canonical(isbn)
if len(lib_isbn) in (10, 13):
if len(lib_isbn) == 10:
isgood = isbnlib.is_isbn10(lib_isbn)
else:
isgood = isbnlib.is_isbn13(lib_isbn)
if isgood:
return lib_isbn
if len(lib_isbn) < 10:
return None
lib_isbn = isbnlib.get_isbnlike(isbn)
if len(lib_isbn) < 10:
return None
lib_isbn = isbnlib.clean(lib_isbn)
if len(lib_isbn) < 10:
return None
lib_isbn = isbnlib.get_canonical_isbn(lib_isbn)
if len(lib_isbn) < 10:
return None
if not lib_isbn:
return None
if len(lib_isbn) in (10, 13):
if len(lib_isbn) == 10:
isgood = isbnlib.is_isbn10(lib_isbn)
else:
isgood = isbnlib.is_isbn13(lib_isbn)
else:
return None
if isgood:
return lib_isbn
else:
return None
# create new function to be used only if ISBNs need validation.
def clean_isbn(row):
x = canonical(row[0])
return get_oclc_data(parmtype, x)
df = pd.read_csv('isbns.csv', dtype="string", names=['isbn'])
df['lcc'] = df.apply(clean_isbn, axis=1)
df.to_csv('isbn-lcc.csv', encoding='utf-8', index=False)