-
Notifications
You must be signed in to change notification settings - Fork 355
/
Copy pathuri.py
410 lines (335 loc) · 12.4 KB
/
uri.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
"""
URIs are Unicode strings that represent the canonical name for any object in
ConceptNet. These can be used with the ConceptNet Web API, or referred to in a
Semantic Web application, by attaching the prefix:
http://api.conceptnet.io
For example, the English concept "book" has the URI '/c/en/book'. This concept
can be referred to, or retrieved, using this complete URI:
http://api.conceptnet.io/c/en/book
"""
def standardize_text(text, lowercase=True):
raise NotImplementedError(
"This function has been superseded by "
"conceptnet5.nodes.preprocess_and_tokenize_text."
)
def join_uri(*pieces):
"""
`join_uri` builds a URI from constituent pieces that should be joined with
slashes (/).
Leading and trailing on the pieces are acceptable, but will be ignored. The
resulting URI will always begin with a slash and have its pieces separated
by a single slash.
The pieces do not have `preprocess_and_tokenize_text` applied to them; to
make sure your URIs are in normal form, run `preprocess_and_tokenize_text`
on each piece that represents arbitrary text.
>>> join_uri('/c', 'en', 'cat')
'/c/en/cat'
>>> join_uri('c', 'en', ' spaces ')
'/c/en/ spaces '
>>> join_uri('/r/', 'AtLocation/')
'/r/AtLocation'
>>> join_uri('/test')
'/test'
>>> join_uri('test')
'/test'
>>> join_uri('/test', '/more/')
'/test/more'
"""
joined = '/' + ('/'.join([piece.strip('/') for piece in pieces]))
return joined
def concept_uri(lang, text, *more):
"""
`concept_uri` builds a representation of a concept, which is a word or
phrase of a particular language, which can participate in relations with
other concepts, and may be linked to concepts in other languages.
Every concept has an ISO language code and a text. It may also have a part
of speech (pos), which is typically a single letter. If it does, it may
have a disambiguation, a string that distinguishes it from other concepts
with the same text.
This function should be called as follows, where arguments after `text`
are optional:
concept_uri(lang, text, pos, disambiguation...)
`text` and `disambiguation` should be strings that have already been run
through `preprocess_and_tokenize_text`.
This is a low-level interface. See `standardized_concept_uri` in nodes.py for
a more generally applicable function that also deals with special
per-language handling.
>>> concept_uri('en', 'cat')
'/c/en/cat'
>>> concept_uri('en', 'cat', 'n')
'/c/en/cat/n'
>>> concept_uri('en', 'cat', 'n', 'feline')
'/c/en/cat/n/feline'
>>> concept_uri('en', 'this is wrong')
Traceback (most recent call last):
...
AssertionError: 'this is wrong' is not in normalized form
"""
assert ' ' not in text, "%r is not in normalized form" % text
if len(more) > 0:
if len(more[0]) != 1:
# We misparsed a part of speech; everything after the text is
# probably junk
more = []
for dis1 in more[1:]:
assert ' ' not in dis1, "%r is not in normalized form" % dis1
return join_uri('/c', lang, text, *more)
def compound_uri(op, args):
"""
Some URIs represent a compound structure or operator built out of a number
of arguments. Some examples are the '/and' and '/or' operators, which
represent a conjunction or disjunction over two or more URIs, which may
themselves be compound URIs; or the assertion structure, '/a', which takes
a relation and two URIs as its arguments.
This function takes the main 'operator', with the slash included, and an
arbitrary number of arguments, and produces the URI that represents the
entire compound structure.
These structures contain square brackets as segments, which look like
`/[/` and `/]/`, so that compound URIs can contain other compound URIs
without ambiguity.
>>> compound_uri('/nothing', [])
'/nothing/[/]'
>>> compound_uri('/a', ['/r/CapableOf', '/c/en/cat', '/c/en/sleep'])
'/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]'
"""
items = [op]
first_item = True
items.append('[')
for arg in args:
if first_item:
first_item = False
else:
items.append(',')
items.append(arg)
items.append(']')
return join_uri(*items)
def split_uri(uri):
"""
Get the slash-delimited pieces of a URI.
>>> split_uri('/c/en/cat/n/animal')
['c', 'en', 'cat', 'n', 'animal']
>>> split_uri('/')
[]
"""
if not uri.startswith('/'):
return [uri]
uri2 = uri.lstrip('/')
if not uri2:
return []
return uri2.split('/')
def uri_prefix(uri, max_pieces=3):
"""
Strip off components that might make a ConceptNet URI too detailed. Only
the first `max_pieces` components will be kept.
By default, `max_pieces` is 3, making this function useful for converting
disambiguated concepts into their more general ambiguous forms.
If the URI is actually a fully qualified URL, no components are removed.
>>> uri_prefix('/c/en/cat/n/animal')
'/c/en/cat'
>>> uri_prefix('/c/en/cat/n')
'/c/en/cat'
>>> uri_prefix('/c/en/cat')
'/c/en/cat'
>>> uri_prefix('/c/en')
'/c/en'
>>> uri_prefix('/c/en/cat', 2)
'/c/en'
>>> uri_prefix('http://en.wikipedia.org/wiki/Example')
'http://en.wikipedia.org/wiki/Example'
"""
if is_absolute_url(uri):
return uri
pieces = split_uri(uri)[:max_pieces]
return join_uri(*pieces)
def uri_prefixes(uri, min_pieces=2):
"""
Get URIs that are prefixes of a given URI: that is, they begin with the
same path components. By default, the prefix must have at least 2
components.
If the URI has sub-parts that are grouped by square brackets, then
only complete sub-parts will be allowed in prefixes.
>>> list(uri_prefixes('/c/en/cat/n/animal'))
['/c/en', '/c/en/cat', '/c/en/cat/n', '/c/en/cat/n/animal']
>>> list(uri_prefixes('/test/[/group/one/]/[/group/two/]'))
['/test/[/group/one/]', '/test/[/group/one/]/[/group/two/]']
>>> list(uri_prefixes('http://en.wikipedia.org/wiki/Example'))
['http://en.wikipedia.org/wiki/Example']
"""
if is_absolute_url(uri):
return [uri]
pieces = []
prefixes = []
for piece in split_uri(uri):
pieces.append(piece)
if len(pieces) >= min_pieces:
if pieces.count('[') == pieces.count(']'):
prefixes.append(join_uri(*pieces))
return prefixes
def parse_compound_uri(uri):
"""
Given a compound URI, extract its operator and its list of arguments.
>>> parse_compound_uri('/nothing/[/]')
('/nothing', [])
>>> parse_compound_uri('/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]')
('/a', ['/r/CapableOf', '/c/en/cat', '/c/en/sleep'])
>>> parse_compound_uri('/or/[/and/[/s/one/,/s/two/]/,/and/[/s/three/,/s/four/]/]')
('/or', ['/and/[/s/one/,/s/two/]', '/and/[/s/three/,/s/four/]'])
"""
pieces = split_uri(uri)
if pieces[-1] != ']':
raise ValueError("Compound URIs must end with /]")
if '[' not in pieces:
raise ValueError(
"Compound URIs must contain /[/ at the beginning of the argument list"
)
list_start = pieces.index('[')
op = join_uri(*pieces[:list_start])
chunks = []
current = []
depth = 0
# Split on commas, but not if they're within additional pairs of brackets.
for piece in pieces[(list_start + 1) : -1]:
if piece == ',' and depth == 0:
chunks.append('/' + ('/'.join(current)).strip('/'))
current = []
else:
current.append(piece)
if piece == '[':
depth += 1
elif piece == ']':
depth -= 1
assert depth == 0, "Unmatched brackets in %r" % uri
if current:
chunks.append('/' + ('/'.join(current)).strip('/'))
return op, chunks
def parse_possible_compound_uri(op, uri):
"""
The AND and OR conjunctions can be expressed as compound URIs, but if they
contain only one thing, they are returned as just that single URI, not a
compound.
This function returns the list of things in the compound URI if its operator
matches `op`, or a list containing the URI itself if not.
>>> parse_possible_compound_uri(
... 'or', '/or/[/and/[/s/one/,/s/two/]/,/and/[/s/three/,/s/four/]/]'
... )
['/and/[/s/one/,/s/two/]', '/and/[/s/three/,/s/four/]']
>>> parse_possible_compound_uri('or', '/s/contributor/omcs/dev')
['/s/contributor/omcs/dev']
"""
if uri.startswith('/' + op + '/'):
return parse_compound_uri(uri)[1]
else:
return [uri]
def conjunction_uri(*sources):
"""
Make a URI representing a conjunction of sources that work together to provide
an assertion. The sources will be sorted in lexicographic order.
>>> conjunction_uri('/s/contributor/omcs/dev')
'/s/contributor/omcs/dev'
>>> conjunction_uri('/s/rule/some_kind_of_parser', '/s/contributor/omcs/dev')
'/and/[/s/contributor/omcs/dev/,/s/rule/some_kind_of_parser/]'
"""
if len(sources) == 0:
# Logically, a conjunction with 0 inputs represents 'True', a
# proposition that cannot be denied. This could be useful as a
# justification for, say, mathematical axioms, but when it comes to
# ConceptNet, that kind of thing makes us uncomfortable and shouldn't
# appear in the data.
raise ValueError("Conjunctions of 0 things are not allowed")
elif len(sources) == 1:
return sources[0]
else:
return compound_uri('/and', sorted(set(sources)))
def assertion_uri(rel, start, end):
"""
Make a URI for an assertion, as a compound URI of its relation, start node,
and end node.
>>> assertion_uri('/r/CapableOf', '/c/en/cat', '/c/en/sleep')
'/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]'
"""
assert rel.startswith('/r'), rel
return compound_uri('/a', (rel, start, end))
def is_concept(uri):
"""
>>> is_concept('/c/sv/klänning')
True
>>> is_concept('/x/en/ly')
False
>>> is_concept('/a/[/r/Synonym/,/c/ro/funcția_beta/,/c/en/beta_function/]')
False
"""
return uri.startswith('/c/')
def is_relation(uri):
"""
>>> is_relation('/r/IsA')
True
>>> is_relation('/c/sv/klänning')
False
"""
return uri.startswith('/r/')
def is_term(uri):
"""
>>> is_term('/c/sv/kostym')
True
>>> is_term('/x/en/ify')
True
>>> is_term('/a/[/r/RelatedTo/,/c/en/cake/,/c/en/flavor/]')
False
"""
return uri.startswith('/c/') or uri.startswith('/x/')
def is_absolute_url(uri):
"""
We have URLs pointing to Creative Commons licenses, starting with 'cc:',
which for Linked Data purposes are absolute URLs because they'll be resolved
into full URLs.
>>> is_absolute_url('http://fr.wiktionary.org/wiki/mįkká’e_uxpáðe')
True
>>> is_absolute_url('/c/fr/nouveau')
False
"""
return uri.startswith('http') or uri.startswith('cc:')
def get_uri_language(uri):
"""
Extract the language from a concept URI. If the URI points to an assertion,
get the language of its first concept.
>>> get_uri_language('/a/[/r/RelatedTo/,/c/en/orchestra/,/c/en/symphony/]')
'en'
>>> get_uri_language('/c/pl/cześć')
'pl'
>>> get_uri_language('/x/en/able')
'en'
"""
if uri.startswith('/a/'):
return get_uri_language(parse_possible_compound_uri('a', uri)[1])
elif is_term(uri):
return split_uri(uri)[1]
else:
return None
def uri_to_label(uri):
"""
Convert a ConceptNet uri into a label to be used in nodes. This function
replaces an underscore with a space, so while '/c/en/example' will be
converted into 'example', '/c/en/canary_islands' will be converted into
'canary islands'.
>>> uri_to_label('/c/en/example')
'example'
>>> uri_to_label('/c/en/canary_islands')
'canary islands'
>>> uri_to_label('/c/en')
''
>>> uri_to_label('/r/RelatedTo')
'RelatedTo'
>>> uri_to_label('http://wikidata.dbpedia.org/resource/Q89')
'Q89'
"""
if is_absolute_url(uri):
return uri.split('/')[-1].replace('_', ' ')
if is_term(uri):
uri = uri_prefix(uri)
parts = split_uri(uri)
if len(parts) < 3 and not is_relation(uri):
return ''
return parts[-1].replace('_', ' ')
class Licenses:
cc_attribution = 'cc:by/4.0'
cc_sharealike = 'cc:by-sa/4.0'