-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.py
68 lines (58 loc) · 2.41 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import nltk
from texttaglib import ttl
from texttaglib.sqlite import TTLSQLite
# ------------------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------------------
def dump_sent(sent):
''' Print a sentence to console '''
print("Raw: {}".format(sent.text))
print("Tokens: {}".format(sent.tokens))
print("Concepts: {}".format(sent.concepts))
for c in sent.concepts:
print(" > {}".format(c))
print(sent.to_json())
# ------------------------------------------------------------------------------
# Demo script
# ------------------------------------------------------------------------------
# create a TTL database
db = TTLSQLite('data/demo.db')
# create a sample corpus (if needed)
encor = db.ensure_corpus(name='eng', title='English sentences')
# create a sample document in corpus 'eng' (if needed)
endoc = db.ensure_doc(name='eng1', title='English sample sentences #1', lang='eng', corpus=encor)
# get document by name
doc = db.doc.select_single('name=?', ('eng1',))
# if the document is empty, create a sample sentence inside
if not db.sent.select('docID=?', (doc.ID,)):
sent = ttl.Sentence("I am a short sentence.")
# tokenize the sentence with NLTK tokenizer
tokens = nltk.word_tokenize(sent.text)
sent.import_tokens(tokens)
# add concepts
sent.new_concept('01436003-a', 'short', tokens=[3])
sent.new_concept('06285090-n', 'sentence', tokens=[4])
# comment on sentences
sent.comment = 'This is just an example to demonstrate how to use TTL.'
# print it out
dump_sent(sent)
# save it to document 'eng1'
sent.docID = doc.ID
db.save_sent(sent)
# create a second sentence with MWE
calico_text = 'I like calico cat.'
calico_cat_synset = '02123242-n'
if not db.sent.select('text = ?', (calico_text,)):
sent = ttl.Sentence(calico_text)
sent.new_tag('三毛猫が好きです。', tagtype='jpn')
sent.import_tokens(nltk.word_tokenize(sent.text))
# create concepts
sent.new_concept('01777210-v', 'like', tokens=[1])
sent.new_concept(calico_cat_synset, 'calico cat', tokens=[2, 3]) # MWE -> tokens=[2,3]
sent[2].new_tag('+', tagtype='MWE')
sent[3].new_tag('+', tagtype='MWE')
dump_sent(sent)
# save it to database
sent.docID = doc.ID
db.save_sent(sent)
print("Done!")