-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathLexTo.py
34 lines (27 loc) · 896 Bytes
/
LexTo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/python
import jpype
import os
class LexTo (object):
def __init__(self):
filePath = os.path.abspath(os.path.dirname(__file__))
jpype.startJVM(jpype.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s/LongLexTo' % (filePath))
LongLexTo = jpype.JClass("LongLexTo")
self.lexto = LongLexTo('%s/lexitron.txt' % (filePath))
self.typeString = {}
self.typeString[0] = "unknown"
self.typeString[1] = "known"
self.typeString[2] = "ambiguous"
self.typeString[3] = "English/Digits"
self.typeString[4] = "special"
def tokenize(self, line):
line = line.strip()
self.lexto.wordInstance(line)
typeList = self.lexto.getTypeList()
typeList = [self.typeString[n] for n in typeList]
wordList = []
begin = self.lexto.first()
while self.lexto.hasNext():
end = self.lexto.next()
wordList.append( line[begin:end] )
begin = end
return wordList, typeList