This repository has been archived by the owner on Jun 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
71 lines (64 loc) · 1.41 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import spacy
from spacy.tokens import DocBin
import pandas as pd
import json
import ast
labels = [
'Country',
'RegionType',
'Region',
'CountyType',
'County',
'Included',
'LocalityType',
'Locality',
'StreetType',
'Street',
'HousingType',
'Housing',
'HostelType',
'Hostel',
'HouseNumberType',
'HouseNumber',
'HouseNumberAdditionally',
'SectionType',
'Section',
'ApartmentType',
'Apartment',
'RoomType',
'Room',
'Sector',
'FloorType',
'Floor',
'PostCode',
'Manually',
'NotAddress',
'Comment',
'AdditionalData'
]
## Init NLP
nlp = spacy.blank('uk')
##
data = pd.read_csv('training/pretrain.csv', sep=";")
db = DocBin()
for name, data in data.iterrows():
doc = nlp(data['Raw'].lower())
ents = []
for item in labels:
if str(data[item]) == 'nan':
pass
else:
positions = ast.literal_eval(data[item])
if len(positions) > 1:
positions = [{'start': positions[0]['start'], 'end': positions[-1]['end']}]
for key, position in enumerate(positions):
string = doc.char_span(int(position['start']), int(position['end']), label=item)
ents.append(string)
print(ents)
doc.ents = ents
db.add(doc)
##
# Build raw models
#
db.to_disk('training/train.spacy')
db.to_disk('training/test.spacy')