-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathveld_convert.yaml
71 lines (66 loc) · 2.31 KB
/
veld_convert.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
x-veld:
code:
description: "prepare data for spacy NER training, since spacy expects the entity annotation
indices to be precisely at the beginning and end of the words, and also no overlapping
entity annotations. Then it converts the data to spaCy docbin, and prepares it for training
by splitting it into train, dev, eval subsets, and shuffling them randomly."
topic:
- "ETL"
- "NLP"
- "Machine Learning"
input:
- volume: /veld/input/
file_type: "json"
description: "name of the csv file, containing NER gold data"
environment_var: in_json_file
content: "NER gold data"
output:
- volume: /veld/output/docbin/
description: "path to folder where spacy docbin files will be stored with file names
`train.spacy, dev.spacy, eval.spacy`"
file_type: "spaCy docbin"
content: "NER gold data"
- volume: /veld/output/log/
environment_var: out_log_file
description: "log file of conversion"
file_type: "txt"
content: "log"
config:
- environment_var: model_base
description: "spacy model to be used for conversion."
var_type: str
- environment_var: percentage_train
description: "percentage of data allocated to training set"
var_type: "int"
default: 80
- environment_var: percentage_dev
description: "percentage of data allocated to dev set"
var_type: "int"
default: 10
- environment_var: percentage_eval
description: "percentage of data allocated to eval set"
var_type: "int"
default: 10
- environment_var: seed
description: "seed for initial random shuffling of training data"
var_type: "int"
default: 42
services:
veld_convert:
build: .
command: python /veld/code/convert.py
#command: bash -c 'ls /root/ && touch /root/bla && ls /root/'
volumes:
- ./src/:/veld/code/
- ./data/models_base_cache/:/tmp/models_base_cache/
- ./data/training_data_json/:/veld/input/
- ./data/docbin/:/veld/output/docbin/
- ./:/veld/output/log/
environment:
in_json_file: null
out_log_file: null
model_base: null
percentage_train: 80
percentage_dev: 10
percentage_eval: 10
seed: 42