forked from ArneBinder/pytorch-ie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconll2003.py
53 lines (37 loc) · 1.6 KB
/
conll2003.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from dataclasses import dataclass
import datasets
import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans
class CoNLL2003Config(datasets.BuilderConfig):
"""BuilderConfig for CoNLL2003"""
def __init__(self, **kwargs):
"""BuilderConfig for CoNLL2003.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)
@dataclass
class CoNLL2003Document(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")
class Conll2003(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = CoNLL2003Document
BASE_DATASET_PATH = "conll2003"
BUILDER_CONFIGS = [
CoNLL2003Config(
name="conll2003", version=datasets.Version("1.0.0"), description="CoNLL2003 dataset"
),
]
def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}
def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)
document = CoNLL2003Document(text=text, id=doc_id)
for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)
return document