-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvertBNC.java
149 lines (139 loc) · 5.64 KB
/
convertBNC.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// Convert a BNC document loaded as XML to what we want it to look like.
// This does the following main steps:
// * convert information from the header to document features starting with bnc.
// * remove the document header
// * create Token annotations in the Key set based on the w and c annotations from the original XML
// * create Sentence annotations in the Key set based on the s annotations from the original XML
import gate.*;
import gate.corpora.*;
import java.util.regex.*;
import java.util.*;
Pattern whiteSpaceEnd = Pattern.compile("\\s+$");
Pattern whiteSpaceBeg = Pattern.compile("^\\s+");
@Override
public void execute() {
//System.err.println("Running for "+doc.getName());
FeatureMap dfm = doc.getFeatures();
AnnotationSet oms = doc.getAnnotations("Original markups");
store4field(doc,oms,"availability");
store4field(doc,oms,"bibl");
store4field(doc,oms,"classCode");
store4field(doc,oms,"date");
store4field(doc,oms,"distributor");
store4field(doc,oms,"edition");
store4field(doc,oms,"extent");
store4field(doc,oms,"imprint");
store4field(doc,oms,"profileDesc");
store4field(doc,oms,"pubPlace");
store4field(doc,oms,"publicationStmt");
store4field(doc,oms,"publisher");
store4field(doc,oms,"respStmt");
store4field(doc,oms,"sourceDesc");
store4field(doc,oms,"titleStmt");
//add stuff that is special ...
store4field(doc,oms,"keywords");
String feature_n = getFeature(doc,oms,"imprint","n");
dfm.put("bnc.imprint.n",feature_n);
dfm.put("bnc.id",gate.Utils.getOnlyAnn(oms.get("bncDoc")).getFeatures().get("xml:id"));
dfm.put("bnc.catRef.targets",gate.Utils.getOnlyAnn(oms.get("catRef")).getFeatures().get("targets"));
AnnotationSet changeAnns = oms.get("change");
for(Annotation ann : changeAnns) {
String text = gate.Utils.cleanStringFor(doc,ann);
String date = (String)ann.getFeatures().get("date");
String who = (String)ann.getFeatures().get("who");
dfm.put("bnc.change.what."+date,text);
dfm.put("bnc.change.who."+date,who);
}
AnnotationSet usageAnns = oms.get("tagUsage");
for(Annotation ann : usageAnns) {
String gi = (String)ann.getFeatures().get("gi");
String occurs = (String)ann.getFeatures().get("occurs");
dfm.put("bnc.tagUsage."+gi,Integer.parseInt(occurs));
}
// Delete the span where the header is located
AnnotationSet spanAnns = oms.get("wtext");
if(spanAnns.size() == 0) {
spanAnns = oms.get("stext");
}
if(spanAnns.size() == 0) {
throw new RuntimeException("Neither wtext nor stext annotation in document "+doc.getName());
}
Annotation spanAnn = gate.Utils.getOnlyAnn(spanAnns);
Long startDoc = gate.Utils.start(spanAnn);
try {
doc.edit(0L,startDoc+1,new DocumentContentImpl(""));
} catch (Exception ex) {
System.err.println(doc.getName()+": could not edit");
ex.printStackTrace(System.err);
}
AnnotationSet keySet = doc.getAnnotations("Key");
AnnotationSet sentAnns = oms.get("s");
for(Annotation ann : sentAnns) {
gate.Utils.addAnn(keySet,ann,"Sentence",gate.Utils.featureMap("n",ann.getFeatures().get("n")));
}
// Convert the w annotations to Token annotations, but make sure whiteSpace is removed
AnnotationSet wAnns = oms.get("w");
for(Annotation ann : wAnns) {
String text = gate.Utils.stringFor(doc,ann);
String wsAfter = "";
String wsBefore = "";
Matcher m = whiteSpaceEnd.matcher(text);
if(m.find()) {
wsAfter = m.group(0);
}
m = whiteSpaceBeg.matcher(text);
if(m.find()) {
wsBefore = m.group(0);
}
Long start = gate.Utils.start(ann);
Long end = gate.Utils.end(ann);
// add the actual token, but make sure we did not find something that is
// entirely white space! Oddly this exists a few times in the corpus!
if(wsBefore.length() == text.length()) {
gate.Utils.addAnn(keySet,start,end,"SpaceToken",gate.Utils.featureMap());
} else {
FeatureMap fmAnn = ann.getFeatures();
gate.Utils.addAnn(keySet,start+wsBefore.length(),end-wsAfter.length(),"Token",
gate.Utils.featureMap("c5",fmAnn.get("c5"),"pos",fmAnn.get("pos"),"lemma",fmAnn.get("hw"),"kind","word"));
if(!wsAfter.isEmpty()) {
gate.Utils.addAnn(keySet,end-wsAfter.length(),end,"SpaceToken",gate.Utils.featureMap());
}
if(!wsBefore.isEmpty()) {
gate.Utils.addAnn(keySet,start,start+wsBefore.length(),"SpaceToken",gate.Utils.featureMap());
}
}
}
AnnotationSet cAnns = oms.get("c");
for(Annotation ann : cAnns) {
String text = gate.Utils.cleanStringFor(doc,ann);
gate.Utils.addAnn(keySet,ann,"Token",gate.Utils.featureMap("kind","punctuation","lemma",text));
}
}
private void store4field(Document doc, AnnotationSet set, String name) {
List<Annotation> anns = set.get(name).inDocumentOrder();
if(anns.size() > 0) {
if(anns.size() > 1) {
System.err.println(doc.getName()+": not exactly one annotation for "+name+", taking first");
}
Annotation ann = anns.get(0);
String text = gate.Utils.cleanStringFor(doc,ann);
doc.getFeatures().put("bnc."+name,text);
} else {
System.err.println(doc.getName()+": no field "+name);
}
}
// return the feature value of the first annotation of that type or null if no annotation
private String getFeature(Document doc, AnnotationSet set, String type, String fname) {
String ret = null;
List<Annotation> anns = set.get(type).inDocumentOrder();
if(anns.size() > 0) {
if(anns.size() > 1) {
System.err.println(doc.getName()+": not exactly one annotation for "+type+", taking first");
}
Annotation ann = anns.get(0);
ret = (String)ann.getFeatures().get(fname);
} else {
System.err.println(doc.getName()+": no field "+type);
}
return ret;
}