Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure that multi-line TREC topic titles are fully parsed. Update the… #1482

Merged
merged 3 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docs/regressions-disk12.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,13 @@ With the above commands, you should be able to replicate the following results:

MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax |
:---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.2273 | 0.2634 | 0.2640 | 0.2189 | 0.2435 | 0.2501 |
[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.2010 | 0.2587 | 0.2722 | 0.2015 | 0.2442 | 0.2593 |
[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.2580 | 0.3390 | 0.3318 | 0.2518 | 0.3042 | 0.3103 |
[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.2277 | 0.2628 | 0.2648 | 0.2188 | 0.2465 | 0.2502 |
[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.2003 | 0.2578 | 0.2698 | 0.2010 | 0.2429 | 0.2596 |
[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.2634 | 0.3345 | 0.3407 | 0.2580 | 0.3037 | 0.3129 |


P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax |
:---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.4533 | 0.4800 | 0.5067 | 0.4520 | 0.4627 | 0.4953 |
[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.4280 | 0.4593 | 0.4753 | 0.4207 | 0.4420 | 0.4740 |
[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.4740 | 0.5273 | 0.5100 | 0.4580 | 0.4913 | 0.5167 |
[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.4540 | 0.4860 | 0.5127 | 0.4553 | 0.4680 | 0.4947 |
[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.4253 | 0.4580 | 0.4720 | 0.4193 | 0.4400 | 0.4760 |
[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.4860 | 0.5260 | 0.5273 | 0.4753 | 0.4967 | 0.5187 |
18 changes: 4 additions & 14 deletions src/main/java/io/anserini/search/topicreader/TrecTopicReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public TrecTopicReader(Path topicFile) {
protected StringBuilder read(BufferedReader reader, String prefix, StringBuilder sb,
boolean collectMatchLine, boolean collectAll) throws IOException {
sb = (sb == null ? new StringBuilder() : sb);
String sep = "";
String sep = (sb == null ? "" : newline);
while (true) {
String line = reader.readLine();
if (line == null) {
Expand Down Expand Up @@ -86,24 +86,14 @@ public SortedMap<Integer, Map<String, String>> read(BufferedReader bRdr) throws

// title
sb = read(bRdr, "<title>", null, true, false);
sb = read(bRdr, "<desc>", sb, false, true);
k = sb.indexOf(":");
if (k == -1) {
k = sb.indexOf(">");
}
String title = sb.substring(k + 1).trim();

//malformed titles, read again
if (title.isEmpty()) {
sb = read(bRdr, "", null, true, false);
k = sb.indexOf(":");
if (k == -1) {
k = sb.indexOf(">");
}
title = sb.substring(k + 1).trim();
}
String title = sb.substring(k + 1).replaceAll("\\s+", " ").trim();

// Read the description...
read(bRdr, "<desc>", null, false, false);
sb.setLength(0);
String line = null;
while ((line = bRdr.readLine()) != null) {
Expand Down Expand Up @@ -133,7 +123,7 @@ public SortedMap<Integer, Map<String, String>> read(BufferedReader bRdr) throws
// we got a topic!
// this is for core track 2018 fix
id = id.replaceAll("</num>", "").trim();
title = title.replaceAll("</title>", "");
title = title.replaceAll("</title>", "").trim();
description = description.replaceAll("</desc>", "");
narrative = narrative.replaceAll("</narr>", "");
// this is for core track 2018 fix
Expand Down
72 changes: 36 additions & 36 deletions src/main/resources/regression/disk12.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,27 +58,27 @@ models:
- -bm25
results:
map:
- 0.2273
- 0.2010
- 0.2580
- 0.2277
- 0.2003
- 0.2634
p30:
- 0.4533
- 0.4280
- 0.4740
- 0.4540
- 0.4253
- 0.4860
- name: bm25+rm3
display: +RM3
params:
- -bm25
- -rm3
results:
map:
- 0.2634
- 0.2587
- 0.3390
- 0.2628
- 0.2578
- 0.3345
p30:
- 0.4800
- 0.4593
- 0.5273
- 0.4860
- 0.4580
- 0.5260
- name: bm25+ax
display: +Ax
params:
Expand All @@ -88,40 +88,40 @@ models:
- -rerankCutoff 20
results:
map:
- 0.2640
- 0.2722
- 0.3318
- 0.2648
- 0.2698
- 0.3407
p30:
- 0.5067
- 0.4753
- 0.5100
- 0.5127
- 0.4720
- 0.5273
- name: ql
display: QL
params:
- -qld
results:
map:
- 0.2189
- 0.2015
- 0.2518
- 0.2188
- 0.2010
- 0.2580
p30:
- 0.4520
- 0.4207
- 0.4580
- 0.4553
- 0.4193
- 0.4753
- name: ql+rm3
display: +RM3
params:
- -qld
- -rm3
results:
map:
- 0.2435
- 0.2442
- 0.3042
- 0.2465
- 0.2429
- 0.3037
p30:
- 0.4627
- 0.4420
- 0.4913
- 0.4680
- 0.4400
- 0.4967
- name: ql+ax
display: +Ax
params:
Expand All @@ -131,10 +131,10 @@ models:
- -rerankCutoff 20
results:
map:
- 0.2501
- 0.2593
- 0.3103
- 0.2502
- 0.2596
- 0.3129
p30:
- 0.4953
- 0.4740
- 0.5167
- 0.4947
- 0.4760
- 0.5187
33 changes: 31 additions & 2 deletions src/test/java/io/anserini/search/topicreader/TopicReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public void testNewswireTopics() {
assertEquals(151, (int) topics.firstKey());
assertEquals("Coping with overcrowded prisons", topics.get(topics.firstKey()).get("title"));
assertEquals(200, (int) topics.lastKey());
assertEquals("Impact of foreign textile imports on U.S.", topics.get(topics.lastKey()).get("title"));
assertEquals("Impact of foreign textile imports on U.S. textile industry", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.ROBUST04);
assertNotNull(topics);
Expand Down Expand Up @@ -115,6 +115,35 @@ public void testNewswireTopics() {
assertEquals(825, (int) topics.lastKey());
assertEquals("ethanol and food prices", topics.get(topics.lastKey()).get("title"));
}

@Test
public void testTrecTitleParsing() {
SortedMap<Integer, Map<String, String>> topics;

topics = TopicReader.getTopics(Topics.TREC1_ADHOC);
assertNotNull(topics);
assertEquals(50, topics.size());
//
// Single line titles.
assertEquals("Airbus Subsidies", topics.get(51).get("title"));
assertEquals("Controlling the Transfer of High Technology", topics.get(100).get("title"));
//
// Multi-line titles.
assertEquals("Financial crunch for televangelists in the wake of the PTL scandal", topics.get(81).get("title"));
assertEquals("Criminal Actions Against Officers of Failed Financial Institutions", topics.get(87).get("title"));
assertEquals("What Backing Does the National Rifle Association Have?", topics.get(93).get("title"));

topics = TopicReader.getTopics(Topics.TREC2_ADHOC);
assertNotNull(topics);
assertEquals(50, topics.size());

assertEquals("Industrial Espionage", topics.get(149).get("title"));

assertEquals("Laser Research Applicable to the U.S.'s Strategic Defense Initiative", topics.get(102).get("title"));
assertEquals("Impact of Government Regulated Grain Farming on International Relations", topics.get(142).get("title"));
}



@Test
public void testNewswireTopics_TopicIdsAsStrings() {
Expand All @@ -136,7 +165,7 @@ public void testNewswireTopics_TopicIdsAsStrings() {
assertNotNull(topics);
assertEquals(50, topics.size());
assertEquals("Coping with overcrowded prisons", topics.get("151").get("title"));
assertEquals("Impact of foreign textile imports on U.S.", topics.get("200").get("title"));
assertEquals("Impact of foreign textile imports on U.S. textile industry", topics.get("200").get("title"));

topics = TopicReader.getTopicsWithStringIds(Topics.ROBUST04);
assertNotNull(topics);
Expand Down