diff --git a/docs/regressions-disk12.md b/docs/regressions-disk12.md index 419e9d01dc..6fa31049da 100644 --- a/docs/regressions-disk12.md +++ b/docs/regressions-disk12.md @@ -148,13 +148,13 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.2273 | 0.2634 | 0.2640 | 0.2189 | 0.2435 | 0.2501 | -[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.2010 | 0.2587 | 0.2722 | 0.2015 | 0.2442 | 0.2593 | -[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.2580 | 0.3390 | 0.3318 | 0.2518 | 0.3042 | 0.3103 | +[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.2277 | 0.2628 | 0.2648 | 0.2188 | 0.2465 | 0.2502 | +[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.2003 | 0.2578 | 0.2698 | 0.2010 | 0.2429 | 0.2596 | +[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.2634 | 0.3345 | 0.3407 | 0.2580 | 0.3037 | 0.3129 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.4533 | 0.4800 | 0.5067 | 0.4520 | 0.4627 | 0.4953 | -[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.4280 | 0.4593 | 0.4753 | 0.4207 | 0.4420 | 0.4740 | -[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.4740 | 0.5273 | 0.5100 | 0.4580 | 0.4913 | 0.5167 | +[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.4540 | 0.4860 | 0.5127 | 0.4553 | 0.4680 | 0.4947 | +[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.4253 | 0.4580 | 0.4720 | 0.4193 | 0.4400 | 0.4760 | +[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.4860 | 0.5260 | 0.5273 | 0.4753 | 0.4967 | 0.5187 | diff --git a/src/main/java/io/anserini/search/topicreader/TrecTopicReader.java b/src/main/java/io/anserini/search/topicreader/TrecTopicReader.java index 9bedcf371e..9d2350e805 100644 --- a/src/main/java/io/anserini/search/topicreader/TrecTopicReader.java +++ b/src/main/java/io/anserini/search/topicreader/TrecTopicReader.java @@ -39,7 +39,7 @@ public TrecTopicReader(Path topicFile) { protected StringBuilder read(BufferedReader reader, String prefix, StringBuilder sb, boolean collectMatchLine, boolean collectAll) throws IOException { sb = (sb == null ? new StringBuilder() : sb); - String sep = ""; + String sep = (sb == null ? "" : newline); while (true) { String line = reader.readLine(); if (line == null) { @@ -86,24 +86,14 @@ public SortedMap> read(BufferedReader bRdr) throws // title sb = read(bRdr, "", null, true, false); + sb = read(bRdr, "<desc>", sb, false, true); k = sb.indexOf(":"); if (k == -1) { - k = sb.indexOf(">"); - } - String title = sb.substring(k + 1).trim(); - - //malformed titles, read again - if (title.isEmpty()) { - sb = read(bRdr, "", null, true, false); - k = sb.indexOf(":"); - if (k == -1) { k = sb.indexOf(">"); - } - title = sb.substring(k + 1).trim(); } + String title = sb.substring(k + 1).replaceAll("\\s+", " ").trim(); // Read the description... - read(bRdr, "<desc>", null, false, false); sb.setLength(0); String line = null; while ((line = bRdr.readLine()) != null) { @@ -133,7 +123,7 @@ public SortedMap<Integer, Map<String, String>> read(BufferedReader bRdr) throws // we got a topic! // this is for core track 2018 fix id = id.replaceAll("</num>", "").trim(); - title = title.replaceAll("", ""); + title = title.replaceAll("", "").trim(); description = description.replaceAll("", ""); narrative = narrative.replaceAll("", ""); // this is for core track 2018 fix diff --git a/src/main/resources/regression/disk12.yaml b/src/main/resources/regression/disk12.yaml index 146a80218e..70c37a0a93 100644 --- a/src/main/resources/regression/disk12.yaml +++ b/src/main/resources/regression/disk12.yaml @@ -58,13 +58,13 @@ models: - -bm25 results: map: - - 0.2273 - - 0.2010 - - 0.2580 + - 0.2277 + - 0.2003 + - 0.2634 p30: - - 0.4533 - - 0.4280 - - 0.4740 + - 0.4540 + - 0.4253 + - 0.4860 - name: bm25+rm3 display: +RM3 params: @@ -72,13 +72,13 @@ models: - -rm3 results: map: - - 0.2634 - - 0.2587 - - 0.3390 + - 0.2628 + - 0.2578 + - 0.3345 p30: - - 0.4800 - - 0.4593 - - 0.5273 + - 0.4860 + - 0.4580 + - 0.5260 - name: bm25+ax display: +Ax params: @@ -88,26 +88,26 @@ models: - -rerankCutoff 20 results: map: - - 0.2640 - - 0.2722 - - 0.3318 + - 0.2648 + - 0.2698 + - 0.3407 p30: - - 0.5067 - - 0.4753 - - 0.5100 + - 0.5127 + - 0.4720 + - 0.5273 - name: ql display: QL params: - -qld results: map: - - 0.2189 - - 0.2015 - - 0.2518 + - 0.2188 + - 0.2010 + - 0.2580 p30: - - 0.4520 - - 0.4207 - - 0.4580 + - 0.4553 + - 0.4193 + - 0.4753 - name: ql+rm3 display: +RM3 params: @@ -115,13 +115,13 @@ models: - -rm3 results: map: - - 0.2435 - - 0.2442 - - 0.3042 + - 0.2465 + - 0.2429 + - 0.3037 p30: - - 0.4627 - - 0.4420 - - 0.4913 + - 0.4680 + - 0.4400 + - 0.4967 - name: ql+ax display: +Ax params: @@ -131,10 +131,10 @@ models: - -rerankCutoff 20 results: map: - - 0.2501 - - 0.2593 - - 0.3103 + - 0.2502 + - 0.2596 + - 0.3129 p30: - - 0.4953 - - 0.4740 - - 0.5167 + - 0.4947 + - 0.4760 + - 0.5187 diff --git a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java index 193c4d9c96..a894faaa00 100755 --- a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java +++ b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java @@ -81,7 +81,7 @@ public void testNewswireTopics() { assertEquals(151, (int) topics.firstKey()); assertEquals("Coping with overcrowded prisons", topics.get(topics.firstKey()).get("title")); assertEquals(200, (int) topics.lastKey()); - assertEquals("Impact of foreign textile imports on U.S.", topics.get(topics.lastKey()).get("title")); + assertEquals("Impact of foreign textile imports on U.S. textile industry", topics.get(topics.lastKey()).get("title")); topics = TopicReader.getTopics(Topics.ROBUST04); assertNotNull(topics); @@ -115,6 +115,35 @@ public void testNewswireTopics() { assertEquals(825, (int) topics.lastKey()); assertEquals("ethanol and food prices", topics.get(topics.lastKey()).get("title")); } + + @Test + public void testTrecTitleParsing() { + SortedMap> topics; + + topics = TopicReader.getTopics(Topics.TREC1_ADHOC); + assertNotNull(topics); + assertEquals(50, topics.size()); + // + // Single line titles. + assertEquals("Airbus Subsidies", topics.get(51).get("title")); + assertEquals("Controlling the Transfer of High Technology", topics.get(100).get("title")); + // + // Multi-line titles. + assertEquals("Financial crunch for televangelists in the wake of the PTL scandal", topics.get(81).get("title")); + assertEquals("Criminal Actions Against Officers of Failed Financial Institutions", topics.get(87).get("title")); + assertEquals("What Backing Does the National Rifle Association Have?", topics.get(93).get("title")); + + topics = TopicReader.getTopics(Topics.TREC2_ADHOC); + assertNotNull(topics); + assertEquals(50, topics.size()); + + assertEquals("Industrial Espionage", topics.get(149).get("title")); + + assertEquals("Laser Research Applicable to the U.S.'s Strategic Defense Initiative", topics.get(102).get("title")); + assertEquals("Impact of Government Regulated Grain Farming on International Relations", topics.get(142).get("title")); + } + + @Test public void testNewswireTopics_TopicIdsAsStrings() { @@ -136,7 +165,7 @@ public void testNewswireTopics_TopicIdsAsStrings() { assertNotNull(topics); assertEquals(50, topics.size()); assertEquals("Coping with overcrowded prisons", topics.get("151").get("title")); - assertEquals("Impact of foreign textile imports on U.S.", topics.get("200").get("title")); + assertEquals("Impact of foreign textile imports on U.S. textile industry", topics.get("200").get("title")); topics = TopicReader.getTopicsWithStringIds(Topics.ROBUST04); assertNotNull(topics);