From f86e4e193eee806e935720c2ea8c549babb02c53 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Thu, 2 Sep 2021 14:39:06 -0400 Subject: [PATCH] jsoup upgrade to address security vulnerabilities (#1625) Ref #1624 Note that this impacts a bunch of regressions that uses jsoup for document processing. All score changes are minor. --- README.md | 16 +-- docs/regressions-backgroundlinking19.md | 2 +- docs/regressions-backgroundlinking20.md | 4 +- docs/regressions-core18.md | 4 +- docs/regressions-cw09b.md | 24 ++-- docs/regressions-cw12.md | 16 +-- docs/regressions-cw12b13.md | 14 +-- docs/regressions-gov2.md | 12 +- docs/regressions-wt10g.md | 4 +- pom.xml | 2 +- .../regression/backgroundlinking18.yaml | 2 +- .../regression/backgroundlinking19.yaml | 8 +- .../regression/backgroundlinking20.yaml | 8 +- src/main/resources/regression/core18.yaml | 20 +-- src/main/resources/regression/cw09b.yaml | 118 +++++++++--------- src/main/resources/regression/cw12.yaml | 52 ++++---- src/main/resources/regression/cw12b13.yaml | 66 +++++----- src/main/resources/regression/disk12.yaml | 2 +- src/main/resources/regression/gov2.yaml | 56 ++++----- src/main/resources/regression/wt10g.yaml | 24 ++-- 20 files changed, 227 insertions(+), 227 deletions(-) diff --git a/README.md b/README.md index 759134c7e7..606a920e5e 100644 --- a/README.md +++ b/README.md @@ -52,14 +52,14 @@ For the most part, these runs are based on [_default_ parameter settings](https: + Regressions for [Tweets2011 (MB11 & MB12)](docs/regressions-mb11.md), [Tweets2013 (MB13 & MB14)](docs/regressions-mb13.md) + Regressions for Complex Answer Retrieval (CAR17): [[v1.5](docs/regressions-car17v1.5.md)] [[v2.0](docs/regressions-car17v2.0.md)] [[v2.0 with doc2query](docs/regressions-car17v2.0-doc2query.md)] + Regressions for MS MARCO Passage Ranking: [[base](docs/regressions-msmarco-passage.md)] [[doc2query](docs/regressions-msmarco-passage-doc2query.md)] [[docTTTTTquery](docs/regressions-msmarco-passage-docTTTTTquery.md)] -+ Regressions for MS MARCO Document Ranking - Per Doc: [[base](docs/regressions-msmarco-doc.md)] [[docTTTTTquery](docs/regressions-msmarco-doc-docTTTTTquery-per-doc.md)] -+ Regressions for MS MARCO Document Ranking - Per Passage: [[base](docs/regressions-msmarco-doc-per-passage.md)] [[docTTTTTquery](docs/regressions-msmarco-doc-docTTTTTquery-per-passage.md)] -+ Regressions for the TREC 2019 Deep Learning Track (Passage Ranking): [[base](docs/regressions-dl19-passage.md)] [[docTTTTTquery](docs/regressions-dl19-passage-docTTTTTquery.md)] -+ Regressions for the TREC 2019 Deep Learning Track (Document Ranking) - Per Doc: [[base](docs/regressions-dl19-doc.md)] [[docTTTTTquery](docs/regressions-dl19-doc-docTTTTTquery-per-doc.md)] -+ Regressions for the TREC 2019 Deep Learning Track (Document Ranking) - Per Passage: [[base](docs/regressions-dl19-doc-per-passage.md)] [[docTTTTTquery](docs/regressions-dl19-doc-docTTTTTquery-per-passage.md)] -+ Regressions for the TREC 2020 Deep Learning Track (Passage Ranking): [[base](docs/regressions-dl20-passage.md)] [[docTTTTTquery](docs/regressions-dl20-passage-docTTTTTquery.md)] -+ Regressions for the TREC 2020 Deep Learning Track (Document Ranking) - Per Doc: [[base](docs/regressions-dl20-doc.md)] [[docTTTTTquery](docs/regressions-dl20-doc-docTTTTTquery-per-doc.md)] -+ Regressions for the TREC 2020 Deep Learning Track (Document Ranking) - Per Passage: [[base](docs/regressions-dl20-doc-per-passage.md)] [[docTTTTTquery](docs/regressions-dl20-doc-docTTTTTquery-per-passage.md)] ++ Regressions for MS MARCO Document Ranking, Per Doc: [[base](docs/regressions-msmarco-doc.md)] [[docTTTTTquery](docs/regressions-msmarco-doc-docTTTTTquery-per-doc.md)] ++ Regressions for MS MARCO Document Ranking, Per Passage: [[base](docs/regressions-msmarco-doc-per-passage.md)] [[docTTTTTquery](docs/regressions-msmarco-doc-docTTTTTquery-per-passage.md)] ++ Regressions for the TREC 2019 Deep Learning Track (Passage): [[base](docs/regressions-dl19-passage.md)] [[docTTTTTquery](docs/regressions-dl19-passage-docTTTTTquery.md)] ++ Regressions for the TREC 2019 Deep Learning Track (Document), Per Doc: [[base](docs/regressions-dl19-doc.md)] [[docTTTTTquery](docs/regressions-dl19-doc-docTTTTTquery-per-doc.md)] ++ Regressions for the TREC 2019 Deep Learning Track (Document), Per Passage: [[base](docs/regressions-dl19-doc-per-passage.md)] [[docTTTTTquery](docs/regressions-dl19-doc-docTTTTTquery-per-passage.md)] ++ Regressions for the TREC 2020 Deep Learning Track (Passage): [[base](docs/regressions-dl20-passage.md)] [[docTTTTTquery](docs/regressions-dl20-passage-docTTTTTquery.md)] ++ Regressions for the TREC 2020 Deep Learning Track (Document), Per Doc: [[base](docs/regressions-dl20-doc.md)] [[docTTTTTquery](docs/regressions-dl20-doc-docTTTTTquery-per-doc.md)] ++ Regressions for the TREC 2020 Deep Learning Track (Document), Per Passage: [[base](docs/regressions-dl20-doc-per-passage.md)] [[docTTTTTquery](docs/regressions-dl20-doc-docTTTTTquery-per-passage.md)] + Regressions for the TREC News Track (Background Linking Task): [[2018](docs/regressions-backgroundlinking18.md)] [[2019](docs/regressions-backgroundlinking19.md)] [[2020](docs/regressions-backgroundlinking20.md)] + Regressions for [FEVER Fact Verification](docs/regressions-fever.md) + Regressions for [NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](docs/regressions-ntcir8-zh.md) diff --git a/docs/regressions-backgroundlinking19.md b/docs/regressions-backgroundlinking19.md index 24c57f37fe..bef02cbbc4 100644 --- a/docs/regressions-backgroundlinking19.md +++ b/docs/regressions-backgroundlinking19.md @@ -69,5 +69,5 @@ NCDG@5 | BM25 | +RM3 | +RM3+DF | AP | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)| 0.3027 | 0.3790 | 0.3158 | +[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)| 0.3029 | 0.3786 | 0.3154 | diff --git a/docs/regressions-backgroundlinking20.md b/docs/regressions-backgroundlinking20.md index 7983411fde..ff8651f856 100644 --- a/docs/regressions-backgroundlinking20.md +++ b/docs/regressions-backgroundlinking20.md @@ -64,10 +64,10 @@ With the above commands, you should be able to reproduce the following results: NCDG@5 | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt)| 0.5231 | 0.5673 | 0.5279 | +[TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt)| 0.5231 | 0.5673 | 0.5316 | AP | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt)| 0.3286 | 0.4504 | 0.3421 | +[TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt)| 0.3286 | 0.4519 | 0.3438 | diff --git a/docs/regressions-core18.md b/docs/regressions-core18.md index f2a9a38316..7ab78c75df 100644 --- a/docs/regressions-core18.md +++ b/docs/regressions-core18.md @@ -85,12 +85,12 @@ With the above commands, you should be able to reproduce the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2495 | 0.3135 | 0.2841 | 0.2526 | 0.3073 | 0.2919 | +[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2496 | 0.3139 | 0.2840 | 0.2527 | 0.3074 | 0.2920 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3567 | 0.4200 | 0.3947 | 0.3653 | 0.4000 | 0.4020 | +[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3573 | 0.4200 | 0.3947 | 0.3653 | 0.3993 | 0.4013 | ## Reproduction Log[*](reproducibility.md) diff --git a/docs/regressions-cw09b.md b/docs/regressions-cw09b.md index 82390d5eae..93b1a91822 100644 --- a/docs/regressions-cw09b.md +++ b/docs/regressions-cw09b.md @@ -168,27 +168,27 @@ With the above commands, you should be able to reproduce the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.1126 | 0.0933 | 0.0929 | 0.1060 | 0.1019 | 0.1086 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.1094 | 0.1085 | 0.0975 | 0.0958 | 0.0839 | 0.0879 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1105 | 0.1107 | 0.1315 | 0.1069 | 0.1058 | 0.1212 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.1126 | 0.0931 | 0.0961 | 0.1060 | 0.1019 | 0.1088 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.1094 | 0.1085 | 0.0986 | 0.0959 | 0.0839 | 0.0860 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1106 | 0.1108 | 0.1356 | 0.1070 | 0.1058 | 0.1224 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.2694 | 0.2389 | 0.2354 | 0.2431 | 0.2312 | 0.2618 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.2513 | 0.2480 | 0.2387 | 0.2147 | 0.2047 | 0.2173 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.2167 | 0.1920 | 0.2553 | 0.2080 | 0.1980 | 0.2147 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.2681 | 0.2382 | 0.2535 | 0.2438 | 0.2312 | 0.2625 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.2513 | 0.2487 | 0.2367 | 0.2147 | 0.2053 | 0.2120 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.2167 | 0.1927 | 0.2547 | 0.2080 | 0.1980 | 0.2220 | NDCG20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.1354 | 0.1369 | 0.1632 | 0.1143 | 0.1182 | 0.1454 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.1890 | 0.1916 | 0.1835 | 0.1619 | 0.1449 | 0.1517 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1014 | 0.0918 | 0.1441 | 0.0868 | 0.0896 | 0.1037 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.1351 | 0.1368 | 0.1767 | 0.1143 | 0.1182 | 0.1495 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.1894 | 0.1915 | 0.1854 | 0.1631 | 0.1449 | 0.1537 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1015 | 0.0918 | 0.1388 | 0.0875 | 0.0896 | 0.1091 | ERR20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.0733 | 0.0747 | 0.0977 | 0.0599 | 0.0592 | 0.0742 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.0959 | 0.0960 | 0.1091 | 0.0849 | 0.0787 | 0.0821 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1303 | 0.1494 | 0.2355 | 0.1305 | 0.1334 | 0.1558 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.0733 | 0.0747 | 0.1019 | 0.0599 | 0.0592 | 0.0751 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.0959 | 0.0959 | 0.0950 | 0.0850 | 0.0787 | 0.0861 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1304 | 0.1494 | 0.2399 | 0.1306 | 0.1333 | 0.1564 | diff --git a/docs/regressions-cw12.md b/docs/regressions-cw12.md index 50c4b415be..c90432c103 100644 --- a/docs/regressions-cw12.md +++ b/docs/regressions-cw12.md @@ -100,23 +100,23 @@ With the above commands, you should be able to reproduce the following results: MAP | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1694 | 0.1464 | 0.1494 | 0.1290 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.2469 | 0.2324 | 0.2466 | 0.2177 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1695 | 0.1465 | 0.1493 | 0.1290 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.2470 | 0.2330 | 0.2467 | 0.2178 | P30 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2773 | 0.2393 | 0.2607 | 0.2347 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.4547 | 0.4080 | 0.4380 | 0.3800 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2767 | 0.2393 | 0.2607 | 0.2347 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.4547 | 0.4080 | 0.4380 | 0.3813 | NDCG20 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2088 | 0.2033 | 0.1993 | 0.1725 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.2572 | 0.2530 | 0.2218 | 0.2083 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2085 | 0.2033 | 0.1993 | 0.1725 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.2572 | 0.2516 | 0.2220 | 0.2093 | ERR20 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1284 | 0.1264 | 0.1233 | 0.1008 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1616 | 0.1655 | 0.1322 | 0.1245 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1283 | 0.1265 | 0.1233 | 0.1007 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1616 | 0.1652 | 0.1323 | 0.1249 | diff --git a/docs/regressions-cw12b13.md b/docs/regressions-cw12b13.md index 3adaeea0ee..3c16398953 100644 --- a/docs/regressions-cw12b13.md +++ b/docs/regressions-cw12b13.md @@ -128,26 +128,26 @@ With the above commands, you should be able to reproduce the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.0468 | 0.0408 | 0.0435 | 0.0397 | 0.0322 | 0.0358 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.0224 | 0.0210 | 0.0180 | 0.0235 | 0.0203 | 0.0183 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.0468 | 0.0408 | 0.0432 | 0.0397 | 0.0322 | 0.0356 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.0224 | 0.0210 | 0.0181 | 0.0235 | 0.0203 | 0.0179 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2113 | 0.1673 | 0.1833 | 0.1780 | 0.1513 | 0.1507 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2107 | 0.1673 | 0.1780 | 0.1773 | 0.1513 | 0.1567 | [TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1273 | 0.1207 | 0.1107 | 0.1373 | 0.1173 | 0.1147 | NDCG20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1286 | 0.1119 | 0.1287 | 0.1106 | 0.0920 | 0.1141 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1183 | 0.1081 | 0.0963 | 0.1177 | 0.1004 | 0.0989 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1289 | 0.1114 | 0.1311 | 0.1104 | 0.0921 | 0.1113 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1183 | 0.1075 | 0.0974 | 0.1176 | 0.1004 | 0.0984 | ERR20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.0838 | 0.0753 | 0.0941 | 0.0768 | 0.0553 | 0.0780 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1201 | 0.1066 | 0.0928 | 0.1092 | 0.0928 | 0.0900 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.0838 | 0.0752 | 0.0949 | 0.0767 | 0.0552 | 0.0720 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1198 | 0.1055 | 0.0925 | 0.1091 | 0.0928 | 0.0879 | ## Reproduction Log[*](reproducibility.md) diff --git a/docs/regressions-gov2.md b/docs/regressions-gov2.md index 9fa0bc0962..b176f697ee 100644 --- a/docs/regressions-gov2.md +++ b/docs/regressions-gov2.md @@ -148,13 +148,13 @@ With the above commands, you should be able to reproduce the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0.2689 | 0.2844 | 0.2669 | 0.2681 | 0.2708 | 0.2666 | -[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0.3390 | 0.3820 | 0.3666 | 0.3303 | 0.3559 | 0.3646 | -[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0.3080 | 0.3377 | 0.3069 | 0.2997 | 0.3154 | 0.3084 | +[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0.2689 | 0.2844 | 0.2730 | 0.2681 | 0.2709 | 0.2678 | +[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0.3391 | 0.3812 | 0.3649 | 0.3304 | 0.3550 | 0.3614 | +[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0.3081 | 0.3378 | 0.3129 | 0.2997 | 0.3154 | 0.3109 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0.4864 | 0.5190 | 0.4993 | 0.4755 | 0.4925 | 0.4932 | -[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0.5540 | 0.5920 | 0.5933 | 0.5347 | 0.5620 | 0.5840 | -[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0.4907 | 0.5160 | 0.5033 | 0.4720 | 0.4847 | 0.4920 | +[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0.4864 | 0.5190 | 0.5156 | 0.4755 | 0.4932 | 0.4925 | +[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0.5540 | 0.5913 | 0.5873 | 0.5340 | 0.5567 | 0.5867 | +[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0.4907 | 0.5160 | 0.5073 | 0.4727 | 0.4840 | 0.4960 | diff --git a/docs/regressions-wt10g.md b/docs/regressions-wt10g.md index 56460886e5..b8969a9e47 100644 --- a/docs/regressions-wt10g.md +++ b/docs/regressions-wt10g.md @@ -84,9 +84,9 @@ With the above commands, you should be able to reproduce the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0.1992 | 0.2276 | 0.2200 | 0.2021 | 0.2188 | 0.2275 | +[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0.1991 | 0.2270 | 0.2196 | 0.2021 | 0.2188 | 0.2268 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0.2214 | 0.2398 | 0.2483 | 0.2180 | 0.2310 | 0.2514 | +[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0.2211 | 0.2401 | 0.2466 | 0.2180 | 0.2306 | 0.2469 | diff --git a/pom.xml b/pom.xml index b6c1e38d09..1948f153f0 100644 --- a/pom.xml +++ b/pom.xml @@ -337,7 +337,7 @@ org.jsoup jsoup - 1.8.3 + 1.14.2 jar diff --git a/src/main/resources/regression/backgroundlinking18.yaml b/src/main/resources/regression/backgroundlinking18.yaml index e98f243721..5b9310d1a7 100644 --- a/src/main/resources/regression/backgroundlinking18.yaml +++ b/src/main/resources/regression/backgroundlinking18.yaml @@ -23,7 +23,7 @@ collection: WashingtonPostCollection index_stats: documents: 595031 documents (non-empty): 595030 - total terms: 318219945 + total terms: 318219870 topics: - name: "[TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt)" path: topics.backgroundlinking18.txt diff --git a/src/main/resources/regression/backgroundlinking19.yaml b/src/main/resources/regression/backgroundlinking19.yaml index 7c49bd446f..7c6d80a328 100644 --- a/src/main/resources/regression/backgroundlinking19.yaml +++ b/src/main/resources/regression/backgroundlinking19.yaml @@ -23,7 +23,7 @@ collection: WashingtonPostCollection index_stats: documents: 595031 documents (non-empty): 595030 - total terms: 318219945 + total terms: 318219870 topics: - name: "[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)" path: topics.backgroundlinking19.txt @@ -52,7 +52,7 @@ models: - -backgroundlinking -backgroundlinking.k 100 -bm25 -hits 100 results: AP: - - 0.3027 + - 0.3029 NCDG@5: - 0.4785 - name: bm25+rm3 @@ -61,7 +61,7 @@ models: - -backgroundlinking -backgroundlinking.k 100 -bm25 -rm3 -hits 100 results: AP: - - 0.3790 + - 0.3786 NCDG@5: - 0.5217 - name: bm25+rm3+df @@ -70,6 +70,6 @@ models: - -backgroundlinking -backgroundlinking.datefilter -backgroundlinking.k 100 -bm25 -rm3 -hits 100 results: AP: - - 0.3158 + - 0.3154 NCDG@5: - 0.5051 diff --git a/src/main/resources/regression/backgroundlinking20.yaml b/src/main/resources/regression/backgroundlinking20.yaml index 35c1d061a2..2c5a6a2aad 100644 --- a/src/main/resources/regression/backgroundlinking20.yaml +++ b/src/main/resources/regression/backgroundlinking20.yaml @@ -23,7 +23,7 @@ collection: WashingtonPostCollection index_stats: documents: 671945 documents (non-empty): 671945 - total terms: 366108299 + total terms: 366108177 topics: - name: "[TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt)" path: topics.backgroundlinking20.txt @@ -61,7 +61,7 @@ models: - -backgroundlinking -backgroundlinking.k 100 -bm25 -rm3 -hits 100 results: AP: - - 0.4504 + - 0.4519 NCDG@5: - 0.5673 - name: bm25+rm3+df @@ -70,6 +70,6 @@ models: - -backgroundlinking -backgroundlinking.datefilter -backgroundlinking.k 100 -bm25 -rm3 -hits 100 results: AP: - - 0.3421 + - 0.3438 NCDG@5: - - 0.5279 + - 0.5316 diff --git a/src/main/resources/regression/core18.yaml b/src/main/resources/regression/core18.yaml index 66fc8a7ead..d36310f5fc 100644 --- a/src/main/resources/regression/core18.yaml +++ b/src/main/resources/regression/core18.yaml @@ -23,7 +23,7 @@ collection: WashingtonPostCollection index_stats: documents: 595031 documents (non-empty): 595030 - total terms: 318219945 + total terms: 318219870 topics: - name: "[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)" path: topics.core18.txt @@ -52,9 +52,9 @@ models: - -bm25 results: map: - - 0.2495 + - 0.2496 p30: - - 0.3567 + - 0.3573 - name: bm25+rm3 display: +RM3 params: @@ -62,7 +62,7 @@ models: - -rm3 results: map: - - 0.3135 + - 0.3139 p30: - 0.4200 - name: bm25+ax @@ -74,7 +74,7 @@ models: - -rerankCutoff 20 results: map: - - 0.2841 + - 0.2840 p30: - 0.3947 - name: ql @@ -83,7 +83,7 @@ models: - -qld results: map: - - 0.2526 + - 0.2527 p30: - 0.3653 - name: ql+rm3 @@ -93,9 +93,9 @@ models: - -rm3 results: map: - - 0.3073 + - 0.3074 p30: - - 0.4000 + - 0.3993 - name: ql+ax display: +Ax params: @@ -105,6 +105,6 @@ models: - -rerankCutoff 20 results: map: - - 0.2919 + - 0.2920 p30: - - 0.4020 + - 0.4013 diff --git a/src/main/resources/regression/cw09b.yaml b/src/main/resources/regression/cw09b.yaml index 38477ed554..d258e54f03 100644 --- a/src/main/resources/regression/cw09b.yaml +++ b/src/main/resources/regression/cw09b.yaml @@ -21,9 +21,9 @@ index_options: - -storeRaw topic_reader: Webxml index_stats: - documents: 50220189 - documents (non-empty): 50220159 - total terms: 31302554269 + documents: 50220186 + documents (non-empty): 50220156 + total terms: 31300822176 topics: - name: "[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)" path: topics.web.51-100.txt @@ -72,19 +72,19 @@ models: map: - 0.1126 - 0.1094 - - 0.1105 + - 0.1106 p30: - - 0.2694 + - 0.2681 - 0.2513 - 0.2167 ndcg20: - - 0.13537 - - 0.18900 - - 0.10139 + - 0.13509 + - 0.18944 + - 0.10145 err20: - - 0.07335 + - 0.07330 - 0.09592 - - 0.13031 + - 0.13043 - name: bm25+rm3 display: +RM3 params: @@ -92,21 +92,21 @@ models: - -rm3 results: map: - - 0.0933 + - 0.0931 - 0.1085 - - 0.1107 + - 0.1108 p30: - - 0.2389 - - 0.2480 - - 0.1920 + - 0.2382 + - 0.2487 + - 0.1927 ndcg20: - - 0.13693 - - 0.19160 - - 0.09182 + - 0.13683 + - 0.19153 + - 0.09183 err20: - - 0.07473 - - 0.09596 - - 0.14936 + - 0.07469 + - 0.09590 + - 0.14937 - name: bm25+ax display: +Ax params: @@ -117,21 +117,21 @@ models: - -rerankCutoff 20 results: map: - - 0.0929 - - 0.0975 - - 0.1315 + - 0.0961 + - 0.0986 + - 0.1356 p30: - - 0.2354 - - 0.2387 - - 0.2553 + - 0.2535 + - 0.2367 + - 0.2547 ndcg20: - - 0.16319 - - 0.18348 - - 0.14413 + - 0.17665 + - 0.18536 + - 0.13878 err20: - - 0.09771 - - 0.10912 - - 0.23551 + - 0.10191 + - 0.09502 + - 0.23994 - name: ql display: QL params: @@ -139,20 +139,20 @@ models: results: map: - 0.1060 - - 0.0958 - - 0.1069 + - 0.0959 + - 0.1070 p30: - - 0.2431 + - 0.2438 - 0.2147 - 0.2080 ndcg20: - - 0.11432 - - 0.16191 - - 0.08682 + - 0.11431 + - 0.16311 + - 0.08755 err20: - 0.05994 - - 0.08486 - - 0.13052 + - 0.08502 + - 0.13063 - name: ql+rm3 display: +RM3 params: @@ -165,16 +165,16 @@ models: - 0.1058 p30: - 0.2312 - - 0.2047 + - 0.2053 - 0.1980 ndcg20: - - 0.11823 - - 0.14487 - - 0.08959 + - 0.11824 + - 0.14488 + - 0.08958 err20: - - 0.05917 + - 0.05918 - 0.07872 - - 0.13336 + - 0.13332 - name: ql+ax display: +Ax params: @@ -185,18 +185,18 @@ models: - -rerankCutoff 20 results: map: - - 0.1086 - - 0.0879 - - 0.1212 + - 0.1088 + - 0.0860 + - 0.1224 p30: - - 0.2618 - - 0.2173 - - 0.2147 + - 0.2625 + - 0.2120 + - 0.2220 ndcg20: - - 0.14541 - - 0.15174 - - 0.10373 + - 0.14950 + - 0.15366 + - 0.10911 err20: - - 0.07424 - - 0.08205 - - 0.15577 + - 0.07515 + - 0.08610 + - 0.15644 diff --git a/src/main/resources/regression/cw12.yaml b/src/main/resources/regression/cw12.yaml index 01c1285263..7266d8aa28 100644 --- a/src/main/resources/regression/cw12.yaml +++ b/src/main/resources/regression/cw12.yaml @@ -21,9 +21,9 @@ index_options: - -storeRaw topic_reader: Webxml index_stats: - documents: 731705088 - documents (non-empty): 731556853 - total terms: 429328271635 + documents: 731645141 + documents (non-empty): 731542236 + total terms: 429234508918 topics: - name: "[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)" path: topics.web.201-250.txt @@ -67,17 +67,17 @@ models: - -bm25 results: map: - - 0.1694 - - 0.2469 + - 0.1695 + - 0.2470 p30: - - 0.2773 + - 0.2767 - 0.4547 ndcg20: - - 0.20881 - - 0.25719 + - 0.20848 + - 0.25720 err20: - - 0.12838 - - 0.16162 + - 0.12829 + - 0.16163 - name: bm25+rm3 display: +RM3 params: @@ -85,34 +85,34 @@ models: - -rm3 results: map: - - 0.1464 - - 0.2324 + - 0.1465 + - 0.2330 p30: - 0.2393 - 0.4080 ndcg20: - - 0.20327 - - 0.25303 + - 0.20325 + - 0.25163 err20: - - 0.12637 - - 0.16550 + - 0.12645 + - 0.16518 - name: ql display: QL params: - -qld results: map: - - 0.1494 - - 0.2466 + - 0.1493 + - 0.2467 p30: - 0.2607 - 0.4380 ndcg20: - 0.19935 - - 0.22184 + - 0.22201 err20: - 0.12325 - - 0.13218 + - 0.13234 - name: ql+rm3 display: +RM3 params: @@ -121,14 +121,14 @@ models: results: map: - 0.1290 - - 0.2177 + - 0.2178 p30: - 0.2347 - - 0.3800 + - 0.3813 ndcg20: - - 0.17253 - - 0.20829 + - 0.17248 + - 0.20926 err20: - - 0.10083 - - 0.12450 + - 0.10073 + - 0.12492 diff --git a/src/main/resources/regression/cw12b13.yaml b/src/main/resources/regression/cw12b13.yaml index b2578040af..0103f4d442 100644 --- a/src/main/resources/regression/cw12b13.yaml +++ b/src/main/resources/regression/cw12b13.yaml @@ -21,9 +21,9 @@ index_options: - -storeRaw topic_reader: Webxml index_stats: - documents: 52249039 - documents (non-empty): 52238526 - total terms: 30666923268 + documents: 52244809 + documents (non-empty): 52237520 + total terms: 30660015721 topics: - name: "[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)" path: topics.web.201-250.txt @@ -70,14 +70,14 @@ models: - 0.0468 - 0.0224 p30: - - 0.2113 + - 0.2107 - 0.1273 ndcg20: - - 0.12862 - - 0.11835 + - 0.12887 + - 0.11831 err20: - - 0.08378 - - 0.12006 + - 0.08377 + - 0.11980 - name: bm25+rm3 display: +RM3 params: @@ -91,11 +91,11 @@ models: - 0.1673 - 0.1207 ndcg20: - - 0.11192 - - 0.10809 + - 0.11139 + - 0.10754 err20: - - 0.07530 - - 0.10662 + - 0.07525 + - 0.10551 - name: bm25+ax display: +Ax params: @@ -106,17 +106,17 @@ models: - -rerankCutoff 20 results: map: - - 0.0435 - - 0.0180 + - 0.0432 + - 0.0181 p30: - - 0.1833 + - 0.1780 - 0.1107 ndcg20: - - 0.12867 - - 0.09627 + - 0.13111 + - 0.09735 err20: - - 0.09413 - - 0.09285 + - 0.09489 + - 0.09246 - name: ql display: QL params: @@ -126,14 +126,14 @@ models: - 0.0397 - 0.0235 p30: - - 0.1780 + - 0.1773 - 0.1373 ndcg20: - - 0.11059 - - 0.11765 + - 0.11038 + - 0.11762 err20: - - 0.07679 - - 0.10917 + - 0.07674 + - 0.10909 - name: ql+rm3 display: +RM3 params: @@ -147,10 +147,10 @@ models: - 0.1513 - 0.1173 ndcg20: - - 0.09199 + - 0.09211 - 0.10036 err20: - - 0.05525 + - 0.05522 - 0.09284 - name: ql+ax display: +Ax @@ -162,14 +162,14 @@ models: - -rerankCutoff 20 results: map: - - 0.0358 - - 0.0183 + - 0.0356 + - 0.0179 p30: - - 0.1507 + - 0.1567 - 0.1147 ndcg20: - - 0.11407 - - 0.09891 + - 0.11128 + - 0.09844 err20: - - 0.07803 - - 0.09002 + - 0.07195 + - 0.08795 diff --git a/src/main/resources/regression/disk12.yaml b/src/main/resources/regression/disk12.yaml index 70c37a0a93..66410efc8d 100644 --- a/src/main/resources/regression/disk12.yaml +++ b/src/main/resources/regression/disk12.yaml @@ -40,7 +40,7 @@ index_path: indexes/lucene-index.disk12.pos+docvectors+raw index_stats: documents: 741676 documents (non-empty): 741675 - total terms: 217199327 + total terms: 217199384 topics: - name: "[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)" path: topics.adhoc.51-100.txt diff --git a/src/main/resources/regression/gov2.yaml b/src/main/resources/regression/gov2.yaml index e4235d27f9..d96fddf659 100644 --- a/src/main/resources/regression/gov2.yaml +++ b/src/main/resources/regression/gov2.yaml @@ -38,9 +38,9 @@ evals: metric_precision: 4 can_combine: true index_stats: - documents: 25172934 - documents (non-empty): 25170664 - total terms: 17345062322 + documents: 25170853 + documents (non-empty): 25170665 + total terms: 17345663488 topics: - name: "[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)" path: topics.terabyte04.701-750.txt @@ -59,8 +59,8 @@ models: results: map: - 0.2689 - - 0.3390 - - 0.3080 + - 0.3391 + - 0.3081 p30: - 0.4864 - 0.5540 @@ -73,11 +73,11 @@ models: results: map: - 0.2844 - - 0.3820 - - 0.3377 + - 0.3812 + - 0.3378 p30: - 0.5190 - - 0.5920 + - 0.5913 - 0.5160 - name: bm25+ax display: +Ax @@ -89,13 +89,13 @@ models: - -rerankCutoff 20 results: map: - - 0.2669 - - 0.3666 - - 0.3069 + - 0.2730 + - 0.3649 + - 0.3129 p30: - - 0.4993 - - 0.5933 - - 0.5033 + - 0.5156 + - 0.5873 + - 0.5073 - name: ql display: QL params: @@ -103,12 +103,12 @@ models: results: map: - 0.2681 - - 0.3303 + - 0.3304 - 0.2997 p30: - 0.4755 - - 0.5347 - - 0.4720 + - 0.5340 + - 0.4727 - name: ql+rm3 display: +RM3 params: @@ -116,13 +116,13 @@ models: - -rm3 results: map: - - 0.2708 - - 0.3559 + - 0.2709 + - 0.3550 - 0.3154 p30: - - 0.4925 - - 0.5620 - - 0.4847 + - 0.4932 + - 0.5567 + - 0.4840 - name: ql+ax display: +Ax params: @@ -133,10 +133,10 @@ models: - -rerankCutoff 20 results: map: - - 0.2666 - - 0.3646 - - 0.3084 + - 0.2678 + - 0.3614 + - 0.3109 p30: - - 0.4932 - - 0.5840 - - 0.4920 + - 0.4925 + - 0.5867 + - 0.4960 diff --git a/src/main/resources/regression/wt10g.yaml b/src/main/resources/regression/wt10g.yaml index e224bf3e7d..b177b2bc0a 100644 --- a/src/main/resources/regression/wt10g.yaml +++ b/src/main/resources/regression/wt10g.yaml @@ -39,9 +39,9 @@ input_roots: input: collections/web/wt10g/ index_path: indexes/lucene-index.wt10g.pos+docvectors+raw index_stats: - documents: 1688402 - documents (non-empty): 1688291 - total terms: 752790242 + documents: 1688390 + documents (non-empty): 1688299 + total terms: 752785964 topics: - name: "[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)" path: topics.adhoc.451-550.txt @@ -53,9 +53,9 @@ models: - -bm25 results: map: - - 0.1992 + - 0.1991 p30: - - 0.2214 + - 0.2211 - name: bm25+rm3 display: +RM3 params: @@ -63,9 +63,9 @@ models: - -rm3 results: map: - - 0.2276 + - 0.2270 p30: - - 0.2398 + - 0.2401 - name: bm25+ax display: +Ax params: @@ -76,9 +76,9 @@ models: - -rerankCutoff 20 results: map: - - 0.2200 + - 0.2196 p30: - - 0.2483 + - 0.2466 - name: ql display: QL params: @@ -97,7 +97,7 @@ models: map: - 0.2188 p30: - - 0.2310 + - 0.2306 - name: ql+ax display: +Ax params: @@ -108,6 +108,6 @@ models: - -rerankCutoff 20 results: map: - - 0.2275 + - 0.2268 p30: - - 0.2514 + - 0.2469