fix: Filling missing metadata for leaderboard release (#1895)

* Update ArxivClusteringS2S.py * fill some metadat for retrieval * fill in the reste of missing metadata * fix metadata * fix climatefever metadata * fix: Added CQADupstack annotations * removed annotation for non-exisitant task * format * Added financial to other financial dataset * Moved ArguAna annotation to derivate datasets --------- Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
embeddings-benchmark · Jan 30, 2025 · 938e90f · 938e90f
1 parent b7e412d
commit 938e90f
Show file tree

Hide file tree

Showing 42 changed files with 242 additions and 213 deletions.
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -71,6 +71,7 @@
     "Written",
     "Programming",
     "Chemistry",
+    "Financial",
 ]
 
 SAMPLE_CREATION_METHOD = Literal[
@@ -171,6 +172,7 @@
         "gpl-3.0",
         "cdla-sharing-1.0",
         "mpl-2.0",
+        "msr-la-nc",
         "multiple",
     ]
 )

diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
@@ -110,7 +110,8 @@
         # in MTEB
         "NQ": ["test"],
         "NQHardNegatives": ["test"],
-        "HotPotQA": ["test"],
+        "NQ-PL": ["test"],
+        "HotPotQA": ["test"],  # translated, not trained on
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],

diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py
@@ -19,7 +19,6 @@
     **E5_TRAINING_DATA,
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py
@@ -130,7 +130,6 @@
     **E5_TRAINING_DATA,
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py
@@ -16,7 +16,6 @@
     # also uses medi2 which contains fever and hotpotqa:
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py
@@ -22,7 +22,6 @@ def instruction_template(
     "FiQA2018-PL": ["train"],
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py
@@ -22,7 +22,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification):
         eval_langs=["eng-Latn"],
         main_score="accuracy",
         date=("2013-11-01", "2013-11-01"),
-        domains=["News", "Written"],
+        domains=["News", "Written", "Financial"],
         task_subtypes=["Sentiment/Hate speech"],
         license="cc-by-nc-sa-3.0",
         annotations_creators="expert-annotated",

diff --git a/mteb/tasks/Classification/kor/KorFin.py b/mteb/tasks/Classification/kor/KorFin.py
@@ -25,7 +25,7 @@ class KorFin(AbsTaskClassification):
             "2022-01-01",
             "2022-12-31",
         ),  # Assumed date based on the citations in the paper
-        domains=["News", "Written"],
+        domains=["News", "Written", "Financial"],
         task_subtypes=["Sentiment/Hate speech"],
         license="cc-by-sa-4.0",
         annotations_creators="expert-annotated",

diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py
@@ -21,13 +21,13 @@ class ArxivClusteringS2S(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("1991-01-01", "2021-01-01"),  # 1991-01-01 is the first arxiv paper
+        domains=["Academic", "Written"],
+        task_subtypes=[],
+        license="cc0-1.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@misc{arxiv_org_submitters_2024,
     title={arXiv Dataset},
     url={https://www.kaggle.com/dsv/7548853},

diff --git a/mteb/tasks/Clustering/eng/RedditClustering.py b/mteb/tasks/Clustering/eng/RedditClustering.py
@@ -85,14 +85,13 @@ class RedditClustering(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Social", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",  # derived from pushshift
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py
@@ -29,14 +29,13 @@ class RedditClusteringP2P(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Social", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",  # derived from pushshift
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/Clustering/eng/StackExchangeClustering.py b/mteb/tasks/Clustering/eng/StackExchangeClustering.py
@@ -87,14 +87,13 @@ class StackExchangeClustering(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py
@@ -91,13 +91,13 @@ class StackExchangeClusteringP2P(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py
@@ -21,12 +21,12 @@ class TwitterSemEval2015PC(AbsTaskPairClassification):
         eval_langs=["eng-Latn"],
         main_score="max_ap",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Social", "Written"],
+        task_subtypes=[],
+        license="not specified",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{xu-etal-2015-semeval,
         title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})",
         author = "Xu, Wei  and

diff --git a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py
@@ -21,12 +21,12 @@ class TwitterURLCorpusPC(AbsTaskPairClassification):
         eval_langs=["eng-Latn"],
         main_score="max_ap",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Social", "Written"],
+        task_subtypes=[],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{lan-etal-2017-continuously,
             title = "A Continuously Growing Dataset of Sentential Paraphrases",
             author = "Lan, Wuwei  and

diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py
@@ -21,12 +21,12 @@ class AskUbuntuDupQuestions(AbsTaskReranking):
         eval_langs=["eng-Latn"],
         main_score="map",
         date=None,
-        domains=None,
+        domains=["Programming", "Web"],
         task_subtypes=None,
         license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
         prompt="Retrieve duplicate questions from AskUbuntu forum",
         bibtex_citation="""@article{wang-2021-TSDAE,
     title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",

diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py
@@ -20,13 +20,13 @@ class StackOverflowDupQuestions(AbsTaskReranking):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="map",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2014-01-21", "2018-01-01"),
+        domains=["Written", "Blog", "Programming"],
+        task_subtypes=["Question answering"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         prompt="Retrieve duplicate questions from StackOverflow forum",
         bibtex_citation="""@article{Liu2018LinkSOAD,
   title={LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py
@@ -21,12 +21,12 @@ class CQADupstackAndroidRetrieval(AbsTaskRetrieval):
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Programming", "Web", "Written", "Non-fiction"],
+        task_subtypes=["Question answering", "Duplicate Detection"],
+        license="apache-2.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{hoogeveen2015,
 author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy},
 title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py
@@ -21,12 +21,12 @@ class CQADupstackEnglishRetrieval(AbsTaskRetrieval):
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Written"],
+        task_subtypes=["Question answering", "Duplicate Detection"],
+        license="apache-2.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{hoogeveen2015,
 author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy},
 title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py
@@ -21,12 +21,12 @@ class CQADupstackGamingRetrieval(AbsTaskRetrieval):
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Web", "Written"],
+        task_subtypes=["Question answering", "Duplicate Detection"],
+        license="apache-2.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{hoogeveen2015,
 author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy},
 title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py
@@ -21,12 +21,12 @@ class CQADupstackGisRetrieval(AbsTaskRetrieval):
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Written", "Non-fiction"],
+        task_subtypes=["Question answering", "Duplicate Detection"],
+        license="apache-2.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{hoogeveen2015,
 author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy},
 title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py
@@ -21,12 +21,12 @@ class CQADupstackMathematicaRetrieval(AbsTaskRetrieval):
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Written", "Academic", "Non-fiction"],
+        task_subtypes=["Question answering", "Duplicate Detection"],
+        license="apache-2.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{hoogeveen2015,
 author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy},
 title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py
@@ -21,12 +21,12 @@ class CQADupstackPhysicsRetrieval(AbsTaskRetrieval):
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Written", "Academic", "Non-fiction"],
+        task_subtypes=["Question answering", "Duplicate Detection"],
+        license="apache-2.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{hoogeveen2015,
 author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy},
 title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research},

diff --git a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py
@@ -23,7 +23,7 @@ class CQADupstackProgrammersRetrieval(AbsTaskRetrieval):
         date=None,
         domains=["Programming", "Written", "Non-fiction"],
         task_subtypes=[],
-        license="cc-by-sa-4.0",
+        license="apache-2.0",
         annotations_creators="derived",
         dialect=[],
         sample_creation="found",