embeddings-benchmark · isaac-chung · Dec 9, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/README.md b/README.md
@@ -46,10 +46,8 @@ from sentence_transformers import SentenceTransformer
 
 # Define the sentence-transformers model name
 model_name = "average_word_embeddings_komninos"
-# or directly from huggingface:
-# model_name = "sentence-transformers/all-MiniLM-L6-v2"
 
-model = SentenceTransformer(model_name)
+model = mteb.get_model(model_name) # if the model is not implemented in MTEB it will be eq. to SentenceTransformer(model_name)
 tasks = mteb.get_tasks(tasks=["Banking77Classification"])
 evaluation = mteb.MTEB(tasks=tasks)
 results = evaluation.run(model, output_folder=f"results/{model_name}")
@@ -221,7 +219,10 @@ Note that the public leaderboard uses the test splits for all datasets except MS
 Models should implement the following interface, implementing an `encode` function taking as inputs a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.). For inspiration, you can look at the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts) used for running diverse models via SLURM scripts for the paper.
 
 ```python
+import mteb
 from mteb.encoder_interface import PromptType
+import numpy as np
+
 
 class CustomModel:
     def encode(
@@ -245,7 +246,7 @@ class CustomModel:
         pass
 
 model = CustomModel()
-tasks = mteb.get_task("Banking77Classification")
+tasks = mteb.get_tasks(tasks=["Banking77Classification"])
 evaluation = MTEB(tasks=tasks)
 evaluation.run(model)
 ```
@@ -379,6 +380,28 @@ results = mteb.load_results(models=models, tasks=tasks)
 df = results_to_dataframe(results)
 ```
 
+</details>
+
+
+<details>
+  <summary>  Annotate Contamination in the training data of a model  </summary>
+
+### Annotate Contamination
+
+have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR
+annotatig the training datasets of the model:
+
+```py
+model_w_contamination = ModelMeta(
+    name = "model-with-contamination"
+    ...
+    training_datasets: {"ArguAna": # name of dataset within MTEB
+                        ["test"]} # the splits that have been trained on
+    ...
+)
+```
+
+
 </details>
 
 <details>

diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py
@@ -8,6 +8,7 @@
 
 import mteb
 from mteb.abstasks.TaskMetadata import PROGRAMMING_LANGS, TASK_TYPE
+from mteb.languages import ISO_TO_FAM_LEVEL0, ISO_TO_LANGUAGE
 
 
 def author_from_bibtex(bibtex: str | None) -> str:
@@ -82,10 +83,21 @@ def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
     ## Wrangle for polars
     pl_table_dict = []
     for lang, d in table_dict.items():
-        d.update({"0-lang": lang})  # for sorting columns
+        d.update({"0-lang-code": lang})  # for sorting columns
         pl_table_dict.append(d)
 
-    df = pl.DataFrame(pl_table_dict).sort(by="0-lang")
+    df = pl.DataFrame(pl_table_dict).sort(by="0-lang-code")
+    df = df.with_columns(
+        pl.col("0-lang-code")
+        .replace_strict(ISO_TO_LANGUAGE, default="unknown")
+        .alias("1-lang-name")
+    )
+    df = df.with_columns(
+        pl.col("0-lang-code")
+        .replace_strict(ISO_TO_FAM_LEVEL0, default="Unclassified")
+        .alias("2-lang-fam")
+    )
+
     df = df.with_columns(sum=pl.sum_horizontal(get_args(TASK_TYPE)))
     df = df.select(sorted(df.columns))
     if sort_by_sum:
@@ -96,7 +108,7 @@ def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
     task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
     horizontal_line_md = "---|---" * (len(sorted(get_args(TASK_TYPE))) + 1)
     table = f"""
-| Language | {task_names_md} | Sum |
+| ISO Code | Language | Family | {task_names_md} | Sum |
 |{horizontal_line_md}|
 """
 
@@ -119,14 +131,14 @@ def insert_tables(
     file_path: str, tables: list[str], tags: list[str] = ["TASKS TABLE"]
 ) -> None:
     """Insert tables within <!-- TABLE START --> and <!-- TABLE END --> or similar tags."""
-    md = Path(file_path).read_text()
+    md = Path(file_path).read_text(encoding="utf-8")
 
     for table, tag in zip(tables, tags):
         start = f"<!-- {tag} START -->"
         end = f"<!-- {tag} END -->"
         md = md.replace(md[md.index(start) + len(start) : md.index(end)], table)
 
-    Path(file_path).write_text(md)
+    Path(file_path).write_text(md, encoding="utf-8")
 
 
 def main():

diff --git a/mteb/__init__.py b/mteb/__init__.py
@@ -10,17 +10,23 @@
     MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
     CoIR,
 )
-from mteb.evaluation import *
+from mteb.encoder_interface import Encoder
+from mteb.evaluation import MTEB
 from mteb.load_results import BenchmarkResults, load_results
-from mteb.models import get_model, get_model_meta, get_model_metas
+from mteb.load_results.task_results import TaskResult
+from mteb.models import (
+    SentenceTransformerWrapper,
+    get_model,
+    get_model_meta,
+    get_model_metas,
+)
 from mteb.overview import TASKS_REGISTRY, get_task, get_tasks
 
 from .benchmarks.benchmarks import Benchmark
 from .benchmarks.get_benchmark import BENCHMARK_REGISTRY, get_benchmark, get_benchmarks
 
 __version__ = version("mteb")  # fetch version from install metadata
 
-
 __all__ = [
     "MTEB_ENG_CLASSIC",
     "MTEB_MAIN_RU",
@@ -40,4 +46,8 @@
     "get_benchmarks",
     "BenchmarkResults",
     "BENCHMARK_REGISTRY",
+    "MTEB",
+    "TaskResult",
+    "SentenceTransformerWrapper",
+    "Encoder",
 ]
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -72,11 +72,11 @@ def __init__(self, seed: int = 42, **kwargs: Any):
         torch.manual_seed(self.seed)
         torch.cuda.manual_seed_all(self.seed)
 
-    def check_if_dataset_is_superseeded(self):
-        """Check if the dataset is superseeded by a newer version"""
+    def check_if_dataset_is_superseded(self):
+        """Check if the dataset is superseded by a newer version"""
         if self.superseded_by:
             logger.warning(
-                f"Dataset '{self.metadata.name}' is superseeded by '{self.superseded_by}', you might consider using the newer version of the dataset."
+                f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
             )
 
     def dataset_transform(self):

diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -168,6 +168,7 @@
         "cc0-1.0",
         "bsd-3-clause",
         "gpl-3.0",
+        "lgpl-3.0",
         "cdla-sharing-1.0",
         "mpl-2.0",
     ]

diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py
@@ -1,15 +1,33 @@
 from __future__ import annotations
 
-from ..evaluation.LangMapping import *
-from .AbsTask import *
-from .AbsTaskBitextMining import *
-from .AbsTaskClassification import *
-from .AbsTaskClustering import *
-from .AbsTaskMultilabelClassification import *
-from .AbsTaskPairClassification import *
-from .AbsTaskReranking import *
-from .AbsTaskRetrieval import *
-from .AbsTaskSpeedTask import *
-from .AbsTaskSTS import *
-from .AbsTaskSummarization import *
-from .MultilingualTask import *
+from .AbsTask import AbsTask
+from .AbsTaskBitextMining import AbsTaskBitextMining
+from .AbsTaskClassification import AbsTaskClassification
+from .AbsTaskClustering import AbsTaskClustering
+from .AbsTaskClusteringFast import AbsTaskClusteringFast
+from .AbsTaskMultilabelClassification import AbsTaskMultilabelClassification
+from .AbsTaskPairClassification import AbsTaskPairClassification
+from .AbsTaskReranking import AbsTaskReranking
+from .AbsTaskRetrieval import AbsTaskRetrieval
+from .AbsTaskSpeedTask import AbsTaskSpeedTask
+from .AbsTaskSTS import AbsTaskSTS
+from .AbsTaskSummarization import AbsTaskSummarization
+from .MultilingualTask import MultilingualTask
+from .TaskMetadata import TaskMetadata
+
+__all__ = [
+    "AbsTask",
+    "AbsTaskBitextMining",
+    "AbsTaskClassification",
+    "AbsTaskClustering",
+    "AbsTaskClusteringFast",
+    "AbsTaskMultilabelClassification",
+    "AbsTaskPairClassification",
+    "AbsTaskReranking",
+    "AbsTaskRetrieval",
+    "AbsTaskSpeedTask",
+    "AbsTaskSTS",
+    "AbsTaskSummarization",
+    "MultilingualTask",
+    "TaskMetadata",
+]
diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py
@@ -1,4 +1,57 @@
 from __future__ import annotations
 
-from mteb.benchmarks.benchmarks import *
-from mteb.benchmarks.get_benchmark import *
+from mteb.benchmarks.benchmarks import (
+    BRIGHT,
+    LONG_EMBED,
+    MTEB_DEU,
+    MTEB_EN,
+    MTEB_ENG_CLASSIC,
+    MTEB_EU,
+    MTEB_FRA,
+    MTEB_INDIC,
+    MTEB_JPN,
+    MTEB_KOR,
+    MTEB_MAIN_RU,
+    MTEB_MINERS_BITEXT_MINING,
+    MTEB_POL,
+    MTEB_RETRIEVAL_LAW,
+    MTEB_RETRIEVAL_MEDICAL,
+    MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
+    SEB,
+    Benchmark,
+    CoIR,
+    MTEB_code,
+    MTEB_multilingual,
+)
+from mteb.benchmarks.get_benchmark import (
+    BENCHMARK_REGISTRY,
+    get_benchmark,
+    get_benchmarks,
+)
+
+__all__ = [
+    "Benchmark",
+    "MTEB_EN",
+    "MTEB_ENG_CLASSIC",
+    "MTEB_MAIN_RU",
+    "MTEB_RETRIEVAL_WITH_INSTRUCTIONS",
+    "MTEB_RETRIEVAL_LAW",
+    "MTEB_RETRIEVAL_MEDICAL",
+    "MTEB_MINERS_BITEXT_MINING",
+    "SEB",
+    "CoIR",
+    "MTEB_FRA",
+    "MTEB_DEU",
+    "MTEB_KOR",
+    "MTEB_POL",
+    "MTEB_code",
+    "MTEB_multilingual",
+    "MTEB_JPN",
+    "MTEB_INDIC",
+    "MTEB_EU",
+    "LONG_EMBED",
+    "BRIGHT",
+    "BENCHMARK_REGISTRY",
+    "get_benchmarks",
+    "get_benchmark",
+]
diff --git a/mteb/descriptive_stats/Classification/Ddisco.json b/mteb/descriptive_stats/Classification/Ddisco.json
@@ -0,0 +1,44 @@
+{
+    "test": {
+        "num_samples": 201,
+        "number_of_characters": 200062,
+        "number_texts_intersect_with_train": 1,
+        "min_text_length": 529,
+        "average_text_length": 995.3333333333334,
+        "max_text_length": 2050,
+        "unique_text": 201,
+        "unique_labels": 3,
+        "labels": {
+            "2": {
+                "count": 76
+            },
+            "3": {
+                "count": 115
+            },
+            "1": {
+                "count": 10
+            }
+        }
+    },
+    "train": {
+        "num_samples": 801,
+        "number_of_characters": 779241,
+        "number_texts_intersect_with_train": null,
+        "min_text_length": 492,
+        "average_text_length": 972.8352059925094,
+        "max_text_length": 2411,
+        "unique_text": 796,
+        "unique_labels": 3,
+        "labels": {
+            "1": {
+                "count": 30
+            },
+            "2": {
+                "count": 325
+            },
+            "3": {
+                "count": 446
+            }
+        }
+    }
+}
diff --git a/mteb/descriptive_stats/Classification/GeorgianSentimentClassification.json b/mteb/descriptive_stats/Classification/GeorgianSentimentClassification.json
@@ -0,0 +1,38 @@
+{
+    "test": {
+        "num_samples": 1200,
+        "number_of_characters": 141679,
+        "number_texts_intersect_with_train": 0,
+        "min_text_length": 25,
+        "average_text_length": 118.06583333333333,
+        "max_text_length": 566,
+        "unique_text": 1200,
+        "unique_labels": 2,
+        "labels": {
+            "1": {
+                "count": 600
+            },
+            "0": {
+                "count": 600
+            }
+        }
+    },
+    "train": {
+        "num_samples": 330,
+        "number_of_characters": 37706,
+        "number_texts_intersect_with_train": null,
+        "min_text_length": 19,
+        "average_text_length": 114.26060606060607,
+        "max_text_length": 315,
+        "unique_text": 330,
+        "unique_labels": 2,
+        "labels": {
+            "1": {
+                "count": 165
+            },
+            "0": {
+                "count": 165
+            }
+        }
+    }
+}