Skip to content

Commit

Permalink
fix: Added Misc Chinese models (#1819)
Browse files Browse the repository at this point in the history
* Added moka and piccolo models to overview file

* Added Text2Vec models

* Added various Chinese embedding models

---------

Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
  • Loading branch information
x-tabdeveloping and isaac-chung authored Jan 17, 2025
1 parent 3b2d074 commit 9823529
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 0 deletions.
88 changes: 88 additions & 0 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,3 +1738,91 @@
training_datasets=None, # They don't specify
superseded_by=None,
)
xiaobu_embedding = ModelMeta(
name="lier007/xiaobu-embedding",
revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92",
release_date="2024-01-09",
languages=["zho_Hans"],
loader=None,
n_parameters=326 * 1e6,
memory_usage=None,
max_tokens=512,
embed_dim=1024,
license="not specified",
open_weights=True,
public_training_data=False,
public_training_code=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/lier007/xiaobu-embedding",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None, # Finetuned from GTE, none of them disclose training data
superseded_by=None,
adapted_from="thenlper/gte-large-zh",
)
xiaobu_embedding_v2 = ModelMeta(
name="lier007/xiaobu-embedding-v2",
revision="1912f2e59a5c2ef802a471d735a38702a5c9485e",
release_date="2024-06-30",
languages=["zho_Hans"],
loader=None,
n_parameters=326 * 1e6,
memory_usage=None,
max_tokens=512,
embed_dim=768,
license="not specified",
open_weights=True,
public_training_data=False,
public_training_code=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/lier007/xiaobu-embedding-v2",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None, # Finetuned from piccolo-embedding, none of them say
superseded_by=None,
adapted_from="sensenova/piccolo-base-zh",
)
yinka_embedding = ModelMeta(
name="Classical/Yinka",
revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92",
release_date="2024-01-09",
languages=["zho_Hans"],
loader=None,
n_parameters=326 * 1e6,
memory_usage=None,
max_tokens=512,
embed_dim=1024,
license="not specified",
open_weights=True,
public_training_data=False,
public_training_code=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/Classical/Yinka",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None, # Not disclosed
superseded_by=None,
adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d",
)
conan_embedding = ModelMeta(
name="TencentBAC/Conan-embedding-v1",
revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb",
release_date="2024-08-22",
languages=["zho_Hans"],
loader=None,
n_parameters=326 * 1e6,
memory_usage=None,
max_tokens=512,
embed_dim=768,
license="cc-by-nc-4.0",
open_weights=True,
public_training_data=False,
public_training_code=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/Classical/Yinka",
similarity_fn_name="cosine",
use_instructions=None,
# source: https://arxiv.org/pdf/2408.15710
training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
superseded_by=None,
)
5 changes: 5 additions & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
nomic_models,
nvidia_models,
openai_models,
piccolo_models,
promptriever_models,
repllama_models,
rerankers_custom,
Expand All @@ -44,6 +45,7 @@
salesforce_models,
sentence_transformers_models,
stella_models,
text2vec_models,
uae_models,
voyage_models,
)
Expand All @@ -69,11 +71,13 @@
llm2vec_models,
mxbai_models,
model2vec_models,
moka_models,
misc_models,
nomic_models,
no_instruct_sentence_models,
nvidia_models,
openai_models,
piccolo_models,
promptriever_models,
repllama_models,
rerankers_custom,
Expand All @@ -88,6 +92,7 @@
jina_models,
jasper_models,
uae_models,
text2vec_models,
stella_models,
uae_models,
voyage_models,
Expand Down
103 changes: 103 additions & 0 deletions mteb/models/text2vec_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Implementation of Text2Vec models"""

from __future__ import annotations

from mteb.model_meta import ModelMeta

# I couldn't find the large model on HF for some reason
text2vec_base_chinese = ModelMeta(
name="shibing624/text2vec-base-chinese",
languages=["zho-Hans"],
open_weights=True,
revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e",
release_date="2022-01-23",
n_parameters=102 * 1e6,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/shibing624/text2vec-base-chinese",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False, # Couldn't find it
public_training_data=True,
training_datasets={
# source: https://huggingface.co/shibing624/text2vec-base-chinese
# Not in MTEB
# - shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset
# (Could have overlaps I'm not aware of)
},
)

text2vec_base_chinese_paraphrase = ModelMeta(
name="shibing624/text2vec-base-chinese-paraphrase",
languages=["zho-Hans"],
open_weights=True,
revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd",
release_date="2023-06-19",
n_parameters=118 * 1e6,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False, # Couldn't find it
public_training_data=True,
training_datasets={
# source: https://huggingface.co/shibing624/text2vec-base-chinese
# Not in MTEB
# - shibing624/nli-zh-all/text2vec-base-chinese-paraphrase
# (Could have overlaps I'm not aware of)
},
)


text2vec_multi_langs = [
"deu-Latn", # German (de)
"eng-Latn", # English (en)
"spa-Latn", # Spanish (es)
"fra-Latn", # French (fr)
"ita-Latn", # Italian (it)
"nld-Latn", # Dutch (nl)
"pol-Latn", # Polish (pl)
"por-Latn", # Portuguese (pt)
"rus-Cyrl", # Russian (ru)
"zho-Hans", # Chinese (Simplified, zh)
]
text2vec_base_multilingual = ModelMeta(
name="shibing624/text2vec-base-multilingual",
languages=text2vec_multi_langs,
open_weights=True,
revision="6633dc49e554de7105458f8f2e96445c6598e9d1",
release_date="2023-06-22",
# While it can be loaded with SBERT, it has one suspicious file according to huggingface
# So probably best not to.
loader=None,
n_parameters=118 * 1e6,
memory_usage=None,
embed_dim=384,
license="apache-2.0",
max_tokens=256,
reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
public_training_code=False, # Couldn't find it
public_training_data=True,
training_datasets={
# source: https://huggingface.co/shibing624/text2vec-base-chinese
# Not in MTEB
# - shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset
# # (Could have overlaps I'm not aware of)
},
)

0 comments on commit 9823529

Please sign in to comment.