-
Notifications
You must be signed in to change notification settings - Fork 307
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Added Misc Chinese models (#1819)
* Added moka and piccolo models to overview file * Added Text2Vec models * Added various Chinese embedding models --------- Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
- Loading branch information
1 parent
3b2d074
commit 9823529
Showing
3 changed files
with
196 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
"""Implementation of Text2Vec models""" | ||
|
||
from __future__ import annotations | ||
|
||
from mteb.model_meta import ModelMeta | ||
|
||
# I couldn't find the large model on HF for some reason | ||
text2vec_base_chinese = ModelMeta( | ||
name="shibing624/text2vec-base-chinese", | ||
languages=["zho-Hans"], | ||
open_weights=True, | ||
revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e", | ||
release_date="2022-01-23", | ||
n_parameters=102 * 1e6, | ||
memory_usage=None, | ||
embed_dim=768, | ||
license="apache-2.0", | ||
max_tokens=512, | ||
reference="https://huggingface.co/shibing624/text2vec-base-chinese", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, # Couldn't find it | ||
public_training_data=True, | ||
training_datasets={ | ||
# source: https://huggingface.co/shibing624/text2vec-base-chinese | ||
# Not in MTEB | ||
# - shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset | ||
# (Could have overlaps I'm not aware of) | ||
}, | ||
) | ||
|
||
text2vec_base_chinese_paraphrase = ModelMeta( | ||
name="shibing624/text2vec-base-chinese-paraphrase", | ||
languages=["zho-Hans"], | ||
open_weights=True, | ||
revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd", | ||
release_date="2023-06-19", | ||
n_parameters=118 * 1e6, | ||
memory_usage=None, | ||
embed_dim=768, | ||
license="apache-2.0", | ||
max_tokens=512, | ||
reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, # Couldn't find it | ||
public_training_data=True, | ||
training_datasets={ | ||
# source: https://huggingface.co/shibing624/text2vec-base-chinese | ||
# Not in MTEB | ||
# - shibing624/nli-zh-all/text2vec-base-chinese-paraphrase | ||
# (Could have overlaps I'm not aware of) | ||
}, | ||
) | ||
|
||
|
||
text2vec_multi_langs = [ | ||
"deu-Latn", # German (de) | ||
"eng-Latn", # English (en) | ||
"spa-Latn", # Spanish (es) | ||
"fra-Latn", # French (fr) | ||
"ita-Latn", # Italian (it) | ||
"nld-Latn", # Dutch (nl) | ||
"pol-Latn", # Polish (pl) | ||
"por-Latn", # Portuguese (pt) | ||
"rus-Cyrl", # Russian (ru) | ||
"zho-Hans", # Chinese (Simplified, zh) | ||
] | ||
text2vec_base_multilingual = ModelMeta( | ||
name="shibing624/text2vec-base-multilingual", | ||
languages=text2vec_multi_langs, | ||
open_weights=True, | ||
revision="6633dc49e554de7105458f8f2e96445c6598e9d1", | ||
release_date="2023-06-22", | ||
# While it can be loaded with SBERT, it has one suspicious file according to huggingface | ||
# So probably best not to. | ||
loader=None, | ||
n_parameters=118 * 1e6, | ||
memory_usage=None, | ||
embed_dim=384, | ||
license="apache-2.0", | ||
max_tokens=256, | ||
reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", | ||
public_training_code=False, # Couldn't find it | ||
public_training_data=True, | ||
training_datasets={ | ||
# source: https://huggingface.co/shibing624/text2vec-base-chinese | ||
# Not in MTEB | ||
# - shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset | ||
# # (Could have overlaps I'm not aware of) | ||
}, | ||
) |