Skip to content

Commit 413000e

Browse files
committed
Merge branch 'new-flagembedding-v1' of https://github.com/hanhainebula/FlagEmbedding into new-flagembedding-v1
2 parents 3aa10c0 + 7ccaf64 commit 413000e

22 files changed

Lines changed: 222 additions & 156 deletions

File tree

FlagEmbedding/abc/evaluation/arguments.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class AbsEvalModelArgs:
8181
metadata={"help": "The embedder name or path.", "required": True}
8282
)
8383
embedder_model_class: Optional[str] = field(
84-
default="auto", metadata={"help": "The embedder model class. Available classes: ['auto', 'encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl']. Default: auto.", "choices": ["auto", "encoder-only-base", "encoder-only-m3", "decoder-only-base", "decoder-only-icl"]}
84+
default=None, metadata={"help": "The embedder model class. Available classes: ['encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl']. Default: None. For the custom model, you need to specifiy the model class.", "choices": ["encoder-only-base", "encoder-only-m3", "decoder-only-base", "decoder-only-icl"]}
8585
)
8686
normalize_embeddings: bool = field(
8787
default=True, metadata={"help": "whether to normalize the embeddings"}
@@ -114,7 +114,7 @@ class AbsEvalModelArgs:
114114
default=None, metadata={"help": "The reranker name or path."}
115115
)
116116
reranker_model_class: Optional[str] = field(
117-
default="auto", metadata={"help": "The reranker model class. Available classes: ['auto', 'encoder-only-base', 'decoder-only-base', 'decoder-only-layerwise', 'decoder-only-lightweight']. Default: auto.", "choices": ["auto", "encoder-only-base", "decoder-only-base", "decoder-only-layerwise", "decoder-only-lightweight"]}
117+
default=None, metadata={"help": "The reranker model class. Available classes: ['encoder-only-base', 'decoder-only-base', 'decoder-only-layerwise', 'decoder-only-lightweight']. Default: None. For the custom model, you need to specify the model class.", "choices": ["encoder-only-base", "decoder-only-base", "decoder-only-layerwise", "decoder-only-lightweight"]}
118118
)
119119
reranker_peft_path: Optional[str] = field(
120120
default=None, metadata={"help": "The reranker peft path."}

FlagEmbedding/inference/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,21 @@
22
from .auto_reranker import FlagAutoReranker
33
from .embedder import (
44
FlagModel, BGEM3FlagModel,
5-
FlagICLModel, FlagLLMModel
5+
FlagICLModel, FlagLLMModel,
6+
EmbedderModelClass
67
)
78
from .reranker import (
89
FlagReranker,
9-
FlagLLMReranker, LayerWiseFlagLLMReranker, LightWeightFlagLLMReranker
10+
FlagLLMReranker, LayerWiseFlagLLMReranker, LightWeightFlagLLMReranker,
11+
RerankerModelClass
1012
)
1113

1214

1315
__all__ = [
1416
"FlagAutoModel",
1517
"FlagAutoReranker",
18+
"EmbedderModelClass",
19+
"RerankerModelClass",
1620
"FlagModel",
1721
"BGEM3FlagModel",
1822
"FlagICLModel",

FlagEmbedding/inference/auto_embedder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def from_finetuned(
3434
if model_name.startswith("checkpoint-"):
3535
model_name = os.path.basename(os.path.dirname(model_name_or_path))
3636

37-
if model_class is not None and model_class != 'auto':
37+
if model_class is not None:
3838
_model_class = EMBEDDER_CLASS_MAPPING[EmbedderModelClass(model_class)]
3939
if pooling_method is None:
4040
pooling_method = _model_class.DEFAULT_POOLING_METHOD

FlagEmbedding/inference/auto_reranker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def from_finetuned(
3030
if model_name.startswith("checkpoint-"):
3131
model_name = os.path.basename(os.path.dirname(model_name_or_path))
3232

33-
if model_class is not None and model_class != 'auto':
33+
if model_class is not None:
3434
_model_class = RERANKER_CLASS_MAPPING[RerankerModelClass(model_class)]
3535
if trust_remote_code is None:
3636
trust_remote_code = False
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from .encoder_only import FlagModel, BGEM3FlagModel
22
from .decoder_only import FlagICLModel, FlagLLMModel
3+
from .model_mapping import EmbedderModelClass
34

45
__all__ = [
56
"FlagModel",
67
"BGEM3FlagModel",
78
"FlagICLModel",
89
"FlagLLMModel",
10+
"EmbedderModelClass",
911
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from .decoder_only import FlagLLMReranker, LayerWiseFlagLLMReranker, LightWeightFlagLLMReranker
22
from .encoder_only import FlagReranker
3+
from .model_mapping import RerankerModelClass
34

45
__all__ = [
56
"FlagReranker",
67
"FlagLLMReranker",
78
"LayerWiseFlagLLMReranker",
89
"LightWeightFlagLLMReranker",
10+
"RerankerModelClass",
911
]

README.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
<a href="https://huggingface.co/C-MTEB">
1515
<img alt="Build" src="https://img.shields.io/badge/C_MTEB-🤗-yellow">
1616
</a>
17-
<a href="https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/baai_general_embedding">
17+
<a href="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/baai_general_embedding">
1818
<img alt="Build" src="https://img.shields.io/badge/FlagEmbedding-1.1-red">
1919
</a>
2020
</p>
@@ -24,7 +24,7 @@
2424
<a href=#installation>Installation</a> |
2525
<a href=#quick-start>Quick Start</a> |
2626
<a href=#community>Community</a> |
27-
<a href="https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research">Projects</a> |
27+
<a href="https://github.com/FlagOpen/FlagEmbedding/tree/master/research">Projects</a> |
2828
<a href=#model-list>Model List</a> |
2929
<a href="#contributor">Contributor</a> |
3030
<a href="#citation">Citation</a> |
@@ -40,12 +40,12 @@ BGE (BAAI General Embedding) focuses on retrieval-augmented LLMs, consisting of
4040

4141
![projects](./imgs/projects.png)
4242

43-
- **Inference**: [Embedder](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/examples/inference/embedder), [Reranker](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/examples/inference/reranker)
44-
- **Finetune**: [Embedder](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/examples/finetune/embedder), [Reranker](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/examples/finetune/reranker)
45-
- **[Evaluation](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/examples/evaluation)**
46-
- **[Dataset](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/dataset)**
47-
- **[Tutorials](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/Tutorials)**
48-
- **[research](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research)**
43+
- **Inference**: [Embedder](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/inference/embedder), [Reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/inference/reranker)
44+
- **Finetune**: [Embedder](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune/embedder), [Reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune/reranker)
45+
- **[Evaluation](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/evaluation)**
46+
- **[Dataset](https://github.com/FlagOpen/FlagEmbedding/tree/master/dataset)**
47+
- **[Tutorials](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials)**
48+
- **[research](https://github.com/FlagOpen/FlagEmbedding/tree/master/research)**
4949

5050
## News
5151

@@ -65,26 +65,26 @@ BGE (BAAI General Embedding) focuses on retrieval-augmented LLMs, consisting of
6565

6666
- 6/7/2024: Release a new benchmark [MLVU](https://github.com/JUNJIE99/MLVU), the first comprehensive benchmark specifically designed for long video understanding. MLVU features an extensive range of video durations, a diverse collection of video sources, and a set of evaluation tasks uniquely tailored for long-form video understanding. :fire:
6767
- 5/21/2024: Release a new benchmark [AIR-Bench](https://github.com/AIR-Bench/AIR-Bench) together with Jina AI, Zilliz, HuggingFace, and other partners. AIR-Bench focuses on a fair out-of-distribution evaluation for Neural IR & RAG. It generates the synthetic data for benchmarking w.r.t. diverse domains and languages. It is dynamic and will be updated on regular basis. [Leaderboard](https://huggingface.co/spaces/AIR-Bench/leaderboard) :fire:
68-
- 4/30/2024: Release [Llama-3-8B-Instruct-80K-QLoRA](https://huggingface.co/namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA), extending the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA training on a few synthesized long-context data. The model achieves remarkable performance on various long-context benchmarks. [Code](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/Long_LLM/longllm_qlora) :fire:
69-
- 3/18/2024: Release new [rerankers](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/llm_reranker), built upon powerful M3 and LLM (GEMMA and MiniCPM, not so large actually :smiley:) backbones, supporitng multi-lingual processing and larger inputs, massive improvements of ranking performances on BEIR, C-MTEB/Retrieval, MIRACL, LlamaIndex Evaluation :fire:
70-
- 3/18/2024: Release [Visualized-BGE](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/visual_bge), equipping BGE with visual capabilities. Visualized-BGE can be utilized to generate embeddings for hybrid image-text data. :fire:
68+
- 4/30/2024: Release [Llama-3-8B-Instruct-80K-QLoRA](https://huggingface.co/namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA), extending the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA training on a few synthesized long-context data. The model achieves remarkable performance on various long-context benchmarks. [Code](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/Long_LLM/longllm_qlora) :fire:
69+
- 3/18/2024: Release new [rerankers](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/llm_reranker), built upon powerful M3 and LLM (GEMMA and MiniCPM, not so large actually :smiley:) backbones, supporitng multi-lingual processing and larger inputs, massive improvements of ranking performances on BEIR, C-MTEB/Retrieval, MIRACL, LlamaIndex Evaluation :fire:
70+
- 3/18/2024: Release [Visualized-BGE](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/visual_bge), equipping BGE with visual capabilities. Visualized-BGE can be utilized to generate embeddings for hybrid image-text data. :fire:
7171
- 1/30/2024: Release **BGE-M3**, a new member to BGE model series! M3 stands for **M**ulti-linguality (100+ languages), **M**ulti-granularities (input length up to 8192), **M**ulti-Functionality (unification of dense, lexical, multi-vec/colbert retrieval).
7272
It is the first embedding model which supports all three retrieval methods, achieving new SOTA on multi-lingual (MIRACL) and cross-lingual (MKQA) benchmarks.
73-
[Technical Report](https://arxiv.org/pdf/2402.03216.pdf) and [Code](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/BGE_M3). :fire:
74-
- 1/9/2024: Release [Activation-Beacon](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/Long_LLM/activation_beacon), an effective, efficient, compatible, and low-cost (training) method to extend the context length of LLM. [Technical Report](https://arxiv.org/abs/2401.03462)
75-
- 12/24/2023: Release **LLaRA**, a LLaMA-7B based dense retriever, leading to state-of-the-art performances on MS MARCO and BEIR. Model and code will be open-sourced. Please stay tuned. [Technical Report](https://arxiv.org/abs/2312.15503) and [Code](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/LLARA)
76-
- 11/23/2023: Release [LM-Cocktail](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/LM_Cocktail), a method to maintain general capabilities during fine-tuning by merging multiple language models. [Technical Report](https://arxiv.org/abs/2311.13534)
77-
- 10/12/2023: Release [LLM-Embedder](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/llm_embedder), a unified embedding model to support diverse retrieval augmentation needs for LLMs. [Technical Report](https://arxiv.org/pdf/2310.07554.pdf)
73+
[Technical Report](https://arxiv.org/pdf/2402.03216.pdf) and [Code](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/BGE_M3). :fire:
74+
- 1/9/2024: Release [Activation-Beacon](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/Long_LLM/activation_beacon), an effective, efficient, compatible, and low-cost (training) method to extend the context length of LLM. [Technical Report](https://arxiv.org/abs/2401.03462)
75+
- 12/24/2023: Release **LLaRA**, a LLaMA-7B based dense retriever, leading to state-of-the-art performances on MS MARCO and BEIR. Model and code will be open-sourced. Please stay tuned. [Technical Report](https://arxiv.org/abs/2312.15503) and [Code](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/LLARA)
76+
- 11/23/2023: Release [LM-Cocktail](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/LM_Cocktail), a method to maintain general capabilities during fine-tuning by merging multiple language models. [Technical Report](https://arxiv.org/abs/2311.13534)
77+
- 10/12/2023: Release [LLM-Embedder](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/llm_embedder), a unified embedding model to support diverse retrieval augmentation needs for LLMs. [Technical Report](https://arxiv.org/pdf/2310.07554.pdf)
7878
- 09/15/2023: The [technical report](https://arxiv.org/pdf/2309.07597.pdf) of BGE has been released
7979
- 09/15/2023: The [massive training data](https://data.baai.ac.cn/details/BAAI-MTP) of BGE has been released
8080
- 09/12/2023: New models:
8181
- **New reranker model**: release cross-encoder models `BAAI/bge-reranker-base` and `BAAI/bge-reranker-large`, which are more powerful than embedding model. We recommend to use/fine-tune them to re-rank top-k documents returned by embedding models.
8282
- **update embedding model**: release `bge-*-v1.5` embedding model to alleviate the issue of the similarity distribution, and enhance its retrieval ability without instruction.
83-
- 09/07/2023: Update [fine-tune code](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/baai_general_embedding): Add script to mine hard negatives and support adding instruction during fine-tuning.
83+
- 09/07/2023: Update [fine-tune code](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/baai_general_embedding): Add script to mine hard negatives and support adding instruction during fine-tuning.
8484
- 08/09/2023: BGE Models are integrated into **Langchain**, you can use it like [this](#using-langchain); C-MTEB **leaderboard** is [available](https://huggingface.co/spaces/mteb/leaderboard).
8585
- 08/05/2023: Release base-scale and small-scale models, **best performance among the models of the same size 🤗**
8686
- 08/02/2023: Release `bge-large-*`(short for BAAI General Embedding) Models, **rank 1st on MTEB and C-MTEB benchmark!** :tada: :tada:
87-
- 08/01/2023: We release the [Chinese Massive Text Embedding Benchmark](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/C_MTEB) (**C-MTEB**), consisting of 31 test dataset.
87+
- 08/01/2023: We release the [Chinese Massive Text Embedding Benchmark](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB) (**C-MTEB**), consisting of 31 test dataset.
8888

8989

9090
</details>
@@ -156,7 +156,7 @@ The following contents are releasing in the upcoming weeks:
156156
| [BAAI/bge-multilingual-gemma2](https://huggingface.co/BAAI/bge-multilingual-gemma2) | Multilingual | A LLM-based multilingual embedding model, trained on a diverse range of languages and tasks. | Provide instructions based on the given task. |
157157
| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | Multilingual | Multi-Functionality(dense retrieval, sparse retrieval, multi-vector(colbert)), Multi-Linguality, and Multi-Granularity(8192 tokens) | |
158158
| [LM-Cocktail](https://huggingface.co/Shitao) | English | fine-tuned models (Llama and BGE) which can be used to reproduce the results of LM-Cocktail | |
159-
| [BAAI/llm-embedder](https://huggingface.co/BAAI/llm-embedder) | English | a unified embedding model to support diverse retrieval augmentation needs for LLMs | See [README](https://github.com/hanhainebula/FlagEmbedding/tree/new-flagembedding-v1/research/llm_embedder) |
159+
| [BAAI/llm-embedder](https://huggingface.co/BAAI/llm-embedder) | English | a unified embedding model to support diverse retrieval augmentation needs for LLMs | See [README](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/llm_embedder) |
160160
| [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) | Multilingual | a lightweight cross-encoder model, possesses strong multilingual capabilities, easy to deploy, with fast inference. | |
161161
| [BAAI/bge-reranker-v2-gemma](https://huggingface.co/BAAI/bge-reranker-v2-gemma) | Multilingual | a cross-encoder model which is suitable for multilingual contexts, performs well in both English proficiency and multilingual capabilities. | |
162162
| [BAAI/bge-reranker-v2-minicpm-layerwise](https://huggingface.co/BAAI/bge-reranker-v2-minicpm-layerwise) | Multilingual | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers for output, facilitating accelerated inference. | |

0 commit comments

Comments
 (0)