Skip to content

Commit 035bbf5

Browse files
authored
Merge branch 'FlagOpen:master' into master
2 parents d104abb + dd7d32b commit 035bbf5

11 files changed

Lines changed: 128 additions & 63 deletions

File tree

FlagEmbedding/abc/finetune/embedder/AbsDataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,9 @@ def _create_batch_data(self, batch_raw_data):
416416

417417
passages.extend(tmp_passages)
418418

419-
if len(teacher_scores) > 0 and len(passages) > 0:
420-
assert len(teacher_scores) == len(passages)
419+
if teacher_scores is not None:
420+
if len(teacher_scores) > 0 and len(passages) > 0:
421+
assert len(teacher_scores) == len(passages)
421422

422423
return queries, passages, teacher_scores
423424

FlagEmbedding/finetune/reranker/decoder_only/layerwise/load_model.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from FlagEmbedding.finetune.reranker.decoder_only.layerwise.arguments import RerankerModelArguments
99

1010
from .modeling_minicpm_reranker import LayerWiseMiniCPMForCausalLM, LayerWiseHead
11+
from .configuration_minicpm_reranker import LayerWiseMiniCPMConfig
1112

1213
logger = logging.getLogger(__name__)
1314

@@ -41,7 +42,7 @@ def get_model(model_args: RerankerModelArguments, only_for_one_logit):
4142
config = AutoConfig.from_pretrained(
4243
model_args.model_name_or_path,
4344
trust_remote_code=model_args.trust_remote_code,
44-
token=model_args,
45+
token=model_args.token,
4546
cache_dir=model_args.cache_dir
4647
)
4748
else:
@@ -61,7 +62,7 @@ def get_model(model_args: RerankerModelArguments, only_for_one_logit):
6162
trust_remote_code=model_args.trust_remote_code,
6263
# torch_dtype=torch.float16 if training_args.fp16 else torch.bfloat16,
6364
use_flash_attention_2=True if model_args.use_flash_attn else False,
64-
token=model_args,
65+
token=model_args.token,
6566
cache_dir=model_args.cache_dir,
6667
from_tf=bool(".ckpt" in model_args.model_name_or_path),
6768
config=config,
@@ -115,7 +116,7 @@ def get_model(model_args: RerankerModelArguments, only_for_one_logit):
115116
model_args.model_name_or_path,
116117
# torch_dtype=torch.float16 if training_args.fp16 else torch.bfloat16,
117118
use_flash_attention_2=True if model_args.use_flash_attn else False,
118-
token=model_args,
119+
token=model_args.token,
119120
cache_dir=model_args.cache_dir,
120121
from_tf=bool(".ckpt" in model_args.model_name_or_path),
121122
config=config,
@@ -155,14 +156,14 @@ def save_merged_model(model_args: RerankerModelArguments, output_dir: str):
155156
config = AutoConfig.from_pretrained(
156157
model_args.config_name,
157158
trust_remote_code=model_args.trust_remote_code,
158-
token=model_args,
159+
token=model_args.token,
159160
cache_dir=model_args.cache_dir
160161
)
161162
elif model_args.model_name_or_path:
162163
config = AutoConfig.from_pretrained(
163164
model_args.model_name_or_path,
164165
trust_remote_code=model_args.trust_remote_code,
165-
token=model_args,
166+
token=model_args.token,
166167
cache_dir=model_args.cache_dir
167168
)
168169
else:
@@ -172,19 +173,19 @@ def save_merged_model(model_args: RerankerModelArguments, output_dir: str):
172173
config.use_cache = False
173174

174175
if model_args.model_type == 'from_raw_model':
175-
config = AutoConfig.from_pretrained('BAAI/bge-reranker-v2-minicpm-layerwise',
176-
cache_dir=model_args.cache_dir,
177-
token=model_args,
178-
trust_remote_code=model_args.trust_remote_code)
176+
config = LayerWiseMiniCPMConfig.from_pretrained('BAAI/bge-reranker-v2-minicpm-layerwise',
177+
cache_dir=model_args.cache_dir,
178+
token=model_args.token,
179+
trust_remote_code=model_args.trust_remote_code)
179180
config.start_layer = model_args.start_layer
180181
config.head_multi = model_args.head_multi
181182
config.head_type = model_args.head_type
182183

183-
model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,
184-
config=config,
185-
cache_dir=model_args.cache_dir,
186-
token=model_args,
187-
trust_remote_code=model_args.trust_remote_code)
184+
model = LayerWiseMiniCPMForCausalLM.from_pretrained(model_args.model_name_or_path,
185+
config=config,
186+
cache_dir=model_args.cache_dir,
187+
token=model_args.token,
188+
trust_remote_code=model_args.trust_remote_code)
188189

189190
if model_args.raw_peft is not None:
190191
for peft_path in model_args.raw_peft:

FlagEmbedding/inference/reranker/decoder_only/models/__init__.py

Whitespace-only changes.

FlagEmbedding/inference/reranker/encoder_only/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99

1010
def sigmoid(x):
11-
return 1 / (1 + np.exp(-x))
11+
return float(1 / (1 + np.exp(-x)))
1212

1313

1414
class BaseReranker(AbsReranker):

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,20 +92,31 @@ It is the first embedding model which supports all three retrieval methods, achi
9292

9393
## Installation
9494
### Using pip:
95+
If you do not want to finetune the models, you can install the package without the finetune dependency:
9596
```
9697
pip install -U FlagEmbedding
9798
```
99+
If you want to finetune the models, you can install the package with the finetune dependency:
100+
```
101+
pip install -U FlagEmbedding[finetune]
102+
```
98103
### Install from sources:
99104

100105
Clone the repository and install
101106
```
102107
git clone https://github.com/FlagOpen/FlagEmbedding.git
103108
cd FlagEmbedding
109+
# If you do not want to finetune the models, you can install the package without the finetune dependency:
104110
pip install .
111+
# If you want to finetune the models, you can install the package with the finetune dependency:
112+
# pip install .[finetune]
105113
```
106114
For development in editable mode:
107115
```
116+
# If you do not want to finetune the models, you can install the package without the finetune dependency:
108117
pip install -e .
118+
# If you want to finetune the models, you can install the package with the finetune dependency:
119+
# pip install -e .[finetune]
109120
```
110121

111122
## Quick Start

README_zh.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
<a href="#license">License</a>
3434
<p>
3535
</h4>
36-
[English](README.md) | [中文](README_zh.md)
36+
37+
[English](https://github.com/FlagOpen/FlagEmbedding/blob/master/README.md) | [中文](https://github.com/FlagOpen/FlagEmbedding/blob/master/README_zh.md)
3738

3839
BGE (BAAI General Embedding) 专注于检索增强llm领域,目前包括以下项目:
3940

@@ -85,20 +86,31 @@ BGE (BAAI General Embedding) 专注于检索增强llm领域,目前包括以下
8586

8687
## 安装
8788
### 使用pip:
89+
如果你不想微调模型,你可以直接安装包,不用finetune依赖:
8890
```
8991
pip install -U FlagEmbedding
9092
```
93+
如果你想微调模型,你可以用finetune依赖安装:
94+
```
95+
pip install -U FlagEmbedding[finetune]
96+
```
9197
### 从源文件安装部署:
9298

9399
克隆并安装FlagEmbedding:
94100
```
95101
git clone https://github.com/FlagOpen/FlagEmbedding.git
96102
cd FlagEmbedding
103+
# 如果你不想微调模型,你可以直接安装包,不用finetune依赖:
97104
pip install .
105+
# 如果你想微调模型,你可以用finetune依赖安装:
106+
# pip install .[finetune]
98107
```
99108
在可编辑模式下安装:
100109
```
110+
# 如果你不想微调模型,你可以直接安装包,不用finetune依赖:
101111
pip install -e .
112+
# 如果你想微调模型,你可以用finetune依赖安装:
113+
# pip install -e .[finetune]
102114
```
103115

104116
## 快速开始

examples/README.md

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,42 @@
1-
# 1. Introduction
1+
# Examples
2+
3+
- [1. Introduction](#1-Introduction)
4+
- [2. Installation](#2-Installation)
5+
- [3. Inference](#3-Inference)
6+
- [4. Finetune](#4-Finetune)
7+
- [5. Evaluation](#5-Evaluation)
8+
9+
## 1. Introduction
210

311
In this example, we show how to **inference**, **finetune** and **evaluate** the baai-general-embedding.
412

5-
# 2. Installation
13+
## 2. Installation
614

715
* **with pip**
16+
817
```shell
918
pip install -U FlagEmbedding
1019
```
1120

1221
* **from source**
22+
1323
```shell
1424
git clone https://github.com/FlagOpen/FlagEmbedding.git
1525
cd FlagEmbedding
1626
pip install .
1727
```
28+
1829
For development, install as editable:
30+
1931
```shell
2032
pip install -e .
2133
```
2234

23-
# 3. Inference
35+
## 3. Inference
2436

2537
We have provided the inference code for two types of models: the **embedder** and the **reranker**. These can be loaded using `FlagAutoModel` and `FlagAutoReranker`, respectively. For more detailed instructions on their use, please refer to the documentation for the [embedder](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/inference/embedder) and [reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/inference/reranker).
2638

27-
## 1. Embedder
39+
### 1. Embedder
2840

2941
```python
3042
from FlagEmbedding import FlagAutoModel
@@ -49,7 +61,7 @@ scores = q_embeddings @ p_embeddings.T
4961
print(scores)
5062
```
5163

52-
## 2. Reranker
64+
### 2. Reranker
5365

5466
```python
5567
from FlagEmbedding import FlagAutoReranker
@@ -65,11 +77,17 @@ scores = model.compute_score(pairs)
6577
print(scores)
6678
```
6779

68-
# 4. Finetune
80+
## 4. Finetune
6981

7082
We support fine-tuning a variety of BGE series models, including `bge-large-en-v1.5`, `bge-m3`, `bge-en-icl`, `bge-multilingual-gemma2`, `bge-reranker-v2-m3`, `bge-reranker-v2-gemma`, and `bge-reranker-v2-minicpm-layerwise`, among others. As examples, we use the basic models `bge-large-en-v1.5` and `bge-reranker-large`. For more details, please refer to the [embedder](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune/embedder) and [reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune/reranker) sections.
7183

72-
## 1. Embedder
84+
If you do not have the `deepspeed` and `flash-attn` packages installed, you can install them with the following commands:
85+
```shell
86+
pip install deepspeed
87+
pip install flash-attn --no-build-isolation
88+
```
89+
90+
### 1. Embedder
7391

7492
```shell
7593
torchrun --nproc_per_node 2 \
@@ -104,7 +122,7 @@ torchrun --nproc_per_node 2 \
104122
--kd_loss_type kl_div
105123
```
106124

107-
## 2. Reranker
125+
### 2. Reranker
108126

109127
```shell
110128
torchrun --nproc_per_node 2 \
@@ -134,16 +152,13 @@ torchrun --nproc_per_node 2 \
134152
--save_steps 1000
135153
```
136154

137-
# 5. Evaluation
155+
## 5. Evaluation
138156

139157
We support evaluations on [MTEB](https://github.com/embeddings-benchmark/mteb), [BEIR](https://github.com/beir-cellar/beir), [MSMARCO](https://microsoft.github.io/msmarco/), [MIRACL](https://github.com/project-miracl/miracl), [MLDR](https://huggingface.co/datasets/Shitao/MLDR), [MKQA](https://github.com/apple/ml-mkqa), [AIR-Bench](https://github.com/AIR-Bench/AIR-Bench), and custom datasets. Below is an example of evaluating MSMARCO passages. For more details, please refer to the [evaluation examples](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/evaluation).
140158

141159
```shell
142160
pip install pytrec_eval
143161
pip install https://github.com/kyamagu/faiss-wheels/releases/download/v1.7.3/faiss_gpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
144-
```
145-
146-
```shell
147162
python -m FlagEmbedding.evaluation.msmarco \
148163
--eval_name msmarco \
149164
--dataset_dir ./data/msmarco \

0 commit comments

Comments
 (0)