Skip to content

Commit a34654a

Browse files
authored
Merge branch 'FlagOpen:master' into master
2 parents 0216c33 + 65cd70d commit a34654a

142 files changed

Lines changed: 15030 additions & 86039 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

C_MTEB/MKQA/dense_retrieval/step0-generate_embedding.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
--max_passage_length 512 \
66
--batch_size 256 \
77
--fp16 \
8-
--add_instruction False \
98
--pooling_method cls \
109
--normalize_embeddings True
1110
"""
@@ -35,14 +34,6 @@ class ModelArgs:
3534
default=True,
3635
metadata={'help': 'Use fp16 in inference?'}
3736
)
38-
add_instruction: bool = field(
39-
default=False,
40-
metadata={'help': 'Add instruction?'}
41-
)
42-
passage_instruction_for_retrieval: str = field(
43-
default=None,
44-
metadata={'help': 'passage instruction for retrieval'}
45-
)
4637
pooling_method: str = field(
4738
default='cls',
4839
metadata={'help': "Pooling method. Avaliable methods: 'cls', 'mean'"}
@@ -78,8 +69,6 @@ def get_model(model_args: ModelArgs):
7869
model_args.encoder,
7970
pooling_method=model_args.pooling_method,
8071
normalize_embeddings=model_args.normalize_embeddings,
81-
# query_instruction_for_retrieval=model_args.query_instruction_for_retrieval if model_args.add_instruction else None,
82-
passage_instruction_for_retrieval=model_args.passage_instruction_for_retrieval if model_args.add_instruction else None,
8372
use_fp16=model_args.fp16
8473
)
8574
return model

C_MTEB/MLDR/dense_retrieval/step0-generate_embedding.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
--max_passage_length 8192 \
77
--batch_size 4 \
88
--fp16 \
9-
--add_instruction False \
109
--pooling_method cls \
1110
--normalize_embeddings True
1211
"""
@@ -30,14 +29,6 @@ class ModelArgs:
3029
default=True,
3130
metadata={'help': 'Use fp16 in inference?'}
3231
)
33-
add_instruction: bool = field(
34-
default=False,
35-
metadata={'help': 'Add instruction?'}
36-
)
37-
passage_instruction_for_retrieval: str = field(
38-
default=None,
39-
metadata={'help': 'passage instruction for retrieval'}
40-
)
4132
pooling_method: str = field(
4233
default='cls',
4334
metadata={'help': "Pooling method. Avaliable methods: 'cls', 'mean'"}
@@ -78,8 +69,6 @@ def get_model(model_args: ModelArgs):
7869
model_args.encoder,
7970
pooling_method=model_args.pooling_method,
8071
normalize_embeddings=model_args.normalize_embeddings,
81-
# query_instruction_for_retrieval=model_args.query_instruction_for_retrieval if model_args.add_instruction else None,
82-
passage_instruction_for_retrieval=model_args.passage_instruction_for_retrieval if model_args.add_instruction else None,
8372
use_fp16=model_args.fp16
8473
)
8574
return model

C_MTEB/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55

66
setup(
77
name='C_MTEB',
8-
version='1.1.0',
8+
version='1.1.1',
99
description='Chinese Massive Text Embedding Benchmark',
1010
long_description=readme,
1111
long_description_content_type="text/markdown",
1212
author_email='2906698981@qq.com',
1313
url='https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB',
1414
packages=find_packages(),
1515
install_requires=[
16-
'mteb[beir]',
16+
'mteb[beir]==1.1.1',
1717
],
1818
)

FlagEmbedding/BGE_M3/modeling.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] =
252252

253253
else:
254254
idxs = torch.arange(q_dense_vecs.size(0), device=q_dense_vecs.device, dtype=torch.long)
255-
targets = idxs * (p_sparse_vecs.size(0) // q_sparse_vecs.size(0))
255+
targets = idxs * (p_dense_vecs.size(0) // q_dense_vecs.size(0))
256256

257257
# dense loss
258258
dense_scores = self.dense_score(q_dense_vecs, p_dense_vecs) # B, B * N
@@ -325,8 +325,11 @@ def _trans_state_dict(state_dict):
325325

326326
self.model.save_pretrained(output_dir, state_dict=_trans_state_dict(self.model.state_dict()))
327327

328-
torch.save(_trans_state_dict(self.colbert_linear.state_dict()), os.path.join(output_dir, 'colbert_linear.pt'))
329-
torch.save(_trans_state_dict(self.sparse_linear.state_dict()), os.path.join(output_dir, 'sparse_linear.pt'))
328+
if self.unified_finetuning:
329+
torch.save(_trans_state_dict(self.colbert_linear.state_dict()),
330+
os.path.join(output_dir, 'colbert_linear.pt'))
331+
torch.save(_trans_state_dict(self.sparse_linear.state_dict()),
332+
os.path.join(output_dir, 'sparse_linear.pt'))
330333

331334
def load_pooler(self, model_dir):
332335
colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')

FlagEmbedding/baai_general_embedding/finetune/hn_mine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n
100100
with open(output_file, 'w') as f:
101101
for data in train_data:
102102
if len(data['neg']) < negative_number:
103-
data['neg'].extend(random.sample(corpus, negative_number - len(data['neg'])))
103+
samples = random.sample(corpus, negative_number - len(data['neg']) + len(data['pos']))
104+
samples = [sent for sent in samples if sent not in data['pos']]
105+
data['neg'].extend(samples[: negative_number - len(data['neg'])])
104106
f.write(json.dumps(data, ensure_ascii=False) + '\n')
105107

106108

FlagEmbedding/baai_general_embedding/finetune/modeling.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ def __init__(self,
4343
if not normlized:
4444
self.temperature = 1.0
4545
logger.info("reset temperature = 1.0 due to using inner product to compute similarity")
46+
if normlized:
47+
if self.temperature > 0.5:
48+
raise ValueError("Temperature should be smaller than 1.0 when use cosine similarity (i.e., normlized=True). Recommend to set it 0.01-0.1")
4649

4750
self.negatives_cross_device = negatives_cross_device
4851
if self.negatives_cross_device:

FlagEmbedding/flag_reranker.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ def __init__(
406406
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
407407
cache_dir=cache_dir,
408408
trust_remote_code=True,
409+
local_files_only=True,
409410
torch_dtype=torch.bfloat16 if use_bf16 else torch.float32)
410411

411412
self.model_name_or_path = model_name_or_path

FlagEmbedding/llm_reranker/README.md

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,8 @@ torchrun --nproc_per_node {number of gpus} \
314314
--start_layer 8 \
315315
--head_multi True \
316316
--head_type simple \
317-
--lora_extra_parameters linear_head
317+
--lora_extra_parameters linear_head \
318+
--finetune_type from_raw_model # should be one of ['from_raw_model', 'from_finetuned_model']
318319
```
319320

320321
Our rerankers are initialized from [google/gemma-2b](https://huggingface.co/google/gemma-2b) (for llm-based reranker) and [openbmb/MiniCPM-2B-dpo-bf16](https://huggingface.co/openbmb/MiniCPM-2B-dpo-bf16) (for llm-based layerwise reranker), and we train it on a mixture of multilingual datasets:
@@ -323,6 +324,33 @@ Our rerankers are initialized from [google/gemma-2b](https://huggingface.co/goog
323324
- [quora train data](https://huggingface.co/datasets/quora)
324325
- [fever train data](https://fever.ai/dataset/fever.html)
325326

327+
### Merge Model
328+
329+
After finetune, you need to merge the model
330+
331+
**For llm-based reranker**
332+
333+
```python
334+
from FlagEmbedding.llm_reranker.merge import merge_llm
335+
merge_llm('google/gemma-2b', 'lora_llm_output_path', 'merged_model_output_paths')
336+
```
337+
338+
**For llm-based layerwise reranker**
339+
340+
If you finetune the raw model (openbmb/MiniCPM-2B-dpo-bf16)
341+
342+
```shell
343+
from FlagEmbedding.llm_reranker.merge import merge_layerwise_raw_llm
344+
merge_layerwise_raw_llm('openbmb/MiniCPM-2B-dpo-bf16', 'lora_llm_output_path', 'merged_model_output_paths')
345+
```
346+
347+
If you finetune the finetuned model (BAAI/bge-reranker-v2-minicpm-layerwise)
348+
349+
```shell
350+
from FlagEmbedding.llm_reranker.merge import merge_layerwise_finetuned_llm
351+
merge_layerwise_finetuned_llm('BAAI/bge-reranker-v2-minicpm-layerwise', 'lora_llm_output_path', 'merged_model_output_paths')
352+
```
353+
326354
## Evaluation
327355

328356
- llama-index.
@@ -351,7 +379,6 @@ It rereank the top 100 results from bge-m3.
351379
![image-20240317173117639](./evaluation/miracl-bge-m3.png)
352380

353381

354-
355382
## Citation
356383

357384
If you find this repository useful, please consider giving a star :star: and citation

FlagEmbedding/llm_reranker/finetune_for_instruction/modeling.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,6 @@ def save(self, output_dir: str):
8585
self.model.save_pretrained(output_dir, state_dict=state_dict)
8686

8787
def save_pretrained(self, **kwargs):
88+
self.tokenizer.save_pretrained(**kwargs)
8889
return self.model.save_pretrained(**kwargs)
90+

FlagEmbedding/llm_reranker/finetune_for_layerwise/arguments.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ class ModelArguments:
8686
default='simple',
8787
metadata={"help": "the type of the classifier"}
8888
)
89+
finetune_type: str = field(
90+
default='from_raw_model' # should be one of ['from_raw_model', 'from_finetuned_model']
91+
# from_raw_model -- openbmb/MiniCPM-2B-dpo-bf16
92+
# from_finetuned_model -- BAAI/bge-reranker-v2-minicpm-layerwise
93+
)
8994

9095

9196
@dataclass

0 commit comments

Comments
 (0)