FlagOpen
diff --git a/‎LLARA/README.md‎
Lines changed: 200 additions & 0 deletions b/‎LLARA/README.md‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎LLARA/data/finetune/toy_finetune_data.jsonl‎
Lines changed: 11 additions & 0 deletions b/‎LLARA/data/finetune/toy_finetune_data.jsonl‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎LLARA/data/pretrain/toy_pretrain_data.jsonl‎
Lines changed: 11 additions & 0 deletions b/‎LLARA/data/pretrain/toy_pretrain_data.jsonl‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎LLARA/finetune/__init__.py‎ b/‎LLARA/finetune/__init__.py‎
diff --git a/‎LLARA/finetune/arguments.py‎
Lines changed: 165 additions & 0 deletions b/‎LLARA/finetune/arguments.py‎
Lines changed: 165 additions & 0 deletions
@@ -0,0 +1,200 @@
+<div align="center">
+<h1> Llama2Vec: Unsupervised Adaptation of Large Language Models for Dense Retrieval (LLARA) [<a href="https://arxiv.org/abs/2312.15503">paper</a>]</h1>
+</div>
+
+Llama2Vec consists of two pretext tasks:
+- **EBAE** (Embedding-Based Auto-Encoding)
+- **EBAR** (Embedding-Based Auto-Regression)
+
+The LLM is prompted to **reconstruct the input sentence** and **predict the next sentence** based on its text embeddings.
+
+It is known for the following features:
+- simple
+- lightweight
+- highly effective
+
+## Environment
+```bash
+conda create llara python=3.10
+
+conda activate llara
+
+# You may need to adjust the cuda version
+conda install pytorch pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install transformers==4.41.0 deepspeed accelerate datasets peft pandas
+pip install flash-attn --no-build-isolation
+```
+
+## Model List
+
+| Model                                                        | Introduction                                                 |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [BAAI/LLARA-pretrain](https://huggingface.co/BAAI/LLARA-pretrain) | LLARA that has undergone unsupervised adaptation on Wikipedia |
+| [BAAI/LLARA-passage](https://huggingface.co/BAAI/LLARA-passage) | The LLARA-pretrain model fine-tuned on MS MARCO passage (the hard negatives come from dense retriever) |
+| [BAAI/LLARA-document](https://huggingface.co/BAAI/LLARA-document) | The LLARA-pretrain model fine-tuned on MS MARCO document     |
+| [BAAI/LLARA-beir](https://huggingface.co/BAAI/LLARA-beir)    | The LLARA-pretrain model fine-tuned on MS MARCO passage (the hard negatives come from BM25) |
+
+## Usage
+
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer, LlamaModel
+
+def get_query_inputs(queries, tokenizer, max_length=512):
+    prefix = '"'
+    suffix = '", predict the following passage within eight words: <s9><s10><s11><s12><s13><s14><s15><s16>'
+    prefix_ids = tokenizer(prefix, return_tensors=None)['input_ids']
+    suffix_ids = tokenizer(suffix, return_tensors=None)['input_ids'][1:]
+    queries_inputs = []
+    for query in queries:
+        inputs = tokenizer(query,
+                           return_tensors=None,
+                           max_length=max_length,
+                           truncation=True,
+                           add_special_tokens=False)
+        inputs['input_ids'] = prefix_ids + inputs['input_ids'] + suffix_ids
+        inputs['attention_mask'] = [1] * len(inputs['input_ids'])
+        queries_inputs.append(inputs)
+    return tokenizer.pad(
+            queries_inputs,
+            padding=True,
+            max_length=max_length,
+            pad_to_multiple_of=8,
+            return_tensors='pt',
+        )
+
+def get_passage_inputs(passages, tokenizer, max_length=512):
+    prefix = '"'
+    suffix = '", summarize the above passage within eight words: <s1><s2><s3><s4><s5><s6><s7><s8>'
+    prefix_ids = tokenizer(prefix, return_tensors=None)['input_ids']
+    suffix_ids = tokenizer(suffix, return_tensors=None)['input_ids'][1:]
+    passages_inputs = []
+    for passage in passages:
+        inputs = tokenizer(passage,
+                           return_tensors=None,
+                           max_length=max_length,
+                           truncation=True,
+                           add_special_tokens=False)
+        inputs['input_ids'] = prefix_ids + inputs['input_ids'] + suffix_ids
+        inputs['attention_mask'] = [1] * len(inputs['input_ids'])
+        passages_inputs.append(inputs)
+    return tokenizer.pad(
+            passages_inputs,
+            padding=True,
+            max_length=max_length,
+            pad_to_multiple_of=8,
+            return_tensors='pt',
+        )
+
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained('BAAI/LLARA-passage')
+model = AutoModel.from_pretrained('BAAI/LLARA-passage')
+
+# Define query and passage inputs
+query = "What is llama?"
+title = "Llama"
+passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
+query_input = get_query_inputs([query], tokenizer)
+passage_input = get_passage_inputs([passage], tokenizer)
+
+
+with torch.no_grad():
+    # compute query embedding
+    query_outputs = model(**query_input, return_dict=True, output_hidden_states=True)
+    query_embedding = query_outputs.hidden_states[-1][:, -8:, :]
+    query_embedding = torch.mean(query_embedding, dim=1)
+    query_embedding = torch.nn.functional.normalize(query_embedding, dim=-1)
+
+    # compute passage embedding
+    passage_outputs = model(**passage_input, return_dict=True, output_hidden_states=True)
+    passage_embeddings = passage_outputs.hidden_states[-1][:, -8:, :]
+    passage_embeddings = torch.mean(passage_embeddings, dim=1)
+    passage_embeddings = torch.nn.functional.normalize(passage_embeddings, dim=-1)
+
+    # compute similarity score
+    score = query_embedding @ passage_embeddings.T
+    print(score)
+
+```
+
+## Unsupervised Adaption (pretrain)
+1. You can get the complete data here: [cfli/pretrain_wiki](https://huggingface.co/datasets/cfli/pretrain_wiki)
+2. Here is an example for pretrain:
+```shell
+cd ./pretrain
+torchrun --nproc_per_node 8 \
+run.py \
+--output_dir ./output \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--train_data ../data/pretrain/toy_pretrain_data.jsonl \
+--learning_rate 1e-5 \
+--num_train_epochs 1 \
+--per_device_train_batch_size 1 \
+--gradient_accumulation_steps 1 \
+--dataloader_drop_last True \
+--cutoff_len 128 \
+--logging_steps 1 \
+--save_steps 500 \
+--save_total_limit 20 \
+--gradient_checkpointing \
+--ddp_find_unused_parameters False \
+--use_flash_attn False \
+--deepspeed ../stage1.json \
+--warmup_ratio 0.1 \
+--remove_stop_words True \
+--use_lora False \
+--bf16 \
+--cache_dir ./LMs \
+--token ...
+```
+If you want to pretrain based on the complete data, please use hype-parameters in our paper.
+
+## Fine-tune
+
+Here is an example for fine-tune:
+```shell
+cd ./finetune
+torchrun --nproc_per_node 8 \
+run.py \
+--output_dir ./output \
+--model_name_or_path BAAI/LLARA-pretrain \
+--train_data ../data/finetune/toy_finetune_data.jsonl \
+--learning_rate 3e-4 \
+--num_train_epochs 1 \
+--per_device_train_batch_size 1 \
+--dataloader_drop_last True \
+--normlized True \
+--temperature 0.01 \
+--query_max_len 64 \
+--passage_max_len 160 \
+--train_group_size 16 \
+--logging_steps 10 \
+--save_steps 500 \
+--save_total_limit 3 \
+--ddp_find_unused_parameters False \
+--negatives_cross_device \
+--gradient_checkpointing \
+--deepspeed ../stage1.json \
+--warmup_ratio 0.1 \
+--fp16 \
+--cache_dir ./LMs \
+--token ...
+```
+
+## Citation
+
+If you find this repository useful, please give us a star ⭐.
+
+To cite our work:
+
+```
+@misc{li2023makinglargelanguagemodels,
+      title={Making Large Language Models A Better Foundation For Dense Retrieval}, 
+      author={Chaofan Li and Zheng Liu and Shitao Xiao and Yingxia Shao},
+      year={2023},
+      eprint={2312.15503},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2312.15503}, 
+}
+```
@@ -0,0 +1,165 @@
+import os
+from dataclasses import dataclass, field
+from typing import Optional, List
+
+from transformers import TrainingArguments
+
+
+def default_list() -> List[int]:
+    return ['v_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'o_proj', 'up_proj']
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+
+    peft_model_path: str = field(
+        default=''
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    # cache_dir: Optional[str] = field(
+    #     default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    # )
+    use_lora: bool = field(
+        default=True,
+        metadata={"help": "If passed, will use LORA (low-rank parameter-efficient training) to train the model."}
+    )
+    lora_rank: int = field(
+        default=64,
+        metadata={"help": "The rank of lora."}
+    )
+    lora_alpha: float = field(
+        default=16,
+        metadata={"help": "The alpha parameter of lora."}
+    )
+    lora_dropout: float = field(
+        default=0.1,
+        metadata={"help": "The dropout rate of lora modules."}
+    )
+    target_modules: List[str] = field(
+        default_factory=default_list
+    )
+    save_merged_lora_model: bool = field(
+        default=False,
+        metadata={"help": "If passed, will merge the lora modules and save the entire model."}
+    )
+    use_flash_attn: bool = field(
+        default=True,
+        metadata={"help": "If passed, will use flash attention to train the model."}
+    )
+    use_slow_tokenizer: bool = field(
+        default=False,
+        metadata={"help": "If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."}
+    )
+    low_cpu_mem_usage: bool = field(
+        default=False,
+        metadata={"help": "It is an option to create the model as an empty shell,"
+                          "then only materialize its parameters when the pretrained weights are loaded."
+                          "If passed, LLM loading time and RAM consumption will be benefited."}
+    )
+    token: str = field(
+        default=""
+    )
+    cache_dir: str = field(
+        default="./LMs"
+    )
+    from_peft: str = field(
+        default=None
+    )
+
+
+@dataclass
+class DataArguments:
+    train_data: str = field(
+        default='./toy_finetune_data.jsonl', metadata={"help": "Path to train data"}
+    )
+    train_group_size: int = field(default=8)
+
+    query_max_len: int = field(
+        default=32,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+
+    passage_max_len: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+
+    max_example_num_per_dataset: int = field(
+        default=100000000, metadata={"help": "the max number of examples for each dataset"}
+    )
+
+    query_instruction_for_retrieval: str = field(
+        default="query: ", metadata={"help": "query: "}
+    )
+    passage_instruction_for_retrieval: str = field(
+        default="passage: ", metadata={"help": "passage: "}
+    )
+
+    cache_path: str = field(
+        default='./data_dir'
+    )
+
+    load_from_disk: bool = field(
+        default=False, metadata={"help": " whether load the data from disk"}
+    )
+
+    load_disk_path: str = field(
+        default=None, metadata={"help": " the path to load the data", "nargs": "+"}
+    )
+
+    save_to_disk: bool = field(
+        default=False, metadata={"help": " whether save the data to disk"}
+    )
+
+    save_disk_path: str = field(
+        default=None, metadata={"help": " the path to save the data"}
+    )
+
+    num_shards: int = field(
+        default=0, metadata={
+            "help": "number of shards to write, prior than `save_max_shard_size`, default depends on `save_max_shard_size`"}
+    )
+
+    save_max_shard_size: str = field(
+        default="50GB", metadata={"help": "the max size of the shard"}
+    )
+
+    exit_after_save: bool = field(
+        default=False, metadata={"help": " whether exit after save the data"}
+    )
+
+    shuffle_ratio: float = field(
+        default=0.0, metadata={"help": "The ratio of shuffling the text"}
+    )
+
+    def __post_init__(self):
+        if not os.path.exists(self.train_data):
+            raise FileNotFoundError(f"cannot find file: {self.train_data}, please set a true path")
+
+@dataclass
+class RetrieverTrainingArguments(TrainingArguments):
+    negatives_cross_device: bool = field(default=False, metadata={"help": "share negatives across devices"})
+    temperature: Optional[float] = field(default=0.02)
+    fix_position_embedding: bool = field(default=False, metadata={"help": "Freeze the parameters of position embeddings"})
+    sentence_pooling_method: str = field(default='cls', metadata={"help": "the pooling method, should be cls or mean"})
+    normlized: bool = field(default=True)
+    sub_batch_size: int = field(default=None)
+    cache_chunk_size: int = field(default=-1, metadata={"help": "用于缓存每一步的执行."})