Merge pull request #995 from 545999961/master

545999961 · web-flow · commit f5198e3495fa · 2024-07-26T17:40:11.000+08:00
upload embedder and reranker
diff --git a/FlagEmbedding/__init__.py b/FlagEmbedding/__init__.py
@@ -1,3 +1,3 @@
-from .flag_models import FlagModel, LLMEmbedder
+from .flag_models import FlagModel, LLMEmbedder, FlagICLModel
 from .bge_m3 import BGEM3FlagModel
 from .flag_reranker import FlagReranker, FlagLLMReranker, LayerWiseFlagLLMReranker
diff --git a/FlagEmbedding/flag_models.py b/FlagEmbedding/flag_models.py
@@ -3,8 +3,217 @@
 import numpy as np
 import torch
 from tqdm import tqdm
+from torch import Tensor
 from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, is_torch_npu_available
 
+import torch.nn.functional as F
+
+
+def last_token_pool(last_hidden_states: Tensor,
+                    attention_mask: Tensor) -> Tensor:
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+
+
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f'<instruct>{task_description}\n<query>{query}'
+
+def get_detailed_example(task_description: str, query: str, response: str) -> str:
+    return f'<instruct>{task_description}\n<query>{query}\n<response>{response}'
+
+
+class FlagICLModel:
+    def __init__(
+        self,
+        model_name_or_path: str = None,
+        normalize_embeddings: bool = True,
+        query_instruction_for_retrieval: str = 'Given a query, retrieval relevant passage that answer the query.',
+        examples_for_task: List[dict] = None,
+        use_fp16: bool = True
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModel.from_pretrained(model_name_or_path)
+        self.query_instruction_for_retrieval = query_instruction_for_retrieval
+        self.examples_for_task = examples_for_task
+
+        self.set_examples()
+        self.suffix = '\n<response>'
+
+        self.normalize_embeddings = normalize_embeddings
+
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+        else:
+            self.device = torch.device("cpu")
+        self.model.half()
+        self.model = self.model.to(self.device)
+
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+        elif is_torch_npu_available():
+            self.device = torch.device("npu")
+        else:
+            self.device = torch.device("cpu")
+            use_fp16 = False
+        if use_fp16: self.model.half()
+        self.model = self.model.to(self.device)
+
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 1:
+            print(f"----------using {self.num_gpus}*GPUs----------")
+            self.model = torch.nn.DataParallel(self.model)
+
+    def set_examples(self, examples_for_task: List[dict] = None):
+        if examples_for_task is None and self.examples_for_task is None:
+            self.prefix = ''
+        elif examples_for_task is not None:
+            eg_paris = []
+            for i in range(len(examples_for_task)):
+                eg_paris.append(
+                    get_detailed_example(
+                        examples_for_task[i].get('instruct', self.query_instruction_for_retrieval),
+                        examples_for_task[i].get('query', ''),
+                        examples_for_task[i].get('response', '')
+                    )
+                )
+            self.prefix = '\n\n'.join(eg_paris) + '\n\n'
+        else:
+            eg_paris = []
+            for i in range(len(self.examples_for_task)):
+                eg_paris.append(
+                    get_detailed_example(
+                        self.examples_for_task[i].get('instruct', self.query_instruction_for_retrieval),
+                        self.examples_for_task[i].get('query', ''),
+                        self.examples_for_task[i].get('response', '')
+                    )
+                )
+            self.prefix = '\n\n'.join(eg_paris) + '\n\n'
+
+
+    @torch.no_grad()
+    def encode_queries(self, queries: Union[List[str], str],
+                       batch_size: int = 256,
+                       max_length: int = 512) -> np.ndarray:
+        self.model.eval()
+        '''
+        This function will be used for retrieval task
+        if there is a instruction for queries, we will add it to the query text
+        '''
+        if isinstance(queries, str):
+            sentences = [get_detailed_instruct(self.query_instruction_for_retrieval, queries)]
+        else:
+            sentences = [get_detailed_instruct(self.query_instruction_for_retrieval, q) for q in queries]
+
+        prefix_ids = self.tokenizer(self.prefix, add_special_tokens=False)['input_ids']
+        suffix_ids = self.tokenizer(self.suffix, add_special_tokens=False)['input_ids']
+
+        all_embeddings = []
+        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
+        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+
+        for start_index in tqdm(range(0, len(sentences_sorted), batch_size), desc="Inference Embeddings",
+                                disable=len(sentences_sorted) < 256):
+            sentences_batch = sentences_sorted[start_index:start_index + batch_size]
+            inputs = self.tokenizer(
+                sentences_batch,
+                max_length=max_length - len(self.tokenizer('<s>', add_special_tokens=False)['input_ids']) - len(
+                    self.tokenizer('\n<response></s>', add_special_tokens=False)['input_ids']),
+                return_token_type_ids=False,
+                truncation=True,
+                return_tensors=None,
+                add_special_tokens=False
+            )
+            new_max_length = (len(prefix_ids) + len(suffix_ids) + max_length) // 8 * 8 + 8
+            sentences_batch = self.tokenizer.batch_decode(inputs['input_ids'])
+            for i in range(len(sentences_batch)):
+                sentences_batch[i] = self.prefix + sentences_batch[i] + self.suffix
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=new_max_length,
+                add_special_tokens=True
+            ).to(self.device)
+
+            outputs = self.model(**inputs, return_dict=True)
+            embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])
+
+            if self.normalize_embeddings:
+                embeddings = F.normalize(embeddings, p=2, dim=1)
+            all_embeddings.extend(embeddings.float().cpu())
+
+        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
+        all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+        return all_embeddings
+
+    @torch.no_grad()
+    def encode_corpus(self,
+                      corpus: Union[List[str], str],
+                      batch_size: int = 256,
+                      max_length: int = 512) -> np.ndarray:
+        '''
+        This function will be used for retrieval task
+        encode corpus for retrieval task
+        '''
+        self.model.eval()
+
+        if isinstance(corpus, str):
+            sentences = [corpus]
+        else:
+            sentences = corpus
+
+        all_embeddings = []
+        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
+        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+
+        for start_index in tqdm(range(0, len(sentences_sorted), batch_size), desc="Inference Embeddings",
+                                disable=len(sentences_sorted) < 256):
+            sentences_batch = sentences_sorted[start_index:start_index + batch_size]
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=max_length,
+                add_special_tokens=True
+            ).to(self.device)
+            outputs = self.model(**inputs, return_dict=True)
+            embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])
+
+            if self.normalize_embeddings:
+                embeddings = F.normalize(embeddings, p=2, dim=1)
+            all_embeddings.extend(embeddings.float().cpu())
+
+        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
+        all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+        return all_embeddings
+
+    def _text_length(self, text: Union[List[int], List[List[int]]]):
+        """
+        Help function to get the length for the input text. Text can be either
+        a list of ints (which means a single text as input), or a tuple of list of ints
+        (representing several text inputs to the model).
+        """
+
+        if isinstance(text, dict):  # {key: value} case
+            return len(next(iter(text.values())))
+        elif not hasattr(text, '__len__'):  # Object has no len() method
+            return 1
+        elif len(text) == 0 or isinstance(text[0], int):  # Empty string or list of ints
+            return len(text)
+        else:
+            return sum([len(t) for t in text])  # Sum of length of individual strings
+
 
 class FlagModel:
     def __init__(
@@ -185,7 +394,7 @@ def encode_queries(self, queries: Union[List[str], str],
                        max_length: int = 256,
                        task: str = 'qa') -> np.ndarray:
         '''
-        Encode queries into dense vectors. 
+        Encode queries into dense vectors.
         Automatically add instructions according to given task.
         '''
         instruction = self.instructions[task]["query"]
@@ -202,7 +411,7 @@ def encode_keys(self, keys: Union[List[str], str],
                     max_length: int = 512,
                     task: str = 'qa') -> np.ndarray:
         '''
-        Encode keys into dense vectors. 
+        Encode keys into dense vectors.
         Automatically add instructions according to given task.
         '''
         instruction = self.instructions[task]["key"]
diff --git a/FlagEmbedding/flag_reranker.py b/FlagEmbedding/flag_reranker.py