upload embedder and reranker

545999961 · 545999961 · commit 5e0a2d7d905c · 2024-07-26T17:01:24.000+08:00
diff --git a/FlagEmbedding/flag_reranker.py b/FlagEmbedding/flag_reranker.py
@@ -389,7 +389,6 @@ def _text_length(self, text: Union[List[int], List[List[int]]]):
         else:
             return sum([len(t) for t in text])  # Sum of length of individual strings
 
-
 class LayerWiseFlagLLMReranker:
     def __init__(
             self,
@@ -561,10 +560,175 @@ def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str,
             if normalize:
                 all_scores[i] = [sigmoid(score) for score in all_scores[i]]
 
-        # if len(all_scores) == 1:
-        #     if len(all_scores[0]) == 1:
-        #         return all_scores[0][0]
-        #     return all_scores[0]
+        return all_scores
+
+
+    def _text_length(self, text: Union[List[int], List[List[int]]]):
+        """
+        Help function to get the length for the input text. Text can be either
+        a list of ints (which means a single text as input), or a tuple of list of ints
+        (representing several text inputs to the model).
+        """
+
+        if isinstance(text, dict):  # {key: value} case
+            return len(next(iter(text.values())))
+        elif not hasattr(text, '__len__'):  # Object has no len() method
+            return 1
+        elif len(text) == 0 or isinstance(text[0], int):  # Empty string or list of ints
+            return len(text)
+        else:
+            return sum([len(t) for t in text])  # Sum of length of individual strings
+
+
+class LightWeightFlagLLMReranker:
+    def __init__(
+            self,
+            model_name_or_path: str = None,
+            peft_path: str = None,
+            use_fp16: bool = False,
+            use_bf16: bool = False,
+            cache_dir: str = None,
+            device: Union[str, int] = None
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
+                                                       cache_dir=cache_dir,
+                                                       trust_remote_code=True)
+
+        if use_bf16 is False and use_fp16 is False:
+            warnings.warn("Due to model constraints, `use_bf16` and `use_fp16` cannot both be `False`. Here, `use_fp16` is set to `True` by default.", UserWarning)
+            use_fp16 = True
+
+        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                          cache_dir=cache_dir,
+                                                          trust_remote_code=True,
+                                                          local_files_only=True,
+                                                          torch_dtype=torch.bfloat16 if use_bf16 else torch.float32)
+        if peft_path:
+            self.model = PeftModel.from_pretrained(self.model,peft_path)
+            self.model = self.model.merge_and_unload()
+        self.model_name_or_path = model_name_or_path
+        self.cache_dir = cache_dir
+
+        if device and isinstance(device, str):
+            if device == 'cpu':
+                warnings.warn('The LLM-based layer-wise reranker does not support CPU; it has been set to CUDA.')
+                device = 'cuda'
+            self.device = torch.device(device)
+        else:
+            device = 0 if device is None else device
+            if torch.cuda.is_available():
+                torch.cuda.set_device(device)
+                self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif is_torch_npu_available():
+                self.device = torch.device("npu")
+            else:
+                self.device = torch.device("cpu")
+                use_fp16 = False
+
+        if use_fp16 and use_bf16 is False:
+            self.model.half()
+
+        self.model = self.model.to(self.device)
+
+        self.model.eval()
+
+        self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
+
+    @torch.no_grad()
+    def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 16,
+                      max_length: int = 512,
+                      cutoff_layers: List[int] = None, compress_layer: List[int] = [8], compress_ratio: int = 1,
+                      prompt: str = None, normalize: bool = False) -> Union[float, List[float], List[List[float]]]:
+        assert isinstance(sentence_pairs, list)
+        if isinstance(sentence_pairs[0], str):
+            sentence_pairs = [sentence_pairs]
+
+        length_sorted_idx = np.argsort([-self._text_length(q) - self._text_length(p) for q, p in sentence_pairs])
+        sentences_sorted = [sentence_pairs[idx] for idx in length_sorted_idx]
+
+        if prompt is None:
+            prompt = "Predict whether passage B contains an answer to query A."
+        prompt_inputs = self.tokenizer(prompt,
+                                       return_tensors=None,
+                                       add_special_tokens=False)['input_ids']
+        sep = "\n"
+        sep_inputs = self.tokenizer(sep,
+                                    return_tensors=None,
+                                    add_special_tokens=False)['input_ids']
+        encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs)
+        all_scores = []
+        for batch_start in trange(0, len(sentences_sorted), batch_size):
+            batch_sentences = sentences_sorted[batch_start:batch_start + batch_size]
+            batch_sentences = [(f'A: {q}', f'B: {p}') for q, p in batch_sentences]
+            queries = [s[0] for s in batch_sentences]
+            passages = [s[1] for s in batch_sentences]
+            queries_inputs = self.tokenizer(queries,
+                                            return_tensors=None,
+                                            add_special_tokens=False,
+                                            max_length=max_length * 3 // 4,
+                                            truncation=True)
+            passages_inputs = self.tokenizer(passages,
+                                             return_tensors=None,
+                                             add_special_tokens=False,
+                                             max_length=max_length,
+                                             truncation=True)
+            query_lengths = []
+            prompt_lengths = []
+            batch_inputs = []
+            for query_inputs, passage_inputs in zip(queries_inputs['input_ids'], passages_inputs['input_ids']):
+                item = self.tokenizer.prepare_for_model(
+                    [self.tokenizer.bos_token_id] + query_inputs,
+                    sep_inputs + passage_inputs,
+                    truncation='only_second',
+                    max_length=encode_max_length,
+                    padding=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    add_special_tokens=False
+                )
+                item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
+                item['attention_mask'] = [1] * len(item['input_ids'])
+                item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
+                if 'position_ids' in item.keys():
+                    item['position_ids'] = list(range(len(item['input_ids'])))
+                batch_inputs.append(item)
+                query_lengths.append(len([self.tokenizer.bos_token_id] + query_inputs + sep_inputs))
+                prompt_lengths.append(len(sep_inputs + prompt_inputs))
+
+            collater_instance = collater(self.tokenizer, max_length)
+            batch_inputs = collater_instance(
+                [
+                    [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']} for item in
+                     batch_inputs],
+                    query_lengths,
+                    prompt_lengths
+                ])[0]
+
+            batch_inputs = {key: val.to(self.device) for key, val in batch_inputs.items()}
+
+            outputs = self.model(**batch_inputs,
+                                 output_hidden_states=True,
+                                 compress_layer=compress_layer,
+                                 compress_ratio=compress_ratio,
+                                 query_lengths=query_lengths,
+                                 prompt_lengths=prompt_lengths,
+                                 cutoff_layers=cutoff_layers)
+            scores = []
+            for i in range(len(outputs.logits)):
+                logits = last_logit_pool(outputs.logits[i], outputs.attention_masks[i])
+                scores.append(logits.cpu().float().tolist())
+            if len(all_scores) == 0:
+                for i in range(len(scores)):
+                    all_scores.append([])
+            for i in range(len(scores)):
+                all_scores[i].extend(scores[i])
+
+        for i in range(len(all_scores)):
+            all_scores[i] = [all_scores[i][idx] for idx in np.argsort(length_sorted_idx)]
+            if normalize:
+                all_scores[i] = [sigmoid(score) for score in all_scores[i]]
 
         return all_scores