Merge branch 'new-flagembedding-v1' of https://github.com/hanhainebula/FlagEmbedding into new-flagembedding-v1

545999961 · 545999961 · commit 60ddaf0a4264 · 2024-10-30T20:51:35.000+08:00
diff --git a/FlagEmbedding/abc/inference/AbsEmbedder.py b/FlagEmbedding/abc/inference/AbsEmbedder.py
@@ -83,6 +83,17 @@ def __init__(
 
     @staticmethod
     def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[str]:
+        """
+
+        Args:
+            devices (Union[str, int, List[str], List[int]]): specified devices, can be `str`, `int`, list of `str`, or list of `int`.
+
+        Raises:
+            ValueError: Devices should be a string or an integer or a list of strings or a list of integers.
+
+        Returns:
+            List[str]: A list of target devices in format
+        """
         if devices is None:
             if torch.cuda.is_available():
                 return [f"cuda:{i}" for i in range(torch.cuda.device_count())]
@@ -108,6 +119,16 @@ def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[s
 
     @staticmethod
     def get_detailed_instruct(instruction_format: str, instruction: str, sentence: str):
+        """Combine the instruction and sentence along with the instruction format.
+
+        Args:
+            instruction_format (str): Format for instruction.
+            instruction (str): The text of instruction.
+            sentence (str): The sentence to concatenate with.
+
+        Returns:
+            str: The complete sentence with instruction
+        """
         return instruction_format.format(instruction, sentence)
 
     def encode_queries(
@@ -118,6 +139,18 @@ def encode_queries(
             convert_to_numpy: Optional[bool] = None,
             **kwargs: Any
     ):
+        """encode the queries using the instruction if provided.
+
+        Args:
+            queries (Union[List[str], str]): Input queries to encode.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
+            convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
+        """
         if batch_size is None: batch_size = self.batch_size
         if max_length is None: max_length = self.query_max_length
         if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy
@@ -140,6 +173,18 @@ def encode_corpus(
             convert_to_numpy: Optional[bool] = None,
             **kwargs: Any
     ):
+        """encode the corpus using the instruction if provided.
+
+        Args:
+            corpus (Union[List[str], str]): Input corpus to encode.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
+            convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
+        """
         passage_instruction_for_retrieval = self.kwargs.get("passage_instruction_for_retrieval", None)
         passage_instruction_format = self.kwargs.get("passage_instruction_format", "{}{}")
 
@@ -167,6 +212,20 @@ def encode(
             instruction_format: Optional[str] = None,
             **kwargs: Any
     ):
+        """encode the input sentences with the embedding model.
+
+        Args:
+            sentences (Union[List[str], str]): Input sentences to encode.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
+            convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to None.
+            instruction (Optional[str], optional): The text of instruction. Defaults to None.
+            instruction_format (Optional[str], optional): Format for instruction. Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
+        """
         if batch_size is None: batch_size = self.batch_size
         if max_length is None: max_length = self.passage_max_length
         if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy
@@ -338,6 +397,17 @@ def encode_multi_process(
         return embeddings
 
     def _concatenate_results_from_multi_process(self, results_list: List[Union[torch.Tensor, np.ndarray, Any]]):
+        """concatenate and return the results from all the processes
+
+        Args:
+            results_list (List[Union[torch.Tensor, np.ndarray, Any]]): a list of results from all the processes
+
+        Raises:
+            NotImplementedError: Unsupported type for results_list
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
+        """
         if isinstance(results_list[0], torch.Tensor):
             return torch.cat(results_list, dim=0)
         elif isinstance(results_list[0], np.ndarray):
diff --git a/FlagEmbedding/abc/inference/AbsReranker.py b/FlagEmbedding/abc/inference/AbsReranker.py
@@ -16,8 +16,26 @@
 
 class AbsReranker(ABC):
     """
-    Base class for embedder.
+    Base class for Reranker.
     Extend this class and implement `compute_score_single_gpu` for custom rerankers.
+
+    Args:
+        model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and
+            load a model from HuggingFace Hub with the name.
+        use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance 
+            degradation. Default: `False`.
+        query_instruction_for_rerank: (Optional[str], optional): Query instruction for reranking, which will be used with
+            with `query_instruction_format`. Default: `None`.
+        query_instruction_format: (str, optional): The template for `query_instruction_for_rerank`. Default: `"{}{}"`.
+        passage_instruction_for_rerank (Optional[str], optional): Passage instruction for reranking. Default: `None`.
+        passage_instruction_format (str, optional): Passage instruction format when using `passage_instruction_for_rerank`. 
+            Default: `"{}{}"`.
+        devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Default: `None`.
+        batch_size (int, optional): Batch size for inference. Default: `128`.
+        query_max_length (int, optional): Maximum length for query. Default: `None`.
+        passage_max_length (int, optional): Maximum length for passage. Default: `512`.
+        normalize (bool, optional): If true, normalize the result. Default: `False`.
+        kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes.
     """
 
     def __init__(
@@ -61,6 +79,17 @@ def __init__(
 
     @staticmethod
     def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[str]:
+        """
+
+        Args:
+            devices (Union[str, int, List[str], List[int]]): Specified devices, can be `str`, `int`, list of `str`, or list of `int`.
+
+        Raises:
+            ValueError: Devices should be a string or an integer or a list of strings or a list of integers.
+
+        Returns:
+            List[str]: A list of target devices in format
+        """
         if devices is None:
             if torch.cuda.is_available():
                 return [f"cuda:{i}" for i in range(torch.cuda.device_count())]
@@ -85,9 +114,27 @@ def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[s
             raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
 
     def get_detailed_instruct(self, instruction_format: str, instruction: str, sentence: str):
+        """Combine the instruction and sentence along with the instruction format.
+
+        Args:
+            instruction_format (str): Format for instruction.
+            instruction (str): The text of instruction.
+            sentence (str): The sentence to concatenate with.
+
+        Returns:
+            str: The complete sentence with instruction
+        """
         return instruction_format.format(instruction, sentence)
     
     def get_detailed_inputs(self, sentence_pairs: Union[str, List[str]]):
+        """get detailed instruct for all the inputs
+
+        Args:
+            sentence_pairs (Union[str, List[str]]): Input sentence pairs
+
+        Returns:
+            list[list[str]]: The complete sentence pairs with instruction
+        """
         if isinstance(sentence_pairs, str):
             sentence_pairs = [sentence_pairs]
 
@@ -127,6 +174,14 @@ def compute_score(
         sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]],
         **kwargs
     ):
+        """Compute score for each sentence pair
+
+        Args:
+            sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute.
+
+        Returns:
+            numpy.ndarray: scores of all the sentence pairs.
+        """
         if isinstance(sentence_pairs[0], str):
             sentence_pairs = [sentence_pairs]
         sentence_pairs = self.get_detailed_inputs(sentence_pairs)
diff --git a/docs/source/API/evaluation.rst b/docs/source/API/evaluation.rst
@@ -1,2 +1,2 @@
-Visualized BGE
-==============
+Evaluation
+==========
diff --git a/docs/source/API/inference.rst b/docs/source/API/inference.rst
@@ -1,2 +1,2 @@
-Evaluation
-==========
+Inference
+=========
diff --git a/docs/source/bge/llm_embedder.rst b/docs/source/bge/llm_embedder.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -51,7 +51,6 @@ We are aiming to enhance text and multi-model retrieval by leveraging advanced e
 
    bge/introduction
    bge/bge_v1
-   bge/llm_embedder
    bge/bge_m3
    bge/bge_icl
    bge/bge_reranker
@@ -62,8 +61,8 @@ We are aiming to enhance text and multi-model retrieval by leveraging advanced e
    :caption: API
 
    API/abc
-   API/evaluation
    API/inference
+   API/evaluation
 
 .. toctree::
    :hidden: