base & m3

ZiyiXia · ZiyiXia · commit 6f693a232d8e · 2024-11-04T04:27:51.000Z
diff --git a/FlagEmbedding/abc/inference/AbsEmbedder.py b/FlagEmbedding/abc/inference/AbsEmbedder.py
@@ -394,7 +394,7 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch
         """concatenate and return the results from all the processes
 
         Args:
-            results_list (List[Union[torch.Tensor, np.ndarray, Any]]): a list of results from all the processes
+            results_list (List[Union[torch.Tensor, np.ndarray, Any]]): A list of results from all the processes.
 
         Raises:
             NotImplementedError: Unsupported type for results_list
diff --git a/FlagEmbedding/inference/embedder/encoder_only/base.py b/FlagEmbedding/inference/embedder/encoder_only/base.py
@@ -9,6 +9,34 @@
 
 
 class BaseEmbedder(AbsEmbedder):
+    """
+    Base embedder for encoder only models.
+
+    Args:
+        model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and
+            load a model from HuggingFace Hub with the name.
+        normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`.
+        use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance 
+            degradation. Defaults to :data:`True`.
+        query_instruction_for_retrieval: (Optional[str], optional): Query instruction for retrieval tasks, which will be used with
+            with :attr:`query_instruction_format`. Defaults to :data:`None`.
+        query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`.
+        devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`.
+        pooling_method (str, optional): Pooling method to get embedding vector from the last hidden state. Defaults to :data:`"cls"`.
+        trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`.
+        cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`.
+        batch_size (int, optional): Batch size for inference. Defaults to :data:`256`.
+        query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`.
+        passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`.
+        instruction (Optional[str], optional): Instruction for embedding with :attr:`instruction_format`. Defaults to :data:`None`.
+        instruction_format (str, optional): Instruction format when using :attr:`instruction`. Defaults to :data:`"{}{}"`.
+        convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. 
+            Defaults to :data:`True`.
+    
+    Attributes:
+        DEFAULT_POOLING_METHOD: The default pooling method when running the model.
+    """
+    
     DEFAULT_POOLING_METHOD = "cls"
 
     def __init__(
@@ -68,6 +96,18 @@ def encode_queries(
         convert_to_numpy: Optional[bool] = None,
         **kwargs: Any
     ) -> Union[np.ndarray, torch.Tensor]:
+        """Encode the queries using the instruction if provided.
+
+        Args:
+            queries (Union[List[str], str]): Input queries to encode.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
+            convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to :data:`None`.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
+        """
         return super().encode_queries(
             queries,
             batch_size=batch_size,
@@ -84,6 +124,18 @@ def encode_corpus(
         convert_to_numpy: Optional[bool] = None,
         **kwargs: Any
     ) -> Union[np.ndarray, torch.Tensor]:
+        """Encode the corpus using the instruction if provided.
+
+        Args:
+            corpus (Union[List[str], str]): Input corpus to encode.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
+            convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to :data:`None`.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
+        """
         return super().encode_corpus(
             corpus,
             batch_size=batch_size,
@@ -100,6 +152,18 @@ def encode(
         convert_to_numpy: Optional[bool] = None,
         **kwargs: Any
     ) -> Union[np.ndarray, torch.Tensor]:
+        """Encode the input sentences with the embedding model.
+
+        Args:
+            sentences (Union[List[str], str]): Input sentences to encode.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
+            convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to :data:`None`.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
+        """
         return super().encode(
             sentences,
             batch_size=batch_size,
@@ -118,6 +182,19 @@ def encode_single_device(
         device: Optional[str] = None,
         **kwargs: Any
     ):
+        """Encode input sentences on a single device.
+
+        Args:
+            sentences (Union[List[str], str]): Input sentences to encode.
+            batch_size (int, optional): Number of sentences for each iter. Defaults to :data:`256`.
+            max_length (int, optional): Maximum length of tokens. Defaults to :data:`512`.
+            convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
+                be a Torch Tensor. Defaults to :data:`True`.
+            device (Optional[str], optional): Device to use for encoding. Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
+        """
         if device is None:
             device = self.target_devices[0]
 
@@ -214,6 +291,18 @@ def pooling(
         last_hidden_state: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None
     ):
+        """The pooling function.
+
+        Args:
+            last_hidden_state (torch.Tensor): The last hidden state of the model.
+            attention_mask (Optional[torch.Tensor], optional): Attention mask. Defaults to :data:`None`.
+
+        Raises:
+            NotImplementedError: pooling method not implemented.
+
+        Returns:
+            torch.Tensor: The embedding vectors after pooling.
+        """
         if self.pooling_method == 'cls':
             return last_hidden_state[:, 0]
         elif self.pooling_method == 'mean':
diff --git a/FlagEmbedding/inference/embedder/encoder_only/m3.py b/FlagEmbedding/inference/embedder/encoder_only/m3.py