update mteb eval

545999961 · 545999961 · commit ef30a53a11fb · 2024-10-29T00:06:23.000+08:00
diff --git a/FlagEmbedding/evaluation/mteb/arguments.py b/FlagEmbedding/evaluation/mteb/arguments.py
@@ -13,7 +13,7 @@ class MTEBEvalArgs(AbsEvalArgs):
         default=None, metadata={"help": "Tasks to evaluate. Default: None"}
     )
     task_types: List[str] = field(
-        default=None, metadata={"help": "The tasks to evaluate. Default: None"}
+        default=None, metadata={"help": "The task types to evaluate. Default: None"}
     )
     use_special_instructions: bool = field(
         default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"}
diff --git a/FlagEmbedding/evaluation/mteb/examples.py b/FlagEmbedding/evaluation/mteb/examples.py
diff --git a/FlagEmbedding/evaluation/mteb/runner.py b/FlagEmbedding/evaluation/mteb/runner.py
@@ -10,6 +10,7 @@
 from .arguments import MTEBEvalArgs
 from .searcher import MTEBEvalDenseRetriever, MTEBEvalReranker
 from .prompts import get_task_def_by_task_name_and_type
+from  .examples import examples_dict
 
 logger = logging.getLogger(__name__)
 
@@ -133,18 +134,7 @@ def run(self):
 
             if self.eval_args.use_special_examples:
                 try:
-                    eg_file_path = f'./examples/{task_name}.csv'
-                    eg_pairs = []
-                    df = pd.read_csv(eg_file_path)
-                    for i in range(len(df)):
-                        task_def = self.retriever.get_instruction()
-                        eg_pairs.append(
-                            {
-                                'instruct': task_def,
-                                'query': df[df.keys()[0]][i],
-                                'response': df[df.keys()[1]][i]
-                            }
-                        )
+                    eg_pairs = examples_dict[task_name]
                     self.retriever.set_examples(eg_pairs)
                 except:
                     logger.logger.info(f"No examples found for {task_name}")
diff --git a/examples/evaluation/README.md b/examples/evaluation/README.md
@@ -0,0 +1,129 @@
+# Evaluation
+
+After finetuning, the model needs to be evaluated. To facilitate this, we have provided scripts for assessing it on various datasets, including **MTEB**, **BEIR**, **MSMARCO**, **MIRACL**, **MLDR**, **MKQA**, and **AIR-Bench**. You can find the specific bash scripts in the respective folders. This document provides an overview of these evaluations.
+
+First, we will introduce the commonly used variables, followed by an introduction to the variables for each dataset.
+
+## Introduction
+
+### 1. EvalArgs
+
+**Parameters for evaluation setup:**
+
+- **`eval_name`**: Name of the evaluation task (e.g., msmarco, beir, miracl).
+  
+- **`dataset_dir`**: Path to the dataset directory. This can be:
+    1. A local path to perform evaluation on your dataset (must exist). It should contain:
+        - `corpus.jsonl`
+        - `<split>_queries.jsonl`
+        - `<split>_qrels.jsonl`
+    2. Path to store datasets downloaded via API. Provide `None` to use the cache directory.
+  
+- **`force_redownload`**: Set to `true` to force redownload of the dataset.
+
+- **`dataset_names`**: List of dataset names to evaluate or `None` to evaluate all available datasets.
+
+- **`splits`**: Dataset splits to evaluate. Default is `test`.
+
+- **`corpus_embd_save_dir`**: Directory to save corpus embeddings. If `None`, embeddings will not be saved.
+
+- **`output_dir`**: Directory to save evaluation results.
+
+- **`search_top_k`**: Top-K results for initial retrieval.
+
+- **`rerank_top_k`**: Top-K results for reranking.
+
+- **`cache_path`**: Cache directory for datasets.
+
+- **`token`**: Token used for accessing the model.
+
+- **`overwrite`**: Set to `true` to overwrite existing evaluation results.
+
+- **`ignore_identical_ids`**: Set to `true` to ignore identical IDs in search results.
+
+- **`k_values`**: List of K values for evaluation (e.g., [1, 3, 5, 10, 100, 1000]).
+
+- **`eval_output_method`**: Format for outputting evaluation results (options: 'json', 'markdown'). Default is `markdown`.
+
+- **`eval_output_path`**: Path to save the evaluation output.
+
+- **`eval_metrics`**: Metrics used for evaluation (e.g., ['ndcg_at_10', 'recall_at_10']).
+
+### 2. ModelArgs
+
+**Parameters for Model Configuration:**
+
+- **`embedder_name_or_path`**: The name or path to the embedder.
+
+- **`embedder_model_class`**: Class of the model used for embedding (options include 'auto', 'encoder-only-base', etc.). Default is `auto`.
+
+- **`normalize_embeddings`**: Set to `true` to normalize embeddings.
+
+- **`use_fp16`**: Use FP16 precision for inference.
+
+- **`devices`**: List of devices used for inference.
+
+- **`query_instruction_for_retrieval`**, **`query_instruction_format_for_retrieval`**: Instructions and format for query during retrieval.
+
+- **`examples_for_task`**, **`examples_instruction_format`**: Example tasks and their instructions format.
+
+- **`trust_remote_code`**: Set to `true` to trust remote code execution.
+
+- **`reranker_name_or_path`**: Name or path to the reranker.
+
+- **`reranker_model_class`**: Reranker model class (options include 'auto', 'decoder-only-base', etc.). Default is `auto`.
+
+- **`reranker_peft_path`**: Path for portable encoder fine-tuning of the reranker.
+
+- **`use_bf16`**: Use BF16 precision for inference.
+
+- **`query_instruction_for_rerank`**, **`query_instruction_format_for_rerank`**: Instructions and format for query during reranking.
+
+- **`passage_instruction_for_rerank`**, **`passage_instruction_format_for_rerank`**: Instructions and format for processing passages during reranking.
+
+- **`cache_dir`**: Cache directory for models.
+
+- **`embedder_batch_size`**, **`reranker_batch_size`**: Batch sizes for embedding and reranking.
+
+- **`embedder_query_max_length`**, **`embedder_passage_max_length`**: Maximum length for embedding queries and passages.
+
+- **`reranker_query_max_length`**, **`reranker_max_length`**: Maximum lengths for reranking queries and reranking in general.
+
+- **`normalize`**: Normalize the reranking scores.
+
+- **`prompt`**: Prompt for the reranker.
+
+- **`cutoff_layers`**, **`compress_ratio`**, **`compress_layers`**: Parameters for configuring the output and compression of layerwise or lightweight rerankers.
+
+## Usage
+
+### 1. MTEB
+
+In the evaluation of MTEB, we primarily utilize the official [MTEB](https://github.com/embeddings-benchmark/mteb) code, which supports only the assessment of embedders. Additionally, it restricts the output format of evaluation results to JSON. The following new variables have been introduced:
+
+### 2. BEIR
+
+
+
+### 3. MSMARCO
+
+
+
+### 4. MIRACL
+
+
+
+### 5. MLDR
+
+
+
+### 6. MKQA
+
+
+
+### 7. AIR+Bench
+
+
+
+### 8. Custom Dataset
+

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ class MTEBEvalArgs(AbsEvalArgs):`
`13`	`13`	`default=None, metadata={"help": "Tasks to evaluate. Default: None"}`
`14`	`14`	`)`
`15`	`15`	`task_types: List[str] = field(`
`16`		`- default=None, metadata={"help": "The tasks to evaluate. Default: None"}`
	`16`	`+ default=None, metadata={"help": "The task types to evaluate. Default: None"}`
`17`	`17`	`)`
`18`	`18`	`use_special_instructions: bool = field(`
`19`	`19`	default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"}