Skip to content

Commit 3f07173

Browse files
committed
evaluation
1 parent 1df8b36 commit 3f07173

15 files changed

Lines changed: 342 additions & 2 deletions

File tree

FlagEmbedding/abc/evaluation/data_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDic
113113
return self._load_remote_corpus(dataset_name=dataset_name)
114114

115115
def load_qrels(self, dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
116-
"""Load the corpus from the dataset.
116+
"""Load the qrels from the dataset.
117117
118118
Args:
119119
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.

FlagEmbedding/evaluation/miracl/data_loader.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,28 @@
1111

1212

1313
class MIRACLEvalDataLoader(AbsEvalDataLoader):
14+
"""
15+
Data loader class for MIRACL.
16+
"""
1417
def available_dataset_names(self) -> List[str]:
18+
"""
19+
Get the available dataset names.
20+
21+
Returns:
22+
List[str]: All the available dataset names.
23+
"""
1524
return ["ar", "bn", "en", "es", "fa", "fi", "fr", "hi", "id", "ja", "ko", "ru", "sw", "te", "th", "zh", "de", "yo"]
1625

1726
def available_splits(self, dataset_name: str) -> List[str]:
27+
"""
28+
Get the avaialble splits.
29+
30+
Args:
31+
dataset_name (str): Dataset name.
32+
33+
Returns:
34+
List[str]: All the available splits for the dataset.
35+
"""
1836
if dataset_name in ["de", "yo"]:
1937
return ["dev"]
2038
else:
@@ -25,6 +43,15 @@ def _load_remote_corpus(
2543
dataset_name: str,
2644
save_dir: Optional[str] = None
2745
) -> datasets.DatasetDict:
46+
"""Load the corpus dataset from HF.
47+
48+
Args:
49+
dataset_name (str): Name of the dataset.
50+
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
51+
52+
Returns:
53+
datasets.DatasetDict: Loaded datasets instance of corpus.
54+
"""
2855
corpus = datasets.load_dataset(
2956
"miracl/miracl-corpus", dataset_name,
3057
cache_dir=self.cache_dir,
@@ -60,6 +87,16 @@ def _load_remote_qrels(
6087
split: str = 'dev',
6188
save_dir: Optional[str] = None
6289
) -> datasets.DatasetDict:
90+
"""Load the qrels from HF.
91+
92+
Args:
93+
dataset_name (str): Name of the dataset.
94+
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
95+
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
96+
97+
Returns:
98+
datasets.DatasetDict: Loaded datasets instance of qrel.
99+
"""
63100
endpoint = f"{os.getenv('HF_ENDPOINT', 'https://huggingface.co')}/datasets/miracl/miracl"
64101
qrels_download_url = f"{endpoint}/resolve/main/miracl-v1.0-{dataset_name}/qrels/qrels.miracl-v1.0-{dataset_name}-{split}.tsv"
65102

@@ -101,6 +138,16 @@ def _load_remote_queries(
101138
split: str = 'dev',
102139
save_dir: Optional[str] = None
103140
) -> datasets.DatasetDict:
141+
"""Load the queries from HF.
142+
143+
Args:
144+
dataset_name (str): Name of the dataset.
145+
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
146+
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
147+
148+
Returns:
149+
datasets.DatasetDict: Loaded datasets instance of queries.
150+
"""
104151
endpoint = f"{os.getenv('HF_ENDPOINT', 'https://huggingface.co')}/datasets/miracl/miracl"
105152
queries_download_url = f"{endpoint}/resolve/main/miracl-v1.0-{dataset_name}/topics/topics.miracl-v1.0-{dataset_name}-{split}.tsv"
106153

FlagEmbedding/evaluation/miracl/runner.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@
44

55

66
class MIRACLEvalRunner(AbsEvalRunner):
7+
"""
8+
Evaluation runner of MIRACL.
9+
"""
710
def load_data_loader(self) -> MIRACLEvalDataLoader:
11+
"""Load the data loader instance by args.
12+
13+
Returns:
14+
MIRACLEvalDataLoader: The MIRACL data loader instance.
15+
"""
816
data_loader = MIRACLEvalDataLoader(
917
eval_name=self.eval_args.eval_name,
1018
dataset_dir=self.eval_args.dataset_dir,

FlagEmbedding/evaluation/mkqa/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
)
55

66
from .data_loader import MKQAEvalDataLoader
7+
from .evaluator import MKQAEvaluator
78
from .runner import MKQAEvalRunner
89

910
__all__ = [
1011
"MKQAEvalArgs",
1112
"MKQAEvalModelArgs",
1213
"MKQAEvalRunner",
1314
"MKQAEvalDataLoader",
15+
"MKQAEvaluator"
1416
]

FlagEmbedding/evaluation/mkqa/data_loader.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,39 @@
1313

1414

1515
class MKQAEvalDataLoader(AbsEvalDataLoader):
16+
"""
17+
Data loader class for MKQA.
18+
"""
1619
def available_dataset_names(self) -> List[str]:
20+
"""
21+
Get the available dataset names.
22+
23+
Returns:
24+
List[str]: All the available dataset names.
25+
"""
1726
return ['en', 'ar', 'fi', 'ja', 'ko', 'ru', 'es', 'sv', 'he', 'th', 'da', 'de', 'fr', 'it', 'nl', 'pl', 'pt', 'hu', 'vi', 'ms', 'km', 'no', 'tr', 'zh_cn', 'zh_hk', 'zh_tw']
1827

1928
def available_splits(self, dataset_name: Optional[str] = None) -> List[str]:
29+
"""
30+
Get the avaialble splits.
31+
32+
Args:
33+
dataset_name (str): Dataset name.
34+
35+
Returns:
36+
List[str]: All the available splits for the dataset.
37+
"""
2038
return ["test"]
2139

2240
def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDict:
41+
"""Load the corpus.
42+
43+
Args:
44+
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
45+
46+
Returns:
47+
datasets.DatasetDict: Loaded datasets instance of corpus.
48+
"""
2349
if self.dataset_dir is not None:
2450
# same corpus for all languages
2551
save_dir = self.dataset_dir
@@ -28,6 +54,19 @@ def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDic
2854
return self._load_remote_corpus(dataset_name=dataset_name)
2955

3056
def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
57+
"""Try to load qrels from local datasets.
58+
59+
Args:
60+
save_dir (str): Directory that save the data files.
61+
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
62+
split (str, optional): Split of the dataset. Defaults to ``'test'``.
63+
64+
Raises:
65+
ValueError: No local qrels found, will try to download from remote.
66+
67+
Returns:
68+
datasets.DatasetDict: Loaded datasets instance of qrels.
69+
"""
3170
checked_split = self.check_splits(split)
3271
if len(checked_split) == 0:
3372
raise ValueError(f"Split {split} not found in the dataset.")
@@ -96,6 +135,16 @@ def _load_remote_qrels(
96135
split: str = 'test',
97136
save_dir: Optional[str] = None
98137
) -> datasets.DatasetDict:
138+
"""Load remote qrels from HF.
139+
140+
Args:
141+
dataset_name (str): Name of the dataset.
142+
split (str, optional): Split of the dataset. Defaults to ``'test'``.
143+
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
144+
145+
Returns:
146+
datasets.DatasetDict: Loaded datasets instance of qrel.
147+
"""
99148
endpoint = f"{os.getenv('HF_ENDPOINT', 'https://huggingface.co')}/datasets/Shitao/bge-m3-data"
100149
queries_download_url = f"{endpoint}/resolve/main/MKQA_test-data.zip"
101150

@@ -137,6 +186,16 @@ def _load_remote_queries(
137186
split: str = 'test',
138187
save_dir: Optional[str] = None
139188
) -> datasets.DatasetDict:
189+
"""Load the queries from HF.
190+
191+
Args:
192+
dataset_name (str): Name of the dataset.
193+
split (str, optional): Split of the dataset. Defaults to ``'test'``.
194+
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
195+
196+
Returns:
197+
datasets.DatasetDict: Loaded datasets instance of queries.
198+
"""
140199
endpoint = f"{os.getenv('HF_ENDPOINT', 'https://huggingface.co')}/datasets/Shitao/bge-m3-data"
141200
queries_download_url = f"{endpoint}/resolve/main/MKQA_test-data.zip"
142201

FlagEmbedding/evaluation/mkqa/evaluator.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,25 @@
88

99

1010
class MKQAEvaluator(AbsEvaluator):
11+
"""
12+
The evaluator class of MKQA.
13+
"""
1114
def get_corpus_embd_save_dir(
1215
self,
1316
retriever_name: str,
1417
corpus_embd_save_dir: Optional[str] = None,
1518
dataset_name: Optional[str] = None
1619
):
20+
"""Get the directory to save the corpus embedding.
21+
22+
Args:
23+
retriever_name (str): Name of the retriever.
24+
corpus_embd_save_dir (Optional[str], optional): Directory to save the corpus embedding. Defaults to ``None``.
25+
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
26+
27+
Returns:
28+
str: The final directory to save the corpus embedding.
29+
"""
1730
if corpus_embd_save_dir is not None:
1831
# Save the corpus embeddings in the same directory for all dataset_name
1932
corpus_embd_save_dir = os.path.join(corpus_embd_save_dir, retriever_name)
@@ -24,6 +37,15 @@ def evaluate_results(
2437
search_results_save_dir: str,
2538
k_values: List[int] = [1, 3, 5, 10, 100, 1000]
2639
):
40+
"""Compute the metrics and get the eval results.
41+
42+
Args:
43+
search_results_save_dir (str): Directory that saves the search results.
44+
k_values (List[int], optional): Cutoffs. Defaults to ``[1, 3, 5, 10, 100, 1000]``.
45+
46+
Returns:
47+
dict: The evaluation results.
48+
"""
2749
eval_results_dict = {}
2850

2951
corpus = self.data_loader.load_corpus()
@@ -70,6 +92,14 @@ def compute_metrics(
7092
):
7193
"""
7294
Compute Recall@k for QA task. The definition of recall in QA task is different from the one in IR task. Please refer to the paper of RocketQA: https://aclanthology.org/2021.naacl-main.466.pdf.
95+
96+
Args:
97+
corpus_dict (Dict[str, str]): Dictionary of the corpus with doc id and contents.
98+
qrels (Dict[str, List[str]]): Relevances of queries and passage.
99+
search_results (Dict[str, Dict[str, float]]): Search results of the model to evaluate.
100+
101+
Returns:
102+
dict: The model's scores of the metrics.
73103
"""
74104
contexts = []
75105
answers = []

FlagEmbedding/evaluation/mkqa/runner.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,15 @@
55

66

77
class MKQAEvalRunner(AbsEvalRunner):
8+
"""
9+
Evaluation runner of MKQA.
10+
"""
811
def load_data_loader(self) -> MKQAEvalDataLoader:
12+
"""Load the data loader instance by args.
13+
14+
Returns:
15+
MKQAEvalDataLoader: The MKQA data loader instance.
16+
"""
917
data_loader = MKQAEvalDataLoader(
1018
eval_name=self.eval_args.eval_name,
1119
dataset_dir=self.eval_args.dataset_dir,
@@ -16,6 +24,11 @@ def load_data_loader(self) -> MKQAEvalDataLoader:
1624
return data_loader
1725

1826
def load_evaluator(self) -> MKQAEvaluator:
27+
"""Load the evaluator instance by args.
28+
29+
Returns:
30+
MKQAEvaluator: The MKQA evaluator instance.
31+
"""
1932
evaluator = MKQAEvaluator(
2033
eval_name=self.eval_args.eval_name,
2134
data_loader=self.data_loader,

docs/source/API/evaluation.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
11
Evaluation
2-
==========
2+
==========
3+
4+
.. toctree::
5+
evaluation/miracl
6+
evaluation/mkqa
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
MIRACL
2+
======
3+
4+
`MIRACL <https://project-miracl.github.io/>`_ (Multilingual Information Retrieval Across a Continuum of Languages)
5+
is an WSDM 2023 Cup challenge that focuses on search across 18 different languages.
6+
They release a multilingual retrieval dataset containing the train and dev set for 16 "known languages" and only dev set for 2 "surprise languages".
7+
The topics are generated by native speakers of each language, who also label the relevance between the topics and a given document list.
8+
You can found the `dataset <https://huggingface.co/datasets/miracl/miracl-corpus>`_ on HuggingFace.
9+
10+
You can evaluate model's performance on MIRACL simply by running our provided shell script:
11+
12+
.. code:: bash
13+
14+
chmod +x /examples/evaluation/miracl/eval_miracl.sh
15+
./examples/evaluation/miracl/eval_miracl.sh
16+
17+
Or by running:
18+
19+
.. code:: bash
20+
21+
python -m FlagEmbedding.evaluation.miracl \
22+
--eval_name miracl \
23+
--dataset_dir ./miracl/data \
24+
--dataset_names bn hi sw te th yo \
25+
--splits dev \
26+
--corpus_embd_save_dir ./miracl/corpus_embd \
27+
--output_dir ./miracl/search_results \
28+
--search_top_k 1000 \
29+
--rerank_top_k 100 \
30+
--cache_path /root/.cache/huggingface/hub \
31+
--overwrite False \
32+
--k_values 10 100 \
33+
--eval_output_method markdown \
34+
--eval_output_path ./miracl/miracl_eval_results.md \
35+
--eval_metrics ndcg_at_10 recall_at_100 \
36+
--embedder_name_or_path BAAI/bge-m3 \
37+
--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
38+
--devices cuda:0 cuda:1 \
39+
--cache_dir /root/.cache/huggingface/hub \
40+
--reranker_max_length 1024
41+
42+
change the embedder, reranker, devices and cache directory to your preference.
43+
44+
.. toctree::
45+
:hidden:
46+
47+
miracl/data_loader
48+
miracl/runner
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
data_loader
2+
===========
3+
4+
.. autoclass:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader
5+
6+
Methods
7+
-------
8+
9+
.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader.available_dataset_names
10+
.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader.available_splits
11+
.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_corpus
12+
.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_qrels
13+
.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_queries

0 commit comments

Comments
 (0)