1313
1414
1515class MKQAEvalDataLoader (AbsEvalDataLoader ):
16+ """
17+ Data loader class for MKQA.
18+ """
1619 def available_dataset_names (self ) -> List [str ]:
20+ """
21+ Get the available dataset names.
22+
23+ Returns:
24+ List[str]: All the available dataset names.
25+ """
1726 return ['en' , 'ar' , 'fi' , 'ja' , 'ko' , 'ru' , 'es' , 'sv' , 'he' , 'th' , 'da' , 'de' , 'fr' , 'it' , 'nl' , 'pl' , 'pt' , 'hu' , 'vi' , 'ms' , 'km' , 'no' , 'tr' , 'zh_cn' , 'zh_hk' , 'zh_tw' ]
1827
1928 def available_splits (self , dataset_name : Optional [str ] = None ) -> List [str ]:
29+ """
30+ Get the avaialble splits.
31+
32+ Args:
33+ dataset_name (str): Dataset name.
34+
35+ Returns:
36+ List[str]: All the available splits for the dataset.
37+ """
2038 return ["test" ]
2139
2240 def load_corpus (self , dataset_name : Optional [str ] = None ) -> datasets .DatasetDict :
41+ """Load the corpus.
42+
43+ Args:
44+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
45+
46+ Returns:
47+ datasets.DatasetDict: Loaded datasets instance of corpus.
48+ """
2349 if self .dataset_dir is not None :
2450 # same corpus for all languages
2551 save_dir = self .dataset_dir
@@ -28,6 +54,19 @@ def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDic
2854 return self ._load_remote_corpus (dataset_name = dataset_name )
2955
3056 def _load_local_qrels (self , save_dir : str , dataset_name : Optional [str ] = None , split : str = 'test' ) -> datasets .DatasetDict :
57+ """Try to load qrels from local datasets.
58+
59+ Args:
60+ save_dir (str): Directory that save the data files.
61+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
62+ split (str, optional): Split of the dataset. Defaults to ``'test'``.
63+
64+ Raises:
65+ ValueError: No local qrels found, will try to download from remote.
66+
67+ Returns:
68+ datasets.DatasetDict: Loaded datasets instance of qrels.
69+ """
3170 checked_split = self .check_splits (split )
3271 if len (checked_split ) == 0 :
3372 raise ValueError (f"Split { split } not found in the dataset." )
@@ -96,6 +135,16 @@ def _load_remote_qrels(
96135 split : str = 'test' ,
97136 save_dir : Optional [str ] = None
98137 ) -> datasets .DatasetDict :
138+ """Load remote qrels from HF.
139+
140+ Args:
141+ dataset_name (str): Name of the dataset.
142+ split (str, optional): Split of the dataset. Defaults to ``'test'``.
143+ save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
144+
145+ Returns:
146+ datasets.DatasetDict: Loaded datasets instance of qrel.
147+ """
99148 endpoint = f"{ os .getenv ('HF_ENDPOINT' , 'https://huggingface.co' )} /datasets/Shitao/bge-m3-data"
100149 queries_download_url = f"{ endpoint } /resolve/main/MKQA_test-data.zip"
101150
@@ -137,6 +186,16 @@ def _load_remote_queries(
137186 split : str = 'test' ,
138187 save_dir : Optional [str ] = None
139188 ) -> datasets .DatasetDict :
189+ """Load the queries from HF.
190+
191+ Args:
192+ dataset_name (str): Name of the dataset.
193+ split (str, optional): Split of the dataset. Defaults to ``'test'``.
194+ save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
195+
196+ Returns:
197+ datasets.DatasetDict: Loaded datasets instance of queries.
198+ """
140199 endpoint = f"{ os .getenv ('HF_ENDPOINT' , 'https://huggingface.co' )} /datasets/Shitao/bge-m3-data"
141200 queries_download_url = f"{ endpoint } /resolve/main/MKQA_test-data.zip"
142201
0 commit comments