1313
1414
1515class BEIREvalDataLoader (AbsEvalDataLoader ):
16+ """
17+ Data loader class for BEIR.
18+ """
1619 def available_dataset_names (self ) -> List [str ]:
20+ """
21+ Get the available dataset names.
22+
23+ Returns:
24+ List[str]: All the available dataset names.
25+ """
1726 return ['arguana' , 'climate-fever' , 'cqadupstack' , 'dbpedia-entity' , 'fever' , 'fiqa' , 'hotpotqa' , 'msmarco' , 'nfcorpus' , 'nq' , 'quora' , 'scidocs' , 'scifact' , 'trec-covid' , 'webis-touche2020' ]
1827
1928 def available_sub_dataset_names (self , dataset_name : Optional [str ] = None ) -> List [str ]:
29+ """
30+ Get the available sub-dataset names.
31+
32+ Args:
33+ dataset_name (Optional[str], optional): All the available sub-dataset names. Defaults to ``None``.
34+
35+ Returns:
36+ List[str]: All the available sub-dataset names.
37+ """
2038 if dataset_name == 'cqadupstack' :
2139 return ['android' , 'english' , 'gaming' , 'gis' , 'mathematica' , 'physics' , 'programmers' , 'stats' , 'tex' , 'unix' , 'webmasters' , 'wordpress' ]
2240 return None
2341
2442 def available_splits (self , dataset_name : Optional [str ] = None ) -> List [str ]:
43+ """
44+ Get the avaialble splits.
45+
46+ Args:
47+ dataset_name (str): Dataset name.
48+
49+ Returns:
50+ List[str]: All the available splits for the dataset.
51+ """
2552 if dataset_name == 'msmarco' :
2653 return ['dev' ]
2754 return ['test' ]
@@ -32,6 +59,16 @@ def _load_remote_corpus(
3259 sub_dataset_name : Optional [str ] = None ,
3360 save_dir : Optional [str ] = None
3461 ) -> datasets .DatasetDict :
62+ """Load the corpus dataset from HF.
63+
64+ Args:
65+ dataset_name (str): Name of the dataset.
66+ sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
67+ save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
68+
69+ Returns:
70+ datasets.DatasetDict: Loaded datasets instance of corpus.
71+ """
3572 if dataset_name != 'cqadupstack' :
3673 corpus = datasets .load_dataset (
3774 'BeIR/{d}' .format (d = dataset_name ),
@@ -94,6 +131,17 @@ def _load_remote_qrels(
94131 split : str = 'dev' ,
95132 save_dir : Optional [str ] = None
96133 ) -> datasets .DatasetDict :
134+ """Load the qrels from HF.
135+
136+ Args:
137+ dataset_name (str): Name of the dataset.
138+ sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
139+ split (str, optional): Split of the dataset. Defaults to ``'dev'``.
140+ save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
141+
142+ Returns:
143+ datasets.DatasetDict: Loaded datasets instance of qrel.
144+ """
97145 if dataset_name != 'cqadupstack' :
98146 qrels = datasets .load_dataset (
99147 'BeIR/{d}-qrels' .format (d = dataset_name ),
@@ -168,6 +216,17 @@ def _load_remote_queries(
168216 split : str = 'test' ,
169217 save_dir : Optional [str ] = None
170218 ) -> datasets .DatasetDict :
219+ """Load the queries from HF.
220+
221+ Args:
222+ dataset_name (str): Name of the dataset.
223+ sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
224+ split (str, optional): Split of the dataset. Defaults to ``'dev'``.
225+ save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
226+
227+ Returns:
228+ datasets.DatasetDict: Loaded datasets instance of queries.
229+ """
171230 qrels = self .load_qrels (dataset_name = dataset_name , sub_dataset_name = sub_dataset_name , split = split )
172231
173232 if dataset_name != 'cqadupstack' :
@@ -230,6 +289,15 @@ def _load_remote_queries(
230289 return datasets .DatasetDict (queries_dict )
231290
232291 def load_corpus (self , dataset_name : Optional [str ] = None , sub_dataset_name : Optional [str ] = None ) -> datasets .DatasetDict :
292+ """Load the corpus from the dataset.
293+
294+ Args:
295+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
296+ sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
297+
298+ Returns:
299+ datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
300+ """
233301 if self .dataset_dir is not None :
234302 if dataset_name is None :
235303 save_dir = self .dataset_dir
@@ -240,6 +308,19 @@ def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_name: Opti
240308 return self ._load_remote_corpus (dataset_name = dataset_name , sub_dataset_name = sub_dataset_name )
241309
242310 def load_qrels (self , dataset_name : Optional [str ] = None , sub_dataset_name : Optional [str ] = None , split : str = 'test' ) -> datasets .DatasetDict :
311+ """Load the qrels from the dataset.
312+
313+ Args:
314+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
315+ sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
316+ split (str, optional): The split to load relevance from. Defaults to ``'test'``.
317+
318+ Raises:
319+ ValueError
320+
321+ Returns:
322+ datasets.DatasetDict: A dict of relevance of query and document.
323+ """
243324 if self .dataset_dir is not None :
244325 if dataset_name is None :
245326 save_dir = self .dataset_dir
@@ -256,6 +337,19 @@ def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_name: Optio
256337 return self ._load_remote_qrels (dataset_name = dataset_name , sub_dataset_name = sub_dataset_name , split = split )
257338
258339 def load_queries (self , dataset_name : Optional [str ] = None , sub_dataset_name : Optional [str ] = None , split : str = 'test' ) -> datasets .DatasetDict :
340+ """Load the queries from the dataset.
341+
342+ Args:
343+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
344+ sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
345+ split (str, optional): The split to load queries from. Defaults to ``'test'``.
346+
347+ Raises:
348+ ValueError
349+
350+ Returns:
351+ datasets.DatasetDict: A dict of queries with id as key, query text as value.
352+ """
259353 if self .dataset_dir is not None :
260354 if dataset_name is None :
261355 save_dir = self .dataset_dir
@@ -272,6 +366,16 @@ def load_queries(self, dataset_name: Optional[str] = None, sub_dataset_name: Opt
272366 return self ._load_remote_queries (dataset_name = dataset_name , sub_dataset_name = sub_dataset_name , split = split )
273367
274368 def _load_local_corpus (self , save_dir : str , dataset_name : Optional [str ] = None , sub_dataset_name : Optional [str ] = None ) -> datasets .DatasetDict :
369+ """Load corpus from local dataset.
370+
371+ Args:
372+ save_dir (str): Path to save the loaded corpus.
373+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
374+ sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
375+
376+ Returns:
377+ datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
378+ """
275379 if sub_dataset_name is None :
276380 corpus_path = os .path .join (save_dir , 'corpus.jsonl' )
277381 else :
@@ -291,6 +395,20 @@ def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None,
291395 return datasets .DatasetDict (corpus )
292396
293397 def _load_local_qrels (self , save_dir : str , dataset_name : Optional [str ] = None , sub_dataset_name : Optional [str ] = None , split : str = 'test' ) -> datasets .DatasetDict :
398+ """Load relevance from local dataset.
399+
400+ Args:
401+ save_dir (str): Path to save the loaded relevance.
402+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
403+ sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
404+ split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
405+
406+ Raises:
407+ ValueError
408+
409+ Returns:
410+ datasets.DatasetDict: A dict of relevance of query and document.
411+ """
294412 checked_split = self .check_splits (split )
295413 if len (checked_split ) == 0 :
296414 raise ValueError (f"Split { split } not found in the dataset." )
@@ -318,6 +436,20 @@ def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, s
318436 return datasets .DatasetDict (qrels )
319437
320438 def _load_local_queries (self , save_dir : str , dataset_name : Optional [str ] = None , sub_dataset_name : Optional [str ] = None , split : str = 'test' ) -> datasets .DatasetDict :
439+ """Load queries from local dataset.
440+
441+ Args:
442+ save_dir (str): Path to save the loaded queries.
443+ dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
444+ sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
445+ split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
446+
447+ Raises:
448+ ValueError
449+
450+ Returns:
451+ datasets.DatasetDict: A dict of queries with id as key, query text as value.
452+ """
321453 checked_split = self .check_splits (split )
322454 if len (checked_split ) == 0 :
323455 raise ValueError (f"Split { split } not found in the dataset." )
0 commit comments