@@ -45,19 +45,33 @@ def _load_remote_corpus(
4545 corpus_dict = {}
4646 with open (save_path , "w" , encoding = "utf-8" ) as f :
4747 for data in tqdm (corpus , desc = "Loading and Saving corpus" ):
48- _data = {
49- "id" : data ["docid" ],
50- "title" : data ["title" ],
51- "text" : data .get ("text" , data .get ("body" , "" ))
52- }
53- corpus_dict [data ["docid" ]] = {
54- "title" : data ["title" ],
55- "text" : data .get ("text" , data .get ("body" , "" ))
56- }
48+ if dataset_name == 'passage' :
49+ _data = {
50+ "id" : data ["docid" ],
51+ "title" : data ["title" ],
52+ "text" : data ["text" ]
53+ }
54+ corpus_dict [data ["docid" ]] = {
55+ "title" : data ["title" ],
56+ "text" : data ["text" ]
57+ }
58+ else :
59+ _data = {
60+ "id" : data ["doc_id" ],
61+ "title" : data ["title" ],
62+ "text" : data ["body" ]
63+ }
64+ corpus_dict [data ["doc_id" ]] = {
65+ "title" : data ["title" ],
66+ "text" : data ["body" ]
67+ }
5768 f .write (json .dumps (_data , ensure_ascii = False ) + "\n " )
5869 logging .info (f"{ self .eval_name } { dataset_name } corpus saved to { save_path } " )
5970 else :
60- corpus_dict = {data ["docid" ]: {"title" : data ["title" ], "text" : data .get ("text" , data .get ("body" , "" ))} for data in tqdm (corpus , desc = "Loading corpus" )}
71+ if dataset_name == 'passage' :
72+ corpus_dict = {data ["docid" ]: {"title" : data ["title" ], "text" : data ["text" ]} for data in tqdm (corpus , desc = "Loading corpus" )}
73+ else :
74+ corpus_dict = {data ["doc_id" ]: {"title" : data ["title" ], "text" : data ["body" ]} for data in tqdm (corpus , desc = "Loading corpus" )}
6175 return datasets .DatasetDict (corpus_dict )
6276
6377 def _load_remote_qrels (
0 commit comments