Skip to content

Commit eeb44dd

Browse files
committed
update msmarco eval
1 parent cfbf506 commit eeb44dd

1 file changed

Lines changed: 24 additions & 10 deletions

File tree

FlagEmbedding/evaluation/msmarco/data_loader.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,33 @@ def _load_remote_corpus(
4545
corpus_dict = {}
4646
with open(save_path, "w", encoding="utf-8") as f:
4747
for data in tqdm(corpus, desc="Loading and Saving corpus"):
48-
_data = {
49-
"id": data["docid"],
50-
"title": data["title"],
51-
"text": data.get("text", data.get("body", ""))
52-
}
53-
corpus_dict[data["docid"]] = {
54-
"title": data["title"],
55-
"text": data.get("text", data.get("body", ""))
56-
}
48+
if dataset_name == 'passage':
49+
_data = {
50+
"id": data["docid"],
51+
"title": data["title"],
52+
"text": data["text"]
53+
}
54+
corpus_dict[data["docid"]] = {
55+
"title": data["title"],
56+
"text": data["text"]
57+
}
58+
else:
59+
_data = {
60+
"id": data["doc_id"],
61+
"title": data["title"],
62+
"text": data["body"]
63+
}
64+
corpus_dict[data["doc_id"]] = {
65+
"title": data["title"],
66+
"text": data["body"]
67+
}
5768
f.write(json.dumps(_data, ensure_ascii=False) + "\n")
5869
logging.info(f"{self.eval_name} {dataset_name} corpus saved to {save_path}")
5970
else:
60-
corpus_dict = {data["docid"]: {"title": data["title"], "text": data.get("text", data.get("body", ""))} for data in tqdm(corpus, desc="Loading corpus")}
71+
if dataset_name == 'passage':
72+
corpus_dict = {data["docid"]: {"title": data["title"], "text": data["text"]} for data in tqdm(corpus, desc="Loading corpus")}
73+
else:
74+
corpus_dict = {data["doc_id"]: {"title": data["title"], "text": data["body"]} for data in tqdm(corpus, desc="Loading corpus")}
6175
return datasets.DatasetDict(corpus_dict)
6276

6377
def _load_remote_qrels(

0 commit comments

Comments
 (0)