Fix a bug in split_data_by_length.py

hanhainebula · hanhainebula · commit 8799821722b1 · 2024-03-24T00:36:59.000+08:00
diff --git a/C_MTEB/MKQA/README.md b/C_MTEB/MKQA/README.md
@@ -16,6 +16,26 @@ We use the well-processed NQ [corpus](https://huggingface.co/datasets/BeIR/nq) o
 
 If you only want to perform dense retrieval with embedding models, you can follow the following steps:
 
+1. Install Java, Pyserini and Faiss (CPU version or GPU version):
+
+```bash
+# install java (Linux)
+apt update
+apt install openjdk-11-jdk
+
+# install pyserini
+pip install pyserini
+
+# install faiss
+## CPU version
+conda install -c conda-forge faiss-cpu
+
+## GPU version
+conda install -c conda-forge faiss-gpu
+```
+
+2. Dense retrieval:
+
 ```bash
 cd dense_retrieval
 
diff --git a/FlagEmbedding/BGE_M3/split_data_by_length.py b/FlagEmbedding/BGE_M3/split_data_by_length.py
@@ -14,6 +14,7 @@
 import math
 import time
 import argparse
+import datasets
 from tqdm import tqdm
 from pprint import pprint
 from transformers import AutoTokenizer
@@ -54,8 +55,7 @@ def _map_func(examples):
             results['idx'] = []
             results['max_length'] = []
             for i in range(len(examples['query'])):
-                results['idx'].append(i)
-
+                idx = examples['idx'][i]
                 query = examples['query'][i]
                 pos, neg = examples['pos'][i], examples['neg'][i]
                 all_texts = [query] + pos + neg
@@ -65,6 +65,8 @@ def _map_func(examples):
                     tokenized_x = self.tokenizer(x)['input_ids']
                     if len(tokenized_x) > max_len:
                         max_len = len(tokenized_x)
+                
+                results['idx'].append(idx)
                 results['max_length'].append(max_len)
             return results
 
@@ -120,8 +122,15 @@ def _process_file(self, file_path: str, output_path: str):
             dataset = load_dataset('json', data_files=file_path, cache_dir=self.cache_dir, features=features)['train']
         except:
             dataset = load_dataset('json', data_files=file_path, cache_dir=self.cache_dir, features=kd_features)['train']
-        mapped_dataset = dataset.map(self._map_func, batched=True, num_proc=self.num_proc)
 
+        dataset_with_idx_list = []
+        for i, data in enumerate(dataset):
+            data['idx'] = i
+            dataset_with_idx_list.append(data)
+        dataset_with_idx = datasets.Dataset.from_list(dataset_with_idx_list)
+        
+        mapped_dataset = dataset_with_idx.map(self._map_func, batched=True, num_proc=self.num_proc)
+        
         split_info_dict = {}
         for length_l, length_r in self.length_ranges_list:
             save_path = output_path + f'_len-{length_l}-{length_r}.jsonl'
@@ -130,7 +139,8 @@ def _process_file(self, file_path: str, output_path: str):
                 continue
 
             idxs = mapped_dataset.filter(lambda x: length_l <= x['max_length'] < length_r, num_proc=self.num_proc)
-            split_dataset = dataset.select(idxs['idx'])
+            split_dataset = dataset_with_idx.select(idxs['idx'])
+            split_dataset = split_dataset.remove_columns('idx')
 
             split_info_dict[f'len-{length_l}-{length_r}'] = len(split_dataset)