np.sum overflows on windows

LaihoE · LaihoE · commit 6649b299ebba · 2023-05-09T16:36:59.000+03:00
diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py
@@ -50,7 +50,7 @@ def process(example):
 
 # concatenate all the ids in each dataset into one large file we can use for training
 for split, dset in tokenized.items():
-    arr_len = np.sum(dset['len'])
+    arr_len = np.sum(dset['len'], dtype=np.uint64)
     filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
     dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
     arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))