We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 7fe4a09 commit 6649b29Copy full SHA for 6649b29
1 file changed
data/openwebtext/prepare.py
@@ -50,7 +50,7 @@ def process(example):
50
51
# concatenate all the ids in each dataset into one large file we can use for training
52
for split, dset in tokenized.items():
53
- arr_len = np.sum(dset['len'])
+ arr_len = np.sum(dset['len'], dtype=np.uint64)
54
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
55
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
56
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
0 commit comments