We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents a022d02 + dccf362 commit 325be85Copy full SHA for 325be85
1 file changed
data/openwebtext/prepare.py
@@ -16,6 +16,8 @@
16
# it is better than 1 usually though
17
num_proc_load_dataset = num_proc
18
19
+enc = tiktoken.get_encoding("gpt2")
20
+
21
if __name__ == '__main__':
22
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
23
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
@@ -38,7 +40,6 @@
38
40
# })
39
41
42
# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
- enc = tiktoken.get_encoding("gpt2")
43
def process(example):
44
ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
45
ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
0 commit comments