File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1010# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
1111batch_size = 12
1212block_size = 1024
13- gradient_accumulation_steps = 5
13+ gradient_accumulation_steps = 5 * 8
1414
1515# this makes total number of tokens be 300B
1616max_iters = 600000
Original file line number Diff line number Diff line change 1414wandb_run_name = 'mini-gpt'
1515
1616dataset = 'shakespeare_char'
17+ gradient_accumulation_steps = 1
1718batch_size = 64
1819block_size = 256 # context of up to 256 previous characters
1920
Original file line number Diff line number Diff line change 4545wandb_run_name = 'gpt2' # 'run' + str(time.time())
4646# data
4747dataset = 'openwebtext'
48- gradient_accumulation_steps = 5 # used to simulate larger batch sizes
48+ gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
4949batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
5050block_size = 1024
5151# model
8888 torch .cuda .set_device (device )
8989 master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
9090 seed_offset = ddp_rank # each process gets a different seed
91+ assert gradient_accumulation_steps % torch .cuda .device_count () == 0
92+ gradient_accumulation_steps //= torch .cuda .device_count ()
9193else :
9294 # if not ddp, we are running on a single gpu, and one process
9395 master_process = True
9496 seed_offset = 0
95- gradient_accumulation_steps *= 8 # simulate 8 gpus
9697
9798if master_process :
9899 os .makedirs (out_dir , exist_ok = True )
You can’t perform that action at this time.
0 commit comments