Skip to content

Commit 01e48ec

Browse files
authored
Merge pull request karpathy#240 from YassineYousfi/master
don't dropout in eval mode
2 parents 7840a66 + 7399dfe commit 01e48ec

1 file changed

Lines changed: 1 addition & 1 deletion

File tree

model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def forward(self, x):
6969
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
7070
if self.flash:
7171
# efficient attention using Flash Attention CUDA kernels
72-
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
72+
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
7373
else:
7474
# manual implementation of attention
7575
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

0 commit comments

Comments
 (0)