labmlai
diff --git a/‎docs/optimizers/configs.html‎
Lines changed: 86 additions & 76 deletions b/‎docs/optimizers/configs.html‎
Lines changed: 86 additions & 76 deletions
diff --git a/‎docs/optimizers/sophia.html‎
Lines changed: 489 additions & 0 deletions b/‎docs/optimizers/sophia.html‎
Lines changed: 489 additions & 0 deletions
diff --git a/‎docs/sitemap.xml‎
Lines changed: 15 additions & 1 deletion b/‎docs/sitemap.xml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎docs/transformers/basic/with_sophia.html‎
Lines changed: 690 additions & 0 deletions b/‎docs/transformers/basic/with_sophia.html‎
Lines changed: 690 additions & 0 deletions
diff --git a/‎labml_nn/optimizers/sophia.py‎
Lines changed: 84 additions & 11 deletions b/‎labml_nn/optimizers/sophia.py‎
Lines changed: 84 additions & 11 deletions
diff --git a/‎labml_nn/transformers/basic/with_sophia.py‎
Lines changed: 25 additions & 11 deletions b/‎labml_nn/transformers/basic/with_sophia.py‎
Lines changed: 25 additions & 11 deletions
@@ -687,7 +687,7 @@
 
     <url>
       <loc>https://nn.labml.ai/optimizers/configs.html</loc>
-      <lastmod>2021-10-21T16:30:00+00:00</lastmod>
+      <lastmod>2023-07-14T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
 
@@ -755,6 +755,13 @@
     </url>
 
 
+    <url>
+      <loc>https://nn.labml.ai/optimizers/sophia.html</loc>
+      <lastmod>2023-07-14T16:30:00+00:00</lastmod>
+      <priority>1.00</priority>
+    </url>
+    
+
     <url>
       <loc>https://nn.labml.ai/optimizers/amsgrad.html</loc>
       <lastmod>2023-04-02T16:30:00+00:00</lastmod>
@@ -965,6 +972,13 @@
     </url>
 
 
+    <url>
+      <loc>https://nn.labml.ai/transformers/basic/with_sophia.html</loc>
+      <lastmod>2023-07-14T16:30:00+00:00</lastmod>
+      <priority>1.00</priority>
+    </url>
+    
+
     <url>
       <loc>https://nn.labml.ai/transformers/basic/index.html</loc>
       <lastmod>2021-06-07T16:30:00+00:00</lastmod>
 
@@ -8,6 +8,47 @@
 
 This is a [PyTorch](https://pytorch.org) implementation of *Sophia-G* from paper
  [Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training](https://papers.labml.ai/paper/2305.14342).
+Official implementation is available at [Liuhong99/Sophia](https://github.com/Liuhong99/Sophia).
+
+Sophia is more adaptive to heterogeneous curvatures than Adam, more resistant
+to non-convexity and rapid change of Hessian than Newton’s method, and also uses a low-cost
+pre-conditioner.
+
+Sophia keeps diagonal Hessian estimates with EMA across iterations.
+The diagonal Hessian $\hat{h}_t$ is calculated every $k$ steps.
+
+\begin{align}
+h_t = \beta_2 h_{t-k} + (1 - \beta_2) \hat{h}_t \ \ \ \ \text{ if } t \text{ mod } k = 1; \text{ else }  h_t = h_{t-1}
+\end{align}
+
+Sophia uses EMA of gradients $m_t$, only considers positive entries of
+ the diagonal Hessian and does per-coordinate clipping to the update.
+
+\begin{align}
+m_t &\leftarrow \beta_1 m_{t-1} + (1 - \beta_1)g_t \\
+\theta_{t + 1} &\leftarrow \theta_t - \eta \cdot \operatorname{clip} \bigg(\frac{m_t}{ \max \{h_t, \epsilon \} }, \rho \bigg)
+\end{align}
+
+where $\epsilon$ is a very small value to prevent division by $0$.
+
+### Gauss-Newton-Bartlett (GNB) estimator
+
+\begin{align}
+\hat{L}(\theta) &= \frac{1}{B} \sum^{B}_{b=1} \ell_{CE} \big( f(\theta, x_b), \hat{y}_b \big) \\
+\hat{h}_t &= B \cdot \nabla_\theta \hat{L} (\theta) \odot \nabla_\theta \hat{L} (\theta)
+\end{align}
+
+where $x_b$ are the inputs,
+$B$ is the batch size (number of inputs/tokens),
+$\ell_{CE}$ is cross entropy loss, and
+$\hat{y}_b$ are sampled from the logits $f(\theta, x_b)$.
+
+Note that this hessian estimate is always positive and therefore we
+can replace $\max \{h_t, \epsilon \}$ with $h_t + \epsilon$.
+
+Sophia with Gauss-Newton-Bartlett (GNB) estimator is **Sophia-G**
+
+Here is an [experiment](../transformers/basic/with_sophia.html) that uses Sophia-G to train a transformer.
 """
 
 from typing import Dict, Any, Tuple, Optional
@@ -27,15 +68,15 @@ class Sophia(GenericAdaptiveOptimizer):
     """
 
     def __init__(self, params,
-                 lr: float = 1e-4, betas: Tuple[float, float] = (0.965, 0.99), eps: float = 1e-16,
-                 rho: float = 0.04,
+                 lr: float = 1e-4, betas: Tuple[float, float] = (0.9, 0.95), eps: float = 1e-12,
+                 rho: float = 0.03,
                  weight_decay: WeightDecay = WeightDecay(),
                  defaults: Optional[Dict[str, Any]] = None):
         """
         ### Initialize the optimizer
 
         * `params` is the list of parameters
-        * `lr` is the learning rate $\alpha$
+        * `lr` is the maximum learning rate $\eta \rho$
         * `betas` is a tuple of ($\beta_1$, $\beta_2$)
         * `eps` is $\epsilon$
         * `pho` is $\rho$
@@ -61,23 +102,46 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
 
         # This is the number of optimizer steps taken on the parameter, $t$
         state['step'] = 0
-        # state['hessian_updates']
         # Exponential moving average of gradients, $m_t$
         state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
-        # Exponential moving average of Hessian
+        # Exponential moving average of Hessian diagonal, $h_t$
         state['hessian'] = torch.zeros_like(param, memory_format=torch.preserve_format)
 
     def update_hessian(self, n_tokens_training_batch):
+        """
+        ### Update the EMA of Hessian diagonal $h_t$
+
+        * `n_tokens_training_batch` is the number of tokens/inputs in the batch $B$
+
+        \begin{align}
+        \hat{h}_t &= B \cdot \nabla_\theta \hat{L} (\theta) \odot \nabla_\theta \hat{L} (\theta) \\
+        h_t &= \beta_2 h_{t-k} + (1 - \beta_2) \hat{h}_t
+        \end{align}
+        """
+
+        # Iterate through parameter groups
         for group in self.param_groups:
-            beta1, beta2 = group['betas']
+            # $\beta_2$
+            _, beta2 = group['betas']
+            # Iterate through parameters
             for p in group['params']:
+                # Skip parameters without gradients
                 if p.grad is None:
                     continue
+
+                # Get optimizer state
                 state = self.state[p]
 
+                # Initialize state if empty
                 if len(state) == 0:
                     self.init_state(state, group, p)
 
+                # Update EMA Hessian diagonal
+                #
+                # \begin{align}
+                # \hat{h}_t &= B \cdot \nabla_\theta \hat{L} (\theta) \odot \nabla_\theta \hat{L} (\theta) \\
+                # h_t &= \beta_2 h_{t-k} + (1 - \beta_2) \hat{h}_t
+                # \end{align}
                 state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=(1 - beta2) * n_tokens_training_batch)
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
@@ -88,17 +152,24 @@ def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.T
         * `group` stores optimizer attributes of the parameter group
         * `grad` is the current gradient tensor  $g_t$ for the parameter $\theta_{t-1}$
         * `param` is the parameter tensor $\theta_{t-1}$
+
+        We do the following parameter update,
+
+        \begin{align}
+        \theta_{t + 1} &\leftarrow \theta_t - \eta \cdot \operatorname{clip} \bigg(\frac{m_t}{h_t + \epsilon}, \rho \bigg) \\
+        \theta_{t + 1} &\leftarrow \theta_t - \eta \rho \cdot \operatorname{clip} \bigg(\frac{m_t}{\rho h_t + \epsilon}, 1 \bigg)
+        \end{align}
         """
 
         # Calculate weight decay
         grad = self.weight_decay(param, grad, group)
 
         # Get $\beta_1$ and $\beta_2$
         beta1, beta2 = group['betas']
-
+        # Get $\rho$
         rho = group['rho']
 
-        # Get $m_{t-1}$ and $v_{t-1}$
+        # Get $m_{t-1}$ and $h_{t}$
         m, hessian = state['exp_avg'], state['hessian']
 
         # In-place calculation of $m_t$
@@ -108,9 +179,11 @@ def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.T
         # Increment $t$ the number of optimizer steps
         state['step'] += 1
 
-        # Get learning rate
+        # Get maximum learning rate $\eta \rho$
         lr = group['lr']
 
-        ratio = (m.abs() / (rho * hessian + group['eps'])).clamp(None, 1)
+        # $$\operatorname{clip} \bigg(\frac{m_t}{\rho h_t + \epsilon}, 1 \bigg)$$
+        ratio = (m / (rho * hessian + group['eps'])).clamp(-1, 1)
 
-        param.data.addcmul_(m.sign(), ratio, value=-lr)
+        # $$\theta_{t + 1} \leftarrow \theta_t - \eta \rho \cdot \operatorname{clip} \bigg(\frac{m_t}{\rho h_t + \epsilon}, 1 \bigg)$$
+        param.data.add_(ratio, alpha=-lr)
@@ -1,5 +1,16 @@
+"""
+---
+title: Transformer Auto-Regression Experiment with [Sophia-G optimizer](../../optimizers/sophia.html)
+summary: >
+  This trains a simple transformer model on NLP auto-regression with Sophia-G optimizer.
+---
+
+# Transformer Auto-Regression Experiment with [Sophia-G optimizer](../../optimizers/sophia.html)
+
+This trains a simple transformer introduced in [Attention Is All You Need](https://papers.labml.ai/paper/1706.03762)
+on an NLP auto-regression task (with Tiny Shakespeare dataset) with [Sophia-G optimizer](../../optimizers/sophia.html).
+"""
 import torch
-from labml.configs import option
 
 from labml import experiment, tracker
 from labml_helpers.train_valid import BatchIndex
@@ -20,7 +31,7 @@ class Configs(TransformerAutoRegressionConfigs):
 
     def step(self, batch: any, batch_idx: BatchIndex):
         """
-        ### Training or validation step
+        ### Training or validation step with Gauss-Newton-Bartlett (GNB) Hessian diagonal estimator
         """
 
         # Set training/eval mode
@@ -29,15 +40,14 @@ def step(self, batch: any, batch_idx: BatchIndex):
         # Move data to the device
         data, target = batch[0].to(self.device), batch[1].to(self.device)
 
+        # Estimate the Hessian diagonal every $k$ steps
         if isinstance(self.optimizer, Sophia) and self.mode.is_train and batch_idx.idx % self.hess_interval == 0:
-            # Whether to capture model outputs
-            with self.mode.update(is_log_activations=False):
-                # Get model outputs.
-                # It's returning a tuple for states when using RNNs.
-                # This is not implemented yet. 😜
-                output, *_ = self.model(data)
+            # Get model outputs
+            output, *_ = self.model(data)
 
+            # Create a categorical distribution from logits
             samp_dist = torch.distributions.Categorical(logits=output)
+            # Sample $\hat{y}$
             y_sample = samp_dist.sample()
 
             # Calculate and log loss
@@ -48,7 +58,12 @@ def step(self, batch: any, batch_idx: BatchIndex):
             loss.backward()
             # Clip gradients
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
-            # Update Hessian estimate
+            # Update EMA Hessian diagonal
+            #
+            # \begin{align}
+            # \hat{h}_t &= B \cdot \nabla_\theta \hat{L} (\theta) \odot \nabla_\theta \hat{L} (\theta) \\
+            # h_t &= \beta_2 h_{t-k} + (1 - \beta_2) \hat{h}_t
+            # \end{align}
             self.optimizer.update_hessian(data.numel())
             # Clear the gradients
             self.optimizer.zero_grad()
@@ -95,7 +110,6 @@ def step(self, batch: any, batch_idx: BatchIndex):
             tracker.save()
 
 
-
 def main():
     # Create experiment
     experiment.create(name="transformer")
@@ -127,7 +141,7 @@ def main():
         'transformer.n_heads': 16,
         'transformer.ffn.d_ff': 1024,
 
-        # Use [Noam optimizer](../../optimizers/noam.html)
+        # Use [Sophia optimizer](../../optimizers/sophia.html)
         'optimizer.optimizer': 'Sophia',
         'optimizer.learning_rate': 3e-4,
         'optimizer.rho': 0.03,