sgl-project · maocheng23 · Jun 28, 2026 · Jun 29, 2026
@@ -0,0 +1,49 @@
+{
+  "architectures": [
+    "DSparkDraftModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModel": "dspark.DSparkDraftModel"
+  },
+  "block_size": 16,
+  "bos_token_id": 151643,
+  "dflash_config": {
+    "mask_token_id": 151669,
+    "target_layer_ids": [1, 9, 17, 25, 33]
+  },
+  "dtype": "bfloat16",
+  "enable_confidence_head": true,
+  "confidence_head_with_markov": true,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "markov_head_type": "vanilla",
+  "markov_rank": 256,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 5,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 5,
+  "num_key_value_heads": 8,
+  "num_target_layers": 36,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
@@ -0,0 +1,51 @@
+#!/bin/bash
+# DSpark online training for Qwen3-8B.
+#
+# DSpark = DFlash block-diffusion drafter + EAGLE-style Markov & confidence heads,
+# trained with cross-entropy + L1 distribution distillation + confidence BCE.
+# The L1 / confidence losses need the target model's FINAL hidden state, so the
+# target backend must surface it. The 'hf' backend (default below) always does;
+# the 'sglang' backend does when its runner returns both the captured aux stream
+# and the final hidden state. To train CE-only (no target final hidden state),
+# pass: --l1-loss-alpha 0 --no-confidence-head --ce-loss-alpha 1.0
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=32
+NUM_GPUS=${1:-8}
+
+ATTENTION_BACKEND=${2:-flex_attention}
+TARGET_BACKEND=${3:-hf}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_dspark.py \
+    --target-model-path Qwen/Qwen3-8B \
+    --draft-config-path $ROOT_DIR/configs/qwen3-8b-dspark.json \
+    --train-data-path $ROOT_DIR/cache/dataset/perfectblend_qwen3-8b_regen.jsonl \
+    --output-dir $ROOT_DIR/outputs/qwen3-8b-dspark \
+    --num-epochs 6 \
+    --batch-size 4 \
+    --learning-rate 6e-4 \
+    --warmup-ratio 0.04 \
+    --max-grad-norm 1.0 \
+    --max-length 3072 \
+    --chat-template qwen \
+    --attention-backend $ATTENTION_BACKEND \
+    --loss-decay-gamma 4.0 \
+    --log-interval 50 \
+    --save-interval 1000 \
+    --report-to wandb \
+    --wandb-project specforge-qwen3-8b-dspark \
+    --target-model-backend $TARGET_BACKEND \
+    --block-size 16 \
+    --num-anchors 512 \
+    --markov-rank 256 \
+    --enable-confidence-head \
+    --confidence-head-with-markov \
+    --ce-loss-alpha 0.1 \
+    --l1-loss-alpha 0.9 \
+    --confidence-head-alpha 1.0 \
+    --wandb-name qwen3-8b-dspark-perfectblend