Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions configs/qwen3-8b-dspark.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"architectures": [
"DSparkDraftModel"
],
"attention_bias": false,
"attention_dropout": 0.0,
"auto_map": {
"AutoModel": "dspark.DSparkDraftModel"
},
"block_size": 16,
"bos_token_id": 151643,
"dflash_config": {
"mask_token_id": 151669,
"target_layer_ids": [1, 9, 17, 25, 33]
},
"dtype": "bfloat16",
"enable_confidence_head": true,
"confidence_head_with_markov": true,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 12288,
"layer_types": [
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention"
],
"markov_head_type": "vanilla",
"markov_rank": 256,
"max_position_embeddings": 40960,
"max_window_layers": 5,
"model_type": "qwen3",
"num_attention_heads": 32,
"num_hidden_layers": 5,
"num_key_value_heads": 8,
"num_target_layers": 36,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
"tie_word_embeddings": false,
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
}
51 changes: 51 additions & 0 deletions examples/run_qwen3_8b_dspark_online.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
# DSpark online training for Qwen3-8B.
#
# DSpark = DFlash block-diffusion drafter + EAGLE-style Markov & confidence heads,
# trained with cross-entropy + L1 distribution distillation + confidence BCE.
# The L1 / confidence losses need the target model's FINAL hidden state, so the
# target backend must surface it. The 'hf' backend (default below) always does;
# the 'sglang' backend does when its runner returns both the captured aux stream
# and the final hidden state. To train CE-only (no target final hidden state),
# pass: --l1-loss-alpha 0 --no-confidence-head --ce-loss-alpha 1.0

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
ROOT_DIR=$(dirname $SCRIPT_DIR)
export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
export SPECFORGE_DATA_NUM_PROC=32
NUM_GPUS=${1:-8}

ATTENTION_BACKEND=${2:-flex_attention}
TARGET_BACKEND=${3:-hf}

torchrun \
--standalone \
--nproc_per_node $NUM_GPUS \
$ROOT_DIR/scripts/train_dspark.py \
--target-model-path Qwen/Qwen3-8B \
--draft-config-path $ROOT_DIR/configs/qwen3-8b-dspark.json \
--train-data-path $ROOT_DIR/cache/dataset/perfectblend_qwen3-8b_regen.jsonl \
--output-dir $ROOT_DIR/outputs/qwen3-8b-dspark \
--num-epochs 6 \
--batch-size 4 \
--learning-rate 6e-4 \
--warmup-ratio 0.04 \
--max-grad-norm 1.0 \
--max-length 3072 \
--chat-template qwen \
--attention-backend $ATTENTION_BACKEND \
--loss-decay-gamma 4.0 \
--log-interval 50 \
--save-interval 1000 \
--report-to wandb \
--wandb-project specforge-qwen3-8b-dspark \
--target-model-backend $TARGET_BACKEND \
--block-size 16 \
--num-anchors 512 \
--markov-rank 256 \
--enable-confidence-head \
--confidence-head-with-markov \
--ce-loss-alpha 0.1 \
--l1-loss-alpha 0.9 \
--confidence-head-alpha 1.0 \
--wandb-name qwen3-8b-dspark-perfectblend
Loading
Loading