Skip to content

Commit b06814e

Browse files
committed
update new code for activation beacond
1 parent 70cb0f0 commit b06814e

36 files changed

Lines changed: 94672 additions & 2 deletions

Long_LLM/activation_beacon/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@ This is the codebase for Activation Beacon, an effective, efficient, compatible,
1919
- train with 80000 texts within 9 hours;
2020
- most training samples are shorter than 4096.
2121

22+
## Note
23+
Activation Beacon is a working project. We have released newer code in the [new folder](./new/), which support:
24+
- deepspeed-3 training
25+
- fine-tuning and evaluating with chat template
26+
- needle-in-a-haystack evaluation
27+
28+
You can use code there if you're interested. The code in this current folder will be deprecated in the future.
29+
30+
2231
## Environment
2332
The main dependencies are:
2433
```

Long_LLM/activation_beacon/docs/training.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Evaluation
1+
# Training
22

33
## Environment
44
The main dependencies are:
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Activation-Beacon
2+
3+
This folder contains the newer code for activation beacon with the support of deepspeed-3 training. This project is under development and subject to change in the future.
4+
5+
## Environment
6+
The main dependencies are:
7+
```
8+
pytorch==2.1.2 transformers==4.36.1 accelerate==0.25.0 datasets==2.14.7 numpy==1.26.2 flash-attn==2.4.2
9+
```
10+
You can install our environment with:
11+
```bash
12+
conda env create -f environment.yaml --name activation-beacon
13+
```
14+
15+
16+
## Data
17+
You should download the data for fine-tuning & evaluation then untar the file at anywhere you prefer, e.g. `/data`, which results in a folder `/data/activation-beacon`:
18+
```bash
19+
# feel free to alternate /data to your prefered location
20+
wget https://huggingface.co/datasets/namespace-Pt/projects/resolve/main/activation-beacon.tar.gz?download=true -O /data/activation-beacon.tar.gz
21+
22+
cd /data
23+
tar -xzvf activation-beacon.tar.gz
24+
```
25+
26+
**IMPORTANT NOTE**
27+
- For any path specified for `train_data` and `eval_data`: if it is prefixed with `activation-beacon:`, it will be solved to the relative path against [`data_root`](../src/args.py).
28+
- e.g. `activation-beacon:lm/pg19.json` becomes `${data_root}/lm/pg19.json`
29+
- you can modify the default value of [`data_root`](../src/args.py), so that you don't need to type it for each command.
30+
31+
32+
## Command
33+
```bash
34+
cd new
35+
36+
torchrun --nproc_per_node 8 -m main.train \
37+
--output_dir data/outputs/activation-beacon-llama2-chat-7b \
38+
--model_name_or_path meta-llama/Llama-2-7b-chat-hf \
39+
--train_data activation-beacon:pretrain/redpajama-sample.json activation-beacon:finetune/longalpaca.json \
40+
--max_length 8192 \
41+
--min_length 1200 \
42+
--max_train_num_per_data 200000 \
43+
--num_train_epochs 1 \
44+
--enable_beacon \
45+
--beacon_window 1024 \
46+
--beacon_stride 1024 \
47+
--beacon_attn step-expansion \
48+
--beacon_sink_size 1 \
49+
--beacon_ratio 2 4 8 16 32 64 128 \
50+
--beacon_ratio_mix step-random \
51+
--beacon_param q k v o \
52+
--gradient_checkpointing \
53+
--save_strategy steps \
54+
--max_steps 10000 \
55+
--save_steps 10000 \
56+
--logging_steps 50 \
57+
--chat_template llama-2 \
58+
--group_by_stride strict \
59+
--deepspeed data/deepspeed/stage3.json \
60+
61+
62+
# Evaluation
63+
for model in data/outputs/activation-beacon-llama2-chat-7b/*
64+
do
65+
COMMAND="--beacon_sink_size 1"
66+
67+
# 100K perplexity
68+
torchrun --nproc_per_node 8 -m main.eval_lm --model_name_or_path $model --max_length 100000 --beacon_ratio 32 --min_length 400000 --enable_beacon --stride 0 $COMMAND
69+
# 400K perplexity
70+
torchrun --nproc_per_node 8 -m main.eval_lm --model_name_or_path $model --max_length 400000 --beacon_ratio 128 --min_length 400000 --enable_beacon --stride 0 $COMMAND
71+
# LongBench
72+
torchrun --nproc_per_node 8 -m main.eval_longbench --model_name_or_path $model --max_length 15500 --enable_beacon $COMMAND
73+
# Topic Retrieval
74+
torchrun --nproc_per_node 8 -m main.eval_longeval --model_name_or_path $model --enable_beacon $COMMAND
75+
done
76+
```
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
11+
"bf16": {
12+
"enabled": "auto"
13+
},
14+
15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupDecayLR",
27+
"params": {
28+
"warmup_min_lr": "auto",
29+
"warmup_max_lr": "auto",
30+
"warmup_num_steps": "auto",
31+
"total_num_steps": "auto"
32+
}
33+
},
34+
35+
"zero_optimization": {
36+
"stage": 0
37+
},
38+
39+
"gradient_accumulation_steps": "auto",
40+
"gradient_clipping": "auto",
41+
"steps_per_print": 100,
42+
"train_batch_size": "auto",
43+
"train_micro_batch_size_per_gpu": "auto",
44+
"wall_clock_breakdown": false
45+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
11+
"bf16": {
12+
"enabled": "auto"
13+
},
14+
15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupDecayLR",
27+
"params": {
28+
"warmup_min_lr": "auto",
29+
"warmup_max_lr": "auto",
30+
"warmup_num_steps": "auto",
31+
"total_num_steps": "auto"
32+
}
33+
},
34+
35+
"zero_optimization": {
36+
"stage": 1
37+
},
38+
39+
"gradient_accumulation_steps": "auto",
40+
"gradient_clipping": "auto",
41+
"steps_per_print": 100,
42+
"train_batch_size": "auto",
43+
"train_micro_batch_size_per_gpu": "auto",
44+
"wall_clock_breakdown": false
45+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"optimizer": {
3+
"type": "AdamW",
4+
"params": {
5+
"lr": "auto",
6+
"betas": "auto",
7+
"eps": "auto",
8+
"weight_decay": "auto"
9+
}
10+
},
11+
"scheduler": {
12+
"type": "WarmupDecayLR",
13+
"params": {
14+
"total_num_steps": "auto",
15+
"warmup_min_lr": "auto",
16+
"warmup_max_lr": "auto",
17+
"warmup_num_steps": "auto"
18+
}
19+
},
20+
"bf16": {
21+
"enabled": "auto",
22+
"loss_scale": 0,
23+
"initial_scale_power": 16,
24+
"loss_scale_window": 1000,
25+
"hysteresis": 2,
26+
"min_loss_scale": 1
27+
},
28+
"zero_optimization": {
29+
"stage": 2,
30+
"allgather_partitions": true,
31+
"allgather_bucket_size": 1e9,
32+
"reduce_scatter": true,
33+
"reduce_bucket_size": 1e9,
34+
"overlap_comm": true,
35+
"contiguous_gradients": true
36+
},
37+
"gradient_accumulation_steps": "auto",
38+
"gradient_clipping": "auto",
39+
"steps_per_print": 2000,
40+
"train_batch_size": "auto",
41+
"train_micro_batch_size_per_gpu": "auto",
42+
"wall_clock_breakdown": false
43+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"optimizer": {
3+
"type": "AdamW",
4+
"params": {
5+
"lr": "auto",
6+
"betas": "auto",
7+
"eps": "auto",
8+
"weight_decay": "auto"
9+
}
10+
},
11+
"scheduler": {
12+
"type": "WarmupDecayLR",
13+
"params": {
14+
"total_num_steps": "auto",
15+
"warmup_min_lr": "auto",
16+
"warmup_max_lr": "auto",
17+
"warmup_num_steps": "auto"
18+
}
19+
},
20+
"bf16": {
21+
"enabled": "auto",
22+
"loss_scale": 0,
23+
"initial_scale_power": 16,
24+
"loss_scale_window": 1000,
25+
"hysteresis": 2,
26+
"min_loss_scale": 1
27+
},
28+
"zero_optimization": {
29+
"stage": 2,
30+
"allgather_partitions": true,
31+
"allgather_bucket_size": 1e8,
32+
"reduce_scatter": true,
33+
"reduce_bucket_size": 1e8,
34+
"overlap_comm": true,
35+
"contiguous_gradients": true,
36+
"offload_optimizer": {
37+
"device": "cpu"
38+
},
39+
"round_robin_gradients": true
40+
},
41+
"gradient_accumulation_steps": "auto",
42+
"gradient_clipping": "auto",
43+
"steps_per_print": 2000,
44+
"train_batch_size": "auto",
45+
"train_micro_batch_size_per_gpu": "auto",
46+
"wall_clock_breakdown": false
47+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"optimizer": {
3+
"type": "AdamW",
4+
"params": {
5+
"lr": "auto",
6+
"betas": "auto",
7+
"eps": "auto",
8+
"weight_decay": "auto"
9+
}
10+
},
11+
"scheduler": {
12+
"type": "WarmupDecayLR",
13+
"params": {
14+
"total_num_steps": "auto",
15+
"warmup_min_lr": "auto",
16+
"warmup_max_lr": "auto",
17+
"warmup_num_steps": "auto"
18+
}
19+
},
20+
"bf16": {
21+
"enabled": "auto",
22+
"loss_scale": 0,
23+
"initial_scale_power": 16,
24+
"loss_scale_window": 1000,
25+
"hysteresis": 2,
26+
"min_loss_scale": 1
27+
},
28+
"zero_optimization": {
29+
"stage": 2,
30+
"allgather_partitions": true,
31+
"allgather_bucket_size": 1e8,
32+
"reduce_scatter": true,
33+
"reduce_bucket_size": 1e8,
34+
"overlap_comm": true,
35+
"contiguous_gradients": true
36+
},
37+
"gradient_accumulation_steps": "auto",
38+
"gradient_clipping": "auto",
39+
"steps_per_print": 2000,
40+
"train_batch_size": "auto",
41+
"train_micro_batch_size_per_gpu": "auto",
42+
"wall_clock_breakdown": false
43+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"zero_optimization": {
3+
"stage": 3,
4+
"overlap_comm": true,
5+
"contiguous_gradients": true,
6+
"sub_group_size": 1e9,
7+
"reduce_bucket_size": "auto",
8+
"stage3_prefetch_bucket_size": "auto",
9+
"stage3_param_persistence_threshold": "auto",
10+
"stage3_max_live_parameters": 1e9,
11+
"stage3_max_reuse_distance": 1e9,
12+
"stage3_gather_16bit_weights_on_model_save": true
13+
},
14+
"fp16": {
15+
"enabled": "auto",
16+
"loss_scale": 0,
17+
"initial_scale_power": 10,
18+
"loss_scale_window": 1000,
19+
"hysteresis": 2,
20+
"min_loss_scale": 1
21+
},
22+
"bf16": {
23+
"enabled": "auto",
24+
"loss_scale": 0,
25+
"initial_scale_power": 10,
26+
"loss_scale_window": 1000,
27+
"hysteresis": 2,
28+
"min_loss_scale": 1
29+
},
30+
"optimizer": {
31+
"type": "AdamW",
32+
"params": {
33+
"lr": "auto",
34+
"betas": "auto",
35+
"eps": "auto",
36+
"weight_decay": "auto"
37+
}
38+
},
39+
40+
"scheduler": {
41+
"type": "WarmupDecayLR",
42+
"params": {
43+
"warmup_min_lr": "auto",
44+
"warmup_max_lr": "auto",
45+
"warmup_num_steps": "auto",
46+
"total_num_steps": "auto"
47+
}
48+
},
49+
50+
"gradient_accumulation_steps": "auto",
51+
"gradient_clipping": "auto",
52+
"steps_per_print": 1000,
53+
"train_batch_size": "auto",
54+
"train_micro_batch_size_per_gpu": "auto",
55+
"wall_clock_breakdown": false
56+
}

0 commit comments

Comments
 (0)