1+ from transformers import AutoModel , AutoConfig , AutoModelForMaskedLM , AutoTokenizer
2+ import torch
3+
4+
5+
6+ def initial_emb (model , output_dir ):
7+ target_len = 8194
8+
9+ position_ids = torch .arange (target_len , dtype = torch .long )
10+ position_ids = position_ids .unsqueeze (0 )
11+ # create hierarchical embedding
12+ alpha = 0.4
13+ pos_ids = torch .arange (model .config .max_position_embeddings , dtype = torch .long )
14+ if hasattr (model , 'roberta' ):
15+ position_embeddings = model .roberta .embeddings .position_embeddings (pos_ids )
16+ model .roberta .embeddings .position_ids = torch .arange (target_len ).expand ((1 , - 1 ))
17+ else :
18+ position_embeddings = model .embeddings .position_embeddings (pos_ids )
19+ model .embeddings .position_ids = torch .arange (target_len ).expand ((1 , - 1 ))
20+
21+ position_embeddings = position_embeddings - alpha * position_embeddings [:1 ]
22+ position_embeddings = position_embeddings / (1 - alpha )
23+
24+ embedding_x = []
25+ embedding_y = []
26+ for i in range (position_ids .size (0 )):
27+ pos_embedding_x = torch .index_select (position_embeddings , 0 , position_ids [i , :] // model .config .max_position_embeddings )
28+ pos_embedding_y = torch .index_select (position_embeddings , 0 , position_ids [i , :] % model .config .max_position_embeddings )
29+ embedding_x .append (pos_embedding_x .unsqueeze (0 ))
30+ embedding_y .append (pos_embedding_y .unsqueeze (0 ))
31+
32+ pos_embedding_x = torch .cat (embedding_x , 0 )
33+ pos_embedding_y = torch .cat (embedding_y , 0 )
34+
35+
36+ position_embeddings = alpha * pos_embedding_x + (1 - alpha ) * pos_embedding_y
37+ position_embeddings = position_embeddings .squeeze (dim = 0 )
38+
39+ if hasattr (model , 'roberta' ):
40+ diff = torch .sum (torch .abs (position_embeddings [:model .config .max_position_embeddings ] - model .roberta .embeddings .position_embeddings (pos_ids )), dim = - 1 )
41+ else :
42+ diff = torch .sum (torch .abs (position_embeddings [:model .config .max_position_embeddings ] - model .embeddings .position_embeddings (pos_ids )), dim = - 1 )
43+ print (diff .size ())
44+ print (diff )
45+ print (position_embeddings .size ())
46+
47+ model .config .max_position_embeddings = target_len
48+ embedding_new = torch .nn .Embedding (target_len , 1024 )
49+ embedding_new .weight = torch .nn .Parameter (position_embeddings )
50+ if hasattr (model , 'roberta' ):
51+ model .roberta .embeddings .position_embeddings = embedding_new
52+ else :
53+ model .embeddings .position_embeddings = embedding_new
54+ model .save_pretrained (output_dir )
55+ print (model .config )
56+ print (model )
57+
58+
59+ model_name = 'xlm-roberta-large'
60+ model = AutoModelForMaskedLM .from_pretrained (model_name )
61+ tokenzier = AutoTokenizer .from_pretrained (model_name )
62+ print (tokenzier )
63+ tokenzier .model_max_length = 8192
64+ initial_emb (model , output_dir = '/share/models/xlm-roberta-large-8194' )
65+ tokenzier .save_pretrained ('/share/models/xlm-roberta-large-8194' )
0 commit comments