Skip to content

Commit c7cefcd

Browse files
committed
update pypi
1 parent 53cfac4 commit c7cefcd

3 files changed

Lines changed: 68 additions & 3 deletions

File tree

C_MTEB/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55

66
setup(
77
name='C_MTEB',
8-
version='1.1.0',
8+
version='1.1.1',
99
description='Chinese Massive Text Embedding Benchmark',
1010
long_description=readme,
1111
long_description_content_type="text/markdown",
1212
author_email='2906698981@qq.com',
1313
url='https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB',
1414
packages=find_packages(),
1515
install_requires=[
16-
'mteb[beir]',
16+
'mteb[beir]=1.1.1',
1717
],
1818
)

extend_position.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from transformers import AutoModel, AutoConfig, AutoModelForMaskedLM, AutoTokenizer
2+
import torch
3+
4+
5+
6+
def initial_emb(model, output_dir):
7+
target_len = 8194
8+
9+
position_ids = torch.arange(target_len, dtype=torch.long)
10+
position_ids = position_ids.unsqueeze(0)
11+
# create hierarchical embedding
12+
alpha = 0.4
13+
pos_ids = torch.arange(model.config.max_position_embeddings, dtype=torch.long)
14+
if hasattr(model, 'roberta'):
15+
position_embeddings = model.roberta.embeddings.position_embeddings(pos_ids)
16+
model.roberta.embeddings.position_ids = torch.arange(target_len).expand((1, -1))
17+
else:
18+
position_embeddings = model.embeddings.position_embeddings(pos_ids)
19+
model.embeddings.position_ids = torch.arange(target_len).expand((1, -1))
20+
21+
position_embeddings = position_embeddings - alpha * position_embeddings[:1]
22+
position_embeddings = position_embeddings / (1-alpha)
23+
24+
embedding_x = []
25+
embedding_y = []
26+
for i in range(position_ids.size(0)):
27+
pos_embedding_x = torch.index_select(position_embeddings, 0, position_ids[i, :] // model.config.max_position_embeddings)
28+
pos_embedding_y = torch.index_select(position_embeddings, 0, position_ids[i, :] % model.config.max_position_embeddings)
29+
embedding_x.append(pos_embedding_x.unsqueeze(0))
30+
embedding_y.append(pos_embedding_y.unsqueeze(0))
31+
32+
pos_embedding_x = torch.cat(embedding_x, 0)
33+
pos_embedding_y = torch.cat(embedding_y, 0)
34+
35+
36+
position_embeddings = alpha * pos_embedding_x + (1-alpha) * pos_embedding_y
37+
position_embeddings = position_embeddings.squeeze(dim=0)
38+
39+
if hasattr(model, 'roberta'):
40+
diff = torch.sum(torch.abs(position_embeddings[:model.config.max_position_embeddings] - model.roberta.embeddings.position_embeddings(pos_ids)), dim=-1)
41+
else:
42+
diff = torch.sum(torch.abs(position_embeddings[:model.config.max_position_embeddings] - model.embeddings.position_embeddings(pos_ids)), dim=-1)
43+
print(diff.size())
44+
print(diff)
45+
print(position_embeddings.size())
46+
47+
model.config.max_position_embeddings = target_len
48+
embedding_new = torch.nn.Embedding(target_len, 1024)
49+
embedding_new.weight = torch.nn.Parameter(position_embeddings)
50+
if hasattr(model, 'roberta'):
51+
model.roberta.embeddings.position_embeddings = embedding_new
52+
else:
53+
model.embeddings.position_embeddings = embedding_new
54+
model.save_pretrained(output_dir)
55+
print(model.config)
56+
print(model)
57+
58+
59+
model_name = 'xlm-roberta-large'
60+
model = AutoModelForMaskedLM.from_pretrained(model_name)
61+
tokenzier = AutoTokenizer.from_pretrained(model_name)
62+
print(tokenzier)
63+
tokenzier.model_max_length=8192
64+
initial_emb(model, output_dir='/share/models/xlm-roberta-large-8194')
65+
tokenzier.save_pretrained('/share/models/xlm-roberta-large-8194')

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name='FlagEmbedding',
8-
version='1.2.8',
8+
version='1.2.9',
99
description='FlagEmbedding',
1010
long_description=readme,
1111
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)