Skip to content

Commit c9cfa7c

Browse files
committed
update tutorials
1 parent 5790d94 commit c9cfa7c

2 files changed

Lines changed: 522 additions & 44 deletions

File tree

Tutorials/1_Embedding/1.2.1_BGE_Series.ipynb

Lines changed: 125 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
},
5656
{
5757
"cell_type": "code",
58-
"execution_count": 4,
58+
"execution_count": 1,
5959
"id": "a2376217",
6060
"metadata": {},
6161
"outputs": [],
@@ -123,10 +123,38 @@
123123
},
124124
{
125125
"cell_type": "code",
126-
"execution_count": null,
126+
"execution_count": 2,
127127
"id": "89e07751",
128128
"metadata": {},
129-
"outputs": [],
129+
"outputs": [
130+
{
131+
"name": "stderr",
132+
"output_type": "stream",
133+
"text": [
134+
"/root/anaconda3/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
135+
" from .autonotebook import tqdm as notebook_tqdm\n",
136+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 93.88it/s]\n",
137+
"/root/anaconda3/envs/dev/lib/python3.12/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
138+
" warnings.warn(\n",
139+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 2418.86it/s]"
140+
]
141+
},
142+
{
143+
"name": "stdout",
144+
"output_type": "stream",
145+
"text": [
146+
"[[0.8486 0.7944]\n",
147+
" [0.7607 0.8545]]\n"
148+
]
149+
},
150+
{
151+
"name": "stderr",
152+
"output_type": "stream",
153+
"text": [
154+
"\n"
155+
]
156+
}
157+
],
130158
"source": [
131159
"from FlagEmbedding import FlagModel\n",
132160
"\n",
@@ -209,17 +237,32 @@
209237
},
210238
{
211239
"cell_type": "code",
212-
"execution_count": 6,
240+
"execution_count": 3,
213241
"id": "9b17afcc",
214242
"metadata": {},
215243
"outputs": [
244+
{
245+
"name": "stderr",
246+
"output_type": "stream",
247+
"text": [
248+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 2252.58it/s]\n",
249+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 3575.71it/s]"
250+
]
251+
},
216252
{
217253
"name": "stdout",
218254
"output_type": "stream",
219255
"text": [
220256
"[[0.76 0.6714]\n",
221257
" [0.6177 0.7603]]\n"
222258
]
259+
},
260+
{
261+
"name": "stderr",
262+
"output_type": "stream",
263+
"text": [
264+
"\n"
265+
]
223266
}
224267
],
225268
"source": [
@@ -274,10 +317,18 @@
274317
},
275318
{
276319
"cell_type": "code",
277-
"execution_count": null,
320+
"execution_count": 4,
278321
"id": "d4647625",
279322
"metadata": {},
280-
"outputs": [],
323+
"outputs": [
324+
{
325+
"name": "stderr",
326+
"output_type": "stream",
327+
"text": [
328+
"Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 194180.74it/s]\n"
329+
]
330+
}
331+
],
281332
"source": [
282333
"from FlagEmbedding import BGEM3FlagModel\n",
283334
"\n",
@@ -313,10 +364,18 @@
313364
},
314365
{
315366
"cell_type": "code",
316-
"execution_count": 6,
367+
"execution_count": 5,
317368
"id": "f0b11cf0",
318369
"metadata": {},
319-
"outputs": [],
370+
"outputs": [
371+
{
372+
"name": "stderr",
373+
"output_type": "stream",
374+
"text": [
375+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 1148.18it/s]\n"
376+
]
377+
}
378+
],
320379
"source": [
321380
"# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n",
322381
"embeddings = model.encode(\n",
@@ -330,7 +389,7 @@
330389
},
331390
{
332391
"cell_type": "code",
333-
"execution_count": 8,
392+
"execution_count": 6,
334393
"id": "72cba126",
335394
"metadata": {},
336395
"outputs": [
@@ -339,38 +398,36 @@
339398
"output_type": "stream",
340399
"text": [
341400
"dense embedding:\n",
342-
"[[-0.03411707 -0.04707828 -0.00089447 ... 0.04828531 0.00755427\n",
343-
" -0.02961654]\n",
344-
" [-0.01041734 -0.04479263 -0.02429199 ... -0.00819298 0.01503995\n",
345-
" 0.01113793]]\n",
401+
"[[-0.03412 -0.04706 -0.00087 ... 0.04822 0.007614 -0.02957 ]\n",
402+
" [-0.01035 -0.04483 -0.02434 ... -0.008224 0.01497 0.011055]]\n",
346403
"sparse embedding:\n",
347-
"[defaultdict(<class 'int'>, {'4865': 0.08362077, '83': 0.081469566, '335': 0.12964639, '11679': 0.25186998, '276': 0.17001738, '363': 0.26957875, '32': 0.040755156}), defaultdict(<class 'int'>, {'262': 0.050144322, '5983': 0.13689369, '2320': 0.045134712, '111': 0.06342201, '90017': 0.25167602, '2588': 0.33353207})]\n",
404+
"[defaultdict(<class 'int'>, {'4865': np.float16(0.0836), '83': np.float16(0.0814), '335': np.float16(0.1296), '11679': np.float16(0.2517), '276': np.float16(0.1699), '363': np.float16(0.2695), '32': np.float16(0.04077)}), defaultdict(<class 'int'>, {'262': np.float16(0.05014), '5983': np.float16(0.1367), '2320': np.float16(0.04517), '111': np.float16(0.0634), '90017': np.float16(0.2517), '2588': np.float16(0.3333)})]\n",
348405
"multi-vector:\n",
349-
"[array([[-8.6726490e-03, -4.8921868e-02, -3.0449261e-03, ...,\n",
350-
" -2.2082448e-02, 5.7268854e-02, 1.2811369e-02],\n",
351-
" [-8.8765034e-03, -4.6860173e-02, -9.5845405e-03, ...,\n",
352-
" -3.1404708e-02, 5.3911421e-02, 6.8714428e-03],\n",
353-
" [ 1.8445771e-02, -4.2359587e-02, 8.6754939e-04, ...,\n",
354-
" -1.9803897e-02, 3.8384371e-02, 7.6852231e-03],\n",
406+
"[array([[-8.68966337e-03, -4.89266850e-02, -3.03634931e-03, ...,\n",
407+
" -2.21243706e-02, 5.72856329e-02, 1.28355855e-02],\n",
408+
" [-8.92937183e-03, -4.67235669e-02, -9.52814799e-03, ...,\n",
409+
" -3.14785317e-02, 5.39088845e-02, 6.96671568e-03],\n",
410+
" [ 1.84195358e-02, -4.22310382e-02, 8.55499704e-04, ...,\n",
411+
" -1.97946690e-02, 3.84313315e-02, 7.71250250e-03],\n",
355412
" ...,\n",
356-
" [-2.5543230e-02, -1.6561864e-02, -4.2125367e-02, ...,\n",
357-
" -4.5030322e-02, 4.4091221e-02, -1.0043185e-02],\n",
358-
" [ 4.9905590e-05, -5.5475257e-02, 8.4884483e-03, ...,\n",
359-
" -2.2911752e-02, 6.0379632e-02, 9.3577225e-03],\n",
360-
" [ 2.5895271e-03, -2.9331330e-02, -1.8961012e-02, ...,\n",
361-
" -8.0389353e-03, 3.2842189e-02, 4.3894034e-02]], dtype=float32), array([[ 0.01715658, 0.03835309, -0.02311821, ..., 0.00146474,\n",
362-
" 0.02993429, -0.05985384],\n",
363-
" [ 0.00996143, 0.039217 , -0.03855301, ..., 0.00599566,\n",
364-
" 0.02722942, -0.06509776],\n",
365-
" [ 0.01777726, 0.03919311, -0.01709837, ..., 0.00805702,\n",
366-
" 0.03988946, -0.05069073],\n",
413+
" [-2.55824160e-02, -1.65533274e-02, -4.21357416e-02, ...,\n",
414+
" -4.50234264e-02, 4.41286489e-02, -1.00052059e-02],\n",
415+
" [ 5.90990965e-07, -5.53734899e-02, 8.51499755e-03, ...,\n",
416+
" -2.29209941e-02, 6.04418293e-02, 9.39912070e-03],\n",
417+
" [ 2.57394509e-03, -2.92690992e-02, -1.89342294e-02, ...,\n",
418+
" -8.04431178e-03, 3.28964666e-02, 4.38723788e-02]], dtype=float32), array([[ 0.01724418, 0.03835401, -0.02309308, ..., 0.00141706,\n",
419+
" 0.02995041, -0.05990082],\n",
420+
" [ 0.00996325, 0.03922409, -0.03849588, ..., 0.00591671,\n",
421+
" 0.02722516, -0.06510868],\n",
422+
" [ 0.01781915, 0.03925728, -0.01710397, ..., 0.00801776,\n",
423+
" 0.03987768, -0.05070014],\n",
367424
" ...,\n",
368-
" [ 0.05474931, 0.0075684 , 0.00329455, ..., -0.01651684,\n",
369-
" 0.02397249, 0.00368039],\n",
370-
" [ 0.0093503 , 0.05022853, -0.02385841, ..., 0.02575599,\n",
371-
" 0.00786822, -0.03260205],\n",
372-
" [ 0.01805054, 0.01337725, 0.00016697, ..., 0.01843987,\n",
373-
" 0.01374448, 0.00310114]], dtype=float32)]\n"
425+
" [ 0.05478653, 0.00755799, 0.00328444, ..., -0.01648209,\n",
426+
" 0.02405782, 0.00363262],\n",
427+
" [ 0.00936953, 0.05028074, -0.02388872, ..., 0.02567679,\n",
428+
" 0.00791224, -0.03257877],\n",
429+
" [ 0.01803976, 0.0133922 , 0.00019365, ..., 0.0184015 ,\n",
430+
" 0.01373822, 0.00315539]], dtype=float32)]\n"
374431
]
375432
}
376433
],
@@ -396,25 +453,37 @@
396453
"BGE Multilingual Gemma2 is a LLM-based Multi-Lingual embedding model."
397454
]
398455
},
456+
{
457+
"cell_type": "markdown",
458+
"id": "abdca22e",
459+
"metadata": {},
460+
"source": [
461+
"| Model | Language | Parameters | Model Size | Description | Base Model |\n",
462+
"|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
463+
"| [BAAI/bge-multilingual-gemma2](https://huggingface.co/BAAI/bge-multilingual-gemma2) | Multilingual | 9.24B | 37 GB | LLM-based multilingual embedding model with SOTA results on multilingual benchmarks | Gemma2-9B |"
464+
]
465+
},
399466
{
400467
"cell_type": "code",
401-
"execution_count": 8,
468+
"execution_count": 7,
402469
"id": "8ec545bc",
403470
"metadata": {},
404471
"outputs": [
405472
{
406473
"name": "stderr",
407474
"output_type": "stream",
408475
"text": [
409-
"Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 4.45it/s]\n"
476+
"Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 6.34it/s]\n",
477+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 816.49it/s]\n",
478+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 718.33it/s]\n"
410479
]
411480
},
412481
{
413482
"name": "stdout",
414483
"output_type": "stream",
415484
"text": [
416-
"[[0.5596 0.01743 ]\n",
417-
" [0.001761 0.502 ]]\n"
485+
"[[0.559 0.01685 ]\n",
486+
" [0.0008683 0.5015 ]]\n"
418487
]
419488
}
420489
],
@@ -453,9 +522,19 @@
453522
"BGE ICL stands for in-context learning. By providing few-shot examples in the query, it can significantly enhance the model's ability to handle new tasks."
454523
]
455524
},
525+
{
526+
"cell_type": "markdown",
527+
"id": "cf6c9345",
528+
"metadata": {},
529+
"source": [
530+
"| Model | Language | Parameters | Model Size | Description | Base Model |\n",
531+
"|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
532+
"| [BAAI/bge-en-icl](https://huggingface.co/BAAI/bge-en-icl) | English | 7.11B | 28.5 GB | LLM-based English embedding model with excellent in-context learning ability. | Mistral-7B |"
533+
]
534+
},
456535
{
457536
"cell_type": "code",
458-
"execution_count": 9,
537+
"execution_count": 8,
459538
"id": "4595bae7",
460539
"metadata": {},
461540
"outputs": [],
@@ -483,15 +562,17 @@
483562
},
484563
{
485564
"cell_type": "code",
486-
"execution_count": 10,
565+
"execution_count": 9,
487566
"id": "ffb586c6",
488567
"metadata": {},
489568
"outputs": [
490569
{
491570
"name": "stderr",
492571
"output_type": "stream",
493572
"text": [
494-
"Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 4.34it/s]\n"
573+
"Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 6.55it/s]\n",
574+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 366.09it/s]\n",
575+
"pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 623.69it/s]\n"
495576
]
496577
},
497578
{

0 commit comments

Comments
 (0)