|
55 | 55 | }, |
56 | 56 | { |
57 | 57 | "cell_type": "code", |
58 | | - "execution_count": 4, |
| 58 | + "execution_count": 1, |
59 | 59 | "id": "a2376217", |
60 | 60 | "metadata": {}, |
61 | 61 | "outputs": [], |
|
123 | 123 | }, |
124 | 124 | { |
125 | 125 | "cell_type": "code", |
126 | | - "execution_count": null, |
| 126 | + "execution_count": 2, |
127 | 127 | "id": "89e07751", |
128 | 128 | "metadata": {}, |
129 | | - "outputs": [], |
| 129 | + "outputs": [ |
| 130 | + { |
| 131 | + "name": "stderr", |
| 132 | + "output_type": "stream", |
| 133 | + "text": [ |
| 134 | + "/root/anaconda3/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
| 135 | + " from .autonotebook import tqdm as notebook_tqdm\n", |
| 136 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 93.88it/s]\n", |
| 137 | + "/root/anaconda3/envs/dev/lib/python3.12/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n", |
| 138 | + " warnings.warn(\n", |
| 139 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 2418.86it/s]" |
| 140 | + ] |
| 141 | + }, |
| 142 | + { |
| 143 | + "name": "stdout", |
| 144 | + "output_type": "stream", |
| 145 | + "text": [ |
| 146 | + "[[0.8486 0.7944]\n", |
| 147 | + " [0.7607 0.8545]]\n" |
| 148 | + ] |
| 149 | + }, |
| 150 | + { |
| 151 | + "name": "stderr", |
| 152 | + "output_type": "stream", |
| 153 | + "text": [ |
| 154 | + "\n" |
| 155 | + ] |
| 156 | + } |
| 157 | + ], |
130 | 158 | "source": [ |
131 | 159 | "from FlagEmbedding import FlagModel\n", |
132 | 160 | "\n", |
|
209 | 237 | }, |
210 | 238 | { |
211 | 239 | "cell_type": "code", |
212 | | - "execution_count": 6, |
| 240 | + "execution_count": 3, |
213 | 241 | "id": "9b17afcc", |
214 | 242 | "metadata": {}, |
215 | 243 | "outputs": [ |
| 244 | + { |
| 245 | + "name": "stderr", |
| 246 | + "output_type": "stream", |
| 247 | + "text": [ |
| 248 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 2252.58it/s]\n", |
| 249 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 3575.71it/s]" |
| 250 | + ] |
| 251 | + }, |
216 | 252 | { |
217 | 253 | "name": "stdout", |
218 | 254 | "output_type": "stream", |
219 | 255 | "text": [ |
220 | 256 | "[[0.76 0.6714]\n", |
221 | 257 | " [0.6177 0.7603]]\n" |
222 | 258 | ] |
| 259 | + }, |
| 260 | + { |
| 261 | + "name": "stderr", |
| 262 | + "output_type": "stream", |
| 263 | + "text": [ |
| 264 | + "\n" |
| 265 | + ] |
223 | 266 | } |
224 | 267 | ], |
225 | 268 | "source": [ |
|
274 | 317 | }, |
275 | 318 | { |
276 | 319 | "cell_type": "code", |
277 | | - "execution_count": null, |
| 320 | + "execution_count": 4, |
278 | 321 | "id": "d4647625", |
279 | 322 | "metadata": {}, |
280 | | - "outputs": [], |
| 323 | + "outputs": [ |
| 324 | + { |
| 325 | + "name": "stderr", |
| 326 | + "output_type": "stream", |
| 327 | + "text": [ |
| 328 | + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 194180.74it/s]\n" |
| 329 | + ] |
| 330 | + } |
| 331 | + ], |
281 | 332 | "source": [ |
282 | 333 | "from FlagEmbedding import BGEM3FlagModel\n", |
283 | 334 | "\n", |
|
313 | 364 | }, |
314 | 365 | { |
315 | 366 | "cell_type": "code", |
316 | | - "execution_count": 6, |
| 367 | + "execution_count": 5, |
317 | 368 | "id": "f0b11cf0", |
318 | 369 | "metadata": {}, |
319 | | - "outputs": [], |
| 370 | + "outputs": [ |
| 371 | + { |
| 372 | + "name": "stderr", |
| 373 | + "output_type": "stream", |
| 374 | + "text": [ |
| 375 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 1148.18it/s]\n" |
| 376 | + ] |
| 377 | + } |
| 378 | + ], |
320 | 379 | "source": [ |
321 | 380 | "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n", |
322 | 381 | "embeddings = model.encode(\n", |
|
330 | 389 | }, |
331 | 390 | { |
332 | 391 | "cell_type": "code", |
333 | | - "execution_count": 8, |
| 392 | + "execution_count": 6, |
334 | 393 | "id": "72cba126", |
335 | 394 | "metadata": {}, |
336 | 395 | "outputs": [ |
|
339 | 398 | "output_type": "stream", |
340 | 399 | "text": [ |
341 | 400 | "dense embedding:\n", |
342 | | - "[[-0.03411707 -0.04707828 -0.00089447 ... 0.04828531 0.00755427\n", |
343 | | - " -0.02961654]\n", |
344 | | - " [-0.01041734 -0.04479263 -0.02429199 ... -0.00819298 0.01503995\n", |
345 | | - " 0.01113793]]\n", |
| 401 | + "[[-0.03412 -0.04706 -0.00087 ... 0.04822 0.007614 -0.02957 ]\n", |
| 402 | + " [-0.01035 -0.04483 -0.02434 ... -0.008224 0.01497 0.011055]]\n", |
346 | 403 | "sparse embedding:\n", |
347 | | - "[defaultdict(<class 'int'>, {'4865': 0.08362077, '83': 0.081469566, '335': 0.12964639, '11679': 0.25186998, '276': 0.17001738, '363': 0.26957875, '32': 0.040755156}), defaultdict(<class 'int'>, {'262': 0.050144322, '5983': 0.13689369, '2320': 0.045134712, '111': 0.06342201, '90017': 0.25167602, '2588': 0.33353207})]\n", |
| 404 | + "[defaultdict(<class 'int'>, {'4865': np.float16(0.0836), '83': np.float16(0.0814), '335': np.float16(0.1296), '11679': np.float16(0.2517), '276': np.float16(0.1699), '363': np.float16(0.2695), '32': np.float16(0.04077)}), defaultdict(<class 'int'>, {'262': np.float16(0.05014), '5983': np.float16(0.1367), '2320': np.float16(0.04517), '111': np.float16(0.0634), '90017': np.float16(0.2517), '2588': np.float16(0.3333)})]\n", |
348 | 405 | "multi-vector:\n", |
349 | | - "[array([[-8.6726490e-03, -4.8921868e-02, -3.0449261e-03, ...,\n", |
350 | | - " -2.2082448e-02, 5.7268854e-02, 1.2811369e-02],\n", |
351 | | - " [-8.8765034e-03, -4.6860173e-02, -9.5845405e-03, ...,\n", |
352 | | - " -3.1404708e-02, 5.3911421e-02, 6.8714428e-03],\n", |
353 | | - " [ 1.8445771e-02, -4.2359587e-02, 8.6754939e-04, ...,\n", |
354 | | - " -1.9803897e-02, 3.8384371e-02, 7.6852231e-03],\n", |
| 406 | + "[array([[-8.68966337e-03, -4.89266850e-02, -3.03634931e-03, ...,\n", |
| 407 | + " -2.21243706e-02, 5.72856329e-02, 1.28355855e-02],\n", |
| 408 | + " [-8.92937183e-03, -4.67235669e-02, -9.52814799e-03, ...,\n", |
| 409 | + " -3.14785317e-02, 5.39088845e-02, 6.96671568e-03],\n", |
| 410 | + " [ 1.84195358e-02, -4.22310382e-02, 8.55499704e-04, ...,\n", |
| 411 | + " -1.97946690e-02, 3.84313315e-02, 7.71250250e-03],\n", |
355 | 412 | " ...,\n", |
356 | | - " [-2.5543230e-02, -1.6561864e-02, -4.2125367e-02, ...,\n", |
357 | | - " -4.5030322e-02, 4.4091221e-02, -1.0043185e-02],\n", |
358 | | - " [ 4.9905590e-05, -5.5475257e-02, 8.4884483e-03, ...,\n", |
359 | | - " -2.2911752e-02, 6.0379632e-02, 9.3577225e-03],\n", |
360 | | - " [ 2.5895271e-03, -2.9331330e-02, -1.8961012e-02, ...,\n", |
361 | | - " -8.0389353e-03, 3.2842189e-02, 4.3894034e-02]], dtype=float32), array([[ 0.01715658, 0.03835309, -0.02311821, ..., 0.00146474,\n", |
362 | | - " 0.02993429, -0.05985384],\n", |
363 | | - " [ 0.00996143, 0.039217 , -0.03855301, ..., 0.00599566,\n", |
364 | | - " 0.02722942, -0.06509776],\n", |
365 | | - " [ 0.01777726, 0.03919311, -0.01709837, ..., 0.00805702,\n", |
366 | | - " 0.03988946, -0.05069073],\n", |
| 413 | + " [-2.55824160e-02, -1.65533274e-02, -4.21357416e-02, ...,\n", |
| 414 | + " -4.50234264e-02, 4.41286489e-02, -1.00052059e-02],\n", |
| 415 | + " [ 5.90990965e-07, -5.53734899e-02, 8.51499755e-03, ...,\n", |
| 416 | + " -2.29209941e-02, 6.04418293e-02, 9.39912070e-03],\n", |
| 417 | + " [ 2.57394509e-03, -2.92690992e-02, -1.89342294e-02, ...,\n", |
| 418 | + " -8.04431178e-03, 3.28964666e-02, 4.38723788e-02]], dtype=float32), array([[ 0.01724418, 0.03835401, -0.02309308, ..., 0.00141706,\n", |
| 419 | + " 0.02995041, -0.05990082],\n", |
| 420 | + " [ 0.00996325, 0.03922409, -0.03849588, ..., 0.00591671,\n", |
| 421 | + " 0.02722516, -0.06510868],\n", |
| 422 | + " [ 0.01781915, 0.03925728, -0.01710397, ..., 0.00801776,\n", |
| 423 | + " 0.03987768, -0.05070014],\n", |
367 | 424 | " ...,\n", |
368 | | - " [ 0.05474931, 0.0075684 , 0.00329455, ..., -0.01651684,\n", |
369 | | - " 0.02397249, 0.00368039],\n", |
370 | | - " [ 0.0093503 , 0.05022853, -0.02385841, ..., 0.02575599,\n", |
371 | | - " 0.00786822, -0.03260205],\n", |
372 | | - " [ 0.01805054, 0.01337725, 0.00016697, ..., 0.01843987,\n", |
373 | | - " 0.01374448, 0.00310114]], dtype=float32)]\n" |
| 425 | + " [ 0.05478653, 0.00755799, 0.00328444, ..., -0.01648209,\n", |
| 426 | + " 0.02405782, 0.00363262],\n", |
| 427 | + " [ 0.00936953, 0.05028074, -0.02388872, ..., 0.02567679,\n", |
| 428 | + " 0.00791224, -0.03257877],\n", |
| 429 | + " [ 0.01803976, 0.0133922 , 0.00019365, ..., 0.0184015 ,\n", |
| 430 | + " 0.01373822, 0.00315539]], dtype=float32)]\n" |
374 | 431 | ] |
375 | 432 | } |
376 | 433 | ], |
|
396 | 453 | "BGE Multilingual Gemma2 is a LLM-based Multi-Lingual embedding model." |
397 | 454 | ] |
398 | 455 | }, |
| 456 | + { |
| 457 | + "cell_type": "markdown", |
| 458 | + "id": "abdca22e", |
| 459 | + "metadata": {}, |
| 460 | + "source": [ |
| 461 | + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", |
| 462 | + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", |
| 463 | + "| [BAAI/bge-multilingual-gemma2](https://huggingface.co/BAAI/bge-multilingual-gemma2) | Multilingual | 9.24B | 37 GB | LLM-based multilingual embedding model with SOTA results on multilingual benchmarks | Gemma2-9B |" |
| 464 | + ] |
| 465 | + }, |
399 | 466 | { |
400 | 467 | "cell_type": "code", |
401 | | - "execution_count": 8, |
| 468 | + "execution_count": 7, |
402 | 469 | "id": "8ec545bc", |
403 | 470 | "metadata": {}, |
404 | 471 | "outputs": [ |
405 | 472 | { |
406 | 473 | "name": "stderr", |
407 | 474 | "output_type": "stream", |
408 | 475 | "text": [ |
409 | | - "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 4.45it/s]\n" |
| 476 | + "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 6.34it/s]\n", |
| 477 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 816.49it/s]\n", |
| 478 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 718.33it/s]\n" |
410 | 479 | ] |
411 | 480 | }, |
412 | 481 | { |
413 | 482 | "name": "stdout", |
414 | 483 | "output_type": "stream", |
415 | 484 | "text": [ |
416 | | - "[[0.5596 0.01743 ]\n", |
417 | | - " [0.001761 0.502 ]]\n" |
| 485 | + "[[0.559 0.01685 ]\n", |
| 486 | + " [0.0008683 0.5015 ]]\n" |
418 | 487 | ] |
419 | 488 | } |
420 | 489 | ], |
|
453 | 522 | "BGE ICL stands for in-context learning. By providing few-shot examples in the query, it can significantly enhance the model's ability to handle new tasks." |
454 | 523 | ] |
455 | 524 | }, |
| 525 | + { |
| 526 | + "cell_type": "markdown", |
| 527 | + "id": "cf6c9345", |
| 528 | + "metadata": {}, |
| 529 | + "source": [ |
| 530 | + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", |
| 531 | + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", |
| 532 | + "| [BAAI/bge-en-icl](https://huggingface.co/BAAI/bge-en-icl) | English | 7.11B | 28.5 GB | LLM-based English embedding model with excellent in-context learning ability. | Mistral-7B |" |
| 533 | + ] |
| 534 | + }, |
456 | 535 | { |
457 | 536 | "cell_type": "code", |
458 | | - "execution_count": 9, |
| 537 | + "execution_count": 8, |
459 | 538 | "id": "4595bae7", |
460 | 539 | "metadata": {}, |
461 | 540 | "outputs": [], |
|
483 | 562 | }, |
484 | 563 | { |
485 | 564 | "cell_type": "code", |
486 | | - "execution_count": 10, |
| 565 | + "execution_count": 9, |
487 | 566 | "id": "ffb586c6", |
488 | 567 | "metadata": {}, |
489 | 568 | "outputs": [ |
490 | 569 | { |
491 | 570 | "name": "stderr", |
492 | 571 | "output_type": "stream", |
493 | 572 | "text": [ |
494 | | - "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 4.34it/s]\n" |
| 573 | + "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 6.55it/s]\n", |
| 574 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 366.09it/s]\n", |
| 575 | + "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 623.69it/s]\n" |
495 | 576 | ] |
496 | 577 | }, |
497 | 578 | { |
|
0 commit comments