Skip to content

Commit 11ae46d

Browse files
committed
delete hybrid example
1 parent 140ca6f commit 11ae46d

1 file changed

Lines changed: 0 additions & 98 deletions

File tree

FlagEmbedding/BGE_M3/README.md

Lines changed: 0 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -260,105 +260,7 @@ If you have no enough resource to fine-tuning model with long text, the method i
260260
Refer to our [report](https://arxiv.org/pdf/2402.03216.pdf) for more details.
261261

262262

263-
## Examples of hybrid retrieval
264263

265-
### Milvus
266-
267-
```python
268-
# A demo showing semantic search with dense and sparse vectors, implemented
269-
# with BGE-M3 embedding model and the Milvus vector database.
270-
271-
# The overall steps are as follows:
272-
# 1. embed the text as dense and sparse vectors using BGE-M3 model
273-
# 2. setup a Milvus collection to store the dense and sparse vectors
274-
# 3. insert the data to Milvus
275-
# 4. search and inspect the result!
276-
277-
# 1. prepare a small corpus to search
278-
docs = [
279-
"Artificial intelligence was founded as an academic discipline in 1956.",
280-
"Alan Turing was the first person to conduct substantial research in AI.",
281-
"Born in Maida Vale, London, Turing was raised in southern England.",
282-
]
283-
query = "Who started AI research?"
284-
285-
# BGE-M3 model can embed texts as dense and sparse vectors.
286-
# It is included in the optional `model` module in pymilvus, to install it,
287-
# simply run "pip install pymilvus[model]".
288-
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
289-
290-
bge_m3_ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
291-
292-
docs_embeddings = bge_m3_ef(docs)
293-
query_embeddings = bge_m3_ef([query])
294-
295-
# 2. setup Milvus collection and index
296-
from pymilvus import (
297-
utility,
298-
FieldSchema, CollectionSchema, DataType,
299-
Collection, AnnSearchRequest, RRFRanker, connections,
300-
)
301-
connections.connect("default", host="localhost", port="19530")
302-
303-
# Specify the data schema for the new Collection.
304-
fields = [
305-
# Use auto generated id as primary key
306-
FieldSchema(name="pk", dtype=DataType.VARCHAR,
307-
is_primary=True, auto_id=True, max_length=100),
308-
# Store the original text to retrieve based on semantically distance
309-
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512),
310-
# Milvus now supports both sparse and dense vectors, we can store each in
311-
# a separate field to conduct hybrid search on both vectors.
312-
FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
313-
FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR,
314-
dim=bge_m3_ef.dim["dense"]),
315-
]
316-
schema = CollectionSchema(fields, "")
317-
col_name = 'hybrid_demo'
318-
# Now we can create the new collection with above name and schema.
319-
col = Collection(col_name, schema, consistency_level="Strong")
320-
321-
# We need to create indices for the vector fields. The indices will be loaded
322-
# into memory for efficient search.
323-
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
324-
col.create_index("sparse_vector", sparse_index)
325-
dense_index = {"index_type": "FLAT", "metric_type": "L2"}
326-
col.create_index("dense_vector", dense_index)
327-
col.load()
328-
329-
# 3. insert text and sparse/dense vector representations into the collection
330-
entities = [docs, docs_embeddings["sparse"], docs_embeddings["dense"]]
331-
col.insert(entities)
332-
col.flush()
333-
334-
# 4. search and inspect the result!
335-
k = 2 # we want to get the top 2 docs closest to the query
336-
337-
# Prepare the search requests for both vector fields
338-
sparse_search_params = {"metric_type": "IP"}
339-
sparse_req = AnnSearchRequest(query_embeddings["sparse"],
340-
"sparse_vector", sparse_search_params, limit=k)
341-
dense_search_params = {"metric_type": "L2"}
342-
dense_req = AnnSearchRequest(query_embeddings["dense"],
343-
"dense_vector", dense_search_params, limit=k)
344-
345-
# Search topK docs based on dense and sparse vectors and rerank with RRF.
346-
res = col.hybrid_search([sparse_req, dense_req], rerank=RRFRanker(),
347-
limit=k, output_fields=['text'])
348-
349-
# Currently Milvus only support 1 query in the same hybrid search request, so
350-
# we inspect res[0] directly. In future release Milvus will accept batch
351-
# hybrid search queries in the same call.
352-
for hit in res[0]:
353-
# print out the data of topK search results
354-
print(f'text: {hit.fields["text"]} distance {hit.distance}')
355-
# Output is:
356-
# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656
357-
# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897
358-
359-
# Drop the collection to clean up the data.
360-
utility.drop_collection(col_name)
361-
```
362264

363265

364266

0 commit comments

Comments
 (0)