@@ -260,105 +260,7 @@ If you have no enough resource to fine-tuning model with long text, the method i
260260Refer to our [ report] ( https://arxiv.org/pdf/2402.03216.pdf ) for more details.
261261
262262
263- ## Examples of hybrid retrieval
264263
265- ### Milvus
266-
267- ``` python
268- # A demo showing semantic search with dense and sparse vectors, implemented
269- # with BGE-M3 embedding model and the Milvus vector database.
270-
271- # The overall steps are as follows:
272- # 1. embed the text as dense and sparse vectors using BGE-M3 model
273- # 2. setup a Milvus collection to store the dense and sparse vectors
274- # 3. insert the data to Milvus
275- # 4. search and inspect the result!
276-
277- # 1. prepare a small corpus to search
278- docs = [
279- " Artificial intelligence was founded as an academic discipline in 1956." ,
280- " Alan Turing was the first person to conduct substantial research in AI." ,
281- " Born in Maida Vale, London, Turing was raised in southern England." ,
282- ]
283- query = " Who started AI research?"
284-
285- # BGE-M3 model can embed texts as dense and sparse vectors.
286- # It is included in the optional `model` module in pymilvus, to install it,
287- # simply run "pip install pymilvus[model]".
288- from pymilvus.model.hybrid import BGEM3EmbeddingFunction
289-
290- bge_m3_ef = BGEM3EmbeddingFunction(use_fp16 = False , device = " cpu" )
291-
292- docs_embeddings = bge_m3_ef(docs)
293- query_embeddings = bge_m3_ef([query])
294-
295- # 2. setup Milvus collection and index
296- from pymilvus import (
297- utility,
298- FieldSchema, CollectionSchema, DataType,
299- Collection, AnnSearchRequest, RRFRanker, connections,
300- )
301- connections.connect(" default" , host = " localhost" , port = " 19530" )
302-
303- # Specify the data schema for the new Collection.
304- fields = [
305- # Use auto generated id as primary key
306- FieldSchema(name = " pk" , dtype = DataType.VARCHAR ,
307- is_primary = True , auto_id = True , max_length = 100 ),
308- # Store the original text to retrieve based on semantically distance
309- FieldSchema(name = " text" , dtype = DataType.VARCHAR , max_length = 512 ),
310- # Milvus now supports both sparse and dense vectors, we can store each in
311- # a separate field to conduct hybrid search on both vectors.
312- FieldSchema(name = " sparse_vector" , dtype = DataType.SPARSE_FLOAT_VECTOR ),
313- FieldSchema(name = " dense_vector" , dtype = DataType.FLOAT_VECTOR ,
314- dim = bge_m3_ef.dim[" dense" ]),
315- ]
316- schema = CollectionSchema(fields, " " )
317- col_name = ' hybrid_demo'
318- # Now we can create the new collection with above name and schema.
319- col = Collection(col_name, schema, consistency_level = " Strong" )
320-
321- # We need to create indices for the vector fields. The indices will be loaded
322- # into memory for efficient search.
323- sparse_index = {" index_type" : " SPARSE_INVERTED_INDEX" , " metric_type" : " IP" }
324- col.create_index(" sparse_vector" , sparse_index)
325- dense_index = {" index_type" : " FLAT" , " metric_type" : " L2" }
326- col.create_index(" dense_vector" , dense_index)
327- col.load()
328-
329- # 3. insert text and sparse/dense vector representations into the collection
330- entities = [docs, docs_embeddings[" sparse" ], docs_embeddings[" dense" ]]
331- col.insert(entities)
332- col.flush()
333-
334- # 4. search and inspect the result!
335- k = 2 # we want to get the top 2 docs closest to the query
336-
337- # Prepare the search requests for both vector fields
338- sparse_search_params = {" metric_type" : " IP" }
339- sparse_req = AnnSearchRequest(query_embeddings[" sparse" ],
340- " sparse_vector" , sparse_search_params, limit = k)
341- dense_search_params = {" metric_type" : " L2" }
342- dense_req = AnnSearchRequest(query_embeddings[" dense" ],
343- " dense_vector" , dense_search_params, limit = k)
344-
345- # Search topK docs based on dense and sparse vectors and rerank with RRF.
346- res = col.hybrid_search([sparse_req, dense_req], rerank = RRFRanker(),
347- limit = k, output_fields = [' text' ])
348-
349- # Currently Milvus only support 1 query in the same hybrid search request, so
350- # we inspect res[0] directly. In future release Milvus will accept batch
351- # hybrid search queries in the same call.
352- for hit in res[0 ]:
353- # print out the data of topK search results
354- print (f ' text: { hit.fields[" text" ]} distance { hit.distance} ' )
355- # Output is:
356- # text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656
357- # text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897
358-
359- # Drop the collection to clean up the data.
360- utility.drop_collection(col_name)
361- ```
362264
363265
364266
0 commit comments