FlagEmbedding/examples/inference/embedder/encoder_only/m3_single_device_ensemble.py at e3ad344012aa385a8e84101085dbebc33bee1f87 · FlagOpen/FlagEmbedding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import torch
import numpy as np
from FlagEmbedding import BGEM3FlagModel


def pad_colbert_vecs(colbert_vecs_list, device):
    """
    Since ColBERT embeddings are computed on a token-level basis, each document (or query)
    may produce a different number of token embeddings. This function aligns all embeddings
    to the same length by padding shorter sequences with zeros, ensuring that every input
    ends up with a uniform shape.

    Steps:
    1. Determine the maximum sequence length (i.e., the largest number of tokens in any
       query or passage within the batch).
    2. For each set of token embeddings, pad it with zeros until it matches the max
       sequence length. Zeros here act as placeholders and do not affect the similarity
       computations since they represent "no token."
    3. Convert all padded embeddings into a single, consistent tensor and move it to the
       specified device (e.g., GPU) for efficient batch computation.

    By performing this padding operation, subsequent tensor operations (like the einsum
    computations for ColBERT scoring) become simpler and more efficient, as all sequences
    share a common shape.
    """

    lengths = [vec.shape[0] for vec in colbert_vecs_list]
    max_len = max(lengths)
    dim = colbert_vecs_list[0].shape[1]

    padded_tensor = torch.zeros(len(colbert_vecs_list), max_len, dim, dtype=torch.float, device=device)
    for i, vec in enumerate(colbert_vecs_list):
        length = vec.shape[0]
        padded_tensor[i, :length, :] = torch.tensor(vec, dtype=torch.float, device=device)

    return padded_tensor


def compute_colbert_scores(query_colbert_vecs, passage_colbert_vecs):
    """
    Compute ColBERT scores:

    ColBERT (Contextualized Late Interaction over BERT) evaluates the similarity
    between a query and a passage at the token level. Instead of producing a single
    dense vector for each query or passage, ColBERT maintains embeddings for every
    token. This allows for finer-grained matching, capturing more subtle similarities.

    Definitions of variables:
    - q: Number of queries (Q)
    - p: Number of passages (P)
    - r: Number of tokens in each query (Tq)
    - c: Number of tokens in each passage (Tp)
    - d: Embedding dimension (D)

    I used the operation `einsum("qrd,pcd->qprc", query_colbert_vecs, passage_colbert_vecs)`:
    - einsum (Einstein summation) is a powerful notation and function for
      expressing and computing multi-dimensional tensor contractions. It allows you
      to specify how dimensions in input tensors correspond to each other and how
      they should be combined (multiplied and summed) to produce the output.

    In this particular case:
    - "qrd" corresponds to (Q, Tq, D) for query token embeddings.
    - "pcd" corresponds to (P, Tp, D) for passage token embeddings.
    - "qrd,pcd->qprc" means:
      1. For each query q and passage p, compute the dot product between every query token
         embedding (r) and every passage token embedding (c) across the embedding dimension d.
      2. This results in a (Q, P, Tq, Tp) tensor (qprc), where each element is the similarity
         score between a single query token and a single passage token.

    After computing this full matrix of token-to-token scores:
    - We take the maximum over the passage token dimension (c) for each query token (r).
      This step identifies, for each query token, which passage token is the "best match."
    - Then we sum over all query tokens (r) to aggregate their best matches into a single
      score per query-passage pair.

    In summary:
    1. einsum to get all pairwise token similarities.
    2. max over passage tokens to find the best matching passage token for each query token.
    3. sum over query tokens to combine all the best matches into a final ColBERT score
       for each query-passage pair.
    """

    dot_products = torch.einsum("qrd,pcd->qprc", query_colbert_vecs, passage_colbert_vecs)  # Q,P,Tq,Tp
    max_per_query_token, _ = dot_products.max(dim=3)
    colbert_scores = max_per_query_token.sum(dim=2)
    return colbert_scores


def hybrid_dbfs_ensemble_simple_linear_combination(dense_scores, sparse_scores, colbert_scores, weights=(0.45, 0.45, 0.1)):
    w_dense, w_sparse, w_colbert = weights
    return w_dense * dense_scores + w_sparse * sparse_scores + w_colbert * colbert_scores


def test_m3_single_device():
    model = BGEM3FlagModel(
        'BAAI/bge-m3',
        devices="cuda:0",
        pooling_method='cls',
        cache_dir=os.getenv('HF_HUB_CACHE', None),
    )

    queries = [
                  "What is Sionic AI?",
                  "Try https://sionicstorm.ai today!"
              ] * 100
    passages = [
                   "Sionic AI delivers more accessible and cost-effective AI technology addressing the various needs to boost productivity and drive innovation.",
                   "The Large Language Model (LLM) is not for research and experimentation. We offer solutions that leverage LLM to add value to your business. Anyone can easily train and control AI."
               ] * 100

    queries_embeddings = model.encode_queries(
        queries,
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=True,
    )

    passages_embeddings = model.encode_corpus(
        passages,
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=True,
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    q_dense = torch.tensor(queries_embeddings["dense_vecs"], dtype=torch.float, device=device)
    p_dense = torch.tensor(passages_embeddings["dense_vecs"], dtype=torch.float, device=device)
    dense_scores = q_dense @ p_dense.T

    sparse_scores_np = model.compute_lexical_matching_score(
        queries_embeddings["lexical_weights"],
        passages_embeddings["lexical_weights"]
    )

    sparse_scores = torch.tensor(sparse_scores_np, dtype=torch.float, device=device)

    query_colbert_vecs = pad_colbert_vecs(queries_embeddings["colbert_vecs"], device)
    passage_colbert_vecs = pad_colbert_vecs(passages_embeddings["colbert_vecs"], device)
    colbert_scores = compute_colbert_scores(query_colbert_vecs, passage_colbert_vecs)

    hybrid_scores = hybrid_dbfs_ensemble_simple_linear_combination(dense_scores, sparse_scores, colbert_scores)

    print("Dense score:\n", dense_scores[:2, :2])
    print("Sparse score:\n", sparse_scores[:2, :2])
    print("ColBERT score:\n", colbert_scores[:2, :2])
    print("Hybrid DBSF Ensemble score:\n", hybrid_scores[:2, :2])


if __name__ == '__main__':
    test_m3_single_device()
    print("Expected Vector Scores")
    print("--------------------------------")
    print("Dense score:")
    print(" [[0.626  0.3477]\n [0.3496 0.678 ]]")
    print("Sparse score:")
    print(" [[0.19554901 0.00880432]\n [0.         0.18036556]]")
    print("ColBERT score:")
    print("[[5.8061, 3.1195] \n [5.6822, 4.6513]]")
    print("Hybrid DBSF Ensemble score:")
    print("[[0.9822, 0.5125] \n [0.8127, 0.6958]]")
    print("--------------------------------")