Skip to content

Commit 39e30ea

Browse files
committed
add 1bit vector type
1 parent 83d029d commit 39e30ea

8 files changed

Lines changed: 190 additions & 42 deletions

File tree

libsql-sqlite3/Makefile.in

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ LIBOBJS0 = alter.lo analyze.lo attach.lo auth.lo \
195195
sqlite3session.lo select.lo sqlite3rbu.lo status.lo stmt.lo \
196196
table.lo threads.lo tokenize.lo treeview.lo trigger.lo \
197197
update.lo userauth.lo upsert.lo util.lo vacuum.lo \
198-
vector.lo vectorfloat32.lo vectorfloat64.lo \
198+
vector.lo vectorfloat32.lo vectorfloat64.lo vector1bit.lo \
199199
vectorIndex.lo vectordiskann.lo vectorvtab.lo \
200200
vdbe.lo vdbeapi.lo vdbeaux.lo vdbeblob.lo vdbemem.lo vdbesort.lo \
201201
vdbetrace.lo vdbevtab.lo \
@@ -302,6 +302,7 @@ SRC = \
302302
$(TOP)/src/util.c \
303303
$(TOP)/src/vacuum.c \
304304
$(TOP)/src/vector.c \
305+
$(TOP)/src/vector1bit.c \
305306
$(TOP)/src/vectorInt.h \
306307
$(TOP)/src/vectorfloat32.c \
307308
$(TOP)/src/vectorfloat64.c \
@@ -1138,6 +1139,9 @@ vacuum.lo: $(TOP)/src/vacuum.c $(HDR)
11381139
vector.lo: $(TOP)/src/vector.c $(HDR)
11391140
$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vector.c
11401141

1142+
vector1bit.lo: $(TOP)/src/vector1bit.c $(HDR)
1143+
$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vector1bit.c
1144+
11411145
vectorfloat32.lo: $(TOP)/src/vectorfloat32.c $(HDR)
11421146
$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorfloat32.c
11431147

libsql-sqlite3/src/vector.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ size_t vectorDataSize(VectorType type, VectorDims dims){
4141
return dims * sizeof(float);
4242
case VECTOR_TYPE_FLOAT64:
4343
return dims * sizeof(double);
44+
case VECTOR_TYPE_1BIT:
45+
return (dims + 7) / 8;
4446
default:
4547
assert(0);
4648
}
@@ -111,6 +113,8 @@ float vectorDistanceCos(const Vector *pVector1, const Vector *pVector2){
111113
return vectorF32DistanceCos(pVector1, pVector2);
112114
case VECTOR_TYPE_FLOAT64:
113115
return vectorF64DistanceCos(pVector1, pVector2);
116+
case VECTOR_TYPE_1BIT:
117+
return vector1BitDistanceHamming(pVector1, pVector2);
114118
default:
115119
assert(0);
116120
}
@@ -381,6 +385,9 @@ void vectorDump(const Vector *pVector){
381385
case VECTOR_TYPE_FLOAT64:
382386
vectorF64Dump(pVector);
383387
break;
388+
case VECTOR_TYPE_1BIT:
389+
vector1BitDump(pVector);
390+
break;
384391
default:
385392
assert(0);
386393
}
@@ -451,6 +458,8 @@ size_t vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t
451458
return vectorF32SerializeToBlob(pVector, pBlob, nBlobSize);
452459
case VECTOR_TYPE_FLOAT64:
453460
return vectorF64SerializeToBlob(pVector, pBlob, nBlobSize);
461+
case VECTOR_TYPE_1BIT:
462+
return vector1BitSerializeToBlob(pVector, pBlob, nBlobSize);
454463
default:
455464
assert(0);
456465
}

libsql-sqlite3/src/vector1bit.c

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
** 2024-07-04
3+
**
4+
** Copyright 2024 the libSQL authors
5+
**
6+
** Permission is hereby granted, free of charge, to any person obtaining a copy of
7+
** this software and associated documentation files (the "Software"), to deal in
8+
** the Software without restriction, including without limitation the rights to
9+
** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
10+
** the Software, and to permit persons to whom the Software is furnished to do so,
11+
** subject to the following conditions:
12+
**
13+
** The above copyright notice and this permission notice shall be included in all
14+
** copies or substantial portions of the Software.
15+
**
16+
** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
18+
** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
19+
** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
20+
** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21+
** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22+
**
23+
******************************************************************************
24+
**
25+
** 1-bit vector format utilities.
26+
*/
27+
#ifndef SQLITE_OMIT_VECTOR
28+
#include "sqliteInt.h"
29+
30+
#include "vectorInt.h"
31+
32+
#include <math.h>
33+
34+
/**************************************************************************
35+
** Utility routines for debugging
36+
**************************************************************************/
37+
38+
void vector1BitDump(const Vector *pVec){
39+
u8 *elems = pVec->data;
40+
unsigned i;
41+
42+
assert( pVec->type == VECTOR_TYPE_1BIT );
43+
44+
for(i = 0; i < pVec->dims; i++){
45+
printf("%d ", ((elems[i / 8] >> (i & 7)) & 1) ? +1 : -1);
46+
}
47+
printf("\n");
48+
}
49+
50+
/**************************************************************************
51+
** Utility routines for vector serialization and deserialization
52+
**************************************************************************/
53+
54+
size_t vector1BitSerializeToBlob(
55+
const Vector *pVector,
56+
unsigned char *pBlob,
57+
size_t nBlobSize
58+
){
59+
float *elems = pVector->data;
60+
unsigned char *pPtr = pBlob;
61+
size_t len = 0;
62+
unsigned i;
63+
64+
assert( pVector->type == VECTOR_TYPE_1BIT );
65+
assert( pVector->dims <= MAX_VECTOR_SZ );
66+
assert( nBlobSize >= (pVector->dims + 7) / 8 );
67+
68+
for(i = 0; i < pVector->dims; i++){
69+
elems[i] = pPtr[i];
70+
}
71+
return (pVector->dims + 7) / 8;
72+
}
73+
74+
// [sum(map(int, bin(i)[2:])) for i in range(256)]
75+
static int BitsCount[256] = {
76+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
77+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
78+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
79+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
80+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
81+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
82+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
83+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
84+
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
86+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
88+
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
90+
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91+
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
92+
};
93+
94+
int vector1BitDistanceHamming(const Vector *v1, const Vector *v2){
95+
int sum = 0;
96+
u8 *e1 = v1->data;
97+
u8 *e2 = v2->data;
98+
int i;
99+
100+
assert( v1->dims == v2->dims );
101+
assert( v1->type == VECTOR_TYPE_1BIT );
102+
assert( v2->type == VECTOR_TYPE_1BIT );
103+
104+
for(i = 0; i < v1->dims; i++){
105+
sum += BitsCount[e1[i]&e2[i]];
106+
}
107+
return sum;
108+
}
109+
110+
#endif /* !defined(SQLITE_OMIT_VECTOR) */

libsql-sqlite3/src/vectorIndex.c

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -396,13 +396,14 @@ struct VectorParamName {
396396
};
397397

398398
static struct VectorParamName VECTOR_PARAM_NAMES[] = {
399-
{ "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN },
400-
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS },
401-
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 },
402-
{ "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 },
403-
{ "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 },
404-
{ "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 },
405-
{ "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 },
399+
{ "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN },
400+
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS },
401+
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 },
402+
{ "compress_neighbors", VECTOR_METRIC_TYPE_PARAM_ID, 0, "1bit", VECTOR_TYPE_1BIT },
403+
{ "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 },
404+
{ "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 },
405+
{ "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 },
406+
{ "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 },
406407
};
407408

408409
static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) {
@@ -802,7 +803,7 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co
802803
int i, rc = SQLITE_OK;
803804
int dims, type;
804805
int hasLibsqlVectorIdxFn = 0, hasCollation = 0;
805-
const char *pzErrMsg;
806+
const char *pzErrMsg = NULL;
806807

807808
assert( zDbSName != NULL );
808809

@@ -914,9 +915,13 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co
914915
sqlite3ErrorMsg(pParse, "vector index: unsupported for tables without ROWID and composite primary key");
915916
return CREATE_FAIL;
916917
}
917-
rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams);
918+
rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams, &pzErrMsg);
918919
if( rc != SQLITE_OK ){
919-
sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann");
920+
if( pzErrMsg != NULL ){
921+
sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann: %s", pzErrMsg);
922+
}else{
923+
sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann");
924+
}
920925
return CREATE_FAIL;
921926
}
922927
rc = insertIndexParameters(db, zDbSName, pIdx->zName, &idxParams);

libsql-sqlite3/src/vectorIndexInt.h

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -100,43 +100,45 @@ typedef u8 MetricType;
100100
*/
101101

102102
/* format version which can help to upgrade vector on-disk format without breaking older version of the db */
103-
#define VECTOR_FORMAT_PARAM_ID 1
103+
#define VECTOR_FORMAT_PARAM_ID 1
104104
/*
105105
* 1 - initial version
106106
*/
107-
#define VECTOR_FORMAT_DEFAULT 1
107+
#define VECTOR_FORMAT_DEFAULT 1
108108

109109
/* type of the vector index */
110-
#define VECTOR_INDEX_TYPE_PARAM_ID 2
111-
#define VECTOR_INDEX_TYPE_DISKANN 1
110+
#define VECTOR_INDEX_TYPE_PARAM_ID 2
111+
#define VECTOR_INDEX_TYPE_DISKANN 1
112112

113113
/* type of the underlying vector for the vector index */
114-
#define VECTOR_TYPE_PARAM_ID 3
114+
#define VECTOR_TYPE_PARAM_ID 3
115115
/* dimension of the underlying vector for the vector index */
116-
#define VECTOR_DIM_PARAM_ID 4
116+
#define VECTOR_DIM_PARAM_ID 4
117117

118118
/* metric type used for comparing two vectors */
119-
#define VECTOR_METRIC_TYPE_PARAM_ID 5
120-
#define VECTOR_METRIC_TYPE_COS 1
121-
#define VECTOR_METRIC_TYPE_L2 2
119+
#define VECTOR_METRIC_TYPE_PARAM_ID 5
120+
#define VECTOR_METRIC_TYPE_COS 1
121+
#define VECTOR_METRIC_TYPE_L2 2
122122

123123
/* block size */
124-
#define VECTOR_BLOCK_SIZE_PARAM_ID 6
125-
#define VECTOR_BLOCK_SIZE_DEFAULT 128
124+
#define VECTOR_BLOCK_SIZE_PARAM_ID 6
125+
#define VECTOR_BLOCK_SIZE_DEFAULT 128
126126

127-
#define VECTOR_PRUNING_ALPHA_PARAM_ID 7
128-
#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2
127+
#define VECTOR_PRUNING_ALPHA_PARAM_ID 7
128+
#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2
129129

130-
#define VECTOR_INSERT_L_PARAM_ID 8
131-
#define VECTOR_INSERT_L_DEFAULT 70
130+
#define VECTOR_INSERT_L_PARAM_ID 8
131+
#define VECTOR_INSERT_L_DEFAULT 70
132132

133-
#define VECTOR_SEARCH_L_PARAM_ID 9
134-
#define VECTOR_SEARCH_L_DEFAULT 200
133+
#define VECTOR_SEARCH_L_PARAM_ID 9
134+
#define VECTOR_SEARCH_L_DEFAULT 200
135135

136-
#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10
136+
#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10
137+
138+
#define VECTOR_COMPRESS_NEIGHBORS_PARAM_ID 11
137139

138140
/* total amount of vector index parameters */
139-
#define VECTOR_PARAM_IDS_COUNT 9
141+
#define VECTOR_PARAM_IDS_COUNT 11
140142

141143
/*
142144
* Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float)
@@ -218,7 +220,7 @@ int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *);
218220
void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int);
219221
void vectorOutRowsFree(sqlite3 *, VectorOutRows *);
220222

221-
int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *);
223+
int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *, const char **);
222224
int diskAnnClearIndex(sqlite3 *, const char *, const char *);
223225
int diskAnnDropIndex(sqlite3 *, const char *, const char *);
224226
int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **);

libsql-sqlite3/src/vectorInt.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ typedef u32 VectorDims;
2424
*/
2525
#define VECTOR_TYPE_FLOAT32 1
2626
#define VECTOR_TYPE_FLOAT64 2
27+
#define VECTOR_TYPE_1BIT 3
2728

2829
#define VECTOR_FLAGS_STATIC 1
2930

@@ -48,8 +49,9 @@ void vectorInit(Vector *, VectorType, VectorDims, void *);
4849
* Dumps vector on the console (used only for debugging)
4950
*/
5051
void vectorDump (const Vector *v);
51-
void vectorF32Dump(const Vector *v);
52-
void vectorF64Dump(const Vector *v);
52+
void vectorF32Dump (const Vector *v);
53+
void vectorF64Dump (const Vector *v);
54+
void vector1BitDump(const Vector *v);
5355

5456
/*
5557
* Converts vector to the text representation and write the result to the sqlite3_context
@@ -61,9 +63,10 @@ void vectorF64MarshalToText(sqlite3_context *, const Vector *);
6163
/*
6264
* Serializes vector to the blob in little-endian format according to the IEEE-754 standard
6365
*/
64-
size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t);
65-
size_t vectorF32SerializeToBlob(const Vector *, unsigned char *, size_t);
66-
size_t vectorF64SerializeToBlob(const Vector *, unsigned char *, size_t);
66+
size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t);
67+
size_t vectorF32SerializeToBlob (const Vector *, unsigned char *, size_t);
68+
size_t vectorF64SerializeToBlob (const Vector *, unsigned char *, size_t);
69+
size_t vector1BitSerializeToBlob(const Vector *, unsigned char *, size_t);
6770

6871
/*
6972
* Calculates cosine distance between two vectors (vector must have same type and same dimensions)
@@ -72,6 +75,11 @@ float vectorDistanceCos (const Vector *, const Vector *);
7275
float vectorF32DistanceCos (const Vector *, const Vector *);
7376
double vectorF64DistanceCos(const Vector *, const Vector *);
7477

78+
/*
79+
* Calculates hamming distance between two 1-bit vectors (vector must have same dimensions)
80+
*/
81+
int vector1BitDistanceHamming(const Vector *, const Vector *);
82+
7583
/*
7684
* Calculates L2 distance between two vectors (vector must have same type and same dimensions)
7785
*/

libsql-sqlite3/src/vectordiskann.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -437,10 +437,11 @@ int diskAnnCreateIndex(
437437
const char *zDbSName,
438438
const char *zIdxName,
439439
const VectorIdxKey *pKey,
440-
VectorIdxParams *pParams
440+
VectorIdxParams *pParams,
441+
const char **pzErrMsg
441442
){
442443
int rc;
443-
int type, dims;
444+
int type, dims, metric, neighbours;
444445
u64 maxNeighborsParam, blockSizeBytes;
445446
char *zSql;
446447
char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...)
@@ -477,11 +478,19 @@ int diskAnnCreateIndex(
477478
if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){
478479
return SQLITE_ERROR;
479480
}
480-
if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){
481-
if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){
481+
metric = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID);
482+
if( metric == 0 ){
483+
metric = VECTOR_METRIC_TYPE_COS;
484+
if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, metric) != 0 ){
482485
return SQLITE_ERROR;
483486
}
484487
}
488+
neighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID);
489+
if( neighbours == VECTOR_TYPE_1BIT && metric != VECTOR_METRIC_TYPE_COS ){
490+
*pzErrMsg = "1-bit compression available only for cosine metric";
491+
return SQLITE_ERROR;
492+
}
493+
485494
if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){
486495
if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){
487496
return SQLITE_ERROR;
@@ -1544,8 +1553,8 @@ int diskAnnOpenIndex(
15441553
pIndex->nEdgeVectorType = pIndex->nNodeVectorType;
15451554
pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize;
15461555
}else if( compressNeighbours == VECTOR_TYPE_1BIT ){
1547-
pIndex->nEdgeVectorType = VECTOR_TYPE_1BIT;
1548-
pIndex->nEdgeVectorSize = vectorDataSize(VECTOR_TYPE_1BIT, pIndex->nVectorDims);
1556+
pIndex->nEdgeVectorType = compressNeighbours;
1557+
pIndex->nEdgeVectorSize = vectorDataSize(compressNeighbours, pIndex->nVectorDims);
15491558
}else{
15501559
return SQLITE_ERROR;
15511560
}

libsql-sqlite3/tool/mksqlite3c.tcl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ set flist {
468468

469469
json.c
470470
vector.c
471+
vector1bit.c
471472
vectordiskann.c
472473
vectorfloat32.c
473474
vectorfloat64.c

0 commit comments

Comments
 (0)