Vector Search

Copy-paste examples of using & searching vectors in SQL & Python

Creating a Table

A vector column can optionally be configured to normalize the vector data inserted into it, giving each vector a magnitude (L2 norm) of 1. This can improve the performance of some vector operations with minimal overhead.

Vector Column without Normalization

SQL
1
2
3
4
5
CREATE OR REPLACE TABLE example.vs
(
    name VARCHAR(16) NOT NULL,
    embedding VECTOR(3) NOT NULL
)
Python (native API)
1
2
3
4
5
6
7
table_name = 'example.vs'
columns = [
    ['name', 'string', 'char16'],
    ['embedding', 'bytes', 'vector(3)']
]

gpudb.GPUdbTable(columns, table_name, db = kinetica)
Python (from DataFrame)
1
2
table_name = 'example.vs'
vs = gpudb.GPUdbTable.from_df(df, kinetica, table_name, clear_table = True)

Vector Column with Normalization

SQL
1
2
3
4
5
CREATE OR REPLACE TABLE example.vs_norm
(
    name VARCHAR(16) NOT NULL,
    embedding VECTOR(3, NORMALIZE) NOT NULL
)
Python (native API)
1
2
3
4
5
6
7
table_name = 'example.vs_norm'
columns = [
    ['name', 'string', 'char16'],
    ['embedding', 'bytes', 'vector(3)', 'normalize']
]

vs = gpudb.GPUdbTable(columns, table_name, db = kinetica)
Python (from DataFrame)
1
2
3
4
5
6
table_name = 'example.vs_norm'

vs = gpudb.GPUdbTable.from_df(
        df, kinetica, table_name, clear_table = True,
        column_types = { 'embedding': [ 'vector(3)', 'normalize' ] }
)

Inserting Vector Data

SQL
1
2
3
4
5
6
7
INSERT INTO example.vs
VALUES
    ('fun','[-0.23764,0.43119,-0.72154]'),
    ('play','[-0.73571,0.19937,-0.89408]'),
    ('food','[0.47222,-0.44545,-0.51833]'),
    ('money','[0.59784,-0.057026,0.97746]'),
    ('work','[0.51359,0.19695,-0.51944]')
Python (as strings)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
embs = [
    ['fun', '[-0.23764,0.43119,-0.72154]'],
    ['play', '[-0.73571,0.19937,-0.89408]'],
    ['food', '[0.47222,-0.44545,-0.51833]'],
    ['money', '[0.59784,-0.057026,0.97746]'],
    ['work', '[0.51359,0.19695,-0.51944]']
]

vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica)

vs.insert_records(embs)
Python (as native types)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
embs = [
    ['fun', [-0.23764,0.43119,-0.72154]],
    ['play', [-0.73571,0.19937,-0.89408]],
    ['food', [0.47222,-0.44545,-0.51833]],
    ['money', [0.59784,-0.057026,0.97746]],
    ['work', [0.51359,0.19695,-0.51944]]
]

vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica)

vs.insert_records(embs)
Python (as DataFrame)
1
2
3
vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica)

vs.insert_df(df, batch_size = 10000)

Retrieving Vector Data

SQL
1
2
SELECT name, embedding
FROM example.vs
Python (w/o native type conversions)
1
2
3
4
5
6
7
8
9
vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica)

records = vs.get_records()

dim = 3

print('%-16s %s' % ('Name', 'Embedding'))
for record in records:
    print('%-16s %s' % (record[0], list(struct.unpack('%df'%dim, record[1]))))
Python (w/ native type conversions)
1
2
3
4
5
6
7
vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica, convert_special_types_on_retrieval = True)

records = vs.get_records(encoding = 'json')

print('%-16s %s' % ('Name', 'Embedding'))
for record in records:
    print('%-16s %s' % (record["name"], record["embedding"]))
Python (as DataFrame)
1
2
3
4
5
vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica)

df = vs.to_df()

print(df)

Indexing Vector Data

CAGRA Vector Index

SQL
1
2
ALTER TABLE example.employee
ADD CAGRA INDEX (profile)
Python
1
2
3
4
5
6
7
8
retobj = kinetica.alter_table(
    table_name = "example.employee",
    action = "create_index",
    value = "profile",
    options = {
            "index_type": "cagra"
    }
)

HNSW Vector Index

SQL
1
2
ALTER TABLE example.employee
ADD HNSW INDEX (profile)
Python
1
2
3
4
5
6
7
8
retobj = kinetica.alter_table(
    table_name = "example.employee",
    action = "create_index",
    value = "profile",
    options = {
            "index_type": "hnsw"
    }
)

Searching Vector Data

Vector Search by Operator

SQL Literal
1
2
3
SELECT TOP 5 name, embedding <-> VECTOR('[-0.23764,0.43119,-0.72154]', 3) as distance 
FROM example.vs
ORDER BY distance
SQL Lookup
1
2
3
4
5
6
SELECT TOP 5 name, embedding <-> search_embedding as distance 
FROM
    example.vs,
    (SELECT embedding AS search_embedding FROM example.vs WHERE name = 'fun')
WHERE name <> 'fun'
ORDER BY distance
Python Literal
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
sql = """
    SELECT TOP 5 name, embedding <-> VECTOR('[-0.23764,0.43119,-0.72154]', 3) as distance 
    FROM example.vs
    ORDER BY distance
"""

with gpudb.GPUdbSqlIterator(kinetica, sql) as records:
    
    print('%-16s %s' % ('Name', 'Distance'))
    for record in records:
        print('%-16s %f' % (record[0], record[1]))
Python Lookup w/ SQL
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
sql = """
    SELECT TOP 5 name, embedding <-> search_embedding as distance 
    FROM
        example.vs,
        (SELECT embedding AS search_embedding FROM example.vs WHERE name = 'fun')
    WHERE name <> 'fun'
    ORDER BY distance
"""

with gpudb.GPUdbSqlIterator(kinetica, sql) as records:
    
    print('%-16s %s' % ('Name', 'Distance'))
    for record in records:
        print('%-16s %f' % (record[0], record[1]))
Python Literal to DataFrame
1
2
3
4
5
6
7
8
9
sql = """
    SELECT TOP 5 name, embedding <-> VECTOR('[-0.23764,0.43119,-0.72154]', 3) as distance 
    FROM example.vs
    ORDER BY distance
"""

df = kinetica.to_df(sql)

print(df)
Python Lookup w/ SQL to DataFrame
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
sql = """
    SELECT TOP 5 name, embedding <-> search_embedding as distance 
    FROM
        example.vs,
        (SELECT embedding AS search_embedding FROM example.vs WHERE name = 'fun')
    WHERE name <> 'fun'
    ORDER BY distance
"""

df = kinetica.to_df(sql)

print(df)

Vector Search by Function

SQL Literal
1
2
3
SELECT TOP 5 name, L2_DISTANCE(embedding,'[-0.23764,0.43119,-0.72154]') as distance 
FROM example.vs
ORDER BY distance
SQL Lookup
1
2
3
4
5
6
SELECT TOP 5 name, L2_DISTANCE(embedding, search_embedding) as distance 
FROM
    example.vs,
    (SELECT embedding AS search_embedding FROM example.vs WHERE name = 'fun')
WHERE name <> 'fun'
ORDER BY distance
Python Literal
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
vs = gpudb.GPUdbTable(name = 'example.vs', db = kinetica)

records = vs.get_records_by_column(
        ["name", "L2_DISTANCE(embedding,'[-0.23764,0.43119,-0.72154]') as distance"],
        limit = 5,
        options = {"sort_by": "distance"}
)

print('%-16s %s' % ('Name', 'Distance'))
for record in zip(records['name'], records['distance']):
    print('%-16s %f' % (record[0], record[1]))
Python Lookup w/ SQL
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
sql = """
    SELECT TOP 5 name, L2_DISTANCE(embedding, search_embedding) as distance 
    FROM
        example.vs,
        (SELECT embedding AS search_embedding FROM example.vs WHERE name = 'fun')
    WHERE name <> 'fun'
    ORDER BY distance
"""

with gpudb.GPUdbSqlIterator(kinetica, sql) as records:
    
    print('%-16s %s' % ('Name', 'Distance'))
    for record in records:
        print('%-16s %f' % (record[0], record[1]))
Python Literal to DataFrame
1
2
3
4
5
6
7
8
9
sql = """
    SELECT TOP 5 name, L2_DISTANCE(embedding,'[-0.23764,0.43119,-0.72154]') as distance
    FROM example.vs
    ORDER BY distance
"""

df = kinetica.to_df(sql)

print(df)
Python Lookup w/ SQL to DataFrame
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
sql = """
    SELECT TOP 5 name, L2_DISTANCE(embedding, search_embedding) as distance 
    FROM
        example.vs,
        (SELECT embedding AS search_embedding FROM example.vs WHERE name = 'fun')
    WHERE name <> 'fun'
    ORDER BY distance
"""

df = kinetica.to_df(sql)

print(df)