-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtestvectordb.py
153 lines (128 loc) · 4.91 KB
/
testvectordb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Roger Erismann 2024
PostgreSQL Vector Database Test Script
This script demonstrates the functionality of a PostgreSQL database with the
pgvector extension for handling vector similarity operations. It performs a series
of tests to validate the capabilities of vector storage, similarity search, and
basic database operations in a PostgreSQL environment.
Features:
1. **Create a Table with Vector Support**:
- Ensures the `vector` extension is enabled.
- Creates a table `embeddings` with a `VECTOR` column for storing 3-dimensional vectors.
2. **Insert Rows with Vector Data**:
- Populates the table with sample vectors and associated metadata.
3. **Retrieve All Rows**:
- Queries and displays all rows from the `embeddings` table.
4. **Perform Similarity Search**:
- Searches for the most similar vectors to a given query vector using the `<->` operator.
- Orders results by similarity and limits the output.
5. **Batch Similarity Search**:
- Performs similarity searches for multiple query vectors.
- Retrieves the most similar row for each query vector.
6. **Delete a Row**:
- Removes a specific row from the table based on its `id` and confirms the deletion.
7. **Clean Up**:
- Drops the `embeddings` table to clean up the database.
Database Configuration:
- The script connects to a PostgreSQL database using the `DB_CONFIG` dictionary, which
contains host, port, user, password, and database name.
Usage:
1. Set up a PostgreSQL database with the pgvector extension.
- Ensure the database `vectordb` is created and accessible.
- Run the PostgreSQL container on port `5433` as per the configuration.
2. Run the script:
love what you do
"""
import psycopg2
from psycopg2.extras import RealDictCursor
# Database configuration
DB_CONFIG = {
"host": "localhost",
"port": 5433,
"user": "hammerdirt",
"password": "hammerdirt",
"dbname": "vectordb"
}
def test_pgvector():
try:
# Connect to the database
conn = psycopg2.connect(**DB_CONFIG)
cursor = conn.cursor(cursor_factory=RealDictCursor)
# 1. Create a Table with a VECTOR Column
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cursor.execute("""
CREATE TABLE IF NOT EXISTS embeddings (
id SERIAL PRIMARY KEY,
name VARCHAR(100),
embedding VECTOR(3)
);
""")
conn.commit()
print("Table 'embeddings' created successfully.")
# 2. Insert Rows with Embeddings
cursor.executemany(
"INSERT INTO embeddings (name, embedding) VALUES (%s, %s);",
[
("Vector 1", [0.1, 0.2, 0.3]),
("Vector 2", [0.4, 0.5, 0.6]),
("Vector 3", [0.7, 0.8, 0.9]),
("Vector 4", [1.0, 1.1, 1.2])
]
)
conn.commit()
print("Inserted rows into 'embeddings'.")
# 3. Query All Rows
cursor.execute("SELECT * FROM embeddings;")
rows = cursor.fetchall()
print("All Rows:")
for row in rows:
print(row)
# 4. Perform a Similarity Search
query_vector = [0.1, 0.2, 0.3]
cursor.execute("""
SELECT *, embedding <-> %s::vector AS distance
FROM embeddings
ORDER BY embedding <-> %s::vector
LIMIT 3;
""", (query_vector, query_vector))
similar_rows = cursor.fetchall()
print("Similarity Search Results:")
for row in similar_rows:
print(row)
# 5. Test: Find the Most Similar Vector to a Given Array of Vectors
query_vectors = [
[0.05, 0.1, 0.2],
[0.6, 0.7, 0.8],
[1.2, 1.3, 1.4]
]
most_similar_results = []
for query_vector in query_vectors:
cursor.execute("""
SELECT *, embedding <-> %s::vector AS distance
FROM embeddings
ORDER BY embedding <-> %s::vector
LIMIT 1;
""", (query_vector, query_vector))
result = cursor.fetchone()
most_similar_results.append({"query_vector": query_vector, "most_similar": result})
print("Most Similar Vectors for Given Array of Query Vectors:")
for result in most_similar_results:
print(f"Query Vector: {result['query_vector']}, Most Similar: {result['most_similar']}")
# 6. Delete a Row
cursor.execute("DELETE FROM embeddings WHERE id = %s RETURNING *;", (2,))
deleted_row = cursor.fetchone()
conn.commit()
print(f"Deleted Row: {deleted_row}")
# 7. Drop the Table
cursor.execute("DROP TABLE IF EXISTS embeddings;")
conn.commit()
print("Table 'embeddings' dropped successfully.")
except Exception as e:
print(f"Error: {e}")
finally:
if conn:
cursor.close()
conn.close()
print("Database connection closed.")
if __name__ == "__main__":
test_pgvector()