In this hands-on lecture, we will implement a vector database for an AI application. We’ll focus on how to convert unstructured data, such as text or images, into vectors, and store them in a vector database for efficient similarity-based search. By the end of this lecture, you will have a functional vector database that can be used to store, retrieve, and search for similar data points based on their vector representations.
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import os
import pickle
from typing import List, Dict, Tuple, Any, Optional
import time
class VectorDatabase:
def __init__(self, model_name: str = "distilbert-base-uncased"):
"""
Initialize the vector database with a specific transformer model.
Args:
model_name: Name of the pre-trained model to use for embeddings
"""
self.model_name = model_name
print(f"Loading model: {model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.index = None
self.documents = []
self.metadata = []
def generate_embeddings(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts.
Args:
texts: List of strings to embed
Returns:
NumPy array of embeddings
"""
# Tokenize and encode the texts
inputs = self.tokenizer(texts, padding=True, truncation=True,
max_length=512, return_tensors="pt")
# Generate embeddings
with torch.no_grad():
outputs = self.model(**inputs)
# Use mean of last hidden states as embedding
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
return embeddings
def add_documents(self, documents: List[str], metadata: Optional[List[Dict[str, Any]]] = None) -> None:
"""
Add documents to the database and build or update the index.
Args:
documents: List of document texts
metadata: Optional list of metadata dictionaries for each document
"""
if not documents:
print("No documents provided")
return
# Generate embeddings for the documents
print(f"Generating embeddings for {len(documents)} documents...")
start_time = time.time()
embeddings = self.generate_embeddings(documents)
print(f"Embeddings generated in {time.time() - start_time:.2f} seconds")
# Create or update the index
if self.index is None:
dimension = embeddings.shape[1]
print(f"Creating new FAISS index with dimension {dimension}")
self.index = faiss.IndexFlatL2(dimension)
# Add embeddings to the index
self.index.add(embeddings)
# Store the documents and metadata
start_idx = len(self.documents)
self.documents.extend(documents)
# Add metadata if provided, otherwise use empty dictionaries
if metadata is None:
metadata = [{} for _ in documents]
assert len(metadata) == len(documents), "Metadata list must match documents list length"
# Add document index to metadata
for i, meta in enumerate(metadata):
meta['document_idx'] = start_idx + i
self.metadata.extend(metadata)
print(f"Added {len(documents)} documents to index. Total documents: {len(self.documents)}")
def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
"""
Search for similar documents based on a query string.
Args:
query: Query string to search for
k: Number of results to return
Returns:
List of dictionaries containing search results
"""
if self.index is None or len(self.documents) == 0:
return []
# Generate embedding for the query
query_embedding = self.generate_embeddings([query])
# Perform the search
k = min(k, len(self.documents)) # Ensure k is not larger than number of documents
distances, indices = self.index.search(query_embedding, k)
# Format results
results = []
for i, (idx, distance) in enumerate(zip(indices[0], distances[0])):
result = {
'rank': i + 1,
'document_idx': int(idx),
'distance': float(distance),
'text': self.documents[idx],
'metadata': self.metadata[idx]
}
results.append(result)
return results
def save(self, directory: str) -> None:
"""
Save the vector database to disk.
Args:
directory: Directory to save the database
"""
if not os.path.exists(directory):
os.makedirs(directory)
# Save the index
if self.index is not None:
faiss.write_index(self.index, os.path.join(directory, "index.faiss"))
# Save documents and metadata
with open(os.path.join(directory, "documents.pkl"), 'wb') as f:
pickle.dump(self.documents, f)
with open(os.path.join(directory, "metadata.pkl"), 'wb') as f:
pickle.dump(self.metadata, f)
# Save model name
with open(os.path.join(directory, "model_name.txt"), 'w') as f:
f.write(self.model_name)
print(f"Vector database saved to {directory}")
@classmethod
def load(cls, directory: str) -> 'VectorDatabase':
"""
Load a vector database from disk.
Args:
directory: Directory containing the saved database
Returns:
Loaded VectorDatabase instance
"""
# Load model name
with open(os.path.join(directory, "model_name.txt"), 'r') as f:
model_name = f.read().strip()
# Create instance with the same model
db = cls(model_name)
# Load documents and metadata
with open(os.path.join(directory, "documents.pkl"), 'rb') as f:
db.documents = pickle.load(f)
with open(os.path.join(directory, "metadata.pkl"), 'rb') as f:
db.metadata = pickle.load(f)
# Load the index
index_path = os.path.join(directory, "index.faiss")
if os.path.exists(index_path):
db.index = faiss.read_index(index_path)
print(f"Loaded vector database from {directory} with {len(db.documents)} documents")
return db
def batch_process_documents(documents: List[str], batch_size: int = 8) -> List[np.ndarray]:
"""
Process a large number of documents in batches to avoid memory issues.
Args:
documents: List of document texts
batch_size: Number of documents to process at once
Returns:
List of embeddings for all documents
"""
db = VectorDatabase()
all_embeddings = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
print(f"Processing batch {i // batch_size + 1}/{(len(documents) - 1) // batch_size + 1}")
embeddings = db.generate_embeddings(batch)
all_embeddings.append(embeddings)
return np.vstack(all_embeddings)
def main():
# Example usage of the VectorDatabase class
# Create a new vector database
db = VectorDatabase()
# Example documents
documents = [
"The quick brown fox jumps over the lazy dog.",
"A journey of a thousand miles begins with a single step.",
"To be or not to be, that is the question.",
"All that glitters is not gold.",
"The early bird catches the worm.",
"Actions speak louder than words.",
"Don't judge a book by its cover.",
"The pen is mightier than the sword.",
"Fortune favors the bold.",
"Knowledge is power."
]
# Add metadata for each document
metadata = [
{"source": "proverb", "category": "animals"},
{"source": "Lao Tzu", "category": "philosophy"},
{"source": "Shakespeare", "category": "literature"},
{"source": "proverb", "category": "wisdom"},
{"source": "proverb", "category": "animals"},
{"source": "proverb", "category": "behavior"},
{"source": "proverb", "category": "wisdom"},
{"source": "Edward Bulwer-Lytton", "category": "literature"},
{"source": "proverb", "category": "courage"},
{"source": "Francis Bacon", "category": "wisdom"}
]
# Add documents to the database
db.add_documents(documents, metadata)
# Example queries
queries = [
"What is the meaning of life?",
"Tell me about courage and boldness",
"I need some wisdom about appearances"
]
# Search for each query
for query in queries:
print(f"\nQuery: {query}")
results = db.search(query)
print("Search Results:")
for result in results:
print(f"{result['rank']}. (Distance: {result['distance']:.4f}) '{result['text']}'")
print(
f" Source: {result['metadata'].get('source', 'Unknown')}, Category: {result['metadata'].get('category', 'Uncategorized')}")
# Save the database
save_dir = "vector_db_example"
db.save(save_dir)
# Load the database
loaded_db = VectorDatabase.load(save_dir)
# Verify the loaded database works
print("\nTesting loaded database:")
results = loaded_db.search("Tell me about wisdom")
print("Search Results:")
for result in results:
print(f"{result['rank']}. (Distance: {result['distance']:.4f}) '{result['text']}'")
if __name__ == "__main__":
main()Loading model: distilbert-base-uncased
Generating embeddings for 10 documents...
Embeddings generated in 0.44 seconds
Creating new FAISS index with dimension 768
Added 10 documents to index. Total documents: 10
Query: What is the meaning of life?
Search Results:
1. (Distance: 28.9064) 'Knowledge is power.'
Source: Francis Bacon, Category: wisdom
2. (Distance: 29.8704) 'To be or not to be, that is the question.'
Source: Shakespeare, Category: literature
3. (Distance: 35.8834) 'Actions speak louder than words.'
Source: proverb, Category: behavior
4. (Distance: 37.3288) 'All that glitters is not gold.'
Source: proverb, Category: wisdom
5. (Distance: 39.9361) 'A journey of a thousand miles begins with a single step.'
Source: Lao Tzu, Category: philosophy
Query: Tell me about courage and boldness
Search Results:
1. (Distance: 29.0239) 'Knowledge is power.'
Source: Francis Bacon, Category: wisdom
2. (Distance: 29.1098) 'Actions speak louder than words.'
Source: proverb, Category: behavior
3. (Distance: 32.2485) 'Fortune favors the bold.'
Source: proverb, Category: courage
4. (Distance: 33.7360) 'The pen is mightier than the sword.'
Source: Edward Bulwer-Lytton, Category: literature
5. (Distance: 34.9861) 'Don't judge a book by its cover.'
Source: proverb, Category: wisdom
Query: I need some wisdom about appearances
Search Results:
1. (Distance: 26.9088) 'Don't judge a book by its cover.'
Source: proverb, Category: wisdom
2. (Distance: 28.1751) 'Actions speak louder than words.'
Source: proverb, Category: behavior
3. (Distance: 29.9099) 'Knowledge is power.'
Source: Francis Bacon, Category: wisdom
4. (Distance: 31.9937) 'Fortune favors the bold.'
Source: proverb, Category: courage
5. (Distance: 34.1386) 'The pen is mightier than the sword.'
Source: Edward Bulwer-Lytton, Category: literature
Vector database saved to vector_db_example
Loading model: distilbert-base-uncased
Loaded vector database from vector_db_example with 10 documents
Testing loaded database:
Search Results:
1. (Distance: 30.5364) 'Knowledge is power.'
2. (Distance: 32.7921) 'Don't judge a book by its cover.'
3. (Distance: 33.9872) 'Actions speak louder than words.'
4. (Distance: 35.6948) 'Fortune favors the bold.'
5. (Distance: 35.7062) 'All that glitters is not gold.'
When the script starts, it loads the distilbert-base-uncased model and tokenizer from Hugging Face. This model transforms text into numerical vectors (embeddings) that capture semantic meaning.
The 10 input documents (proverbs and quotes) are tokenized and passed through DistilBERT. For each document, the model returns a sequence of hidden states — the script takes the mean of these hidden states to create a single 768-dimensional embedding per document.
This embedding represents the “meaning” of the text numerically.
A FAISS index is created to enable fast nearest-neighbor search using L2 (Euclidean) distance between embeddings. This means documents with similar embeddings will be closer together in this vector space.
The 10 document embeddings are added to this index.
Each document is paired with metadata (e.g., author and category). This info is stored in memory and later used to enrich search result outputs.
You run three queries:
For each query:
The low distances (e.g., 28–35) reflect a close semantic match between the query and the returned document.
The FAISS index, documents, metadata, and model name are saved to the disk in a folder named vector_db_example.
The database is reloaded using the saved files:
A test query is run: “Tell me about wisdom”
The results again show “Knowledge is power” and similar wisdom-related quotes, proving that the loaded index works just like the original.