Login Sign Up

Hands-On – Implementing a Vector Database for AI

In this hands-on lecture, we will implement a vector database for an AI application. We’ll focus on how to convert unstructured data, such as text or images, into vectors, and store them in a vector database for efficient similarity-based search. By the end of this lecture, you will have a functional vector database that can be used to store, retrieve, and search for similar data points based on their vector representations.

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import os
import pickle
from typing import List, Dict, Tuple, Any, Optional
import time


class VectorDatabase:
    def __init__(self, model_name: str = "distilbert-base-uncased"):
        """
        Initialize the vector database with a specific transformer model.

        Args:
            model_name: Name of the pre-trained model to use for embeddings
        """
        self.model_name = model_name
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.index = None
        self.documents = []
        self.metadata = []

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        Args:
            texts: List of strings to embed

        Returns:
            NumPy array of embeddings
        """
        # Tokenize and encode the texts
        inputs = self.tokenizer(texts, padding=True, truncation=True,
                                max_length=512, return_tensors="pt")

        # Generate embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use mean of last hidden states as embedding
            embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

        return embeddings

    def add_documents(self, documents: List[str], metadata: Optional[List[Dict[str, Any]]] = None) -> None:
        """
        Add documents to the database and build or update the index.

        Args:
            documents: List of document texts
            metadata: Optional list of metadata dictionaries for each document
        """
        if not documents:
            print("No documents provided")
            return

        # Generate embeddings for the documents
        print(f"Generating embeddings for {len(documents)} documents...")
        start_time = time.time()
        embeddings = self.generate_embeddings(documents)
        print(f"Embeddings generated in {time.time() - start_time:.2f} seconds")

        # Create or update the index
        if self.index is None:
            dimension = embeddings.shape[1]
            print(f"Creating new FAISS index with dimension {dimension}")
            self.index = faiss.IndexFlatL2(dimension)

        # Add embeddings to the index
        self.index.add(embeddings)

        # Store the documents and metadata
        start_idx = len(self.documents)
        self.documents.extend(documents)

        # Add metadata if provided, otherwise use empty dictionaries
        if metadata is None:
            metadata = [{} for _ in documents]
        assert len(metadata) == len(documents), "Metadata list must match documents list length"

        # Add document index to metadata
        for i, meta in enumerate(metadata):
            meta['document_idx'] = start_idx + i

        self.metadata.extend(metadata)

        print(f"Added {len(documents)} documents to index. Total documents: {len(self.documents)}")

    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        """
        Search for similar documents based on a query string.

        Args:
            query: Query string to search for
            k: Number of results to return

        Returns:
            List of dictionaries containing search results
        """
        if self.index is None or len(self.documents) == 0:
            return []

        # Generate embedding for the query
        query_embedding = self.generate_embeddings([query])

        # Perform the search
        k = min(k, len(self.documents))  # Ensure k is not larger than number of documents
        distances, indices = self.index.search(query_embedding, k)

        # Format results
        results = []
        for i, (idx, distance) in enumerate(zip(indices[0], distances[0])):
            result = {
                'rank': i + 1,
                'document_idx': int(idx),
                'distance': float(distance),
                'text': self.documents[idx],
                'metadata': self.metadata[idx]
            }
            results.append(result)

        return results

    def save(self, directory: str) -> None:
        """
        Save the vector database to disk.

        Args:
            directory: Directory to save the database
        """
        if not os.path.exists(directory):
            os.makedirs(directory)

        # Save the index
        if self.index is not None:
            faiss.write_index(self.index, os.path.join(directory, "index.faiss"))

        # Save documents and metadata
        with open(os.path.join(directory, "documents.pkl"), 'wb') as f:
            pickle.dump(self.documents, f)

        with open(os.path.join(directory, "metadata.pkl"), 'wb') as f:
            pickle.dump(self.metadata, f)

        # Save model name
        with open(os.path.join(directory, "model_name.txt"), 'w') as f:
            f.write(self.model_name)

        print(f"Vector database saved to {directory}")

    @classmethod
    def load(cls, directory: str) -> 'VectorDatabase':
        """
        Load a vector database from disk.

        Args:
            directory: Directory containing the saved database

        Returns:
            Loaded VectorDatabase instance
        """
        # Load model name
        with open(os.path.join(directory, "model_name.txt"), 'r') as f:
            model_name = f.read().strip()

        # Create instance with the same model
        db = cls(model_name)

        # Load documents and metadata
        with open(os.path.join(directory, "documents.pkl"), 'rb') as f:
            db.documents = pickle.load(f)

        with open(os.path.join(directory, "metadata.pkl"), 'rb') as f:
            db.metadata = pickle.load(f)

        # Load the index
        index_path = os.path.join(directory, "index.faiss")
        if os.path.exists(index_path):
            db.index = faiss.read_index(index_path)

        print(f"Loaded vector database from {directory} with {len(db.documents)} documents")
        return db


def batch_process_documents(documents: List[str], batch_size: int = 8) -> List[np.ndarray]:
    """
    Process a large number of documents in batches to avoid memory issues.

    Args:
        documents: List of document texts
        batch_size: Number of documents to process at once

    Returns:
        List of embeddings for all documents
    """
    db = VectorDatabase()
    all_embeddings = []

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1}/{(len(documents) - 1) // batch_size + 1}")
        embeddings = db.generate_embeddings(batch)
        all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)


def main():
    # Example usage of the VectorDatabase class

    # Create a new vector database
    db = VectorDatabase()

    # Example documents
    documents = [
        "The quick brown fox jumps over the lazy dog.",
        "A journey of a thousand miles begins with a single step.",
        "To be or not to be, that is the question.",
        "All that glitters is not gold.",
        "The early bird catches the worm.",
        "Actions speak louder than words.",
        "Don't judge a book by its cover.",
        "The pen is mightier than the sword.",
        "Fortune favors the bold.",
        "Knowledge is power."
    ]

    # Add metadata for each document
    metadata = [
        {"source": "proverb", "category": "animals"},
        {"source": "Lao Tzu", "category": "philosophy"},
        {"source": "Shakespeare", "category": "literature"},
        {"source": "proverb", "category": "wisdom"},
        {"source": "proverb", "category": "animals"},
        {"source": "proverb", "category": "behavior"},
        {"source": "proverb", "category": "wisdom"},
        {"source": "Edward Bulwer-Lytton", "category": "literature"},
        {"source": "proverb", "category": "courage"},
        {"source": "Francis Bacon", "category": "wisdom"}
    ]

    # Add documents to the database
    db.add_documents(documents, metadata)

    # Example queries
    queries = [
        "What is the meaning of life?",
        "Tell me about courage and boldness",
        "I need some wisdom about appearances"
    ]

    # Search for each query
    for query in queries:
        print(f"\nQuery: {query}")
        results = db.search(query)

        print("Search Results:")
        for result in results:
            print(f"{result['rank']}. (Distance: {result['distance']:.4f}) '{result['text']}'")
            print(
                f"   Source: {result['metadata'].get('source', 'Unknown')}, Category: {result['metadata'].get('category', 'Uncategorized')}")

    # Save the database
    save_dir = "vector_db_example"
    db.save(save_dir)

    # Load the database
    loaded_db = VectorDatabase.load(save_dir)

    # Verify the loaded database works
    print("\nTesting loaded database:")
    results = loaded_db.search("Tell me about wisdom")

    print("Search Results:")
    for result in results:
        print(f"{result['rank']}. (Distance: {result['distance']:.4f}) '{result['text']}'")


if __name__ == "__main__":
    main()

Output:

Loading model: distilbert-base-uncased
Generating embeddings for 10 documents...
Embeddings generated in 0.44 seconds
Creating new FAISS index with dimension 768
Added 10 documents to index. Total documents: 10

Query: What is the meaning of life?
Search Results:
1. (Distance: 28.9064) 'Knowledge is power.'
   Source: Francis Bacon, Category: wisdom
2. (Distance: 29.8704) 'To be or not to be, that is the question.'
   Source: Shakespeare, Category: literature
3. (Distance: 35.8834) 'Actions speak louder than words.'
   Source: proverb, Category: behavior
4. (Distance: 37.3288) 'All that glitters is not gold.'
   Source: proverb, Category: wisdom
5. (Distance: 39.9361) 'A journey of a thousand miles begins with a single step.'
   Source: Lao Tzu, Category: philosophy

Query: Tell me about courage and boldness
Search Results:
1. (Distance: 29.0239) 'Knowledge is power.'
   Source: Francis Bacon, Category: wisdom
2. (Distance: 29.1098) 'Actions speak louder than words.'
   Source: proverb, Category: behavior
3. (Distance: 32.2485) 'Fortune favors the bold.'
   Source: proverb, Category: courage
4. (Distance: 33.7360) 'The pen is mightier than the sword.'
   Source: Edward Bulwer-Lytton, Category: literature
5. (Distance: 34.9861) 'Don't judge a book by its cover.'
   Source: proverb, Category: wisdom

Query: I need some wisdom about appearances
Search Results:
1. (Distance: 26.9088) 'Don't judge a book by its cover.'
   Source: proverb, Category: wisdom
2. (Distance: 28.1751) 'Actions speak louder than words.'
   Source: proverb, Category: behavior
3. (Distance: 29.9099) 'Knowledge is power.'
   Source: Francis Bacon, Category: wisdom
4. (Distance: 31.9937) 'Fortune favors the bold.'
   Source: proverb, Category: courage
5. (Distance: 34.1386) 'The pen is mightier than the sword.'
   Source: Edward Bulwer-Lytton, Category: literature
Vector database saved to vector_db_example
Loading model: distilbert-base-uncased
Loaded vector database from vector_db_example with 10 documents

Testing loaded database:
Search Results:
1. (Distance: 30.5364) 'Knowledge is power.'
2. (Distance: 32.7921) 'Don't judge a book by its cover.'
3. (Distance: 33.9872) 'Actions speak louder than words.'
4. (Distance: 35.6948) 'Fortune favors the bold.'
5. (Distance: 35.7062) 'All that glitters is not gold.'

Code Explanation

1. Model Initialization

When the script starts, it loads the distilbert-base-uncased model and tokenizer from Hugging Face. This model transforms text into numerical vectors (embeddings) that capture semantic meaning.

2. Embedding Generation

The 10 input documents (proverbs and quotes) are tokenized and passed through DistilBERT. For each document, the model returns a sequence of hidden states — the script takes the mean of these hidden states to create a single 768-dimensional embedding per document.

This embedding represents the “meaning” of the text numerically.

3. Index Creation with FAISS

A FAISS index is created to enable fast nearest-neighbor search using L2 (Euclidean) distance between embeddings. This means documents with similar embeddings will be closer together in this vector space.

The 10 document embeddings are added to this index.

4. Metadata Storage

Each document is paired with metadata (e.g., author and category). This info is stored in memory and later used to enrich search result outputs.

5. Semantic Search

You run three queries:

  1. What is the meaning of life?
  2. Tell me about courage and boldness
  3. I need some wisdom about appearances

For each query:

  • The query is embedded into the same 768-dimensional space.
  • FAISS returns the top 5 closest document embeddings based on Euclidean distance.
  • These are shown ranked from most to least relevant.

The low distances (e.g., 28–35) reflect a close semantic match between the query and the returned document.

6. Save to Disk

The FAISS index, documents, metadata, and model name are saved to the disk in a folder named vector_db_example.

7. Reloading and Persistence

The database is reloaded using the saved files:

  • It reinstantiates the model.
  • Reloads the saved FAISS index and document data.
  • Ensures that search behavior works identically post-reload.

8. Final Search Validation

A test query is run: “Tell me about wisdom”

The results again show “Knowledge is power” and similar wisdom-related quotes, proving that the loaded index works just like the original.

Summary of Output Behavior:

  • Distances reflect semantic closeness (lower = more similar).
  • Matching is not based on keyword but meaning (thanks to embeddings).
  • Save/load functionality preserves all search capabilities.
  • Great for scalable semantic search use cases.