Skip to content

Tutorial: Building a Vector Store Workflow

Let's walk through creating a full working embedding + vector store pipeline.

Step 1: Initialize the Vector Store with your chosen provider and embedding function

from railtracks.vector_stores.chroma import ChromaVectorStore
from railtracks.rag.embedding_service import EmbeddingService

embedding_function = EmbeddingService().embed

store = ChromaVectorStore(
    collection_name="my_collection",
    embedding_function=embedding_function,
    path="./chroma-data"
)

Choosing where your vector store is located

You can look at our different integrations to choose where your vectors are stored

Step 2: Insert Your Documents

You can upsert:

  • A string or list of strings

  • A Chunk object or list of Chunk objects (if you want a custom ID or need to attach document data or metadata)

Plain Text Inserts

upserted_item = store.upsert("Oranges are orange")
upserted_list = store.upsert(["Bananas are yellow.", "Apples can be red or green."])

Insert With Metadata or Custom ID

from railtracks.vector_stores.vector_store_base import Chunk

meta_data_chunk = Chunk(
    content="The Eiffel Tower is located in Paris.",
    document="france_guide.txt",
    metadata={"category": "travel"}
)

custom_id_chunk = Chunk(
    id="important_id_i_need_access_to",
    content="My favourite ai library is Railtracks",
)

custom_id_meta_chunk = Chunk(
    id="other_important_id_i_need_access_to",
    content="big ben is in London",
    document="england_guide.txt",
    metadata={"category": "travel"}
)

travel_chunks = [meta_data_chunk, custom_id_meta_chunk]

upserted_chunk = store.upsert(custom_id_chunk)
upserted_chunk_list = store.upsert(travel_chunks)

Step 3: Search Your Data

results = store.search("Where is the Eiffel Tower?", top_k=5)
print("Question: Where is the Eiffel Tower?")
print("Answer: " + results[0].content)
search_queries = ["Where is the Eiffel Tower?", "what is the best ai library?"]
results2 = store.search(search_queries, top_k=1)
eiffel_tower_location = results2[0][0]
best_ai_library = results2[1][0]

Step 4: Fetch By ID

important_result = store.fetch("important_id_i_need_access_to")[0]
# print the fetched chunk string
print(important_result.content)

# get list of Fetch Results
more_results = store.fetch(upserted_chunk_list)

print(more_results[0].content)
print(more_results[1].content)

Step 5: Delete

store.delete("important_id_i_need_access_to")
store.delete(upserted_chunk_list)

Examples

Simple Vector Store Insertion

# Initialize embedding function
embedding_function = EmbeddingService().embed

# Step 1: Initialize vector store
store = ChromaVectorStore(
    collection_name="temporary_knowledge_base",
    embedding_function=embedding_function,
)

# Step 2: Insert text
text = [
    "Python is a high-level programming language.",
    "Machine learning is a subset of artificial intelligence.",
    "Neural networks are inspired by biological neural networks.",
    "Deep learning uses multiple layers of neural networks."
]

text_ids = store.upsert(text)

#Check size of store matches size of text length
print(len(text) == store.count())

# Step 3: Search for relevant information
query = "What is machine learning?"
results = store.search(query, top_k=3)


print("Question: What is machine learning?")
print("Answer: " + results[0].content)

Vector Store Searching and Manipulation With Metadata

store = ChromaVectorStore(
    collection_name="article_archive",
    embedding_function=embedding_function,
    path="./chroma-data"
)

articles_data = [
    {"title": "AI Advances in 2024", "content": "Artificial intelligence saw major breakthroughs this year...", "author": "Jane Doe", "date": "2024-01-15"},
    {"title": "Climate Change Report", "content": "New studies show accelerating impacts of climate change...", "author": "John Smith", "date": "2024-02-20"},
    {"title": "Space Exploration Updates", "content": "Mars mission successfully lands new rover...", "author": "Alice Johnson", "date": "2024-03-10"},
    {"title": "Healthcare Innovations", "content": "Revolutionary new treatment for rare diseases approved...", "author": "Bob Williams", "date": "2024-04-05"},
    {"title": "Quantum Computing Milestone", "content": "Scientists achieve quantum advantage in practical application...", "author": "Jane Doe", "date": "2024-04-05"}
]


chunks = []
for article in articles_data:
    chunk = Chunk(
        content=f"{article['title']}: {article['content']}",
        document=f"article_{article['date']}.txt",
        metadata={
            "title": article['title'],
            "author": article['author'],
            "date": article['date']
        }
    )
    chunks.append(chunk)

article_ids = store.upsert(chunks)

search_queries = [
    "artificial intelligence",
    "space and planets",
    "medical breakthroughs",
]

results2 = store.search(search_queries, top_k=3, where={"author" : "Jane Doe"})
store.delete(article_ids, where={"date" : "2024-04-05"})