llama-index

name: llama-index description: "[Applies to: **/*.py] Definitive guidelines for building production-ready LlamaIndex applications, emphasizing modularity, type safety, cloud services, and robust testing." source: "cursor_mdc"

LlamaIndex Best Practices

LlamaIndex is the definitive framework for building context-augmented LLM applications. Follow these rules to ensure your RAG pipelines and agents are modular, type-safe, performant, and production-ready in 2025.

1. Code Organization and Structure

Organize your LlamaIndex components into distinct, testable functions or classes. Leverage the Workflow API for explicit data flow.

✅ GOOD: Modular Functions & Workflow API

# core_components.py
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.workflow import Workflow, LlamaAgentWorker, AgentInput, AgentOutput
from typing import List

def configure_global_settings() -> None:
    """Configures global LLM and embedding models for consistent behavior."""
    Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.1)
    Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

def load_documents(data_path: str) -> List[Document]:
    """Loads documents from a directory using SimpleDirectoryReader."""
    return SimpleDirectoryReader(data_path).load_data()

def build_vector_index(documents: List[Document]) -> VectorStoreIndex:
    """Builds a VectorStoreIndex from documents."""
    return VectorStoreIndex.from_documents(documents)

def create_query_engine(index: VectorStoreIndex):
    """Creates a query engine with optimal response mode for citations."""
    return index.as_query_engine(response_mode="compact", verbose=True)

# main_app.py
from core_components import (
    configure_global_settings, load_documents, build_vector_index, create_query_engine
)

def main_rag_workflow():
    configure_global_settings() # Always configure settings first

    docs = load_documents("./data")
    index = build_vector_index(docs)
    query_engine = create_query_engine(index)

    # Wrap the query engine in an AgentWorker for Workflow compatibility
    agent_worker = LlamaAgentWorker.from_query_engine(query_engine)
    rag_workflow = Workflow(
        name="SimpleRAGWorkflow",
        description="A basic RAG pipeline for document querying.",
        input_type=AgentInput,
        output_type=AgentOutput,
        worker=agent_worker
    )

    result = rag_workflow.run(AgentInput(query="What are the main themes across these documents?"))
    print(result.response)

if __name__ == "__main__":
    main_rag_workflow()

❌ BAD: Monolithic Script

# app.py (avoid this structure for anything beyond quickstarts)
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

docs = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(docs)
qe = index.as_query_engine()
ans = qe.query("What are the main themes?")
print(ans)

2. Common Patterns and Anti-patterns

✅ GOOD: Leverage LlamaCloud Services & LlamaHub Connectors

For production-grade RAG, prefer managed services like LlamaParse and LlamaCloud, and utilize LlamaHub for data connectors.

import os
from llama_index.core.readers import LlamaParseReader
from llama_index.cloud import LlamaCloudIndex
from llama_index.core import Document

# Ensure LLAMAPARSE_API_KEY and LLAMACLOUD_API_KEY are set in environment
os.environ["LLAMAPARSE_API_KEY"] = os.getenv("LLAMAPARSE_API_KEY")
os.environ["LLAMACLOUD_API_KEY"] = os.getenv("LLAMACLOUD_API_KEY")

# Use LlamaParse for robust PDF/Office document parsing (preserves structure)
parser = LlamaParseReader(result_type="markdown", parsing_instruction="Extract all tables and figures.")
documents: List[Document] = parser.load_data("path/to/complex_report.pdf")

# Store indexes in LlamaCloud for automatic scaling, versioning, and management
cloud_index = LlamaCloudIndex.from_documents(
    documents,
    name="my-production-rag-index",
    project_name="my-enterprise-rag",
    # auto_delete=False # Keep index after script finishes
)
query_engine = cloud_index.as_query_engine()
response = query_engine.query("Summarize the key findings in the report.")
print(response)

❌ BAD: Custom Scraping & Basic Readers for Complex Data

# Avoid custom web scraping when LlamaHub offers a robust connector.
# Avoid SimpleDirectoryReader for complex PDFs with tables/figures; it will lose structure.
from llama_index.core.readers import SimpleDirectoryReader
# docs = SimpleDirectoryReader("./complex_pdfs").load_data() # Leads to poor retrieval

3. Performance Considerations

Optimize ingestion, indexing, and retrieval for speed and cost.

✅ GOOD: Asynchronous Operations & Efficient Chunking

import asyncio
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from typing import List

async def build_index_async(documents: List[Document]) -> VectorStoreIndex:
    """Asynchronously builds a VectorStoreIndex."""
    # For very large datasets, consider streaming or batching document processing
    index = await asyncio.to_thread(VectorStoreIndex.from_documents, documents)
    return index

async def main_performance_example():
    Settings.text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)
    # Load documents (can be async if using async readers)
    docs = [Document(text=f"Sample document {i} content. " * 100) for i in range(100)]
    
    index = await build_index_async(docs)
    query_engine = index.as_query_engine()
    response = await asyncio.to_thread(query_engine.query, "What is in the documents?")
    print("Query completed:", response)

if __name__ == "__main__":
    asyncio.run(main_performance_example())

❌ BAD: Synchronous, Unoptimized Processing

# Synchronous processing of many documents is slow and blocks the event loop.
# Default chunking without consideration for content structure also hurts performance and quality.
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
# docs = SimpleDirectoryReader("./large_data_folder").load_data() # Can block
# index = VectorStoreIndex.from_documents(docs) # Can be memory intensive and slow

4. Common Pitfalls and Gotchas

4.1. Environment Variables for Secrets

Always use environment variables for API keys and sensitive credentials.

✅ GOOD: Load from Environment

import os
# Set these in your shell or .env file, e.g., OPENAI_API_KEY="sk-..."
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LLAMAPARSE_API_KEY"] = os.getenv("LLAMAPARSE_API_KEY")

❌ BAD: Hardcoding Secrets

# NEVER hardcode API keys directly in code
OPENAI_API_KEY = "sk-YOUR_HARDCODED_KEY_HERE"

4.2. Chunking Strategy

Effective chunking is critical for retrieval quality.

✅ GOOD: Semantic Chunking with Metadata & Overlap

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

# Configure chunking globally for consistency
Settings.text_splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separator=" " # Prefer space separator for better word boundaries
)
# For complex documents, LlamaParse handles intelligent, structure-aware chunking.

❌ BAD: Default or Naive Chunking

# Relying solely on default chunking can lead to poor context and retrieval.
# Default is often fine for simple text, but not for structured documents or long texts.
# Settings.text_splitter = None # Don't explicitly disable if you need control

5. Type Hints

Enforce type safety for maintainable and robust LlamaIndex applications.

✅ GOOD: Comprehensive Type Annotations

from llama_index.core import Document, VectorStoreIndex
from llama_index.core.query_engine import BaseQueryEngine
from typing import List

def process_and_query_documents(documents: List[Document], query_str: str) -> str:
    """Processes documents and queries the index, returning the response as a string."""
    index: VectorStoreIndex = VectorStoreIndex.from_documents(documents)
    query_engine: BaseQueryEngine = index.as_query_engine()
    response = query_engine.query(query_str)
    return str(response)

❌ BAD: Untyped Functions

def process_and_query_untyped(documents, query_str): # Lacks clarity on expected types
    index = VectorStoreIndex.from_documents(documents)
    query_engine = index.as_query_engine()
    response = query_engine.query(query_str)
    return str(response)

6. Virtual Environments

Isolate project dependencies to prevent conflicts and ensure reproducible builds.

✅ GOOD: Use `venv` or `poetry`

# Using venv (standard Python approach)
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt

# Using Poetry (recommended for robust dependency management)
poetry init --name my-llamaindex-app --python ">=3.9,<3.12"
poetry add llama-index llama-index-llms-openai llama-index-embeddings-openai
poetry shell

❌ BAD: Global `pip install`

# Avoid installing directly into your system's Python environment.
# This leads to dependency conflicts and "works on my machine" problems.
pip install llama-index # Can break other projects

7. Packaging

Define project dependencies clearly for reproducibility and deployment.

✅ GOOD: `pyproject.toml` (Poetry/Rye) or `requirements.txt`

# pyproject.toml (Poetry example - preferred for modern Python projects)
[tool.poetry]
name = "my-llamaindex-app"
version = "0.1.0"
description = "A production-ready LlamaIndex RAG application."
authors = ["Your Name <you@example.com>"]
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.9,<3.12"
llama-index = "^0.10.0" # Use appropriate, pinned version ranges
llama-index-llms-openai = "^0.1.0"
llama-index-embeddings-openai = "^0.1.0"
# Add other necessary packages like llama-index-cloud, llama-index-readers-llamaparse

❌ BAD: Undocumented Dependencies

# No explicit dependency list makes it hard for others to set up the project.
# (Implicitly relying on a global environment or manual installs is a recipe for errors.)

8. Testing Approaches

Implement unit and integration tests for reliability. Mock LLM calls for faster, cheaper unit tests.

✅ GOOD: Unit Tests with Mocking

# tests/test_rag_components.py
import pytest
from unittest.mock import MagicMock
from llama_index.core import Document, Settings
from core_components import load_documents, build_vector_index, create_query_engine

@pytest.fixture
def mock_documents():
    return [
        Document(text="LlamaIndex is a data framework for LLM applications."),
        Document(text="It connects LLMs to your private data sources.")
    ]

def test_load_documents_from_dir(tmp_path):
    # Create dummy files for testing SimpleDirectoryReader
    (tmp_path / "doc1.txt").write_text("Content of document one.")
    (tmp_path / "doc2.txt").write_text("Content of document two.")
    docs = load_documents(str(tmp_path))
    assert len(docs) == 2
    assert "Content of document one." in docs[0].text

def test_build_vector_index(mock_documents):
    # Mock LLM and embedding models for fast, isolated unit tests
    Settings.llm = MagicMock()
    Settings.embed_model = MagicMock()
    index = build_vector_index(mock_documents)
    assert index is not None
    # Further assertions can check index properties if needed

def test_query_engine_response_mocked():
    mock_query_engine = MagicMock()
    mock_query_engine.query.return_value = "Mocked response: LlamaIndex connects LLMs to your data."
    response = mock_query_engine.query("What is LlamaIndex?")
    assert "Mocked response" in str(response)
    mock_query_engine.query.assert_called_once_with("What is LlamaIndex?")

❌ BAD: No Tests or Relying on End-to-End Only

# No test files, or only manual testing.
# Relying solely on slow, expensive end-to-end tests makes development cycles long and debugging difficult.
# Unit tests are crucial for isolating issues in specific components.