Prompt Caching Demo
Chat with Claude — cache metrics update after each response
Implementation:
demos/agno-prompt-caching/main.py225 lines
1"""2Agno prompt caching demo.34Demonstrates four caching rules for Agno agents backed by claude-sonnet-4-20250514:5 Rule 1 — add_datetime_to_instructions=False, cache_system_prompt=True6 Rule 2 — all tools passed once at Agent() construction7 Rule 3 — dynamic context injected via <system-reminder> in user message8 Rule 4 — compaction clones agent with same description/instructions/tools9"""1011import os12import datetime13from agno.agent import Agent14from agno.models.anthropic import Claude151617# ---------------------------------------------------------------------------18# Mock tools — defined once, never changed19# ---------------------------------------------------------------------------2021def search_docs(query: str) -> str:22 """Search internal documentation."""23 return f"[search_docs] Found 3 results for '{query}': doc_a.md, doc_b.md, doc_c.md"242526def read_file(path: str) -> str:27 """Read a file from the project."""28 return f"[read_file] Contents of {path}: <mock file content for {path}>"293031def write_file(path: str, content: str) -> str:32 """Write content to a file."""33 return f"[write_file] Wrote {len(content)} bytes to {path}"343536def run_tests(suite: str = "all") -> str:37 """Run the test suite."""38 return f"[run_tests] Suite '{suite}': 42 passed, 0 failed"394041def enter_plan_mode(goal: str) -> str:42 """Enter structured planning mode for a goal."""43 return f"[enter_plan_mode] Planning mode activated for: {goal}"444546def exit_plan_mode(summary: str) -> str:47 """Exit planning mode with a summary."""48 return f"[exit_plan_mode] Plan committed: {summary}"495051ALL_TOOLS = [search_docs, read_file, write_file, run_tests, enter_plan_mode, exit_plan_mode]5253SYSTEM_INSTRUCTIONS = """You are an expert software engineering assistant.54You help developers understand codebases, write code, run tests, and plan work.5556You have access to tools for searching documentation, reading and writing files,57running tests, and entering structured planning mode.5859Always reason step by step. Prefer reading before writing. Run tests after changes."""606162# ---------------------------------------------------------------------------63# Agent factory — Rule 1 and Rule 264# ---------------------------------------------------------------------------6566def create_agent() -> Agent:67 """68 Rule 1: add_datetime_to_instructions=False keeps the system prompt stable69 so Anthropic can cache it. cache_system_prompt=True tells Agno to70 send cache_control breakpoints on the system prompt.71 Rule 2: All tools passed once at construction. Never mutate agent.tools72 mid-session — that breaks the cache prefix.73 """74 return Agent(75 model=Claude(76 id="claude-sonnet-4-20250514",77 ),78 description="Expert software engineering assistant",79 instructions=SYSTEM_INSTRUCTIONS,80 tools=ALL_TOOLS,81 add_datetime_to_instructions=False, # Rule 182 cache_system_prompt=True, # Rule 183 markdown=False,84 )858687# ---------------------------------------------------------------------------88# Rule 3 — inject dynamic context into the user message89# ---------------------------------------------------------------------------9091def build_message(user_input: str) -> str:92 """93 Wrap dynamic runtime context (timestamp, cwd, env) in a <system-reminder>94 block prepended to the user message. This keeps the cacheable system prompt95 clean while still giving the model fresh context each turn.96 """97 now = datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z"98 reminder = (99 f"<system-reminder>\n"100 f"current_time={now}\n"101 f"cwd=/home/user/project\n"102 f"git_branch=main\n"103 f"</system-reminder>\n\n"104 )105 return reminder + user_input106107108# ---------------------------------------------------------------------------109# Rule 4 — compaction preserves the cache prefix110# ---------------------------------------------------------------------------111112def compact_history(agent: Agent) -> Agent:113 """114 Rather than appending a growing history, clone the agent with the same115 description, instructions, and tools so the system-prompt cache prefix is116 preserved. The conversation history is dropped (compacted).117 """118 return Agent(119 model=Claude(id="claude-sonnet-4-20250514"),120 description=agent.description,121 instructions=SYSTEM_INSTRUCTIONS,122 tools=ALL_TOOLS,123 add_datetime_to_instructions=False,124 cache_system_prompt=True,125 markdown=False,126 )127128129# ---------------------------------------------------------------------------130# Metrics printer131# ---------------------------------------------------------------------------132133def print_metrics(turn: int, response) -> None:134 metrics = getattr(response, "metrics", None) or {}135136 if isinstance(metrics, dict):137 cache_read = metrics.get("cache_read_input_tokens", [0])138 cache_write = metrics.get("cache_creation_input_tokens", [0])139 input_tokens = metrics.get("input_tokens", [0])140141 # Agno stores metrics as lists (one entry per LLM call)142 cache_read = sum(cache_read) if isinstance(cache_read, list) else cache_read143 cache_write = sum(cache_write) if isinstance(cache_write, list) else cache_write144 input_tokens = sum(input_tokens) if isinstance(input_tokens, list) else input_tokens145 else:146 cache_read = getattr(metrics, "cache_read_input_tokens", 0) or 0147 cache_write = getattr(metrics, "cache_creation_input_tokens", 0) or 0148 input_tokens = getattr(metrics, "input_tokens", 0) or 0149150 total = cache_read + cache_write + input_tokens151 hit_rate = cache_read / total if total > 0 else 0.0152153 # Rough cost: cache_write=$3.75/Mtok, cache_read=$0.30/Mtok, uncached=$3/Mtok154 cost = (cache_write * 3.75 + cache_read * 0.30 + input_tokens * 3.0) / 1_000_000155156 print(f"\n [Turn {turn} metrics]")157 print(f" cache_write : {cache_write:>6} tokens")158 print(f" cache_read : {cache_read:>6} tokens")159 print(f" uncached : {input_tokens:>6} tokens")160 print(f" hit_rate : {hit_rate:.1%}")161 print(f" est. cost : ${cost:.4f}")162163 if turn > 1 and hit_rate < 0.8:164 print(f" !! WARNING: hit_rate {hit_rate:.1%} below 80% on turn {turn}. "165 "Check for cache-busting (timestamps, tool mutations, model switches).")166167168# ---------------------------------------------------------------------------169# Main170# ---------------------------------------------------------------------------171172PROMPTS = [173 "Search the docs for information about our authentication system.",174 "Read the file src/auth/middleware.py and explain what it does.",175 "The middleware looks fine. Run the test suite to confirm everything passes.",176 "Enter plan mode: refactor the auth middleware to support OAuth2.",177]178179180def main() -> None:181 print("=" * 60)182 print("Agno Prompt Caching Demo")183 print("Model: claude-sonnet-4-20250514")184 print("=" * 60)185186 agent = create_agent()187188 for turn, prompt in enumerate(PROMPTS, start=1):189 print(f"\n--- Turn {turn} ---")190 message = build_message(prompt) # Rule 3191 print(f" User: {prompt[:80]}...")192193 response = agent.run(message)194195 content = getattr(response, "content", "") or ""196 if isinstance(content, list):197 content = " ".join(198 getattr(block, "text", str(block)) for block in content199 )200 print(f" Agent: {str(content)[:120]}...")201 print_metrics(turn, response)202203 # Rule 4: compaction demo204 print("\n" + "=" * 60)205 print("Compaction demo (Rule 4)")206 print("=" * 60)207 compacted = compact_history(agent)208 followup = build_message("What is the status of the OAuth2 refactor plan?")209 print("\n--- Turn 5 (post-compaction) ---")210 response = compacted.run(followup)211 content = getattr(response, "content", "") or ""212 if isinstance(content, list):213 content = " ".join(getattr(b, "text", str(b)) for b in content)214 print(f" Agent: {str(content)[:120]}...")215 print_metrics(5, response)216 print("\nNote: cache_write may spike again after compaction (new session).")217 print(" But system prompt + tools prefix is preserved, so subsequent")218 print(" turns in the compacted agent will hit cache again.")219220221if __name__ == "__main__":222 if not os.environ.get("ANTHROPIC_API_KEY"):223 raise SystemExit("Set ANTHROPIC_API_KEY before running this demo.")224 main()225
Ask anything. Turn 1 shows 0% cache hits.
Ask a follow-up and watch the hit rate climb.
Press Enter to send