docker · dgageot · Apr 28, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
@@ -398,6 +398,10 @@
           "$ref": "#/definitions/HooksConfig",
           "description": "Lifecycle hooks for executing shell commands at various points in the agent's execution"
         },
+        "cache": {
+          "$ref": "#/definitions/CacheConfig",
+          "description": "Optional response cache: when the same user question is asked again, replay the previous answer instead of calling the model."
+        },
         "skills": {
           "description": "Enable skills discovery for this agent. Set to true to load all discovered skills from local filesystem sources; false disables skills. A list can mix sources (\"local\" or an HTTP/HTTPS URL) and/or skill names to include. If only names are given, local sources are loaded and filtered to just those skills.",
           "oneOf": [
@@ -480,6 +484,32 @@
       },
       "additionalProperties": false
     },
+    "CacheConfig": {
+      "type": "object",
+      "description": "Configuration for the agent's response cache. When enabled, the assistant response produced for a given user question is stored and replayed verbatim the next time the same question is asked, skipping the model entirely. Two normalization options control what 'same question' means: case_sensitive (default false) toggles case-insensitive matching, and trim_spaces (default false) strips leading/trailing whitespace before comparison. Set 'path' to persist entries to a JSON file (relative paths resolve against the agent config directory); leave it empty to keep entries in memory only.",
+      "properties": {
+        "enabled": {
+          "type": "boolean",
+          "description": "Set to true to enable the cache. When false (or when the cache section is omitted), no caching is performed.",
+          "default": false
+        },
+        "case_sensitive": {
+          "type": "boolean",
+          "description": "When true, questions must match exactly (including case) to hit the cache. Default: false (case-insensitive matching).",
+          "default": false
+        },
+        "trim_spaces": {
+          "type": "boolean",
+          "description": "When true, leading and trailing whitespace is stripped from questions before they are compared. Default: false.",
+          "default": false
+        },
+        "path": {
+          "type": "string",
+          "description": "Path to a JSON file used to persist cache entries across runs. Relative paths are resolved against the agent's config directory. When empty, the cache lives only in memory."
+        }
+      },
+      "additionalProperties": false
+    },
     "HooksConfig": {
       "type": "object",
       "description": "Lifecycle hooks configuration for an agent. Hooks allow running shell commands at various points in the agent's execution lifecycle.",

@@ -48,6 +48,11 @@ agents:
     structured_output: # Optional: constrain output format
       name: string
       schema: object
+    cache: # Optional: response cache (skip the model on repeat questions)
+      enabled: boolean
+      case_sensitive: boolean
+      trim_spaces: boolean
+      path: string
 ```
 
 <div class="callout callout-tip" markdown="1">
@@ -83,6 +88,7 @@ agents:
 | `handoffs`                  | array   | ✗        | List of agent names this agent can hand off the conversation to. Enables the `handoff` tool. See [Handoffs Routing]({{ '/concepts/multi-agent/#handoffs-routing' | relative_url }}).                  |
 | `hooks`                     | object  | ✗        | Lifecycle hooks for running commands at various points. See [Hooks]({{ '/configuration/hooks/' | relative_url }}).                                                                                   |
 | `structured_output`         | object  | ✗        | Constrain agent output to match a JSON schema. See [Structured Output]({{ '/configuration/structured-output/' | relative_url }}).                                                                    |
+| `cache`                     | object  | ✗        | Response cache. When the same user question is asked again, the previous answer is replayed verbatim and the model is not called. See [Response Cache](#response-cache) below.                  |
 
 <div class="callout callout-warning" markdown="1">
 <div class="callout-title">⚠️ max_iterations
@@ -91,6 +97,47 @@ agents:
 
 </div>
 
+## Response Cache
+
+The response cache short-circuits the model when the same user question is asked again. The first time a question is asked, the agent calls the model normally and stores the assistant's reply. Subsequent identical questions skip the model entirely and replay the stored reply verbatim.
+
+```yaml
+agents:
+  root:
+    model: openai/gpt-5-mini
+    description: Cached assistant
+    instruction: You are a helpful assistant.
+    cache:
+      enabled: true          # required to turn the cache on
+      case_sensitive: false  # default: false ("Hello" == "hello")
+      trim_spaces: true      # default: false ("  hello  " == "hello")
+      path: ./cache.json     # optional: persist to disk; omit for in-memory
+```
+
+| Property         | Type    | Default | Description                                                                                                                                                                                                                       |
+| ---------------- | ------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `enabled`        | boolean | `false` | Master switch. When `false` (or when the `cache` section is omitted), no caching is performed.                                                                                                                                     |
+| `case_sensitive` | boolean | `false` | When `true`, questions must match exactly (including case) to hit the cache.                                                                                                                                                       |
+| `trim_spaces`    | boolean | `false` | When `true`, leading and trailing whitespace is stripped from the question before it is compared.                                                                                                                                  |
+| `path`           | string  | _empty_ | When set, cache entries are persisted to a JSON file at the given path and reloaded on startup so the cache survives restarts. Relative paths resolve against the agent config directory. When empty, the cache lives in memory only. |
+
+**How it works**
+
+- The cache key is the latest user message in the session, normalized according to `case_sensitive` and `trim_spaces`.
+- On a hit, the cached reply is added to the session as the assistant message and stop hooks fire normally — the rest of the agent (tools, sub-agents, the model) is bypassed.
+- On a miss, the agent runs normally; the final assistant message produced by the first stop of the run is then stored under the question's key.
+- Only the response to the original user question of a run is cached; follow-up turns inside the same `RunStream` are not.
+
+**File-backed storage**
+
+When `path` is set, every `Store` rewrites the entire cache file. Writes are **atomic**: the new content is written to a sibling temp file, `fsync`'d, and renamed over the destination, so a concurrent reader (or a process that crashes mid-write) will always see either the previous content or the new content in full — never a partially written file. The parent directory is also `fsync`'d after the rename so the rename itself is durable.
+
+**Cross-process sharing**
+
+Multiple processes can share the same `path:` cache file safely. Every `Store` takes an exclusive advisory lock on a sibling `<path>.lock` file (POSIX `flock(2)` on Unix, `LockFileEx` on Windows), reloads the current on-disk state under the lock, merges the new entry, and writes back atomically. Two processes that store *different* keys at the same time both see their writes preserved on disk; the lock window is short (one read + one fsync'd write).
+
+`Lookup` watches the file's modification time and reloads the in-memory map when the file has advanced since its last load, so writes from a sibling process become visible without a restart. The `<path>.lock` sentinel file is created on first write and never deleted: removing it would let two processes lock different inodes and lose mutual exclusion.
+
 ## Welcome Message
 
 Display a message when users start a session:

@@ -0,0 +1,30 @@
+#!/usr/bin/env docker agent run
+
+# Demonstrates the response cache.
+#
+# The first time a question is asked, the agent calls the model normally and
+# stores the answer. The second time the same question is asked, the answer is
+# replayed verbatim and the model is not invoked at all.
+#
+# Two normalization options control what "same question" means:
+#   - case_sensitive: when false (the default), "Hello" and "hello" match.
+#   - trim_spaces: when true, leading and trailing whitespace is ignored.
+#
+# Storage is in-memory by default. Set `path` to persist entries to a JSON
+# file that is reloaded on startup so the cache survives restarts. Multiple
+# processes can safely share the same file: an advisory lock on
+# `<path>.lock` serializes writes, and Lookup reloads the in-memory map
+# when the file changes externally.
+
+agents:
+  root:
+    model: openai/gpt-5-mini
+    description: A helpful AI assistant with a response cache
+    instruction: |
+      You are a knowledgeable assistant that helps users with various tasks.
+      Be helpful, accurate, and concise in your responses.
+    cache:
+      enabled: true
+      case_sensitive: false  # "Hello" == "hello"
+      trim_spaces: true      # "  hello  " == "hello"
+      path: ./cache.json     # remove this line to keep the cache in memory only
@@ -9,6 +9,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/docker/docker-agent/pkg/cache"
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/config/types"
 	"github.com/docker/docker-agent/pkg/model/provider"
@@ -41,6 +42,7 @@ type Agent struct {
 	tools                   []tools.Tool
 	commands                types.Commands
 	hooks                   *latest.HooksConfig
+	cache                   *cache.Cache
 
 	// warningsMu guards pendingWarnings. addToolWarning and DrainWarnings
 	// may be called concurrently from the runtime loop, the MCP server,
@@ -254,6 +256,12 @@ func (a *Agent) Hooks() *latest.HooksConfig {
 	return a.hooks
 }
 
+// Cache returns the response cache configured for this agent, or nil when
+// caching is disabled.
+func (a *Agent) Cache() *cache.Cache {
+	return a.cache
+}
+
 // Tools returns the tools available to this agent
 func (a *Agent) Tools(ctx context.Context) ([]tools.Tool, error) {
 	a.ensureToolSetsAreStarted(ctx)

@@ -3,6 +3,7 @@ package agent
 import (
 	"time"
 
+	"github.com/docker/docker-agent/pkg/cache"
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/config/types"
 	"github.com/docker/docker-agent/pkg/model/provider"
@@ -172,3 +173,10 @@ func WithHooks(hooks *latest.HooksConfig) Opt {
 		a.hooks = hooks
 	}
 }
+
+// WithCache attaches a response cache to the agent. Pass nil to disable.
+func WithCache(c *cache.Cache) Opt {
+	return func(a *Agent) {
+		a.cache = c
+	}
+}