Skip to content

Commit 4ca9713

Browse files
authored
Merge pull request #2546 from dgageot/board/llm-as-judge-for-auto-approving-tool-1563505f
feat(hooks): add 'type: model' hook and integrate pre_tool_use into approval flow
2 parents 27f98ce + 3a6990e commit 4ca9713

18 files changed

Lines changed: 1549 additions & 53 deletions

‎agent-schema.json‎

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -701,15 +701,16 @@
701701
},
702702
"type": {
703703
"type": "string",
704-
"description": "Type of hook. 'command' executes a shell command; 'builtin' invokes a named in-process Go function registered by the runtime. The docker-agent runtime ships these builtins: 'add_date' (turn_start: today's date), 'add_environment_info' (session_start: cwd, git, OS, arch), 'add_prompt_files' (turn_start: contents of named files looked up in the workdir hierarchy and the home directory), 'add_git_status' (turn_start: `git status --short --branch`), 'add_git_diff' (turn_start: `git diff --stat`, or full diff with args=['full']), 'add_directory_listing' (session_start: top-level entries of cwd), 'add_user_info' (session_start: current OS user and hostname), 'add_recent_commits' (session_start: `git log --oneline -n N`, default N=10, override via args=['<N>']), 'max_iterations' (before_llm_call: hard stop after N model calls; args=['<N>'] required).",
704+
"description": "Type of hook. 'command' executes a shell command; 'builtin' invokes a named in-process Go function registered by the runtime; 'model' asks an LLM and translates its reply into the hook's native output (used for LLM-as-a-judge pre_tool_use, summarizers, etc., with no Go code). The docker-agent runtime ships these builtins: 'add_date' (turn_start: today's date), 'add_environment_info' (session_start: cwd, git, OS, arch), 'add_prompt_files' (turn_start: contents of named files looked up in the workdir hierarchy and the home directory), 'add_git_status' (turn_start: `git status --short --branch`), 'add_git_diff' (turn_start: `git diff --stat`, or full diff with args=['full']), 'add_directory_listing' (session_start: top-level entries of cwd), 'add_user_info' (session_start: current OS user and hostname), 'add_recent_commits' (session_start: `git log --oneline -n N`, default N=10, override via args=['<N>']), 'max_iterations' (before_llm_call: hard stop after N model calls; args=['<N>'] required).",
705705
"enum": [
706706
"command",
707-
"builtin"
707+
"builtin",
708+
"model"
708709
]
709710
},
710711
"command": {
711712
"type": "string",
712-
"description": "Shell command (type=command) or builtin name (type=builtin) to invoke. Command hooks receive JSON input via stdin with tool/session information."
713+
"description": "Shell command (type=command) or builtin name (type=builtin) to invoke. Command hooks receive JSON input via stdin with tool/session information. Ignored when type=model."
713714
},
714715
"args": {
715716
"type": "array",
@@ -744,13 +745,38 @@
744745
"block"
745746
],
746747
"default": "warn"
748+
},
749+
"model": {
750+
"type": "string",
751+
"description": "Model spec ('provider/model', e.g. 'openai/gpt-4o-mini') invoked by type=model hooks. Required for that type, ignored otherwise."
752+
},
753+
"prompt": {
754+
"type": "string",
755+
"description": "User-message template rendered for each invocation of a type=model hook. Parsed as a Go text/template with the hook Input as the data context: {{ .ToolName }}, {{ .ToolInput }}, {{ .StopResponse }}, etc. Required for type=model."
756+
},
757+
"schema": {
758+
"type": "string",
759+
"description": "Well-known response interpretation for type=model hooks. Empty: return the model's reply as additional_context. 'pre_tool_use_decision': ask the model for {decision, reason} JSON and produce a permission_decision verdict (allow|ask|deny)."
747760
}
748761
},
749762
"required": [
750-
"type",
751-
"command"
763+
"type"
752764
],
753-
"additionalProperties": false
765+
"additionalProperties": false,
766+
"allOf": [
767+
{
768+
"if": {"properties": {"type": {"const": "command"}}, "required": ["type"]},
769+
"then": {"required": ["command"]}
770+
},
771+
{
772+
"if": {"properties": {"type": {"const": "builtin"}}, "required": ["type"]},
773+
"then": {"required": ["command"]}
774+
},
775+
{
776+
"if": {"properties": {"type": {"const": "model"}}, "required": ["type"]},
777+
"then": {"required": ["model", "prompt"]}
778+
}
779+
]
754780
},
755781
"ModelConfig": {
756782
"type": "object",

‎docs/configuration/hooks/index.md‎

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,66 @@ hooks:
471471

472472
Return nothing to fall through to the usual interactive confirmation.
473473

474+
### LLM as a Judge (Auto-Approving Tool Calls)
475+
476+
The `model` hook type asks an LLM and translates its reply into the
477+
hook's native output — no Go code, no shell glue, no JSON parsing on
478+
your side. Combined with the well-known `pre_tool_use_decision`
479+
schema it gives you a fully-configurable LLM judge that decides
480+
`allow` / `ask` / `deny` per tool call.
481+
482+
```yaml
483+
hooks:
484+
pre_tool_use:
485+
- matcher: "shell|edit_file|mcp:.*"
486+
hooks:
487+
- type: model
488+
model: openai/gpt-4o-mini
489+
timeout: 15
490+
schema: pre_tool_use_decision
491+
prompt: |
492+
You are a security judge for an autonomous agent.
493+
Decide whether this tool call is safe to auto-approve.
494+
495+
Tool: {{ .ToolName }}
496+
Args: {{ .ToolInput | toJSON }}
497+
498+
Project rules:
499+
- Reads under the working directory are safe.
500+
- Writes to ~/.ssh / ~/.aws / ~/.docker are deny.
501+
```
502+
503+
| Field | Required | Description |
504+
| -------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
505+
| `model` | yes | Model spec (`provider/model`, e.g. `openai/gpt-4o-mini`). The judge model — small/cheap is recommended. |
506+
| `prompt` | yes | Go [`text/template`](https://pkg.go.dev/text/template) body. Sees the hook [Input](#hook-input) as data, plus the `toJSON` and `truncate <n>` helpers. |
507+
| `schema` | no | Well-known response interpretation. `pre_tool_use_decision` produces a `permission_decision` verdict; omit for free-form text injected as `additional_context`. |
508+
| `timeout`| no (default 60s) | Per-call timeout. **Timeouts fail closed (deny) for `pre_tool_use`** regardless of any other setting. Match it to your judge model's typical latency plus a small buffer. |
509+
510+
The `pre_tool_use_decision` schema constrains the judge to reply with
511+
strict `{decision, reason}` JSON. Providers that honor structured
512+
output (OpenAI, ...) are asked to emit that shape directly; on
513+
providers that ignore it the framework still parses tolerant
514+
JSON-in-text. Anything unparseable propagates as a hook error and the
515+
executor falls closed (deny) on `pre_tool_use`.
516+
517+
Pair it with deterministic `permissions:` rules so destructive calls
518+
(e.g. `sudo`, `rm -rf`) are blocked even if the judge is misled, and
519+
obvious read-only calls bypass the LLM entirely. See
520+
[`examples/llm_judge.yaml`](https://github.com/docker/docker-agent/blob/main/examples/llm_judge.yaml)
521+
for a complete configuration.
522+
523+
**Security considerations**:
524+
525+
- **Sensitive data**: Tool arguments (including file paths, command
526+
arguments, and any other parameters) are sent to the judge LLM. Avoid
527+
using the judge on tools that handle secrets, or ensure your judge
528+
model is self-hosted.
529+
- **Defense in depth**: The judge should not be your only security
530+
layer. Use deterministic `permissions:` rules to block obviously
531+
dangerous operations (e.g., `sudo`, `rm -rf`) before the judge sees
532+
them, as shown in the example configuration.
533+
474534
</div>
475535

476536
## CLI Flags

‎examples/llm_judge.yaml‎

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env docker agent run
2+
#
3+
# LLM-as-a-judge — auto-approve tool calls with a small model
4+
# ==========================================================================
5+
#
6+
# This example wires three layers of tool-call control:
7+
#
8+
# 1. Deterministic permissions — `permissions:` block. Hard deny rules
9+
# catch obvious abuse (sudo, rm -rf, etc) regardless of what the
10+
# judge says, and obvious read-only ops are allow-listed so the
11+
# judge isn't paid for every harmless call.
12+
#
13+
# 2. LLM judge (this file's centerpiece) — a `pre_tool_use` hook of
14+
# type `model` that asks a small LLM to verdict every shell / edit
15+
# / MCP call that fell through the deterministic rules. The
16+
# `pre_tool_use_decision` schema constrains the model to reply
17+
# with strict {decision, reason} JSON; the framework translates
18+
# that into the runtime's permission_decision pipeline.
19+
#
20+
# 3. User confirmation — anything the judge said `ask` on falls
21+
# through to the normal TUI prompt. So the human is still in the
22+
# loop on ambiguous calls.
23+
#
24+
# This config uses ZERO Go code: the `type: model` hook is provided by
25+
# the framework, no builtin to register, no shell glue, no jq+curl.
26+
#
27+
28+
agents:
29+
root:
30+
model: openai/gpt-4o
31+
description: Agent supervised by an LLM judge for tool calls.
32+
instruction: |
33+
You are a helpful assistant with access to shell and filesystem tools.
34+
Use them to help the user. The user's machine is supervised by an
35+
LLM judge that decides which of your tool calls are auto-approved.
36+
37+
toolsets:
38+
- type: shell
39+
- type: filesystem
40+
41+
hooks:
42+
# Layer 2: LLM judge. Scope it via the matcher so we don't pay
43+
# judge latency on cheap tools.
44+
pre_tool_use:
45+
- matcher: "shell|edit_file|mcp:.*"
46+
hooks:
47+
- type: model
48+
model: openai/gpt-4o-mini
49+
# 15s is plenty for a small model; the judge fails closed
50+
# (deny) on timeout. Pair with permissions.allow for
51+
# read-only ops to keep latency budget under control.
52+
timeout: 15
53+
schema: pre_tool_use_decision
54+
prompt: |
55+
You are a security judge for an autonomous agent.
56+
Decide whether this tool call is safe to auto-approve.
57+
58+
Tool: {{ .ToolName }}
59+
Args: {{ .ToolInput | toJSON }}
60+
61+
Project rules:
62+
- Reads under the working directory are safe.
63+
- Network egress to anything other than github.com or
64+
pypi.org should be `ask`.
65+
- Any write under ~/.ssh, ~/.aws, or ~/.docker is `deny`.
66+
- When in doubt, prefer `ask` (the user is then asked).
67+
68+
# Layer 3 (audit): log every verdict so the policy can be tuned.
69+
post_tool_use:
70+
- matcher: "*"
71+
hooks:
72+
- type: command
73+
timeout: 5
74+
command: |
75+
INPUT=$(cat)
76+
TS=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
77+
TOOL=$(echo "$INPUT" | jq -r '.tool_name')
78+
echo "[$TS] $TOOL completed" >> /tmp/agent-judge-audit.log
79+
80+
# Layer 1: deterministic rules. Evaluated BEFORE any hook fires, so
81+
# they short-circuit the judge entirely for obvious cases.
82+
permissions:
83+
deny:
84+
- "shell:cmd=*sudo*"
85+
- "shell:cmd=*rm -rf*"
86+
- "shell:cmd=*mkfs*"
87+
- "shell:cmd=*dd if=*"
88+
- "edit_file:path=/etc/*"
89+
allow:
90+
- "read_*"

‎pkg/config/latest/types.go‎

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1816,6 +1816,10 @@ type HookDefinition struct {
18161816
// is owned by the runtime; the docker-agent runtime
18171817
// ships add_date, add_environment_info, and
18181818
// add_prompt_files.
1819+
// - "model": ask an LLM and translate its reply into the hook's
1820+
// native output. See Model / Prompt / Schema. Used to
1821+
// implement "LLM as a judge" pre_tool_use hooks,
1822+
// turn-start summarizers, etc., with no Go code.
18191823
Type string `json:"type" yaml:"type"`
18201824

18211825
// Command is the shell command (Type==command) or the builtin name
@@ -1839,6 +1843,25 @@ type HookDefinition struct {
18391843

18401844
// OnError controls non-fail-closed hook failures: warn (default), ignore, or block.
18411845
OnError string `json:"on_error,omitempty" yaml:"on_error,omitempty"`
1846+
1847+
// Model is the model spec ("provider/model", e.g. "openai/gpt-4o-mini")
1848+
// invoked by Type==model hooks. Required for that type, ignored
1849+
// otherwise.
1850+
Model string `json:"model,omitempty" yaml:"model,omitempty"`
1851+
1852+
// Prompt is the user-message template rendered for each invocation
1853+
// of a Type==model hook. It is parsed as a Go text/template with the
1854+
// hook [Input] as the data context (so {{ .ToolName }},
1855+
// {{ .ToolInput }}, etc. work). Required for Type==model.
1856+
Prompt string `json:"prompt,omitempty" yaml:"prompt,omitempty"`
1857+
1858+
// Schema selects a well-known response interpretation for Type==model
1859+
// hooks. The empty value means "return the model's reply as
1860+
// additional_context". Other values (registered by the runtime) ask
1861+
// the provider for strict-JSON output and translate the result into
1862+
// the right Output shape (e.g. "pre_tool_use_decision" produces a
1863+
// permission_decision verdict).
1864+
Schema string `json:"schema,omitempty" yaml:"schema,omitempty"`
18421865
}
18431866

18441867
// GetTimeout returns the per-hook execution timeout, defaulting to 60
@@ -2044,8 +2067,15 @@ func (h *HookDefinition) validate(prefix string, index int) error {
20442067
if h.Command == "" {
20452068
return fmt.Errorf("hooks.%s[%d]: command must name the builtin to invoke", prefix, index)
20462069
}
2070+
case "model":
2071+
if h.Model == "" {
2072+
return fmt.Errorf("hooks.%s[%d]: model is required for model hooks (e.g. 'openai/gpt-4o-mini')", prefix, index)
2073+
}
2074+
if h.Prompt == "" {
2075+
return fmt.Errorf("hooks.%s[%d]: prompt is required for model hooks", prefix, index)
2076+
}
20472077
default:
2048-
return fmt.Errorf("hooks.%s[%d]: unsupported hook type '%s' (supported: 'command', 'builtin')", prefix, index, h.Type)
2078+
return fmt.Errorf("hooks.%s[%d]: unsupported hook type '%s' (supported: 'command', 'builtin', 'model')", prefix, index, h.Type)
20492079
}
20502080

20512081
return nil

‎pkg/hooks/aggregate_test.go‎

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package hooks
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
// TestAggregateTracksMostRestrictiveDecision pins the new
10+
// Result.Decision contract: when multiple pre_tool_use hooks fire on a
11+
// single tool call, the aggregated verdict is the most-restrictive
12+
// (Deny > Ask > Allow). The runtime's tool-approval flow consults this
13+
// to short-circuit the user prompt for Allow and to escalate Ask, so
14+
// the ordering must be stable.
15+
func TestAggregateTracksMostRestrictiveDecision(t *testing.T) {
16+
t.Parallel()
17+
18+
mk := func(d Decision, reason string) hookResult {
19+
return hookResult{HandlerResult: HandlerResult{Output: &Output{
20+
HookSpecificOutput: &HookSpecificOutput{
21+
HookEventName: EventPreToolUse,
22+
PermissionDecision: d,
23+
PermissionDecisionReason: reason,
24+
},
25+
}}}
26+
}
27+
28+
cases := []struct {
29+
name string
30+
results []hookResult
31+
wantVerdict Decision
32+
wantReason string
33+
wantAllowed bool
34+
}{
35+
{
36+
name: "no decision: Allowed=true, Decision empty",
37+
results: []hookResult{{}},
38+
wantVerdict: "",
39+
wantAllowed: true,
40+
},
41+
{
42+
name: "single allow",
43+
results: []hookResult{mk(DecisionAllow, "safe")},
44+
wantVerdict: DecisionAllow,
45+
wantReason: "safe",
46+
wantAllowed: true,
47+
},
48+
{
49+
name: "single ask escalates over no decision",
50+
results: []hookResult{{}, mk(DecisionAsk, "unclear")},
51+
wantVerdict: DecisionAsk,
52+
wantReason: "unclear",
53+
wantAllowed: true, // Ask doesn't flip Allowed; the runtime handles the prompt.
54+
},
55+
{
56+
name: "deny beats ask beats allow",
57+
results: []hookResult{
58+
mk(DecisionAllow, "looks fine"),
59+
mk(DecisionAsk, "second-guess"),
60+
mk(DecisionDeny, "destructive"),
61+
},
62+
wantVerdict: DecisionDeny,
63+
wantReason: "destructive",
64+
wantAllowed: false,
65+
},
66+
{
67+
name: "first reason wins on ties",
68+
results: []hookResult{
69+
mk(DecisionAsk, "first ask"),
70+
mk(DecisionAsk, "second ask"),
71+
},
72+
wantVerdict: DecisionAsk,
73+
wantReason: "first ask",
74+
wantAllowed: true,
75+
},
76+
}
77+
for _, tc := range cases {
78+
t.Run(tc.name, func(t *testing.T) {
79+
t.Parallel()
80+
final := aggregate(tc.results, EventPreToolUse)
81+
assert.Equal(t, tc.wantVerdict, final.Decision)
82+
assert.Equal(t, tc.wantReason, final.DecisionReason)
83+
assert.Equal(t, tc.wantAllowed, final.Allowed)
84+
})
85+
}
86+
}
87+
88+
// TestAggregateDecisionEmptyForNonPreToolUse documents that
89+
// Result.Decision is meaningful only for pre_tool_use events. Other
90+
// events (turn_start, post_tool_use, ...) MUST leave it empty so a
91+
// runtime that consults it can't accidentally act on a stale verdict
92+
// from an unrelated hook.
93+
func TestAggregateDecisionEmptyForNonPreToolUse(t *testing.T) {
94+
t.Parallel()
95+
96+
results := []hookResult{{HandlerResult: HandlerResult{Output: &Output{
97+
HookSpecificOutput: &HookSpecificOutput{
98+
HookEventName: EventTurnStart,
99+
PermissionDecision: DecisionAllow, // misconfigured but possible
100+
},
101+
}}}}
102+
103+
final := aggregate(results, EventTurnStart)
104+
assert.Equal(t, Decision(""), final.Decision)
105+
assert.Empty(t, final.DecisionReason)
106+
}

‎pkg/hooks/builtins/builtins.go‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
// stable for its duration. max_iterations is stateful: its
2424
// per-session counter lives on the [State] returned by [Register];
2525
// the runtime clears it via [State.ClearSession] from session_end.
26+
//
27+
// LLM-as-a-judge hooks are NOT shipped here: write `type: model` with
28+
// `schema: pre_tool_use_decision` instead — see
29+
// pkg/hooks/shape_pre_tool_use_decision.go and examples/llm_judge.yaml.
2630
package builtins
2731

2832
import (

0 commit comments

Comments
 (0)