forked from confident-ai/deepeval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_pydantic_agent.py
More file actions
61 lines (44 loc) · 1.89 KB
/
Copy pathtest_pydantic_agent.py
File metadata and controls
61 lines (44 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""test_pydantic_agent.py — pytest analog of ``pydantic_after_evals_iterator.py``.
Run with::
deepeval test run test_pydantic_agent.py
Same 3 goldens, same agent setup, but driven by pytest + ``assert_test``
instead of ``dataset.evals_iterator``. The deepeval pytest plugin
(``deepeval test run``) wraps each test in an eval session so the agent's
OTel spans route through REST and the trace gets evaluated against the
metrics passed to ``assert_test``.
Requirements:
- ``CONFIDENT_API_KEY`` in env (or ``deepeval login``)
- ``OPENAI_API_KEY`` in env
- ``pip install pydantic-ai pytest``
"""
import asyncio
import uuid
from pathlib import Path
import pytest
from pydantic_ai import Agent
from deepeval import assert_test
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.integrations.pydantic_ai import DeepEvalInstrumentationSettings
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.tracing.context import next_agent_span
RUN_ID = f"{Path(__file__).stem}-{uuid.uuid4().hex[:8]}"
agent = Agent(
"openai:gpt-4o-mini",
system_prompt="Be concise. Reply with one short sentence.",
instrument=DeepEvalInstrumentationSettings(),
)
async def run_agent(prompt: str) -> str:
# Span-level metric attached to the agent span via next_agent_span;
# trace-level metric is passed to assert_test below. Mirrors the
# split used in pydantic_after_evals_iterator.py.
with next_agent_span(metrics=[AnswerRelevancyMetric(threshold=0.2)]):
result = await agent.run(prompt)
return result.output
dataset = EvaluationDataset()
dataset.pull(alias="Single Turn QA")
@pytest.mark.parametrize("golden", dataset.goldens)
async def test_pydantic_agent(golden: Golden):
# await agent.run(golden.input)
await run_agent(golden.input)
# asyncio.run(run_agent(golden.input))
assert_test(golden=golden, metrics=[AnswerRelevancyMetric(threshold=0.8)])