Files
Stirling-PDF/engine/tests/test_pdf_edit_agent.py
James Brunton 3e94157137 Add document context for edit agent (#6152)
# Description of Changes
Adds the ability for the Edit agent to request the content of the
document before it decides which parameters it needs. This makes it able
to process requests like `Split the document after the page containing
the "My Section" section`, allowing for document context-based requests
for all[^1] tools.

I had to make a few changes elsewhere to make this work, including:
- Moving the requesting of content out of the Question Agent and into a
common location
- Added specific API docs for the Split param because the generic ones
were not specific enough for the AI to be able to reliably perform the
correct operation
- Fixed an issue in the tool models generator which caused the Redact
params to only be half-generated (causing Pydantic to crash when the AI
tried to run Redact)
- Added missing logging to a bunch of tools and hooked it up properly so
it'll print to stderr
- Made the limits for the max pages/chars to extract from PDFs
configurable via env var

[^1]: Many of the tools can't actually do anything useful with the
context at this stage, but will just need the tool API to be extended
with new features like page-specific operations to be automatically able
to do smart operations without needing to change the Edit agent itself.
2026-04-23 13:19:27 +00:00

278 lines
9.1 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
import pytest
from stirling.agents import PdfEditAgent, PdfEditParameterSelector, PdfEditPlanSelection
from stirling.agents.pdf_edit import PdfEditPlanOutput
from stirling.contracts import (
EditCannotDoResponse,
EditClarificationRequest,
EditPlanResponse,
ExtractedFileText,
NeedContentFileRequest,
NeedContentResponse,
PdfContentType,
PdfEditRequest,
PdfTextSelection,
SupportedCapability,
ToolOperationStep,
)
from stirling.models.tool_models import Angle, FlattenParams, RotatePdfParams, ToolEndpoint
from stirling.services.runtime import AppRuntime
@dataclass(frozen=True)
class ParameterSelectorCall:
request: PdfEditRequest
operation_plan: list[ToolEndpoint]
operation_index: int
generated_steps: list[ToolOperationStep]
class RecordingParameterSelector:
def __init__(self) -> None:
self.calls: list[ParameterSelectorCall] = []
async def select(
self,
request: PdfEditRequest,
operation_plan: list[ToolEndpoint],
operation_index: int,
generated_steps: list[ToolOperationStep],
) -> RotatePdfParams | FlattenParams:
self.calls.append(
ParameterSelectorCall(
request=request,
operation_plan=operation_plan,
operation_index=operation_index,
generated_steps=list(generated_steps),
)
)
if operation_index == 0:
return RotatePdfParams(angle=Angle(90))
return FlattenParams(flatten_only_forms=False, render_dpi=None)
class StubPdfEditAgent(PdfEditAgent):
def __init__(
self,
runtime: AppRuntime,
selection: PdfEditPlanOutput,
parameter_selector: RecordingParameterSelector | PdfEditParameterSelector | None = None,
) -> None:
super().__init__(runtime)
self.selection = selection
if parameter_selector is not None:
self.parameter_selector = parameter_selector
async def _select_plan(
self,
request: PdfEditRequest,
allow_need_content: bool = True,
) -> PdfEditPlanOutput:
return self.selection
@pytest.mark.anyio
async def test_pdf_edit_agent_builds_multi_step_plan(runtime: AppRuntime) -> None:
parameter_selector = RecordingParameterSelector()
agent = StubPdfEditAgent(
runtime,
PdfEditPlanSelection(
operations=[ToolEndpoint.ROTATE_PDF, ToolEndpoint.FLATTEN],
summary="Rotate the PDF, then compress it.",
rationale="The pages need reorientation before reducing file size.",
),
parameter_selector=parameter_selector,
)
response = await agent.handle(
PdfEditRequest(
user_message="Rotate the PDF clockwise and then compress it.",
file_names=["scan.pdf"],
)
)
assert isinstance(response, EditPlanResponse)
assert response.summary == "Rotate the PDF, then compress it."
assert response.rationale == "The pages need reorientation before reducing file size."
assert [step.tool for step in response.steps] == [ToolEndpoint.ROTATE_PDF, ToolEndpoint.FLATTEN]
assert isinstance(response.steps[0].parameters, RotatePdfParams)
assert isinstance(response.steps[1].parameters, FlattenParams)
@pytest.mark.anyio
async def test_pdf_edit_agent_passes_previous_steps_to_parameter_selector(runtime: AppRuntime) -> None:
parameter_selector = RecordingParameterSelector()
agent = StubPdfEditAgent(
runtime,
PdfEditPlanSelection(
operations=[ToolEndpoint.ROTATE_PDF, ToolEndpoint.FLATTEN],
summary="Rotate the PDF, then compress it.",
),
parameter_selector=parameter_selector,
)
request = PdfEditRequest(
user_message="Rotate the PDF clockwise and then compress it.",
file_names=["scan.pdf"],
)
response = await agent.handle(request)
assert isinstance(response, EditPlanResponse)
assert len(parameter_selector.calls) == 2
assert parameter_selector.calls[0].operation_index == 0
assert parameter_selector.calls[0].generated_steps == []
assert parameter_selector.calls[1].operation_index == 1
assert parameter_selector.calls[1].generated_steps == [
ToolOperationStep(
tool=ToolEndpoint.ROTATE_PDF,
parameters=RotatePdfParams(angle=Angle(90)),
)
]
@pytest.mark.anyio
async def test_pdf_edit_agent_returns_clarification_without_partial_plan(runtime: AppRuntime) -> None:
agent = StubPdfEditAgent(
runtime,
EditClarificationRequest(
question="Which pages should be rotated?",
reason="The request does not say which pages to change.",
),
)
response = await agent.handle(PdfEditRequest(user_message="Rotate some pages."))
assert isinstance(response, EditClarificationRequest)
@pytest.mark.anyio
async def test_pdf_edit_agent_returns_cannot_do_without_partial_plan(runtime: AppRuntime) -> None:
agent = StubPdfEditAgent(
runtime,
EditCannotDoResponse(
reason="This request requires OCR, which is not part of PDF edit planning.",
),
)
response = await agent.handle(PdfEditRequest(user_message="Read this scan and summarize it."))
assert isinstance(response, EditCannotDoResponse)
@pytest.mark.anyio
async def test_pdf_edit_agent_returns_need_content_without_building_plan(runtime: AppRuntime) -> None:
parameter_selector = RecordingParameterSelector()
agent = StubPdfEditAgent(
runtime,
NeedContentResponse(
resume_with=SupportedCapability.PDF_EDIT,
reason="Need page text to locate the NEW PAGE markers.",
files=[],
max_pages=0,
max_characters=0,
),
parameter_selector=parameter_selector,
)
response = await agent.handle(
PdfEditRequest(
user_message="Split after every page that says 'NEW PAGE'.",
file_names=["report.pdf"],
)
)
assert isinstance(response, NeedContentResponse)
assert response.resume_with == SupportedCapability.PDF_EDIT
assert response.files == [NeedContentFileRequest(file_name="report.pdf", content_types=[PdfContentType.PAGE_TEXT])]
assert response.max_pages == runtime.settings.max_pages
assert response.max_characters == runtime.settings.max_characters
assert parameter_selector.calls == []
@pytest.mark.anyio
async def test_pdf_edit_agent_builds_selection_agent_matching_content_availability(runtime: AppRuntime) -> None:
from stirling.agents.pdf_edit import PdfEditSelectionAgent
agent = PdfEditAgent(runtime)
captured: list[bool] = []
def record(*, allow_need_content: bool) -> PdfEditSelectionAgent:
captured.append(allow_need_content)
raise _StopSelectionError()
agent._build_selection_agent = record
with pytest.raises(_StopSelectionError):
await agent._select_plan(PdfEditRequest(user_message="Rotate."))
with pytest.raises(_StopSelectionError):
await agent._select_plan(
PdfEditRequest(
user_message="Rotate.",
page_text=[
ExtractedFileText(
file_name="report.pdf",
pages=[PdfTextSelection(page_number=1, text="content")],
)
],
)
)
with pytest.raises(_StopSelectionError):
await agent._select_plan(PdfEditRequest(user_message="Rotate."), allow_need_content=False)
assert captured == [True, False, False]
@pytest.mark.anyio
async def test_pdf_edit_selection_agent_excludes_need_content_from_schema_when_not_allowed(
runtime: AppRuntime,
) -> None:
from stirling.agents.pdf_edit import PdfEditSelectionAgent
can_request = PdfEditSelectionAgent(runtime, "base", allow_need_content=True)
cannot_request = PdfEditSelectionAgent(runtime, "base", allow_need_content=False)
assert NeedContentResponse in _agent_output_types(can_request)
assert NeedContentResponse not in _agent_output_types(cannot_request)
def _agent_output_types(agent: object) -> list[type]:
native = getattr(getattr(agent, "agent"), "output_type")
return list(getattr(native, "outputs", []))
class _StopSelectionError(Exception):
pass
@pytest.mark.anyio
async def test_pdf_edit_agent_passes_page_text_to_parameter_selector(runtime: AppRuntime) -> None:
parameter_selector = RecordingParameterSelector()
agent = StubPdfEditAgent(
runtime,
PdfEditPlanSelection(
operations=[ToolEndpoint.ROTATE_PDF],
summary="Rotate the PDF.",
),
parameter_selector=parameter_selector,
)
page_text = [
ExtractedFileText(
file_name="report.pdf",
pages=[PdfTextSelection(page_number=1, text="NEW PAGE")],
)
]
await agent.handle(
PdfEditRequest(
user_message="Rotate clockwise.",
file_names=["report.pdf"],
page_text=page_text,
)
)
assert parameter_selector.calls[0].request.page_text == page_text