Integrate Gemini Agent for natural language PDF operations

This commit introduces a Gemini Agent capable of interpreting user prompts
to perform Stirling-PDF operations.

Key changes:
- Added GeminiAgentController and GeminiAgentService.
- Implemented prompt engineering to provide API knowledge to Gemini.
- Configured API key management for the Gemini API.
- Outlined testing, deployment, and documentation strategies.

Further work needed:
- Integrate a specific Gemini API client library.
- Implement full response parsing and PDF operation orchestration logic
  in GeminiAgentService.
- Add the client library to build.gradle.
This commit is contained in:
google-labs-jules[bot] 2025-07-04 12:55:34 +00:00
parent 6158454020
commit 7eb904964f
6 changed files with 278 additions and 0 deletions

View File

@ -15,6 +15,7 @@ import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.core.env.Environment; import org.springframework.core.env.Environment;
import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.context.annotation.PropertySource; // Added import
import io.github.pixee.security.SystemCommand; import io.github.pixee.security.SystemCommand;
@ -38,6 +39,7 @@ import stirling.software.common.util.UrlUtils;
"stirling.software.common", "stirling.software.common",
"stirling.software.proprietary" "stirling.software.proprietary"
}) })
@PropertySource("classpath:gemini-agent.properties") // Added annotation
public class SPDFApplication { public class SPDFApplication {
private static String serverPortStatic; private static String serverPortStatic;

View File

@ -0,0 +1,50 @@
package stirling.software.SPDF.controller.api;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.model.api.agent.AgentRequest;
import stirling.software.SPDF.model.api.agent.AgentResponse;
import stirling.software.SPDF.service.agent.GeminiAgentService;
import java.util.List;
@RestController
@RequestMapping("/api/v1/agent")
@Tag(name = "Agent", description = "Gemini Agent APIs")
@RequiredArgsConstructor
public class GeminiAgentController {
@Autowired
private GeminiAgentService geminiAgentService;
@PostMapping(value = "/execute", consumes = { MediaType.MULTIPART_FORM_DATA_VALUE })
@Operation(
summary = "Process a user request through the Gemini agent",
description = "This endpoint takes a user prompt and optional files, processes them using the Gemini agent, and returns the result.")
public ResponseEntity<AgentResponse> executeTask(
@RequestPart(name = "request", required = true) AgentRequest agentRequestDetails,
@RequestPart(name = "files", required = false) List<MultipartFile> files) {
// The AgentRequest DTO might need adjustment if MultipartFile is directly included.
// For now, we assume file references or that files are handled separately by the service.
// This example assumes files are passed to the service.
AgentResponse response = geminiAgentService.processRequest(
agentRequestDetails.getUserPrompt(),
files,
agentRequestDetails.getAdditionalParams());
return ResponseEntity.ok(response);
}
}

View File

@ -0,0 +1,18 @@
package stirling.software.SPDF.model.api.agent;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.web.multipart.MultipartFile;
import java.util.List;
import java.util.Map;
@Data
@NoArgsConstructor
public class AgentRequest {
private String userPrompt;
// Files will be handled as a separate @RequestPart in the controller
// and passed as a List<MultipartFile> to the service.
// This DTO therefore doesn't need to carry file information directly.
private Map<String, Object> additionalParams;
}

View File

@ -0,0 +1,12 @@
package stirling.software.SPDF.model.api.agent;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class AgentResponse {
private String message;
private Object data;
private boolean success;
}

View File

@ -0,0 +1,192 @@
package stirling.software.SPDF.service.agent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.model.api.agent.AgentRequest;
import stirling.software.SPDF.model.api.agent.AgentResponse;
import java.util.List;
import java.util.Map;
// TODO: Add necessary imports for a Gemini client library
@Service
public class GeminiAgentService {
@Value("${gemini.api.key}")
private String geminiApiKey;
// TODO: Inject a Gemini client if using a library
public GeminiAgentService() {
// Constructor
// Initialize Gemini client here if needed
}
public AgentResponse processRequest(String userPrompt, List<MultipartFile> files, Map<String, Object> additionalParams) {
AgentResponse agentResponse = new AgentResponse();
// 1. Validate inputs (userPrompt, files, etc.)
if (userPrompt == null || userPrompt.trim().isEmpty()) {
agentResponse.setSuccess(false);
agentResponse.setMessage("User prompt cannot be empty.");
return agentResponse;
}
// 2. Prepare request for Gemini API
// - This will involve constructing the prompt, potentially including
// information about available Stirling-PDF tools/APIs.
// - If files are provided, decide how to represent them to Gemini
// (e.g., extract text, OCR, pass file references if supported).
String geminiPrompt = buildPromptForGemini(userPrompt, files, additionalParams);
// 3. Call Gemini API
try {
// Placeholder for actual Gemini API call
// GeminiResponse geminiApiResponse = geminiClient.generateContent(geminiPrompt);
String geminiOutput = "Placeholder Gemini response: Would perform action X on PDF."; // Replace with actual API call
// 4. Parse Gemini's response
// - Determine the action(s) to take based on Gemini's output
// (e.g., merge PDFs, add watermark, extract text).
// - Extract any parameters needed for the action.
String actionToPerform = parseActionFromGeminiResponse(geminiOutput);
Map<String, Object> actionParams = parseParamsFromGeminiResponse(geminiOutput);
// 5. Orchestrate Stirling-PDF operations
// - This is where you'd call other services or controllers in Stirling-PDF.
// - For now, this is a placeholder.
Object resultData = executeStirlingPdfOperation(actionToPerform, actionParams, files);
agentResponse.setSuccess(true);
agentResponse.setMessage("Gemini agent processed the request successfully.");
agentResponse.setData(resultData);
} catch (Exception e) {
// Log the exception
agentResponse.setSuccess(false);
agentResponse.setMessage("Error processing request with Gemini agent: " + e.getMessage());
agentResponse.setData(null);
}
return agentResponse;
}
private String buildPromptForGemini(String userPrompt, List<MultipartFile> files, Map<String, Object> additionalParams) {
// This prompt should instruct Gemini on how to interpret the user's request
// and what kind of output is expected (e.g., identify an action and parameters).
// It should also include a summary of available Stirling-PDF tools.
StringBuilder prompt = new StringBuilder();
prompt.append("You are an AI assistant for Stirling-PDF, a powerful PDF manipulation tool.\n");
prompt.append("Your primary goal is to understand the user's request and determine the single most appropriate Stirling-PDF operation to perform and the necessary parameters for that operation.\n\n");
prompt.append("## Available Stirling-PDF Operations:\n");
prompt.append("Here is a list of operations you can request. For each operation, specify the 'operation' name and a 'parameters' JSON object.\n\n");
// General Operations
prompt.append("- operation: \"merge-pdfs\"\n");
prompt.append(" description: \"Merges multiple PDF files into one single PDF.\"\n");
prompt.append(" parameters: {\"sortType\": \"orderProvided|byFileName|byDateModified|...\", \"generateToc\": \"true|false\", \"removeCertSign\": \"true|false\"}\n\n");
prompt.append("- operation: \"split-pdf\"\n");
prompt.append(" description: \"Splits a PDF into multiple files based on page ranges or extracting all pages.\"\n");
prompt.append(" parameters: {\"splitType\": \"ranges|all\", \"ranges\": \"e.g., 1-3,5,7-end\"}\n\n");
prompt.append("- operation: \"rotate-pdf\"\n");
prompt.append(" description: \"Rotates pages in a PDF file.\"\n");
prompt.append(" parameters: {\"angle\": \"90|180|270\", \"pageFilter\": \"all|even|odd|custom\", \"pageNumbers\": \"e.g., 1,3-5\"}\n\n");
// Security Operations
prompt.append("- operation: \"add-watermark\"\n");
prompt.append(" description: \"Adds a text or image watermark to a PDF.\"\n");
prompt.append(" parameters: {\"watermarkType\": \"text|image\", \"watermarkText\": \"text_for_watermark (if type is text)\", \"watermarkImage\": \"reference_to_image_file (if type is image)\", \"fontSize\": float, \"opacity\": float (0.0-1.0), \"rotation\": float, ...}\n\n");
prompt.append("- operation: \"add-password\"\n");
prompt.append(" description: \"Adds a password to protect a PDF.\"\n");
prompt.append(" parameters: {\"ownerPassword\": \"password_string\", \"userPassword\": \"password_string\"}\n\n");
// Misc Operations
prompt.append("- operation: \"ocr-pdf\"\n");
prompt.append(" description: \"Performs OCR (Optical Character Recognition) on a PDF to make its text selectable/searchable.\"\n");
prompt.append(" parameters: {\"languages\": [\"eng\", \"spa\", ...], \"ocrType\": \"skip-text|force-ocr\", \"deskew\": \"true|false\"}\n\n");
prompt.append("- operation: \"compress-pdf\"\n");
prompt.append(" description: \"Reduces the file size of a PDF.\"\n");
prompt.append(" parameters: {\"compressionLevel\": \"low|medium|high|custom_0-100\"}\n\n");
// Conversion Operations
prompt.append("- operation: \"convert-to-pdfa\"\n");
prompt.append(" description: \"Converts a PDF to PDF/A format for long-term archiving.\"\n");
prompt.append(" parameters: {\"pdfStandard\": \"PDF/A-1B|PDF/A-2B|PDF/A-3B\"}\n\n");
prompt.append("- operation: \"pdf-to-word\"\n");
prompt.append(" description: \"Converts a PDF file to a Word document (docx).\"\n");
prompt.append(" parameters: {}\n\n"); // Assuming simple conversion, might need more params
prompt.append("- operation: \"image-to-pdf\"\n");
prompt.append(" description: \"Converts one or more image files to a PDF document.\"\n");
prompt.append(" parameters: {\"pageSize\": \"A4|LETTER|AUTO\", \"orientation\": \"portrait|landscape\"}\n\n");
prompt.append("## User Request Context:\n");
prompt.append("User's request: \"").append(userPrompt).append("\"\n");
if (files != null && !files.isEmpty()) {
prompt.append("The user has provided the following file(s) for the operation (you will receive them separately):\n");
for (int i = 0; i < files.size(); i++) {
prompt.append("- File ").append(i + 1).append(": ").append(files.get(i).getOriginalFilename()).append("\n");
}
if (files.size() == 1) {
prompt.append("Assume this single file is the primary input unless the user specifies otherwise.\n");
} else {
prompt.append("Determine from the user's prompt how these files should be used (e.g., all for merge, first as input second as watermark image).\n");
}
}
if (additionalParams != null && !additionalParams.isEmpty()) {
prompt.append("Additional parameters provided: ").append(additionalParams.toString()).append("\n");
}
prompt.append("\n## Your Response Format:\n");
prompt.append("Based on the user's request and the available operations, please identify the single most relevant operation and its parameters.\n");
prompt.append("Respond with a JSON object containing two keys: 'operation' (a string matching one of the available operation names) and 'parameters' (a JSON object of the parameters for that operation).\n");
prompt.append("If the user's request is ambiguous or requires an operation not listed, respond with {\"operation\": \"clarification_needed\", \"parameters\": {\"message\": \"Your clarification message here\"}}.\n");
prompt.append("If multiple operations seem applicable, choose the one that seems most central to the user's request or ask for clarification.\n");
prompt.append("Example response: {\"operation\": \"add-watermark\", \"parameters\": {\"watermarkType\": \"text\", \"watermarkText\": \"CONFIDENTIAL DRAFT\", \"opacity\": 0.3, \"fontSize\": 50.0}}\n");
return prompt.toString();
}
private String parseActionFromGeminiResponse(String geminiOutput) {
// TODO: Implement logic to parse the action from Gemini's response.
// This might involve JSON parsing if Gemini returns structured data,
// or regex/string matching for less structured output.
// For placeholder:
if (geminiOutput.contains("perform action X")) {
return "actionX";
}
return "unknownAction";
}
private Map<String, Object> parseParamsFromGeminiResponse(String geminiOutput) {
// TODO: Implement logic to parse parameters from Gemini's response.
return Map.of(); // Placeholder
}
private Object executeStirlingPdfOperation(String action, Map<String, Object> params, List<MultipartFile> files) {
// TODO: Implement the orchestration logic.
// This will involve a switch or if-else structure to call the appropriate
// Stirling-PDF service methods or make internal HTTP requests.
// Example:
// if ("merge".equals(action)) {
// // Call MergeService or make HTTP request to /api/v1/general/merge-pdfs
// } else if ("watermark".equals(action)) {
// // Call WatermarkService or make HTTP request to /api/v1/security/add-watermark
// }
return "Placeholder: Executed " + action + " with params " + params.toString() + " on " + (files != null ? files.size() : 0) + " files.";
}
// Helper method to get API key (useful for client initialization if not done in constructor)
public String getGeminiApiKey() {
return geminiApiKey;
}
}

View File

@ -0,0 +1,4 @@
# Gemini AI Agent Configuration
# Ensure this API key is set via an environment variable or a secure configuration method in production.
# Example: GEMINI_API_KEY="your_actual_api_key_env_var"
gemini.api.key=${GEMINI_API_KEY:YOUR_GEMINI_API_KEY_HERE}