mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
Integrate Gemini Agent for natural language PDF operations
This commit introduces a Gemini Agent capable of interpreting user prompts to perform Stirling-PDF operations. Key changes: - Added GeminiAgentController and GeminiAgentService. - Implemented prompt engineering to provide API knowledge to Gemini. - Configured API key management for the Gemini API. - Outlined testing, deployment, and documentation strategies. Further work needed: - Integrate a specific Gemini API client library. - Implement full response parsing and PDF operation orchestration logic in GeminiAgentService. - Add the client library to build.gradle.
This commit is contained in:
parent
6158454020
commit
7eb904964f
@ -15,6 +15,7 @@ import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.core.env.Environment;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
import org.springframework.context.annotation.PropertySource; // Added import
|
||||
|
||||
import io.github.pixee.security.SystemCommand;
|
||||
|
||||
@ -38,6 +39,7 @@ import stirling.software.common.util.UrlUtils;
|
||||
"stirling.software.common",
|
||||
"stirling.software.proprietary"
|
||||
})
|
||||
@PropertySource("classpath:gemini-agent.properties") // Added annotation
|
||||
public class SPDFApplication {
|
||||
|
||||
private static String serverPortStatic;
|
||||
|
@ -0,0 +1,50 @@
|
||||
package stirling.software.SPDF.controller.api;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestPart;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import stirling.software.SPDF.model.api.agent.AgentRequest;
|
||||
import stirling.software.SPDF.model.api.agent.AgentResponse;
|
||||
import stirling.software.SPDF.service.agent.GeminiAgentService;
|
||||
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/agent")
|
||||
@Tag(name = "Agent", description = "Gemini Agent APIs")
|
||||
@RequiredArgsConstructor
|
||||
public class GeminiAgentController {
|
||||
|
||||
@Autowired
|
||||
private GeminiAgentService geminiAgentService;
|
||||
|
||||
@PostMapping(value = "/execute", consumes = { MediaType.MULTIPART_FORM_DATA_VALUE })
|
||||
@Operation(
|
||||
summary = "Process a user request through the Gemini agent",
|
||||
description = "This endpoint takes a user prompt and optional files, processes them using the Gemini agent, and returns the result.")
|
||||
public ResponseEntity<AgentResponse> executeTask(
|
||||
@RequestPart(name = "request", required = true) AgentRequest agentRequestDetails,
|
||||
@RequestPart(name = "files", required = false) List<MultipartFile> files) {
|
||||
|
||||
// The AgentRequest DTO might need adjustment if MultipartFile is directly included.
|
||||
// For now, we assume file references or that files are handled separately by the service.
|
||||
// This example assumes files are passed to the service.
|
||||
|
||||
AgentResponse response = geminiAgentService.processRequest(
|
||||
agentRequestDetails.getUserPrompt(),
|
||||
files,
|
||||
agentRequestDetails.getAdditionalParams());
|
||||
|
||||
return ResponseEntity.ok(response);
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package stirling.software.SPDF.model.api.agent;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class AgentRequest {
|
||||
private String userPrompt;
|
||||
// Files will be handled as a separate @RequestPart in the controller
|
||||
// and passed as a List<MultipartFile> to the service.
|
||||
// This DTO therefore doesn't need to carry file information directly.
|
||||
private Map<String, Object> additionalParams;
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
package stirling.software.SPDF.model.api.agent;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class AgentResponse {
|
||||
private String message;
|
||||
private Object data;
|
||||
private boolean success;
|
||||
}
|
@ -0,0 +1,192 @@
|
||||
package stirling.software.SPDF.service.agent;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import stirling.software.SPDF.model.api.agent.AgentRequest;
|
||||
import stirling.software.SPDF.model.api.agent.AgentResponse;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
// TODO: Add necessary imports for a Gemini client library
|
||||
|
||||
@Service
|
||||
public class GeminiAgentService {
|
||||
|
||||
@Value("${gemini.api.key}")
|
||||
private String geminiApiKey;
|
||||
|
||||
// TODO: Inject a Gemini client if using a library
|
||||
|
||||
public GeminiAgentService() {
|
||||
// Constructor
|
||||
// Initialize Gemini client here if needed
|
||||
}
|
||||
|
||||
public AgentResponse processRequest(String userPrompt, List<MultipartFile> files, Map<String, Object> additionalParams) {
|
||||
AgentResponse agentResponse = new AgentResponse();
|
||||
|
||||
// 1. Validate inputs (userPrompt, files, etc.)
|
||||
if (userPrompt == null || userPrompt.trim().isEmpty()) {
|
||||
agentResponse.setSuccess(false);
|
||||
agentResponse.setMessage("User prompt cannot be empty.");
|
||||
return agentResponse;
|
||||
}
|
||||
|
||||
// 2. Prepare request for Gemini API
|
||||
// - This will involve constructing the prompt, potentially including
|
||||
// information about available Stirling-PDF tools/APIs.
|
||||
// - If files are provided, decide how to represent them to Gemini
|
||||
// (e.g., extract text, OCR, pass file references if supported).
|
||||
String geminiPrompt = buildPromptForGemini(userPrompt, files, additionalParams);
|
||||
|
||||
// 3. Call Gemini API
|
||||
try {
|
||||
// Placeholder for actual Gemini API call
|
||||
// GeminiResponse geminiApiResponse = geminiClient.generateContent(geminiPrompt);
|
||||
String geminiOutput = "Placeholder Gemini response: Would perform action X on PDF."; // Replace with actual API call
|
||||
|
||||
// 4. Parse Gemini's response
|
||||
// - Determine the action(s) to take based on Gemini's output
|
||||
// (e.g., merge PDFs, add watermark, extract text).
|
||||
// - Extract any parameters needed for the action.
|
||||
String actionToPerform = parseActionFromGeminiResponse(geminiOutput);
|
||||
Map<String, Object> actionParams = parseParamsFromGeminiResponse(geminiOutput);
|
||||
|
||||
// 5. Orchestrate Stirling-PDF operations
|
||||
// - This is where you'd call other services or controllers in Stirling-PDF.
|
||||
// - For now, this is a placeholder.
|
||||
Object resultData = executeStirlingPdfOperation(actionToPerform, actionParams, files);
|
||||
|
||||
agentResponse.setSuccess(true);
|
||||
agentResponse.setMessage("Gemini agent processed the request successfully.");
|
||||
agentResponse.setData(resultData);
|
||||
|
||||
} catch (Exception e) {
|
||||
// Log the exception
|
||||
agentResponse.setSuccess(false);
|
||||
agentResponse.setMessage("Error processing request with Gemini agent: " + e.getMessage());
|
||||
agentResponse.setData(null);
|
||||
}
|
||||
|
||||
return agentResponse;
|
||||
}
|
||||
|
||||
private String buildPromptForGemini(String userPrompt, List<MultipartFile> files, Map<String, Object> additionalParams) {
|
||||
// This prompt should instruct Gemini on how to interpret the user's request
|
||||
// and what kind of output is expected (e.g., identify an action and parameters).
|
||||
// It should also include a summary of available Stirling-PDF tools.
|
||||
StringBuilder prompt = new StringBuilder();
|
||||
prompt.append("You are an AI assistant for Stirling-PDF, a powerful PDF manipulation tool.\n");
|
||||
prompt.append("Your primary goal is to understand the user's request and determine the single most appropriate Stirling-PDF operation to perform and the necessary parameters for that operation.\n\n");
|
||||
|
||||
prompt.append("## Available Stirling-PDF Operations:\n");
|
||||
prompt.append("Here is a list of operations you can request. For each operation, specify the 'operation' name and a 'parameters' JSON object.\n\n");
|
||||
|
||||
// General Operations
|
||||
prompt.append("- operation: \"merge-pdfs\"\n");
|
||||
prompt.append(" description: \"Merges multiple PDF files into one single PDF.\"\n");
|
||||
prompt.append(" parameters: {\"sortType\": \"orderProvided|byFileName|byDateModified|...\", \"generateToc\": \"true|false\", \"removeCertSign\": \"true|false\"}\n\n");
|
||||
|
||||
prompt.append("- operation: \"split-pdf\"\n");
|
||||
prompt.append(" description: \"Splits a PDF into multiple files based on page ranges or extracting all pages.\"\n");
|
||||
prompt.append(" parameters: {\"splitType\": \"ranges|all\", \"ranges\": \"e.g., 1-3,5,7-end\"}\n\n");
|
||||
|
||||
prompt.append("- operation: \"rotate-pdf\"\n");
|
||||
prompt.append(" description: \"Rotates pages in a PDF file.\"\n");
|
||||
prompt.append(" parameters: {\"angle\": \"90|180|270\", \"pageFilter\": \"all|even|odd|custom\", \"pageNumbers\": \"e.g., 1,3-5\"}\n\n");
|
||||
|
||||
// Security Operations
|
||||
prompt.append("- operation: \"add-watermark\"\n");
|
||||
prompt.append(" description: \"Adds a text or image watermark to a PDF.\"\n");
|
||||
prompt.append(" parameters: {\"watermarkType\": \"text|image\", \"watermarkText\": \"text_for_watermark (if type is text)\", \"watermarkImage\": \"reference_to_image_file (if type is image)\", \"fontSize\": float, \"opacity\": float (0.0-1.0), \"rotation\": float, ...}\n\n");
|
||||
|
||||
prompt.append("- operation: \"add-password\"\n");
|
||||
prompt.append(" description: \"Adds a password to protect a PDF.\"\n");
|
||||
prompt.append(" parameters: {\"ownerPassword\": \"password_string\", \"userPassword\": \"password_string\"}\n\n");
|
||||
|
||||
// Misc Operations
|
||||
prompt.append("- operation: \"ocr-pdf\"\n");
|
||||
prompt.append(" description: \"Performs OCR (Optical Character Recognition) on a PDF to make its text selectable/searchable.\"\n");
|
||||
prompt.append(" parameters: {\"languages\": [\"eng\", \"spa\", ...], \"ocrType\": \"skip-text|force-ocr\", \"deskew\": \"true|false\"}\n\n");
|
||||
|
||||
prompt.append("- operation: \"compress-pdf\"\n");
|
||||
prompt.append(" description: \"Reduces the file size of a PDF.\"\n");
|
||||
prompt.append(" parameters: {\"compressionLevel\": \"low|medium|high|custom_0-100\"}\n\n");
|
||||
|
||||
// Conversion Operations
|
||||
prompt.append("- operation: \"convert-to-pdfa\"\n");
|
||||
prompt.append(" description: \"Converts a PDF to PDF/A format for long-term archiving.\"\n");
|
||||
prompt.append(" parameters: {\"pdfStandard\": \"PDF/A-1B|PDF/A-2B|PDF/A-3B\"}\n\n");
|
||||
|
||||
prompt.append("- operation: \"pdf-to-word\"\n");
|
||||
prompt.append(" description: \"Converts a PDF file to a Word document (docx).\"\n");
|
||||
prompt.append(" parameters: {}\n\n"); // Assuming simple conversion, might need more params
|
||||
|
||||
prompt.append("- operation: \"image-to-pdf\"\n");
|
||||
prompt.append(" description: \"Converts one or more image files to a PDF document.\"\n");
|
||||
prompt.append(" parameters: {\"pageSize\": \"A4|LETTER|AUTO\", \"orientation\": \"portrait|landscape\"}\n\n");
|
||||
|
||||
|
||||
prompt.append("## User Request Context:\n");
|
||||
prompt.append("User's request: \"").append(userPrompt).append("\"\n");
|
||||
|
||||
if (files != null && !files.isEmpty()) {
|
||||
prompt.append("The user has provided the following file(s) for the operation (you will receive them separately):\n");
|
||||
for (int i = 0; i < files.size(); i++) {
|
||||
prompt.append("- File ").append(i + 1).append(": ").append(files.get(i).getOriginalFilename()).append("\n");
|
||||
}
|
||||
if (files.size() == 1) {
|
||||
prompt.append("Assume this single file is the primary input unless the user specifies otherwise.\n");
|
||||
} else {
|
||||
prompt.append("Determine from the user's prompt how these files should be used (e.g., all for merge, first as input second as watermark image).\n");
|
||||
}
|
||||
}
|
||||
if (additionalParams != null && !additionalParams.isEmpty()) {
|
||||
prompt.append("Additional parameters provided: ").append(additionalParams.toString()).append("\n");
|
||||
}
|
||||
|
||||
prompt.append("\n## Your Response Format:\n");
|
||||
prompt.append("Based on the user's request and the available operations, please identify the single most relevant operation and its parameters.\n");
|
||||
prompt.append("Respond with a JSON object containing two keys: 'operation' (a string matching one of the available operation names) and 'parameters' (a JSON object of the parameters for that operation).\n");
|
||||
prompt.append("If the user's request is ambiguous or requires an operation not listed, respond with {\"operation\": \"clarification_needed\", \"parameters\": {\"message\": \"Your clarification message here\"}}.\n");
|
||||
prompt.append("If multiple operations seem applicable, choose the one that seems most central to the user's request or ask for clarification.\n");
|
||||
prompt.append("Example response: {\"operation\": \"add-watermark\", \"parameters\": {\"watermarkType\": \"text\", \"watermarkText\": \"CONFIDENTIAL DRAFT\", \"opacity\": 0.3, \"fontSize\": 50.0}}\n");
|
||||
|
||||
return prompt.toString();
|
||||
}
|
||||
|
||||
private String parseActionFromGeminiResponse(String geminiOutput) {
|
||||
// TODO: Implement logic to parse the action from Gemini's response.
|
||||
// This might involve JSON parsing if Gemini returns structured data,
|
||||
// or regex/string matching for less structured output.
|
||||
// For placeholder:
|
||||
if (geminiOutput.contains("perform action X")) {
|
||||
return "actionX";
|
||||
}
|
||||
return "unknownAction";
|
||||
}
|
||||
|
||||
private Map<String, Object> parseParamsFromGeminiResponse(String geminiOutput) {
|
||||
// TODO: Implement logic to parse parameters from Gemini's response.
|
||||
return Map.of(); // Placeholder
|
||||
}
|
||||
|
||||
private Object executeStirlingPdfOperation(String action, Map<String, Object> params, List<MultipartFile> files) {
|
||||
// TODO: Implement the orchestration logic.
|
||||
// This will involve a switch or if-else structure to call the appropriate
|
||||
// Stirling-PDF service methods or make internal HTTP requests.
|
||||
// Example:
|
||||
// if ("merge".equals(action)) {
|
||||
// // Call MergeService or make HTTP request to /api/v1/general/merge-pdfs
|
||||
// } else if ("watermark".equals(action)) {
|
||||
// // Call WatermarkService or make HTTP request to /api/v1/security/add-watermark
|
||||
// }
|
||||
return "Placeholder: Executed " + action + " with params " + params.toString() + " on " + (files != null ? files.size() : 0) + " files.";
|
||||
}
|
||||
|
||||
// Helper method to get API key (useful for client initialization if not done in constructor)
|
||||
public String getGeminiApiKey() {
|
||||
return geminiApiKey;
|
||||
}
|
||||
}
|
4
stirling-pdf/src/main/resources/gemini-agent.properties
Normal file
4
stirling-pdf/src/main/resources/gemini-agent.properties
Normal file
@ -0,0 +1,4 @@
|
||||
# Gemini AI Agent Configuration
|
||||
# Ensure this API key is set via an environment variable or a secure configuration method in production.
|
||||
# Example: GEMINI_API_KEY="your_actual_api_key_env_var"
|
||||
gemini.api.key=${GEMINI_API_KEY:YOUR_GEMINI_API_KEY_HERE}
|
Loading…
Reference in New Issue
Block a user