diff --git a/Dockerfile b/Dockerfile index ccb8408a9..46cae3478 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,6 +66,10 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a poppler-utils \ # OCR MY PDF (unpaper for descew and other advanced features) tesseract-ocr-data-eng \ + tesseract-ocr-data-chi_sim \ + tesseract-ocr-data-deu \ + tesseract-ocr-data-fra \ + tesseract-ocr-data-por \ # CV py3-opencv \ python3 \ diff --git a/Dockerfile.fat b/Dockerfile.fat index 8855be6c0..8a4d55d80 100644 --- a/Dockerfile.fat +++ b/Dockerfile.fat @@ -75,7 +75,10 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a # OCR MY PDF (unpaper for descew and other advanced featues) qpdf \ tesseract-ocr-data-eng \ - + tesseract-ocr-data-chi_sim \ + tesseract-ocr-data-deu \ + tesseract-ocr-data-fra \ + tesseract-ocr-data-por \ font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \ # CV py3-opencv \ diff --git a/build.gradle b/build.gradle index 775fb9823..3f91a9706 100644 --- a/build.gradle +++ b/build.gradle @@ -25,7 +25,7 @@ ext { } group = "stirling.software" -version = "0.44.2" +version = "0.44.3" java { // 17 is lowest but we support and recommend 21 diff --git a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 12bf2c291..5b4eb2382 100644 --- a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -48,6 +48,22 @@ public class EndpointConfiguration { return endpointStatuses.getOrDefault(endpoint, true); } + public boolean isGroupEnabled(String group) { + Set endpoints = endpointGroups.get(group); + if (endpoints == null || endpoints.isEmpty()) { + log.debug("Group '{}' does not exist or has no endpoints", group); + return false; + } + + for (String endpoint : endpoints) { + if (!isEndpointEnabled(endpoint)) { + return false; + } + } + + return true; + } + public void addEndpointToGroup(String group, String endpoint) { endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint); } @@ -176,21 +192,17 @@ public class EndpointConfiguration { addEndpointToGroup("OpenCV", "extract-image-scans"); // LibreOffice - addEndpointToGroup("qpdf", "repair"); addEndpointToGroup("LibreOffice", "file-to-pdf"); addEndpointToGroup("LibreOffice", "pdf-to-word"); addEndpointToGroup("LibreOffice", "pdf-to-presentation"); addEndpointToGroup("LibreOffice", "pdf-to-rtf"); addEndpointToGroup("LibreOffice", "pdf-to-html"); addEndpointToGroup("LibreOffice", "pdf-to-xml"); + addEndpointToGroup("LibreOffice", "pdf-to-pdfa"); // Unoconvert addEndpointToGroup("Unoconvert", "file-to-pdf"); - // qpdf - addEndpointToGroup("qpdf", "compress-pdf"); - addEndpointToGroup("qpdf", "pdf-to-pdfa"); - addEndpointToGroup("tesseract", "ocr-pdf"); // Java @@ -240,8 +252,6 @@ public class EndpointConfiguration { addEndpointToGroup("Javascript", "adjust-contrast"); // qpdf dependent endpoints - addEndpointToGroup("qpdf", "compress-pdf"); - addEndpointToGroup("qpdf", "pdf-to-pdfa"); addEndpointToGroup("qpdf", "repair"); // Weasyprint dependent endpoints diff --git a/src/main/java/stirling/software/SPDF/config/EndpointInspector.java b/src/main/java/stirling/software/SPDF/config/EndpointInspector.java new file mode 100644 index 000000000..474606783 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/config/EndpointInspector.java @@ -0,0 +1,216 @@ +package stirling.software.SPDF.config; + +import java.lang.reflect.Method; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.ApplicationContext; +import org.springframework.context.ApplicationListener; +import org.springframework.context.event.ContextRefreshedEvent; +import org.springframework.stereotype.Component; +import org.springframework.web.bind.annotation.RequestMethod; +import org.springframework.web.method.HandlerMethod; +import org.springframework.web.servlet.mvc.method.RequestMappingInfo; +import org.springframework.web.servlet.mvc.method.annotation.RequestMappingHandlerMapping; + +@Component +public class EndpointInspector implements ApplicationListener { + private static final Logger logger = LoggerFactory.getLogger(EndpointInspector.class); + + private final ApplicationContext applicationContext; + private final Set validGetEndpoints = new HashSet<>(); + private boolean endpointsDiscovered = false; + + @Autowired + public EndpointInspector(ApplicationContext applicationContext) { + this.applicationContext = applicationContext; + } + + @Override + public void onApplicationEvent(ContextRefreshedEvent event) { + if (!endpointsDiscovered) { + discoverEndpoints(); + endpointsDiscovered = true; + logger.info("Discovered {} valid GET endpoints", validGetEndpoints.size()); + } + } + + private void discoverEndpoints() { + try { + Map mappings = + applicationContext.getBeansOfType(RequestMappingHandlerMapping.class); + + for (Map.Entry entry : mappings.entrySet()) { + RequestMappingHandlerMapping mapping = entry.getValue(); + Map handlerMethods = mapping.getHandlerMethods(); + + for (Map.Entry handlerEntry : + handlerMethods.entrySet()) { + RequestMappingInfo mappingInfo = handlerEntry.getKey(); + HandlerMethod handlerMethod = handlerEntry.getValue(); + + boolean isGetHandler = false; + try { + Set methods = mappingInfo.getMethodsCondition().getMethods(); + isGetHandler = methods.isEmpty() || methods.contains(RequestMethod.GET); + } catch (Exception e) { + isGetHandler = true; + } + + if (isGetHandler) { + Set patterns = extractPatternsUsingDirectPaths(mappingInfo); + + if (patterns.isEmpty()) { + patterns = extractPatternsFromString(mappingInfo); + } + + validGetEndpoints.addAll(patterns); + } + } + } + + if (validGetEndpoints.isEmpty()) { + logger.warn("No endpoints discovered. Adding common endpoints as fallback."); + validGetEndpoints.add("/"); + validGetEndpoints.add("/api/**"); + validGetEndpoints.add("/**"); + } + } catch (Exception e) { + logger.error("Error discovering endpoints", e); + } + } + + private Set extractPatternsUsingDirectPaths(RequestMappingInfo mappingInfo) { + Set patterns = new HashSet<>(); + + try { + Method getDirectPathsMethod = mappingInfo.getClass().getMethod("getDirectPaths"); + Object result = getDirectPathsMethod.invoke(mappingInfo); + if (result instanceof Set) { + @SuppressWarnings("unchecked") + Set resultSet = (Set) result; + patterns.addAll(resultSet); + } + } catch (Exception e) { + // Return empty set if method not found or fails + } + + return patterns; + } + + private Set extractPatternsFromString(RequestMappingInfo mappingInfo) { + Set patterns = new HashSet<>(); + try { + String infoString = mappingInfo.toString(); + if (infoString.contains("{")) { + String patternsSection = + infoString.substring(infoString.indexOf("{") + 1, infoString.indexOf("}")); + + for (String pattern : patternsSection.split(",")) { + pattern = pattern.trim(); + if (!pattern.isEmpty()) { + patterns.add(pattern); + } + } + } + } catch (Exception e) { + // Return empty set if parsing fails + } + return patterns; + } + + public boolean isValidGetEndpoint(String uri) { + if (!endpointsDiscovered) { + discoverEndpoints(); + endpointsDiscovered = true; + } + + if (validGetEndpoints.contains(uri)) { + return true; + } + + if (matchesWildcardOrPathVariable(uri)) { + return true; + } + + if (matchesPathSegments(uri)) { + return true; + } + + return false; + } + + private boolean matchesWildcardOrPathVariable(String uri) { + for (String pattern : validGetEndpoints) { + if (pattern.contains("*") || pattern.contains("{")) { + int wildcardIndex = pattern.indexOf('*'); + int variableIndex = pattern.indexOf('{'); + + int cutoffIndex; + if (wildcardIndex < 0) { + cutoffIndex = variableIndex; + } else if (variableIndex < 0) { + cutoffIndex = wildcardIndex; + } else { + cutoffIndex = Math.min(wildcardIndex, variableIndex); + } + + String staticPrefix = pattern.substring(0, cutoffIndex); + + if (uri.startsWith(staticPrefix)) { + return true; + } + } + } + return false; + } + + private boolean matchesPathSegments(String uri) { + for (String pattern : validGetEndpoints) { + if (!pattern.contains("*") && !pattern.contains("{")) { + String[] patternSegments = pattern.split("/"); + String[] uriSegments = uri.split("/"); + + if (uriSegments.length < patternSegments.length) { + continue; + } + + boolean match = true; + for (int i = 0; i < patternSegments.length; i++) { + if (!patternSegments[i].equals(uriSegments[i])) { + match = false; + break; + } + } + + if (match) { + return true; + } + } + } + return false; + } + + public Set getValidGetEndpoints() { + if (!endpointsDiscovered) { + discoverEndpoints(); + endpointsDiscovered = true; + } + return new HashSet<>(validGetEndpoints); + } + + private void logAllEndpoints() { + Set sortedEndpoints = new TreeSet<>(validGetEndpoints); + + logger.info("=== BEGIN: All discovered GET endpoints ==="); + for (String endpoint : sortedEndpoints) { + logger.info("Endpoint: {}", endpoint); + } + logger.info("=== END: All discovered GET endpoints ==="); + } +} diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java index 7d1985cec..ade1b306c 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; @@ -29,8 +30,8 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.PostMapping; @@ -44,13 +45,14 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.AllArgsConstructor; import lombok.Data; +import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; import lombok.extern.slf4j.Slf4j; +import stirling.software.SPDF.config.EndpointConfiguration; import stirling.software.SPDF.model.api.misc.OptimizePdfRequest; import stirling.software.SPDF.service.CustomPDFDocumentFactory; import stirling.software.SPDF.utils.GeneralUtils; -import stirling.software.SPDF.utils.ImageProcessingUtils; import stirling.software.SPDF.utils.ProcessExecutor; import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; import stirling.software.SPDF.utils.WebResponseUtils; @@ -62,10 +64,13 @@ import stirling.software.SPDF.utils.WebResponseUtils; public class CompressController { private final CustomPDFDocumentFactory pdfDocumentFactory; + private final boolean qpdfEnabled; - @Autowired - public CompressController(CustomPDFDocumentFactory pdfDocumentFactory) { + public CompressController( + CustomPDFDocumentFactory pdfDocumentFactory, + EndpointConfiguration endpointConfiguration) { this.pdfDocumentFactory = pdfDocumentFactory; + this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf"); } @Data @@ -76,10 +81,30 @@ public class CompressController { COSName name; // The name used to reference this image } + @Data + @EqualsAndHashCode(callSuper = true) + @AllArgsConstructor + @NoArgsConstructor + private static class NestedImageReference extends ImageReference { + COSName formName; // Name of the form XObject containing the image + COSName imageName; // Name of the image within the form + } + + // Tracks compression stats for reporting + private static class CompressionStats { + int totalImages = 0; + int nestedImages = 0; + int uniqueImagesCount = 0; + int compressedImages = 0; + int skippedImages = 0; + long totalOriginalBytes = 0; + long totalCompressedBytes = 0; + } + public Path compressImagesInPDF( Path pdfFile, double scaleFactor, float jpegQuality, boolean convertToGrayscale) throws Exception { - Path newCompressedPDF = Files.createTempFile("compressedPDF", ".pdf"); + Path newCompressedPDF = Files.createTempFile("compressedPDF", ".pdf"); long originalFileSize = Files.size(pdfFile); log.info( "Starting image compression with scale factor: {}, JPEG quality: {}, grayscale: {} on file size: {}", @@ -89,146 +114,29 @@ public class CompressController { GeneralUtils.formatBytes(originalFileSize)); try (PDDocument doc = pdfDocumentFactory.load(pdfFile)) { + // Find all unique images in the document + Map> uniqueImages = findImages(doc); - // Collect all unique images by content hash - Map> uniqueImages = new HashMap<>(); - Map compressedVersions = new HashMap<>(); + // Get statistics + CompressionStats stats = new CompressionStats(); + stats.uniqueImagesCount = uniqueImages.size(); + calculateImageStats(uniqueImages, stats); - int totalImages = 0; + // Create compressed versions of unique images + Map compressedVersions = + createCompressedImages( + doc, uniqueImages, scaleFactor, jpegQuality, convertToGrayscale, stats); - for (int pageNum = 0; pageNum < doc.getNumberOfPages(); pageNum++) { - PDPage page = doc.getPage(pageNum); - PDResources res = page.getResources(); - if (res == null || res.getXObjectNames() == null) continue; - - for (COSName name : res.getXObjectNames()) { - PDXObject xobj = res.getXObject(name); - if (!(xobj instanceof PDImageXObject)) continue; - - totalImages++; - PDImageXObject image = (PDImageXObject) xobj; - String imageHash = generateImageHash(image); - - // Store only page number and name reference - ImageReference ref = new ImageReference(); - ref.pageNum = pageNum; - ref.name = name; - - uniqueImages.computeIfAbsent(imageHash, k -> new ArrayList<>()).add(ref); - } - } - - int uniqueImagesCount = uniqueImages.size(); - int duplicatedImages = totalImages - uniqueImagesCount; - log.info( - "Found {} unique images and {} duplicated instances across {} pages", - uniqueImagesCount, - duplicatedImages, - doc.getNumberOfPages()); - - // SECOND PASS: Process each unique image exactly once - int compressedImages = 0; - int skippedImages = 0; - long totalOriginalBytes = 0; - long totalCompressedBytes = 0; - - for (Entry> entry : uniqueImages.entrySet()) { - String imageHash = entry.getKey(); - List references = entry.getValue(); - - if (references.isEmpty()) continue; - - // Get the first instance of this image - ImageReference firstRef = references.get(0); - PDPage firstPage = doc.getPage(firstRef.pageNum); - PDResources firstPageResources = firstPage.getResources(); - PDImageXObject originalImage = - (PDImageXObject) firstPageResources.getXObject(firstRef.name); - - // Track original size - int originalSize = (int) originalImage.getCOSObject().getLength(); - totalOriginalBytes += originalSize; - - // Process this unique image once - BufferedImage processedImage = - processAndCompressImage( - originalImage, scaleFactor, jpegQuality, convertToGrayscale); - - if (processedImage != null) { - // Convert to bytes for storage - byte[] compressedData = convertToBytes(processedImage, jpegQuality); - - // Check if compression is beneficial - if (compressedData.length < originalSize || convertToGrayscale) { - // Create a single compressed version - PDImageXObject compressedImage = - PDImageXObject.createFromByteArray( - doc, - compressedData, - originalImage.getCOSObject().toString()); - - // Store the compressed version only once in our map - compressedVersions.put(imageHash, compressedImage); - - // Report compression stats - double reductionPercentage = - 100.0 - ((compressedData.length * 100.0) / originalSize); - log.info( - "Image hash {}: Compressed from {} to {} (reduced by {}%)", - imageHash, - GeneralUtils.formatBytes(originalSize), - GeneralUtils.formatBytes(compressedData.length), - String.format("%.1f", reductionPercentage)); - - // Replace ALL instances with the compressed version - for (ImageReference ref : references) { - // Get the page and resources when needed - PDPage page = doc.getPage(ref.pageNum); - PDResources resources = page.getResources(); - resources.put(ref.name, compressedImage); - - log.info( - "Replaced image on page {} with compressed version", - ref.pageNum + 1); - } - - totalCompressedBytes += compressedData.length * references.size(); - compressedImages++; - } else { - log.info("Image hash {}: Compression not beneficial, skipping", imageHash); - totalCompressedBytes += originalSize * references.size(); - skippedImages++; - } - } else { - log.info("Image hash {}: Not suitable for compression, skipping", imageHash); - totalCompressedBytes += originalSize * references.size(); - skippedImages++; - } - } + // Replace all instances with compressed versions + replaceImages(doc, uniqueImages, compressedVersions, stats); // Log compression statistics - double overallImageReduction = - totalOriginalBytes > 0 - ? 100.0 - ((totalCompressedBytes * 100.0) / totalOriginalBytes) - : 0; - - log.info( - "Image compression summary - Total unique: {}, Compressed: {}, Skipped: {}, Duplicates: {}", - uniqueImagesCount, - compressedImages, - skippedImages, - duplicatedImages); - log.info( - "Total original image size: {}, compressed: {} (reduced by {}%)", - GeneralUtils.formatBytes(totalOriginalBytes), - GeneralUtils.formatBytes(totalCompressedBytes), - String.format("%.1f", overallImageReduction)); + logCompressionStats(stats, originalFileSize); // Free memory before saving compressedVersions.clear(); uniqueImages.clear(); - // Save the document log.info("Saving compressed PDF to {}", newCompressedPDF.toString()); doc.save(newCompressedPDF.toString()); @@ -242,7 +150,315 @@ public class CompressController { String.format("%.1f", overallReduction)); return newCompressedPDF; } - + } + + // Find all images in the document, both direct and nested within forms + private Map> findImages(PDDocument doc) throws IOException { + Map> uniqueImages = new HashMap<>(); + + // Scan through all pages in the document + for (int pageNum = 0; pageNum < doc.getNumberOfPages(); pageNum++) { + PDPage page = doc.getPage(pageNum); + PDResources res = page.getResources(); + if (res == null || res.getXObjectNames() == null) continue; + + // Process all XObjects on the page + for (COSName name : res.getXObjectNames()) { + PDXObject xobj = res.getXObject(name); + + // Direct image + if (isImage(xobj)) { + addDirectImage(pageNum, name, (PDImageXObject) xobj, uniqueImages); + log.info( + "Found direct image '{}' on page {} - {}x{}", + name.getName(), + pageNum + 1, + ((PDImageXObject) xobj).getWidth(), + ((PDImageXObject) xobj).getHeight()); + } + // Form XObject that may contain nested images + else if (isForm(xobj)) { + checkFormForImages(pageNum, name, (PDFormXObject) xobj, uniqueImages); + } + } + } + + return uniqueImages; + } + + private boolean isImage(PDXObject xobj) { + return xobj instanceof PDImageXObject; + } + + private boolean isForm(PDXObject xobj) { + return xobj instanceof PDFormXObject; + } + + private ImageReference addDirectImage( + int pageNum, + COSName name, + PDImageXObject image, + Map> uniqueImages) + throws IOException { + ImageReference ref = new ImageReference(); + ref.pageNum = pageNum; + ref.name = name; + + String imageHash = generateImageHash(image); + uniqueImages.computeIfAbsent(imageHash, k -> new ArrayList<>()).add(ref); + + return ref; + } + + // Look for images inside form XObjects + private void checkFormForImages( + int pageNum, + COSName formName, + PDFormXObject formXObj, + Map> uniqueImages) + throws IOException { + PDResources formResources = formXObj.getResources(); + if (formResources == null || formResources.getXObjectNames() == null) { + return; + } + + log.info( + "Checking form XObject '{}' on page {} for nested images", + formName.getName(), + pageNum + 1); + + // Process all XObjects within the form + for (COSName nestedName : formResources.getXObjectNames()) { + PDXObject nestedXobj = formResources.getXObject(nestedName); + + if (isImage(nestedXobj)) { + PDImageXObject nestedImage = (PDImageXObject) nestedXobj; + + log.info( + "Found nested image '{}' in form '{}' on page {} - {}x{}", + nestedName.getName(), + formName.getName(), + pageNum + 1, + nestedImage.getWidth(), + nestedImage.getHeight()); + + // Create specialized reference for the nested image + NestedImageReference nestedRef = new NestedImageReference(); + nestedRef.pageNum = pageNum; + nestedRef.formName = formName; + nestedRef.imageName = nestedName; + + String imageHash = generateImageHash(nestedImage); + uniqueImages.computeIfAbsent(imageHash, k -> new ArrayList<>()).add(nestedRef); + } + } + } + + // Count total images and nested images + private void calculateImageStats( + Map> uniqueImages, CompressionStats stats) { + for (List references : uniqueImages.values()) { + for (ImageReference ref : references) { + stats.totalImages++; + if (ref instanceof NestedImageReference) { + stats.nestedImages++; + } + } + } + } + + // Create compressed versions of all unique images + private Map createCompressedImages( + PDDocument doc, + Map> uniqueImages, + double scaleFactor, + float jpegQuality, + boolean convertToGrayscale, + CompressionStats stats) + throws IOException { + + Map compressedVersions = new HashMap<>(); + + // Process each unique image exactly once + for (Entry> entry : uniqueImages.entrySet()) { + String imageHash = entry.getKey(); + List references = entry.getValue(); + + if (references.isEmpty()) continue; + + // Get the first instance of this image + PDImageXObject originalImage = getOriginalImage(doc, references.get(0)); + + // Track original size + int originalSize = (int) originalImage.getCOSObject().getLength(); + stats.totalOriginalBytes += originalSize; + + // Process this unique image + PDImageXObject compressedImage = + compressImage( + doc, + originalImage, + originalSize, + scaleFactor, + jpegQuality, + convertToGrayscale); + + if (compressedImage != null) { + // Store the compressed version in our map + compressedVersions.put(imageHash, compressedImage); + stats.compressedImages++; + + // Update compression stats + int compressedSize = (int) compressedImage.getCOSObject().getLength(); + stats.totalCompressedBytes += compressedSize * references.size(); + + double reductionPercentage = 100.0 - ((compressedSize * 100.0) / originalSize); + log.info( + "Image hash {}: Compressed from {} to {} (reduced by {}%)", + imageHash, + GeneralUtils.formatBytes(originalSize), + GeneralUtils.formatBytes(compressedSize), + String.format("%.1f", reductionPercentage)); + } else { + log.info("Image hash {}: Not suitable for compression, skipping", imageHash); + stats.totalCompressedBytes += originalSize * references.size(); + stats.skippedImages++; + } + } + + return compressedVersions; + } + + // Get original image from a reference + private PDImageXObject getOriginalImage(PDDocument doc, ImageReference ref) throws IOException { + if (ref instanceof NestedImageReference) { + // Get the nested image from within a form XObject + NestedImageReference nestedRef = (NestedImageReference) ref; + PDPage page = doc.getPage(nestedRef.pageNum); + PDResources pageResources = page.getResources(); + + // Get the form XObject + PDFormXObject formXObj = (PDFormXObject) pageResources.getXObject(nestedRef.formName); + + // Get the nested image from the form's resources + PDResources formResources = formXObj.getResources(); + return (PDImageXObject) formResources.getXObject(nestedRef.imageName); + } else { + // Get direct image from page resources + PDPage page = doc.getPage(ref.pageNum); + PDResources resources = page.getResources(); + return (PDImageXObject) resources.getXObject(ref.name); + } + } + + // Try to compress an image if it makes sense + private PDImageXObject compressImage( + PDDocument doc, + PDImageXObject originalImage, + int originalSize, + double scaleFactor, + float jpegQuality, + boolean convertToGrayscale) + throws IOException { + + // Process and compress the image + BufferedImage processedImage = + processAndCompressImage( + originalImage, scaleFactor, jpegQuality, convertToGrayscale); + + if (processedImage == null) { + return null; + } + + // Convert to bytes for storage + byte[] compressedData = convertToBytes(processedImage, jpegQuality); + + // Check if compression is beneficial + if (compressedData.length < originalSize || convertToGrayscale) { + // Create a compressed version + return PDImageXObject.createFromByteArray( + doc, compressedData, originalImage.getCOSObject().toString()); + } + + return null; + } + + // Replace all instances of original images with their compressed versions + private void replaceImages( + PDDocument doc, + Map> uniqueImages, + Map compressedVersions, + CompressionStats stats) + throws IOException { + + for (Entry> entry : uniqueImages.entrySet()) { + String imageHash = entry.getKey(); + List references = entry.getValue(); + + // Skip if no compressed version exists + PDImageXObject compressedImage = compressedVersions.get(imageHash); + if (compressedImage == null) continue; + + // Replace ALL instances with the compressed version + for (ImageReference ref : references) { + replaceImageReference(doc, ref, compressedImage); + } + } + } + + // Replace a specific image reference with a compressed version + private void replaceImageReference( + PDDocument doc, ImageReference ref, PDImageXObject compressedImage) throws IOException { + if (ref instanceof NestedImageReference) { + // Replace nested image within form XObject + NestedImageReference nestedRef = (NestedImageReference) ref; + PDPage page = doc.getPage(nestedRef.pageNum); + PDResources pageResources = page.getResources(); + + // Get the form XObject + PDFormXObject formXObj = (PDFormXObject) pageResources.getXObject(nestedRef.formName); + + // Replace the nested image in the form's resources + PDResources formResources = formXObj.getResources(); + formResources.put(nestedRef.imageName, compressedImage); + + log.info( + "Replaced nested image '{}' in form '{}' on page {} with compressed version", + nestedRef.imageName.getName(), + nestedRef.formName.getName(), + nestedRef.pageNum + 1); + } else { + // Replace direct image in page resources + PDPage page = doc.getPage(ref.pageNum); + PDResources resources = page.getResources(); + resources.put(ref.name, compressedImage); + + log.info("Replaced direct image on page {} with compressed version", ref.pageNum + 1); + } + } + + // Log final stats about the compression + private void logCompressionStats(CompressionStats stats, long originalFileSize) { + // Calculate image reduction percentage + double overallImageReduction = + stats.totalOriginalBytes > 0 + ? 100.0 - ((stats.totalCompressedBytes * 100.0) / stats.totalOriginalBytes) + : 0; + + int duplicatedImages = stats.totalImages - stats.uniqueImagesCount; + + log.info( + "Image compression summary - Total unique: {}, Compressed: {}, Skipped: {}, Duplicates: {}, Nested: {}", + stats.uniqueImagesCount, + stats.compressedImages, + stats.skippedImages, + duplicatedImages, + stats.nestedImages); + log.info( + "Total original image size: {}, compressed: {} (reduced by {}%)", + GeneralUtils.formatBytes(stats.totalOriginalBytes), + GeneralUtils.formatBytes(stats.totalCompressedBytes), + String.format("%.1f", overallImageReduction)); } private BufferedImage convertToGrayscale(BufferedImage image) { @@ -257,10 +473,7 @@ public class CompressController { return grayImage; } - /** - * Processes and compresses an image if beneficial. Returns the processed image if compression - * is worthwhile, null otherwise. - */ + // Resize and optionally convert to grayscale private BufferedImage processAndCompressImage( PDImageXObject image, double scaleFactor, float jpegQuality, boolean convertToGrayscale) throws IOException { @@ -342,10 +555,7 @@ public class CompressController { return scaledImage; } - /** - * Converts a BufferedImage to a byte array with specified JPEG quality. Checks if compression - * is beneficial compared to original. - */ + // Convert image to byte array with quality settings private byte[] convertToBytes(BufferedImage scaledImage, float jpegQuality) throws IOException { String format = scaledImage.getColorModel().hasAlpha() ? "png" : "jpeg"; ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -376,7 +586,7 @@ public class CompressController { return outputStream.toByteArray(); } - /** Modified hash function to consistently identify identical image content */ + // Hash function to identify identical images private String generateImageHash(PDImageXObject image) { try { // Create a stream for the raw stream data @@ -414,43 +624,26 @@ public class CompressController { } } - private byte[] generateImageMD5(PDImageXObject image) throws IOException { - return generatMD5(ImageProcessingUtils.getImageData(image.getImage())); - } - - /** Generates a hash string from a byte array */ - private String generateHashFromBytes(byte[] data) { - try { - // Use the existing method to generate MD5 hash - byte[] hash = generatMD5(data); - return bytesToHexString(hash); - } catch (Exception e) { - log.error("Error generating hash from bytes", e); - // Return a unique string as fallback - return "fallback-" + System.identityHashCode(data); - } - } - - // Updated scale factor method for levels 4-9 + // Scale factors for different optimization levels private double getScaleFactorForLevel(int optimizeLevel) { return switch (optimizeLevel) { - case 4 -> 0.9; // 90% of original size - lite image compression - case 5 -> 0.8; // 80% of original size - lite image compression - case 6 -> 0.7; // 70% of original size - lite image compression - case 7 -> 0.6; // 60% of original size - intense image compression - case 8 -> 0.5; // 50% of original size - intense image compression - case 9, 10 -> 0.4; // 40% of original size - intense image compression - default -> 1.0; // No image scaling for levels 1-3 + case 4 -> 0.9; // 90% - lite compression + case 5 -> 0.8; // 80% - lite compression + case 6 -> 0.7; // 70% - lite compression + case 7 -> 0.6; // 60% - intense compression + case 8 -> 0.5; // 50% - intense compression + case 9, 10 -> 0.4; // 40% - intense compression + default -> 1.0; // No scaling for levels 1-3 }; } - // New method for JPEG quality based on optimization level + // JPEG quality for different optimization levels private float getJpegQualityForLevel(int optimizeLevel) { return switch (optimizeLevel) { - case 7 -> 0.8f; // 80% quality - intense compression - case 8 -> 0.6f; // 60% quality - more intense compression - case 9, 10 -> 0.4f; // 40% quality - most intense compression - default -> 0.7f; // 70% quality for levels 1-6 (higher quality) + case 7 -> 0.8f; // 80% quality + case 8 -> 0.6f; // 60% quality + case 9, 10 -> 0.4f; // 40% quality + default -> 0.7f; // 70% quality for levels 1-6 }; } @@ -478,17 +671,17 @@ public class CompressController { } // Create initial input file - Path originalFile = Files.createTempFile("input_", ".pdf"); + Path originalFile = Files.createTempFile("original_", ".pdf"); inputFile.transferTo(originalFile.toFile()); long inputFileSize = Files.size(originalFile); - - // Start with original as current working file - Path currentFile = originalFile; - + + Path currentFile = Files.createTempFile("working_", ".pdf"); + Files.copy(originalFile, currentFile, StandardCopyOption.REPLACE_EXISTING); + // Keep track of all temporary files for cleanup List tempFiles = new ArrayList<>(); tempFiles.add(originalFile); - + tempFiles.add(currentFile); try { if (autoMode) { double sizeReductionRatio = expectedOutputSize / (double) inputFileSize; @@ -499,93 +692,56 @@ public class CompressController { boolean imageCompressionApplied = false; boolean qpdfCompressionApplied = false; + if (qpdfEnabled && optimizeLevel <= 3) { + optimizeLevel = 4; + } + while (!sizeMet && optimizeLevel <= 9) { // Apply image compression for levels 4-9 if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale)) && !imageCompressionApplied) { double scaleFactor = getScaleFactorForLevel(optimizeLevel); float jpegQuality = getJpegQualityForLevel(optimizeLevel); - - // Use the returned path from compressImagesInPDF - Path compressedImageFile = compressImagesInPDF( - currentFile, - scaleFactor, - jpegQuality, - Boolean.TRUE.equals(convertToGrayscale)); - - // Add to temp files list and update current file + + // Compress images + Path compressedImageFile = + compressImagesInPDF( + currentFile, + scaleFactor, + jpegQuality, + Boolean.TRUE.equals(convertToGrayscale)); + tempFiles.add(compressedImageFile); currentFile = compressedImageFile; imageCompressionApplied = true; } // Apply QPDF compression for all levels - if (!qpdfCompressionApplied) { - long preQpdfSize = Files.size(currentFile); - log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize)); - - // Map optimization levels to QPDF compression levels - int qpdfCompressionLevel = optimizeLevel <= 3 - ? optimizeLevel * 3 // Level 1->3, 2->6, 3->9 - : 9; // Max compression for levels 4-9 - - // Create output file for QPDF - Path qpdfOutputFile = Files.createTempFile("qpdf_output_", ".pdf"); - tempFiles.add(qpdfOutputFile); - - // Run QPDF optimization - List command = new ArrayList<>(); - command.add("qpdf"); - if (request.getNormalize()) { - command.add("--normalize-content=y"); - } - if (request.getLinearize()) { - command.add("--linearize"); - } - command.add("--recompress-flate"); - command.add("--compression-level=" + qpdfCompressionLevel); - command.add("--compress-streams=y"); - command.add("--object-streams=generate"); - command.add(currentFile.toString()); - command.add(qpdfOutputFile.toString()); - - ProcessExecutorResult returnCode = null; - try { - returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) - .runCommandWithOutputHandling(command); - qpdfCompressionApplied = true; - - // Update current file to the QPDF output - currentFile = qpdfOutputFile; - - long postQpdfSize = Files.size(currentFile); - double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize); - log.info( - "Post-QPDF file size: {} (reduced by {}%)", - GeneralUtils.formatBytes(postQpdfSize), - String.format("%.1f", qpdfReduction)); - - } catch (Exception e) { - if (returnCode != null && returnCode.getRc() != 3) { - throw e; - } - // If QPDF fails, keep using the current file - log.warn("QPDF compression failed, continuing with current file"); + if (!qpdfCompressionApplied && qpdfEnabled) { + applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles); + qpdfCompressionApplied = true; + } else if (!qpdfCompressionApplied) { + // If QPDF is disabled, mark as applied and log + if (!qpdfEnabled) { + log.info("Skipping QPDF compression as QPDF group is disabled"); } + qpdfCompressionApplied = true; } - // Check if file size is within expected size or not auto mode + // Check if target size reached or not in auto mode long outputFileSize = Files.size(currentFile); if (outputFileSize <= expectedOutputSize || !autoMode) { sizeMet = true; } else { - int newOptimizeLevel = incrementOptimizeLevel( - optimizeLevel, outputFileSize, expectedOutputSize); + int newOptimizeLevel = + incrementOptimizeLevel( + optimizeLevel, outputFileSize, expectedOutputSize); // Check if we can't increase the level further if (newOptimizeLevel == optimizeLevel) { if (autoMode) { - log.info("Maximum optimization level reached without meeting target size."); + log.info( + "Maximum optimization level reached without meeting target size."); sizeMet = true; } } else { @@ -597,18 +753,19 @@ public class CompressController { } } - // Check if optimized file is larger than the original + // Use original if optimized file is somehow larger long finalFileSize = Files.size(currentFile); - if (finalFileSize > inputFileSize) { - log.warn("Optimized file is larger than the original. Using the original file instead."); - // Use the stored reference to the original file + if (finalFileSize >= inputFileSize) { + log.warn( + "Optimized file is larger than the original. Using the original file instead."); currentFile = originalFile; } - String outputFilename = Filenames.toSimpleFileName(inputFile.getOriginalFilename()) + String outputFilename = + Filenames.toSimpleFileName(inputFile.getOriginalFilename()) .replaceFirst("[.][^.]+$", "") + "_Optimized.pdf"; - + return WebResponseUtils.pdfDocToWebResponse( pdfDocumentFactory.load(currentFile.toFile()), outputFilename); @@ -624,6 +781,65 @@ public class CompressController { } } + // Run QPDF compression + private void applyQpdfCompression( + OptimizePdfRequest request, int optimizeLevel, Path currentFile, List tempFiles) + throws IOException { + + long preQpdfSize = Files.size(currentFile); + log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize)); + + // Map optimization levels to QPDF compression levels + int qpdfCompressionLevel = + optimizeLevel <= 3 + ? optimizeLevel * 3 // Level 1->3, 2->6, 3->9 + : 9; // Max compression for levels 4-9 + + // Create output file for QPDF + Path qpdfOutputFile = Files.createTempFile("qpdf_output_", ".pdf"); + tempFiles.add(qpdfOutputFile); + + // Build QPDF command + List command = new ArrayList<>(); + command.add("qpdf"); + if (request.getNormalize()) { + command.add("--normalize-content=y"); + } + if (request.getLinearize()) { + command.add("--linearize"); + } + command.add("--recompress-flate"); + command.add("--compression-level=" + qpdfCompressionLevel); + command.add("--compress-streams=y"); + command.add("--object-streams=generate"); + command.add(currentFile.toString()); + command.add(qpdfOutputFile.toString()); + + ProcessExecutorResult returnCode = null; + try { + returnCode = + ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) + .runCommandWithOutputHandling(command); + + // Update current file to the QPDF output + Files.copy(qpdfOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING); + + long postQpdfSize = Files.size(currentFile); + double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize); + log.info( + "Post-QPDF file size: {} (reduced by {}%)", + GeneralUtils.formatBytes(postQpdfSize), String.format("%.1f", qpdfReduction)); + + } catch (Exception e) { + if (returnCode != null && returnCode.getRc() != 3) { + throw new IOException("QPDF command failed", e); + } + // If QPDF fails, keep using the current file + log.warn("QPDF compression failed, continuing with current file", e); + } + } + + // Pick optimization level based on target size private int determineOptimizeLevel(double sizeReductionRatio) { if (sizeReductionRatio > 0.9) return 1; if (sizeReductionRatio > 0.8) return 2; @@ -636,6 +852,7 @@ public class CompressController { return 9; } + // Increment optimization level if we need more compression private int incrementOptimizeLevel(int currentLevel, long currentSize, long targetSize) { double currentRatio = currentSize / (double) targetSize; log.info("Current compression ratio: {}", String.format("%.2f", currentRatio)); @@ -647,4 +864,4 @@ public class CompressController { } return Math.min(9, currentLevel + 1); } -} +} \ No newline at end of file diff --git a/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineController.java b/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineController.java index 291cd9b45..2ab90cac2 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineController.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -26,8 +27,10 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.PipelineConfig; +import stirling.software.SPDF.model.PipelineOperation; import stirling.software.SPDF.model.PipelineResult; import stirling.software.SPDF.model.api.HandleDataRequest; +import stirling.software.SPDF.service.PostHogService; import stirling.software.SPDF.utils.WebResponseUtils; @RestController @@ -40,9 +43,13 @@ public class PipelineController { private final ObjectMapper objectMapper; - public PipelineController(PipelineProcessor processor, ObjectMapper objectMapper) { + private final PostHogService postHogService; + + public PipelineController( + PipelineProcessor processor, ObjectMapper objectMapper, PostHogService postHogService) { this.processor = processor; this.objectMapper = objectMapper; + this.postHogService = postHogService; } @PostMapping("/handleData") @@ -55,6 +62,18 @@ public class PipelineController { } PipelineConfig config = objectMapper.readValue(jsonString, PipelineConfig.class); log.info("Received POST request to /handleData with {} files", files.length); + + List operationNames = + config.getOperations().stream() + .map(PipelineOperation::getOperation) + .collect(Collectors.toList()); + + Map properties = new HashMap<>(); + properties.put("operations", operationNames); + properties.put("fileCount", files.length); + + postHogService.captureEvent("pipeline_api_event", properties); + try { List inputFiles = processor.generateInputFiles(files); if (inputFiles == null || inputFiles.size() == 0) { diff --git a/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineDirectoryProcessor.java b/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineDirectoryProcessor.java index 192bed0e4..a9e1f4103 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineDirectoryProcessor.java +++ b/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineDirectoryProcessor.java @@ -17,7 +17,9 @@ import java.time.LocalDate; import java.time.LocalTime; import java.time.format.DateTimeFormatter; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.stream.Stream; @@ -34,6 +36,7 @@ import stirling.software.SPDF.config.RuntimePathConfig; import stirling.software.SPDF.model.PipelineConfig; import stirling.software.SPDF.model.PipelineOperation; import stirling.software.SPDF.model.PipelineResult; +import stirling.software.SPDF.service.PostHogService; import stirling.software.SPDF.utils.FileMonitor; @Service @@ -41,15 +44,11 @@ import stirling.software.SPDF.utils.FileMonitor; public class PipelineDirectoryProcessor { private final ObjectMapper objectMapper; - private final ApiDocService apiDocService; - private final PipelineProcessor processor; - private final FileMonitor fileMonitor; - + private final PostHogService postHogService; private final String watchedFoldersDir; - private final String finishedFoldersDir; public PipelineDirectoryProcessor( @@ -57,13 +56,15 @@ public class PipelineDirectoryProcessor { ApiDocService apiDocService, PipelineProcessor processor, FileMonitor fileMonitor, + PostHogService postHogService, RuntimePathConfig runtimePathConfig) { this.objectMapper = objectMapper; this.apiDocService = apiDocService; - this.watchedFoldersDir = runtimePathConfig.getPipelineWatchedFoldersPath(); - this.finishedFoldersDir = runtimePathConfig.getPipelineFinishedFoldersPath(); this.processor = processor; this.fileMonitor = fileMonitor; + this.postHogService = postHogService; + this.watchedFoldersDir = runtimePathConfig.getPipelineWatchedFoldersPath(); + this.finishedFoldersDir = runtimePathConfig.getPipelineFinishedFoldersPath(); } @Scheduled(fixedRate = 60000) @@ -152,6 +153,14 @@ public class PipelineDirectoryProcessor { log.debug("No files detected for {} ", dir); return; } + + List operationNames = + config.getOperations().stream().map(PipelineOperation::getOperation).toList(); + Map properties = new HashMap<>(); + properties.put("operations", operationNames); + properties.put("fileCount", files.length); + postHogService.captureEvent("pipeline_directory_event", properties); + List filesToProcess = prepareFilesForProcessing(files, processingDir); runPipelineAgainstFiles(filesToProcess, config, dir, processingDir); } @@ -252,8 +261,7 @@ public class PipelineDirectoryProcessor { try { Thread.sleep(retryDelayMs * (int) Math.pow(2, attempt - 1)); } catch (InterruptedException e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); + log.error("prepareFilesForProcessing failure", e); } } } diff --git a/src/main/java/stirling/software/SPDF/model/api/misc/OptimizePdfRequest.java b/src/main/java/stirling/software/SPDF/model/api/misc/OptimizePdfRequest.java index 3fb15791d..147d163e8 100644 --- a/src/main/java/stirling/software/SPDF/model/api/misc/OptimizePdfRequest.java +++ b/src/main/java/stirling/software/SPDF/model/api/misc/OptimizePdfRequest.java @@ -14,7 +14,7 @@ public class OptimizePdfRequest extends PDFFile { @Schema( description = "The level of optimization to apply to the PDF file. Higher values indicate greater compression but may reduce quality.", - allowableValues = {"1", "2", "3", "4", "5"}) + allowableValues = {"1", "2", "3", "4", "5", "6", "7", "8", "9"}) private Integer optimizeLevel; @Schema(description = "The expected output size, e.g. '100MB', '25KB', etc.") diff --git a/src/main/java/stirling/software/SPDF/service/CustomPDFDocumentFactory.java b/src/main/java/stirling/software/SPDF/service/CustomPDFDocumentFactory.java index 354324744..92055a76c 100644 --- a/src/main/java/stirling/software/SPDF/service/CustomPDFDocumentFactory.java +++ b/src/main/java/stirling/software/SPDF/service/CustomPDFDocumentFactory.java @@ -77,7 +77,7 @@ public class CustomPDFDocumentFactory { } long fileSize = file.length(); - log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024)); + log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024)); return loadAdaptively(file, fileSize); } @@ -92,7 +92,7 @@ public class CustomPDFDocumentFactory { } long fileSize = Files.size(path); - log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024)); + log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024)); return loadAdaptively(path.toFile(), fileSize); } @@ -104,7 +104,7 @@ public class CustomPDFDocumentFactory { } long dataSize = input.length; - log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024)); + log.debug("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024)); return loadAdaptively(input, dataSize); } @@ -150,7 +150,7 @@ public class CustomPDFDocumentFactory { long actualFreeMemory = maxMemory - usedMemory; // Log memory status - log.info( + log.debug( "Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB", actualFreeMemory / (1024 * 1024), String.format("%.2f", freeMemoryPercent), @@ -160,21 +160,21 @@ public class CustomPDFDocumentFactory { // If free memory is critically low, always use file-based caching if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE || actualFreeMemory < MIN_FREE_MEMORY_BYTES) { - log.info( + log.debug( "Low memory detected ({}%), forcing file-based cache", String.format("%.2f", freeMemoryPercent)); return createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly()); } else if (contentSize < SMALL_FILE_THRESHOLD) { - log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024); + log.debug("Using memory-only cache for small document ({}KB)", contentSize / 1024); return IOUtils.createMemoryOnlyStreamCache(); } else if (contentSize < LARGE_FILE_THRESHOLD) { // For medium files (10-50MB), use a mixed approach - log.info( + log.debug( "Using mixed memory/file cache for medium document ({}MB)", contentSize / (1024 * 1024)); return createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE)); } else { - log.info("Using file-based cache for large document"); + log.debug("Using file-based cache for large document"); return createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly()); } } @@ -237,7 +237,7 @@ public class CustomPDFDocumentFactory { byte[] bytes, long size, StreamCacheCreateFunction cache, String password) throws IOException { if (size >= SMALL_FILE_THRESHOLD) { - log.info("Writing large byte array to temp file for password-protected PDF"); + log.debug("Writing large byte array to temp file for password-protected PDF"); Path tempFile = createTempFile("pdf-bytes-"); Files.write(tempFile, bytes); @@ -261,7 +261,6 @@ public class CustomPDFDocumentFactory { removePassword(doc); } - private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache) throws IOException { return Loader.loadPDF(new DeletingRandomAccessFile(file), "", null, null, cache); @@ -270,7 +269,7 @@ public class CustomPDFDocumentFactory { private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache) throws IOException { if (size >= SMALL_FILE_THRESHOLD) { - log.info("Writing large byte array to temp file"); + log.debug("Writing large byte array to temp file"); Path tempFile = createTempFile("pdf-bytes-"); Files.write(tempFile, bytes); @@ -318,7 +317,7 @@ public class CustomPDFDocumentFactory { // Temp file handling with enhanced logging private Path createTempFile(String prefix) throws IOException { Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp"); - log.info("Created temp file: {}", file); + log.debug("Created temp file: {}", file); return file; } diff --git a/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java b/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java index ad911f969..1a61d03bd 100644 --- a/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java +++ b/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java @@ -4,6 +4,8 @@ import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; @@ -11,22 +13,32 @@ import org.springframework.stereotype.Service; import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.search.Search; +import stirling.software.SPDF.config.EndpointInspector; + @Service public class MetricsAggregatorService { + private static final Logger logger = LoggerFactory.getLogger(MetricsAggregatorService.class); private final MeterRegistry meterRegistry; private final PostHogService postHogService; + private final EndpointInspector endpointInspector; private final Map lastSentMetrics = new ConcurrentHashMap<>(); @Autowired - public MetricsAggregatorService(MeterRegistry meterRegistry, PostHogService postHogService) { + public MetricsAggregatorService( + MeterRegistry meterRegistry, + PostHogService postHogService, + EndpointInspector endpointInspector) { this.meterRegistry = meterRegistry; this.postHogService = postHogService; + this.endpointInspector = endpointInspector; } @Scheduled(fixedRate = 7200000) // Run every 2 hours public void aggregateAndSendMetrics() { Map metrics = new HashMap<>(); + + final boolean validateGetEndpoints = endpointInspector.getValidGetEndpoints().size() != 0; Search.in(meterRegistry) .name("http.requests") .counters() @@ -34,35 +46,52 @@ public class MetricsAggregatorService { counter -> { String method = counter.getId().getTag("method"); String uri = counter.getId().getTag("uri"); - // Skip if either method or uri is null if (method == null || uri == null) { return; } + + // Skip URIs that are 2 characters or shorter + if (uri.length() <= 2) { + return; + } + + // Skip non-GET and non-POST requests if (!"GET".equals(method) && !"POST".equals(method)) { return; } - // Skip URIs that are 2 characters or shorter - if (uri.length() <= 2) { + + // For POST requests, only include if they start with /api/v1 + if ("POST".equals(method) && !uri.contains("api/v1")) { + return; + } + + if (uri.contains(".txt")) { + return; + } + // For GET requests, validate if we have a list of valid endpoints + if ("GET".equals(method) + && validateGetEndpoints + && !endpointInspector.isValidGetEndpoint(uri)) { + logger.debug("Skipping invalid GET endpoint: {}", uri); return; } String key = String.format( "http_requests_%s_%s", method, uri.replace("/", "_")); - double currentCount = counter.count(); double lastCount = lastSentMetrics.getOrDefault(key, 0.0); double difference = currentCount - lastCount; - if (difference > 0) { + logger.info("{}, {}", key, difference); metrics.put(key, difference); lastSentMetrics.put(key, currentCount); } }); - // Send aggregated metrics to PostHog if (!metrics.isEmpty()) { + postHogService.captureEvent("aggregated_metrics", metrics); } } diff --git a/src/main/resources/static/js/fileInput.js b/src/main/resources/static/js/fileInput.js index 9f27360f9..e9cdfca46 100644 --- a/src/main/resources/static/js/fileInput.js +++ b/src/main/resources/static/js/fileInput.js @@ -141,7 +141,17 @@ function setupFileInput(chooser) { allFiles = Array.from(isDragAndDrop ? allFiles : [element.files[0]]); } + const originalText = inputContainer.querySelector('#fileInputText').innerHTML; + + inputContainer.querySelector('#fileInputText').innerHTML = window.fileInput.loading; + async function checkZipFile() { + const hasZipFiles = allFiles.some(file => zipTypes.includes(file.type)); + + // Only change to extractPDF message if we actually have zip files + if (hasZipFiles) { + inputContainer.querySelector('#fileInputText').innerHTML = window.fileInput.extractPDF; + } const promises = allFiles.map(async (file, index) => { try { @@ -156,13 +166,10 @@ function setupFileInput(chooser) { }); await Promise.all(promises); - } - const originalText = inputContainer.querySelector('#fileInputText').innerHTML; + const decryptFile = new DecryptFile(); - inputContainer.querySelector('#fileInputText').innerHTML = window.fileInput.extractPDF; - await checkZipFile(); allFiles = await Promise.all( @@ -224,26 +231,26 @@ function setupFileInput(chooser) { .then(function (zip) { var extractionPromises = []; - zip.forEach(function (relativePath, zipEntry) { - - const promise = zipEntry.async('blob').then(function (content) { - // Assuming that folders have size zero - if (content.size > 0) { - const extension = zipEntry.name.split('.').pop().toLowerCase(); - const mimeType = mimeTypes[extension]; - - // Check for file extension - if (mimeType && (mimeType.startsWith(acceptedFileType.split('/')[0]) || acceptedFileType === mimeType)) { - - var file = new File([content], zipEntry.name, { type: mimeType }); - file.uniqueId = UUID.uuidv4(); - allFiles.push(file); - - } else { - console.log(`File ${zipEntry.name} skipped. MIME type (${mimeType}) does not match accepted type (${acceptedFileType})`); - } - } - }); + zip.forEach(function (relativePath, zipEntry) { + const promise = zipEntry.async('blob').then(function (content) { + // Assuming that folders have size zero + if (content.size > 0) { + const extension = zipEntry.name.split('.').pop().toLowerCase(); + const mimeType = mimeTypes[extension] || 'application/octet-stream'; + + // Check if we're accepting ONLY ZIP files (in which case extract everything) + // or if the file type matches the accepted type + if (zipTypes.includes(acceptedFileType) || + acceptedFileType === '*/*' || + (mimeType && (mimeType.startsWith(acceptedFileType.split('/')[0]) || acceptedFileType === mimeType))) { + var file = new File([content], zipEntry.name, { type: mimeType }); + file.uniqueId = UUID.uuidv4(); + allFiles.push(file); + } else { + console.log(`File ${zipEntry.name} skipped. MIME type (${mimeType}) does not match accepted type (${acceptedFileType})`); + } + } + }); extractionPromises.push(promise); }); diff --git a/src/main/resources/templates/fragments/common.html b/src/main/resources/templates/fragments/common.html index 2a8e95012..66fa94b58 100644 --- a/src/main/resources/templates/fragments/common.html +++ b/src/main/resources/templates/fragments/common.html @@ -224,15 +224,20 @@ window.fileInput = { dragAndDropPDF: '[[#{fileChooser.dragAndDropPDF}]]', dragAndDropImage: '[[#{fileChooser.dragAndDropImage}]]', - extractPDF: '[[#{fileChooser.extractPDF}]]' + extractPDF: '[[#{fileChooser.extractPDF}]]', + loading: '[[#{loading}]]' };
diff --git a/src/main/resources/templates/pipeline.html b/src/main/resources/templates/pipeline.html index a81d4b91f..34abed18b 100644 --- a/src/main/resources/templates/pipeline.html +++ b/src/main/resources/templates/pipeline.html @@ -64,7 +64,7 @@
@@ -93,7 +93,7 @@