From b4da18659da69894e74c1789e51a90502fc0af9d Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Tue, 18 Mar 2025 23:04:09 +0000 Subject: [PATCH] Default langs, and working on reducing metrics --- Dockerfile | 4 + Dockerfile.fat | 5 +- .../SPDF/config/EndpointConfiguration.java | 8 +- .../SPDF/config/EndpointInspector.java | 291 ++++++++++++++++++ .../service/MetricsAggregatorService.java | 55 +++- 5 files changed, 347 insertions(+), 16 deletions(-) create mode 100644 src/main/java/stirling/software/SPDF/config/EndpointInspector.java diff --git a/Dockerfile b/Dockerfile index ccb8408a9..46cae3478 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,6 +66,10 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a poppler-utils \ # OCR MY PDF (unpaper for descew and other advanced features) tesseract-ocr-data-eng \ + tesseract-ocr-data-chi_sim \ + tesseract-ocr-data-deu \ + tesseract-ocr-data-fra \ + tesseract-ocr-data-por \ # CV py3-opencv \ python3 \ diff --git a/Dockerfile.fat b/Dockerfile.fat index 8855be6c0..8a4d55d80 100644 --- a/Dockerfile.fat +++ b/Dockerfile.fat @@ -75,7 +75,10 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a # OCR MY PDF (unpaper for descew and other advanced featues) qpdf \ tesseract-ocr-data-eng \ - + tesseract-ocr-data-chi_sim \ + tesseract-ocr-data-deu \ + tesseract-ocr-data-fra \ + tesseract-ocr-data-por \ font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \ # CV py3-opencv \ diff --git a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 12bf2c291..a2f325bd8 100644 --- a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -176,21 +176,17 @@ public class EndpointConfiguration { addEndpointToGroup("OpenCV", "extract-image-scans"); // LibreOffice - addEndpointToGroup("qpdf", "repair"); addEndpointToGroup("LibreOffice", "file-to-pdf"); addEndpointToGroup("LibreOffice", "pdf-to-word"); addEndpointToGroup("LibreOffice", "pdf-to-presentation"); addEndpointToGroup("LibreOffice", "pdf-to-rtf"); addEndpointToGroup("LibreOffice", "pdf-to-html"); addEndpointToGroup("LibreOffice", "pdf-to-xml"); + addEndpointToGroup("LibreOffice", "pdf-to-pdfa"); // Unoconvert addEndpointToGroup("Unoconvert", "file-to-pdf"); - // qpdf - addEndpointToGroup("qpdf", "compress-pdf"); - addEndpointToGroup("qpdf", "pdf-to-pdfa"); - addEndpointToGroup("tesseract", "ocr-pdf"); // Java @@ -240,8 +236,6 @@ public class EndpointConfiguration { addEndpointToGroup("Javascript", "adjust-contrast"); // qpdf dependent endpoints - addEndpointToGroup("qpdf", "compress-pdf"); - addEndpointToGroup("qpdf", "pdf-to-pdfa"); addEndpointToGroup("qpdf", "repair"); // Weasyprint dependent endpoints diff --git a/src/main/java/stirling/software/SPDF/config/EndpointInspector.java b/src/main/java/stirling/software/SPDF/config/EndpointInspector.java new file mode 100644 index 000000000..fc7e03e7c --- /dev/null +++ b/src/main/java/stirling/software/SPDF/config/EndpointInspector.java @@ -0,0 +1,291 @@ +package stirling.software.SPDF.config; + +import java.lang.reflect.Method; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.ApplicationContext; +import org.springframework.context.ApplicationListener; +import org.springframework.context.event.ContextRefreshedEvent; +import org.springframework.stereotype.Component; +import org.springframework.web.bind.annotation.RequestMethod; +import org.springframework.web.method.HandlerMethod; +import org.springframework.web.servlet.mvc.method.RequestMappingInfo; +import org.springframework.web.servlet.mvc.method.annotation.RequestMappingHandlerMapping; + +@Component +public class EndpointInspector implements ApplicationListener { + private static final Logger logger = LoggerFactory.getLogger(EndpointInspector.class); + + private final ApplicationContext applicationContext; + private final Set validGetEndpoints = new HashSet<>(); + private boolean endpointsDiscovered = false; + + @Autowired + public EndpointInspector(ApplicationContext applicationContext) { + this.applicationContext = applicationContext; + } + + @Override + public void onApplicationEvent(ContextRefreshedEvent event) { + if (!endpointsDiscovered) { + discoverEndpoints(); + endpointsDiscovered = true; + } + } + + private void discoverEndpoints() { + try { + // Get all request mapping beans from the application context + Map mappings = + applicationContext.getBeansOfType(RequestMappingHandlerMapping.class); + + // Process each mapping bean + for (Map.Entry entry : mappings.entrySet()) { + String beanName = entry.getKey(); + RequestMappingHandlerMapping mapping = entry.getValue(); + + // Get all handler methods registered in this mapping + Map handlerMethods = mapping.getHandlerMethods(); + int methodsWithEmptyMethodsCondition = 0; + int methodsWithGetMethod = 0; + int methodsWithGetOrEmpty = 0; + + // Process each handler method + for (Map.Entry handlerEntry : + handlerMethods.entrySet()) { + RequestMappingInfo mappingInfo = handlerEntry.getKey(); + HandlerMethod handlerMethod = handlerEntry.getValue(); + + // Debug info + logger.debug( + "Examining handler: {} -> {}", + mappingInfo, + handlerMethod.getMethod().getName()); + + boolean hasEmptyMethodsCondition = false; + boolean hasGetMethod = false; + + // Get methods through reflection if standard approach fails + Set methods = Collections.emptySet(); + + try { + methods = mappingInfo.getMethodsCondition().getMethods(); + + // Standard approach + hasEmptyMethodsCondition = methods.isEmpty(); + hasGetMethod = methods.contains(RequestMethod.GET); + + logger.debug( + "Standard method detection: methods={}, isEmpty={}, hasGET={}", + methods, + hasEmptyMethodsCondition, + hasGetMethod); + } catch (Exception e) { + logger.warn( + "Error accessing methods through standard API: {}", e.getMessage()); + } + + if (hasEmptyMethodsCondition) { + methodsWithEmptyMethodsCondition++; + } + + if (hasGetMethod) { + methodsWithGetMethod++; + } + + // Count any method that could potentially handle GET requests + if (hasEmptyMethodsCondition || hasGetMethod) { + methodsWithGetOrEmpty++; + + // Try to get patterns using reflection if direct approach fails + Set patterns = extractPatternsUsingReflection(mappingInfo); + + if (patterns.isEmpty()) { + // Fall back to toString parsing + String infoString = mappingInfo.toString(); + // Extract patterns from toString if possible + if (infoString.contains("{")) { + String patternsSection = + infoString.substring( + infoString.indexOf("{") + 1, + infoString.indexOf("}")); + + for (String pattern : patternsSection.split(",")) { + pattern = pattern.trim(); + if (!pattern.isEmpty()) { + patterns.add(pattern); + } + } + } + } + + // Add all patterns + validGetEndpoints.addAll(patterns); + } + } + + } + + if (validGetEndpoints.isEmpty()) { + // If we still couldn't find any endpoints, add some common ones as a fallback + logger.warn("No endpoints discovered. Adding common endpoints as fallback."); + validGetEndpoints.add("/"); + validGetEndpoints.add("/api/**"); + validGetEndpoints.add("/**"); + } + } catch (Exception e) { + logger.error("Error discovering endpoints", e); + } + } + + private Set extractPatternsUsingReflection(RequestMappingInfo mappingInfo) { + Set patterns = new HashSet<>(); + + try { + // First try standard API + if (mappingInfo.getPatternsCondition() != null) { + patterns.addAll(mappingInfo.getPatternsCondition().getPatterns()); + } + } catch (Exception e) { + logger.debug("Standard pattern access failed: {}", e.getMessage()); + } + + // If standard approach failed, try reflection + if (patterns.isEmpty()) { + try { + // Try to access patterns through reflection on different Spring versions + Method[] methods = mappingInfo.getClass().getMethods(); + + // Look for methods that might return patterns + for (Method method : methods) { + String methodName = method.getName(); + if ((methodName.contains("pattern") || methodName.contains("Path")) + && method.getParameterCount() == 0) { + + logger.debug("Trying reflection method: {}", methodName); + try { + Object result = method.invoke(mappingInfo); + if (result instanceof Set) { + @SuppressWarnings("unchecked") + Set resultSet = (Set) result; + patterns.addAll(resultSet); + logger.debug( + "Found {} patterns using method {}", + resultSet.size(), + methodName); + } else if (result != null) { + logger.debug( + "Method {} returned non-Set result: {}", + methodName, + result); + } + } catch (Exception e) { + logger.debug( + "Method {} invocation failed: {}", methodName, e.getMessage()); + } + } + } + } catch (Exception e) { + logger.warn("Reflection-based pattern extraction failed: {}", e.getMessage()); + } + } + + return patterns; + } + + /** + * Check if a URI corresponds to a valid GET endpoint - Fixed to handle path variables safely + */ + public boolean isValidGetEndpoint(String uri) { + // Ensure endpoints are discovered + if (!endpointsDiscovered) { + discoverEndpoints(); + endpointsDiscovered = true; + } + + // If no endpoints were discovered, assume all endpoints are valid + if (validGetEndpoints.isEmpty()) { + logger.warn( + "No valid endpoints were discovered. Assuming all GET endpoints are valid."); + return true; + } + + // Direct match + if (validGetEndpoints.contains(uri)) { + return true; + } + + // Try simple prefix matching first (safer than regex) + for (String pattern : validGetEndpoints) { + // Handle wildcards and path variables with simple prefix matching + if (pattern.contains("*") || pattern.contains("{")) { + int wildcardIndex = pattern.indexOf('*'); + int variableIndex = pattern.indexOf('{'); + + // Find the earliest special character + int cutoffIndex; + if (wildcardIndex < 0) { + cutoffIndex = variableIndex; + } else if (variableIndex < 0) { + cutoffIndex = wildcardIndex; + } else { + cutoffIndex = Math.min(wildcardIndex, variableIndex); + } + + // Get the static part of the pattern + String staticPrefix = pattern.substring(0, cutoffIndex); + + // If the URI starts with this prefix, consider it a match + if (uri.startsWith(staticPrefix)) { + return true; + } + } + } + + // For patterns without wildcards or variables, try path-segment-by-segment matching + for (String pattern : validGetEndpoints) { + if (!pattern.contains("*") && !pattern.contains("{")) { + // Split the pattern and URI into path segments + String[] patternSegments = pattern.split("/"); + String[] uriSegments = uri.split("/"); + + // If URI has fewer segments than the pattern, it can't match + if (uriSegments.length < patternSegments.length) { + continue; + } + + // Check each segment + boolean match = true; + for (int i = 0; i < patternSegments.length; i++) { + if (!patternSegments[i].equals(uriSegments[i])) { + match = false; + break; + } + } + + if (match) { + return true; + } + } + } + + // If no match was found, the URI is not valid + return false; + } + + /** Get all discovered valid GET endpoints */ + public Set getValidGetEndpoints() { + // Ensure endpoints are discovered + if (!endpointsDiscovered) { + discoverEndpoints(); + endpointsDiscovered = true; + } + return new HashSet<>(validGetEndpoints); + } +} diff --git a/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java b/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java index ad911f969..7415719b9 100644 --- a/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java +++ b/src/main/java/stirling/software/SPDF/service/MetricsAggregatorService.java @@ -4,6 +4,8 @@ import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; @@ -11,22 +13,41 @@ import org.springframework.stereotype.Service; import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.search.Search; +import stirling.software.SPDF.config.EndpointInspector; + @Service public class MetricsAggregatorService { + private static final Logger logger = LoggerFactory.getLogger(MetricsAggregatorService.class); private final MeterRegistry meterRegistry; private final PostHogService postHogService; + private final EndpointInspector endpointInspector; private final Map lastSentMetrics = new ConcurrentHashMap<>(); + // Flag to decide behavior if no endpoints are discovered + private boolean allowAllGetEndpointsIfNoneDiscovered = true; + @Autowired - public MetricsAggregatorService(MeterRegistry meterRegistry, PostHogService postHogService) { + public MetricsAggregatorService( + MeterRegistry meterRegistry, + PostHogService postHogService, + EndpointInspector endpointInspector) { this.meterRegistry = meterRegistry; this.postHogService = postHogService; + this.endpointInspector = endpointInspector; } - @Scheduled(fixedRate = 7200000) // Run every 2 hours + @Scheduled(fixedRate = 72000) // Run every 2 hours public void aggregateAndSendMetrics() { Map metrics = new HashMap<>(); + + int endpointCount = endpointInspector.getValidGetEndpoints().size(); + + boolean validateGetEndpoints = true; + if (endpointCount == 0 && allowAllGetEndpointsIfNoneDiscovered) { + validateGetEndpoints = false; + } + final boolean validateGetEndpointsFinal = validateGetEndpoints; Search.in(meterRegistry) .name("http.requests") .counters() @@ -34,35 +55,53 @@ public class MetricsAggregatorService { counter -> { String method = counter.getId().getTag("method"); String uri = counter.getId().getTag("uri"); - // Skip if either method or uri is null if (method == null || uri == null) { return; } + + // Skip URIs that are 2 characters or shorter + if (uri.length() <= 2) { + return; + } + + // Skip non-GET and non-POST requests if (!"GET".equals(method) && !"POST".equals(method)) { return; } - // Skip URIs that are 2 characters or shorter - if (uri.length() <= 2) { + + // For POST requests, only include if they start with /api/v1 + if ("POST".equals(method) && !uri.contains("api/v1")) { + return; + } + + + if(uri.contains(".txt")) { + return; + } + // For GET requests, validate if we have a list of valid endpoints + if ("GET".equals(method) + && validateGetEndpointsFinal + && !endpointInspector.isValidGetEndpoint(uri)) { + logger.debug("Skipping invalid GET endpoint: {}", uri); return; } String key = String.format( "http_requests_%s_%s", method, uri.replace("/", "_")); - double currentCount = counter.count(); double lastCount = lastSentMetrics.getOrDefault(key, 0.0); double difference = currentCount - lastCount; - if (difference > 0) { + logger.info("{}, {}", key, difference); metrics.put(key, difference); lastSentMetrics.put(key, currentCount); } }); - // Send aggregated metrics to PostHog if (!metrics.isEmpty()) { + postHogService.captureEvent("aggregated_metrics", metrics); } }