Stirling 2.0 (#3928)

# Description of Changes  --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
2026-03-13 02:18:16 +01:00 · 2025-07-16 17:53:50 +01:00
parent 584e2ecee7
commit 922bbc9076
66 changed files with 8728 additions and 2519 deletions
--- a/frontend/src/services/fileAnalyzer.ts
+++ b/frontend/src/services/fileAnalyzer.ts
@@ -0,0 +1,240 @@
+import { getDocument } from 'pdfjs-dist';
+import { FileAnalysis, ProcessingStrategy } from '../types/processing';
+
+export class FileAnalyzer {
+  private static readonly SIZE_THRESHOLDS = {
+    SMALL: 10 * 1024 * 1024,  // 10MB
+    MEDIUM: 50 * 1024 * 1024, // 50MB
+    LARGE: 200 * 1024 * 1024, // 200MB
+  };
+
+  private static readonly PAGE_THRESHOLDS = {
+    FEW: 10,    // < 10 pages - immediate full processing
+    MANY: 50,   // < 50 pages - priority pages
+    MASSIVE: 100, // < 100 pages - progressive chunked
+    // >100 pages = metadata only
+  };
+
+  /**
+   * Analyze a file to determine optimal processing strategy
+   */
+  static async analyzeFile(file: File): Promise<FileAnalysis> {
+    const analysis: FileAnalysis = {
+      fileSize: file.size,
+      isEncrypted: false,
+      isCorrupted: false,
+      recommendedStrategy: 'metadata_only',
+      estimatedProcessingTime: 0,
+    };
+
+    try {
+      // Quick validation and page count estimation
+      const quickAnalysis = await this.quickPDFAnalysis(file);
+      analysis.estimatedPageCount = quickAnalysis.pageCount;
+      analysis.isEncrypted = quickAnalysis.isEncrypted;
+      analysis.isCorrupted = quickAnalysis.isCorrupted;
+
+      // Determine strategy based on file characteristics
+      analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount);
+      
+      // Estimate processing time
+      analysis.estimatedProcessingTime = this.estimateProcessingTime(
+        file.size, 
+        quickAnalysis.pageCount, 
+        analysis.recommendedStrategy
+      );
+
+    } catch (error) {
+      console.error('File analysis failed:', error);
+      analysis.isCorrupted = true;
+      analysis.recommendedStrategy = 'metadata_only';
+    }
+
+    return analysis;
+  }
+
+  /**
+   * Quick PDF analysis without full processing
+   */
+  private static async quickPDFAnalysis(file: File): Promise<{
+    pageCount: number;
+    isEncrypted: boolean;
+    isCorrupted: boolean;
+  }> {
+    try {
+      // For small files, read the whole file
+      // For large files, try the whole file first (PDF.js needs the complete structure)
+      const arrayBuffer = await file.arrayBuffer();
+
+      const pdf = await getDocument({ 
+        data: arrayBuffer,
+        stopAtErrors: false, // Don't stop at minor errors
+        verbosity: 0 // Suppress PDF.js warnings
+      }).promise;
+
+      const pageCount = pdf.numPages;
+      const isEncrypted = pdf.isEncrypted;
+      
+      // Clean up
+      pdf.destroy();
+
+      return {
+        pageCount,
+        isEncrypted,
+        isCorrupted: false
+      };
+
+    } catch (error) {
+      // Try to determine if it's corruption vs encryption
+      const errorMessage = error instanceof Error ? error.message.toLowerCase() : '';
+      const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted');
+      
+      return {
+        pageCount: 0,
+        isEncrypted,
+        isCorrupted: !isEncrypted // If not encrypted, probably corrupted
+      };
+    }
+  }
+
+  /**
+   * Determine the best processing strategy based on file characteristics
+   */
+  private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy {
+    // Handle corrupted or encrypted files
+    if (!pageCount || pageCount === 0) {
+      return 'metadata_only';
+    }
+
+    // Small files with few pages - process everything immediately
+    if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) {
+      return 'immediate_full';
+    }
+
+    // Medium files or many pages - priority pages first, then progressive
+    if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) {
+      return 'priority_pages';
+    }
+
+    // Large files or massive page counts - chunked processing
+    if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) {
+      return 'progressive_chunked';
+    }
+
+    // Very large files - metadata only
+    return 'metadata_only';
+  }
+
+  /**
+   * Estimate processing time based on file characteristics and strategy
+   */
+  private static estimateProcessingTime(
+    fileSize: number, 
+    pageCount: number = 0, 
+    strategy: ProcessingStrategy
+  ): number {
+    const baseTimes = {
+      immediate_full: 200,      // 200ms per page
+      priority_pages: 150,     // 150ms per page (optimized)
+      progressive_chunked: 100, // 100ms per page (chunked)
+      metadata_only: 50        // 50ms total
+    };
+
+    const baseTime = baseTimes[strategy];
+
+    switch (strategy) {
+      case 'metadata_only':
+        return baseTime;
+      
+      case 'immediate_full':
+        return pageCount * baseTime;
+      
+      case 'priority_pages':
+        // Estimate time for priority pages (first 10)
+        const priorityPages = Math.min(pageCount, 10);
+        return priorityPages * baseTime;
+      
+      case 'progressive_chunked':
+        // Estimate time for first chunk (20 pages)
+        const firstChunk = Math.min(pageCount, 20);
+        return firstChunk * baseTime;
+      
+      default:
+        return pageCount * baseTime;
+    }
+  }
+
+  /**
+   * Get processing recommendations for a set of files
+   */
+  static async analyzeMultipleFiles(files: File[]): Promise<{
+    analyses: Map<File, FileAnalysis>;
+    recommendations: {
+      totalEstimatedTime: number;
+      suggestedBatchSize: number;
+      shouldUseWebWorker: boolean;
+      memoryWarning: boolean;
+    };
+  }> {
+    const analyses = new Map<File, FileAnalysis>();
+    let totalEstimatedTime = 0;
+    let totalSize = 0;
+    let totalPages = 0;
+
+    // Analyze each file
+    for (const file of files) {
+      const analysis = await this.analyzeFile(file);
+      analyses.set(file, analysis);
+      totalEstimatedTime += analysis.estimatedProcessingTime;
+      totalSize += file.size;
+      totalPages += analysis.estimatedPageCount || 0;
+    }
+
+    // Generate recommendations
+    const recommendations = {
+      totalEstimatedTime,
+      suggestedBatchSize: this.calculateBatchSize(files.length, totalSize),
+      shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM,
+      memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE
+    };
+
+    return { analyses, recommendations };
+  }
+
+  /**
+   * Calculate optimal batch size for processing multiple files
+   */
+  private static calculateBatchSize(fileCount: number, totalSize: number): number {
+    // Process small batches for large total sizes
+    if (totalSize > this.SIZE_THRESHOLDS.LARGE) {
+      return Math.max(1, Math.floor(fileCount / 4));
+    }
+    
+    if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) {
+      return Math.max(2, Math.floor(fileCount / 2));
+    }
+    
+    // Process all at once for smaller total sizes
+    return fileCount;
+  }
+
+  /**
+   * Check if a file appears to be a valid PDF
+   */
+  static async isValidPDF(file: File): Promise<boolean> {
+    if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) {
+      return false;
+    }
+
+    try {
+      // Read first few bytes to check PDF header
+      const header = file.slice(0, 8);
+      const headerBytes = new Uint8Array(await header.arrayBuffer());
+      const headerString = String.fromCharCode(...headerBytes);
+      
+      return headerString.startsWith('%PDF-');
+    } catch (error) {
+      return false;
+    }
+  }
+}