mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-03-13 02:18:16 +01:00
Stirling 2.0 (#3928)
# Description of Changes <!-- File context for managing files between tools and views Optimisation for large files Updated Split to work with new file system and match Matts stepped design closer --> --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
This commit is contained in:
240
frontend/src/services/fileAnalyzer.ts
Normal file
240
frontend/src/services/fileAnalyzer.ts
Normal file
@@ -0,0 +1,240 @@
|
||||
import { getDocument } from 'pdfjs-dist';
|
||||
import { FileAnalysis, ProcessingStrategy } from '../types/processing';
|
||||
|
||||
export class FileAnalyzer {
|
||||
private static readonly SIZE_THRESHOLDS = {
|
||||
SMALL: 10 * 1024 * 1024, // 10MB
|
||||
MEDIUM: 50 * 1024 * 1024, // 50MB
|
||||
LARGE: 200 * 1024 * 1024, // 200MB
|
||||
};
|
||||
|
||||
private static readonly PAGE_THRESHOLDS = {
|
||||
FEW: 10, // < 10 pages - immediate full processing
|
||||
MANY: 50, // < 50 pages - priority pages
|
||||
MASSIVE: 100, // < 100 pages - progressive chunked
|
||||
// >100 pages = metadata only
|
||||
};
|
||||
|
||||
/**
|
||||
* Analyze a file to determine optimal processing strategy
|
||||
*/
|
||||
static async analyzeFile(file: File): Promise<FileAnalysis> {
|
||||
const analysis: FileAnalysis = {
|
||||
fileSize: file.size,
|
||||
isEncrypted: false,
|
||||
isCorrupted: false,
|
||||
recommendedStrategy: 'metadata_only',
|
||||
estimatedProcessingTime: 0,
|
||||
};
|
||||
|
||||
try {
|
||||
// Quick validation and page count estimation
|
||||
const quickAnalysis = await this.quickPDFAnalysis(file);
|
||||
analysis.estimatedPageCount = quickAnalysis.pageCount;
|
||||
analysis.isEncrypted = quickAnalysis.isEncrypted;
|
||||
analysis.isCorrupted = quickAnalysis.isCorrupted;
|
||||
|
||||
// Determine strategy based on file characteristics
|
||||
analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount);
|
||||
|
||||
// Estimate processing time
|
||||
analysis.estimatedProcessingTime = this.estimateProcessingTime(
|
||||
file.size,
|
||||
quickAnalysis.pageCount,
|
||||
analysis.recommendedStrategy
|
||||
);
|
||||
|
||||
} catch (error) {
|
||||
console.error('File analysis failed:', error);
|
||||
analysis.isCorrupted = true;
|
||||
analysis.recommendedStrategy = 'metadata_only';
|
||||
}
|
||||
|
||||
return analysis;
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick PDF analysis without full processing
|
||||
*/
|
||||
private static async quickPDFAnalysis(file: File): Promise<{
|
||||
pageCount: number;
|
||||
isEncrypted: boolean;
|
||||
isCorrupted: boolean;
|
||||
}> {
|
||||
try {
|
||||
// For small files, read the whole file
|
||||
// For large files, try the whole file first (PDF.js needs the complete structure)
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
|
||||
const pdf = await getDocument({
|
||||
data: arrayBuffer,
|
||||
stopAtErrors: false, // Don't stop at minor errors
|
||||
verbosity: 0 // Suppress PDF.js warnings
|
||||
}).promise;
|
||||
|
||||
const pageCount = pdf.numPages;
|
||||
const isEncrypted = pdf.isEncrypted;
|
||||
|
||||
// Clean up
|
||||
pdf.destroy();
|
||||
|
||||
return {
|
||||
pageCount,
|
||||
isEncrypted,
|
||||
isCorrupted: false
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
// Try to determine if it's corruption vs encryption
|
||||
const errorMessage = error instanceof Error ? error.message.toLowerCase() : '';
|
||||
const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted');
|
||||
|
||||
return {
|
||||
pageCount: 0,
|
||||
isEncrypted,
|
||||
isCorrupted: !isEncrypted // If not encrypted, probably corrupted
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the best processing strategy based on file characteristics
|
||||
*/
|
||||
private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy {
|
||||
// Handle corrupted or encrypted files
|
||||
if (!pageCount || pageCount === 0) {
|
||||
return 'metadata_only';
|
||||
}
|
||||
|
||||
// Small files with few pages - process everything immediately
|
||||
if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) {
|
||||
return 'immediate_full';
|
||||
}
|
||||
|
||||
// Medium files or many pages - priority pages first, then progressive
|
||||
if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) {
|
||||
return 'priority_pages';
|
||||
}
|
||||
|
||||
// Large files or massive page counts - chunked processing
|
||||
if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) {
|
||||
return 'progressive_chunked';
|
||||
}
|
||||
|
||||
// Very large files - metadata only
|
||||
return 'metadata_only';
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate processing time based on file characteristics and strategy
|
||||
*/
|
||||
private static estimateProcessingTime(
|
||||
fileSize: number,
|
||||
pageCount: number = 0,
|
||||
strategy: ProcessingStrategy
|
||||
): number {
|
||||
const baseTimes = {
|
||||
immediate_full: 200, // 200ms per page
|
||||
priority_pages: 150, // 150ms per page (optimized)
|
||||
progressive_chunked: 100, // 100ms per page (chunked)
|
||||
metadata_only: 50 // 50ms total
|
||||
};
|
||||
|
||||
const baseTime = baseTimes[strategy];
|
||||
|
||||
switch (strategy) {
|
||||
case 'metadata_only':
|
||||
return baseTime;
|
||||
|
||||
case 'immediate_full':
|
||||
return pageCount * baseTime;
|
||||
|
||||
case 'priority_pages':
|
||||
// Estimate time for priority pages (first 10)
|
||||
const priorityPages = Math.min(pageCount, 10);
|
||||
return priorityPages * baseTime;
|
||||
|
||||
case 'progressive_chunked':
|
||||
// Estimate time for first chunk (20 pages)
|
||||
const firstChunk = Math.min(pageCount, 20);
|
||||
return firstChunk * baseTime;
|
||||
|
||||
default:
|
||||
return pageCount * baseTime;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get processing recommendations for a set of files
|
||||
*/
|
||||
static async analyzeMultipleFiles(files: File[]): Promise<{
|
||||
analyses: Map<File, FileAnalysis>;
|
||||
recommendations: {
|
||||
totalEstimatedTime: number;
|
||||
suggestedBatchSize: number;
|
||||
shouldUseWebWorker: boolean;
|
||||
memoryWarning: boolean;
|
||||
};
|
||||
}> {
|
||||
const analyses = new Map<File, FileAnalysis>();
|
||||
let totalEstimatedTime = 0;
|
||||
let totalSize = 0;
|
||||
let totalPages = 0;
|
||||
|
||||
// Analyze each file
|
||||
for (const file of files) {
|
||||
const analysis = await this.analyzeFile(file);
|
||||
analyses.set(file, analysis);
|
||||
totalEstimatedTime += analysis.estimatedProcessingTime;
|
||||
totalSize += file.size;
|
||||
totalPages += analysis.estimatedPageCount || 0;
|
||||
}
|
||||
|
||||
// Generate recommendations
|
||||
const recommendations = {
|
||||
totalEstimatedTime,
|
||||
suggestedBatchSize: this.calculateBatchSize(files.length, totalSize),
|
||||
shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM,
|
||||
memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE
|
||||
};
|
||||
|
||||
return { analyses, recommendations };
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate optimal batch size for processing multiple files
|
||||
*/
|
||||
private static calculateBatchSize(fileCount: number, totalSize: number): number {
|
||||
// Process small batches for large total sizes
|
||||
if (totalSize > this.SIZE_THRESHOLDS.LARGE) {
|
||||
return Math.max(1, Math.floor(fileCount / 4));
|
||||
}
|
||||
|
||||
if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) {
|
||||
return Math.max(2, Math.floor(fileCount / 2));
|
||||
}
|
||||
|
||||
// Process all at once for smaller total sizes
|
||||
return fileCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a file appears to be a valid PDF
|
||||
*/
|
||||
static async isValidPDF(file: File): Promise<boolean> {
|
||||
if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Read first few bytes to check PDF header
|
||||
const header = file.slice(0, 8);
|
||||
const headerBytes = new Uint8Array(await header.arrayBuffer());
|
||||
const headerString = String.fromCharCode(...headerBytes);
|
||||
|
||||
return headerString.startsWith('%PDF-');
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user