diff --git a/frontend/src/components/fileEditor/FileEditor.tsx b/frontend/src/components/fileEditor/FileEditor.tsx index 90ee6eebe..7064f297b 100644 --- a/frontend/src/components/fileEditor/FileEditor.tsx +++ b/frontend/src/components/fileEditor/FileEditor.tsx @@ -104,13 +104,22 @@ const FileEditor = ({ // Handle PDF files normally allExtractedFiles.push(file); } else if (file.type === 'application/zip' || file.type === 'application/x-zip-compressed' || file.name.toLowerCase().endsWith('.zip')) { - // Handle ZIP files - only expand if they contain PDFs + // Handle ZIP files - extract all files except HTML try { + // Check if ZIP contains HTML files - if so, don't extract + const containsHtml = await zipFileService.containsHtmlFiles(file); + + if (containsHtml) { + // HTML files should stay zipped + allExtractedFiles.push(file); + continue; + } + // Validate ZIP file first const validation = await zipFileService.validateZipFile(file); - if (validation.isValid && validation.containsPDFs) { - // ZIP contains PDFs - extract them + if (validation.isValid && validation.containsFiles) { + // ZIP contains files - extract them setZipExtractionProgress({ isExtracting: true, currentFile: file.name, @@ -119,7 +128,7 @@ const FileEditor = ({ totalFiles: validation.fileCount }); - const extractionResult = await zipFileService.extractPdfFiles(file, (progress) => { + const extractionResult = await zipFileService.extractAllFiles(file, (progress) => { setZipExtractionProgress({ isExtracting: true, currentFile: progress.currentFile, @@ -148,7 +157,7 @@ const FileEditor = ({ errors.push(`Failed to extract ZIP file "${file.name}": ${extractionResult.errors.join(', ')}`); } } else { - // ZIP doesn't contain PDFs or is invalid - treat as regular file + // ZIP is empty or invalid - treat as regular file allExtractedFiles.push(file); } } catch (zipError) { diff --git a/frontend/src/hooks/tools/shared/useToolResources.ts b/frontend/src/hooks/tools/shared/useToolResources.ts index 366730885..42db6889b 100644 --- a/frontend/src/hooks/tools/shared/useToolResources.ts +++ b/frontend/src/hooks/tools/shared/useToolResources.ts @@ -27,11 +27,11 @@ export const useToolResources = () => { // Cleanup on unmount - use ref to avoid dependency on blobUrls state const blobUrlsRef = useRef([]); - + useEffect(() => { blobUrlsRef.current = blobUrls; }, [blobUrls]); - + useEffect(() => { return () => { blobUrlsRef.current.forEach(url => { @@ -85,6 +85,15 @@ export const useToolResources = () => { const extractZipFiles = useCallback(async (zipBlob: Blob, skipAutoUnzip = false): Promise => { try { + // Check if ZIP contains HTML files - if so, keep as ZIP + const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' }); + const containsHtml = await zipFileService.containsHtmlFiles(zipFile); + + if (containsHtml) { + // HTML files should stay zipped + return [new File([zipBlob], 'result.zip', { type: 'application/zip' })]; + } + // Check if we should extract based on preferences const shouldExtract = await zipFileService.shouldUnzip( zipBlob, @@ -97,8 +106,7 @@ export const useToolResources = () => { return [new File([zipBlob], 'result.zip', { type: 'application/zip' })]; } - const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' }); - const extractionResult = await zipFileService.extractPdfFiles(zipFile); + const extractionResult = await zipFileService.extractAllFiles(zipFile); return extractionResult.success ? extractionResult.extractedFiles : []; } catch (error) { console.error('useToolResources.extractZipFiles - Error:', error); @@ -108,6 +116,15 @@ export const useToolResources = () => { const extractAllZipFiles = useCallback(async (zipBlob: Blob, skipAutoUnzip = false): Promise => { try { + // Check if ZIP contains HTML files - if so, keep as ZIP + const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' }); + const containsHtml = await zipFileService.containsHtmlFiles(zipFile); + + if (containsHtml) { + // HTML files should stay zipped + return [new File([zipBlob], 'result.zip', { type: 'application/zip' })]; + } + // Check if we should extract based on preferences const shouldExtract = await zipFileService.shouldUnzip( zipBlob, @@ -120,7 +137,6 @@ export const useToolResources = () => { return [new File([zipBlob], 'result.zip', { type: 'application/zip' })]; } - const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' }); const extractionResult = await zipFileService.extractAllFiles(zipFile); return extractionResult.success ? extractionResult.extractedFiles : []; } catch (error) { diff --git a/frontend/src/services/zipFileService.ts b/frontend/src/services/zipFileService.ts index 45ec39219..021e7a594 100644 --- a/frontend/src/services/zipFileService.ts +++ b/frontend/src/services/zipFileService.ts @@ -29,6 +29,7 @@ export interface ZipValidationResult { fileCount: number; totalSizeBytes: number; containsPDFs: boolean; + containsFiles: boolean; errors: string[]; } @@ -42,7 +43,6 @@ export interface ZipExtractionProgress { export class ZipFileService { private readonly maxFileSize = 100 * 1024 * 1024; // 100MB per file private readonly maxTotalSize = 500 * 1024 * 1024; // 500MB total extraction limit - private readonly supportedExtensions = ['.pdf']; // ZIP file validation constants private static readonly VALID_ZIP_TYPES = [ @@ -62,6 +62,7 @@ export class ZipFileService { fileCount: 0, totalSizeBytes: 0, containsPDFs: false, + containsFiles: false, errors: [] }; @@ -115,10 +116,13 @@ export class ZipFileService { result.fileCount = fileCount; result.totalSizeBytes = totalSize; result.containsPDFs = containsPDFs; - result.isValid = result.errors.length === 0 && containsPDFs; + result.containsFiles = fileCount > 0; - if (!containsPDFs) { - result.errors.push('ZIP file does not contain any PDF files'); + // ZIP is valid if it has files and no size errors + result.isValid = result.errors.length === 0 && result.containsFiles; + + if (!result.containsFiles) { + result.errors.push('ZIP file does not contain any files'); } return result; @@ -278,6 +282,37 @@ export class ZipFileService { return filename.toLowerCase().endsWith('.pdf'); } + /** + * Check if a filename indicates an HTML file + */ + private isHtmlFile(filename: string): boolean { + const lowerName = filename.toLowerCase(); + return lowerName.endsWith('.html') || lowerName.endsWith('.htm') || lowerName.endsWith('.xhtml'); + } + + /** + * Check if a ZIP file contains HTML files + * Used to determine if the ZIP should be kept intact (HTML) or extracted (other files) + */ + async containsHtmlFiles(file: Blob | File): Promise { + try { + const zip = new JSZip(); + const zipContents = await zip.loadAsync(file); + + // Check if any file is an HTML file + for (const [filename, zipEntry] of Object.entries(zipContents.files)) { + if (!zipEntry.dir && this.isHtmlFile(filename)) { + return true; + } + } + + return false; + } catch (error) { + console.error('Error checking for HTML files:', error); + return false; + } + } + /** * Validate that a file is actually a PDF by checking its header */ @@ -486,9 +521,11 @@ export class ZipFileService { } /** - * Extract PDF files from ZIP and store them in IndexedDB with preserved history metadata + * Extract all files from ZIP and store them in IndexedDB with preserved history metadata * Used by both FileManager and FileEditor to avoid code duplication * + * Note: HTML files will NOT be extracted - the ZIP is kept intact when HTML is detected + * * @param zipFile - The ZIP file to extract from * @param zipStub - The StirlingFileStub for the ZIP (contains metadata to preserve) * @returns Object with success status, extracted stubs, and any errors @@ -504,8 +541,15 @@ export class ZipFileService { }; try { - // Extract PDF files from ZIP - const extractionResult = await this.extractPdfFiles(zipFile); + // Check if ZIP contains HTML files - if so, don't extract + const hasHtml = await this.containsHtmlFiles(zipFile); + if (hasHtml) { + result.errors.push('ZIP contains HTML files and will not be auto-extracted. Download the ZIP to access the files.'); + return result; + } + + // Extract all files from ZIP (not just PDFs) + const extractionResult = await this.extractAllFiles(zipFile); if (!extractionResult.success || extractionResult.extractedFiles.length === 0) { result.errors = extractionResult.errors; @@ -515,7 +559,7 @@ export class ZipFileService { // Process each extracted file for (const extractedFile of extractionResult.extractedFiles) { try { - // Generate thumbnail + // Generate thumbnail (works for PDFs and images) const thumbnail = await generateThumbnailForFile(extractedFile); // Create StirlingFile diff --git a/frontend/src/utils/automationFileProcessor.ts b/frontend/src/utils/automationFileProcessor.ts index d81dd3a1b..4b7417177 100644 --- a/frontend/src/utils/automationFileProcessor.ts +++ b/frontend/src/utils/automationFileProcessor.ts @@ -30,6 +30,7 @@ export class AutomationFileProcessor { /** * Extract files from a ZIP blob during automation execution, with fallback for non-ZIP files + * Extracts all file types (PDFs, images, etc.) except HTML files which stay zipped */ static async extractAutomationZipFiles(blob: Blob): Promise { try { @@ -40,20 +41,26 @@ export class AutomationFileProcessor { 'application/zip' ); - const result = await zipFileService.extractPdfFiles(zipFile); - - if (!result.success || result.extractedFiles.length === 0) { - // Fallback: treat as single PDF file - const fallbackFile = ResourceManager.createTimestampedFile( - blob, - AUTOMATION_CONSTANTS.RESULT_FILE_PREFIX, - '.pdf' - ); - + // Check if ZIP contains HTML files - if so, keep as ZIP + const containsHtml = await zipFileService.containsHtmlFiles(zipFile); + if (containsHtml) { + // HTML files should stay zipped - return ZIP as-is return { success: true, - files: [fallbackFile], - errors: [`ZIP extraction failed, treated as single file: ${result.errors?.join(', ') || 'Unknown error'}`] + files: [zipFile], + errors: [] + }; + } + + // Extract all files (not just PDFs) - handles images from scanner-image-split, etc. + const result = await zipFileService.extractAllFiles(zipFile); + + if (!result.success || result.extractedFiles.length === 0) { + // Fallback: keep as ZIP file (might be valid ZIP with extraction issues) + return { + success: true, + files: [zipFile], + errors: [`ZIP extraction failed, kept as ZIP: ${result.errors?.join(', ') || 'Unknown error'}`] }; } @@ -63,18 +70,19 @@ export class AutomationFileProcessor { errors: [] }; } catch (error) { - console.warn('Failed to extract automation ZIP files, falling back to single file:', error); - // Fallback: treat as single PDF file + console.warn('Failed to extract automation ZIP files, keeping as ZIP:', error); + // Fallback: keep as ZIP file for next automation step to handle const fallbackFile = ResourceManager.createTimestampedFile( blob, - AUTOMATION_CONSTANTS.RESULT_FILE_PREFIX, - '.pdf' + AUTOMATION_CONSTANTS.RESPONSE_ZIP_PREFIX, + '.zip', + 'application/zip' ); return { success: true, files: [fallbackFile], - errors: [`ZIP extraction failed, treated as single file: ${error}`] + errors: [`ZIP extraction failed, kept as ZIP: ${error}`] }; } }