Zip file changes

2026-04-22 23:08:53 +02:00 · 2025-10-10 15:33:33 +01:00
parent b695e3900e
commit b2f0efdcd7
4 changed files with 112 additions and 35 deletions
--- a/frontend/src/components/fileEditor/FileEditor.tsx
+++ b/frontend/src/components/fileEditor/FileEditor.tsx
@@ -104,13 +104,22 @@ const FileEditor = ({
          // Handle PDF files normally
          allExtractedFiles.push(file);
        } else if (file.type === 'application/zip' || file.type === 'application/x-zip-compressed' || file.name.toLowerCase().endsWith('.zip')) {
-          // Handle ZIP files - only expand if they contain PDFs
+          // Handle ZIP files - extract all files except HTML
          try {
+            // Check if ZIP contains HTML files - if so, don't extract
+            const containsHtml = await zipFileService.containsHtmlFiles(file);
+
+            if (containsHtml) {
+              // HTML files should stay zipped
+              allExtractedFiles.push(file);
+              continue;
+            }
+
            // Validate ZIP file first
            const validation = await zipFileService.validateZipFile(file);

-            if (validation.isValid && validation.containsPDFs) {
-              // ZIP contains PDFs - extract them
+            if (validation.isValid && validation.containsFiles) {
+              // ZIP contains files - extract them
              setZipExtractionProgress({
                isExtracting: true,
                currentFile: file.name,
@@ -119,7 +128,7 @@ const FileEditor = ({
                totalFiles: validation.fileCount
              });

-              const extractionResult = await zipFileService.extractPdfFiles(file, (progress) => {
+              const extractionResult = await zipFileService.extractAllFiles(file, (progress) => {
                setZipExtractionProgress({
                  isExtracting: true,
                  currentFile: progress.currentFile,
@@ -148,7 +157,7 @@ const FileEditor = ({
                errors.push(`Failed to extract ZIP file "${file.name}": ${extractionResult.errors.join(', ')}`);
              }
            } else {
-              // ZIP doesn't contain PDFs or is invalid - treat as regular file
+              // ZIP is empty or invalid - treat as regular file
              allExtractedFiles.push(file);
            }
          } catch (zipError) {
--- a/frontend/src/hooks/tools/shared/useToolResources.ts
+++ b/frontend/src/hooks/tools/shared/useToolResources.ts
@@ -27,11 +27,11 @@ export const useToolResources = () => {

  // Cleanup on unmount - use ref to avoid dependency on blobUrls state
  const blobUrlsRef = useRef<string[]>([]);
-  
+
  useEffect(() => {
    blobUrlsRef.current = blobUrls;
  }, [blobUrls]);
-  
+
  useEffect(() => {
    return () => {
      blobUrlsRef.current.forEach(url => {
@@ -85,6 +85,15 @@ export const useToolResources = () => {

  const extractZipFiles = useCallback(async (zipBlob: Blob, skipAutoUnzip = false): Promise<File[]> => {
    try {
+      // Check if ZIP contains HTML files - if so, keep as ZIP
+      const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' });
+      const containsHtml = await zipFileService.containsHtmlFiles(zipFile);
+
+      if (containsHtml) {
+        // HTML files should stay zipped
+        return [new File([zipBlob], 'result.zip', { type: 'application/zip' })];
+      }
+
      // Check if we should extract based on preferences
      const shouldExtract = await zipFileService.shouldUnzip(
        zipBlob,
@@ -97,8 +106,7 @@ export const useToolResources = () => {
        return [new File([zipBlob], 'result.zip', { type: 'application/zip' })];
      }

-      const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' });
-      const extractionResult = await zipFileService.extractPdfFiles(zipFile);
+      const extractionResult = await zipFileService.extractAllFiles(zipFile);
      return extractionResult.success ? extractionResult.extractedFiles : [];
    } catch (error) {
      console.error('useToolResources.extractZipFiles - Error:', error);
@@ -108,6 +116,15 @@ export const useToolResources = () => {

  const extractAllZipFiles = useCallback(async (zipBlob: Blob, skipAutoUnzip = false): Promise<File[]> => {
    try {
+      // Check if ZIP contains HTML files - if so, keep as ZIP
+      const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' });
+      const containsHtml = await zipFileService.containsHtmlFiles(zipFile);
+
+      if (containsHtml) {
+        // HTML files should stay zipped
+        return [new File([zipBlob], 'result.zip', { type: 'application/zip' })];
+      }
+
      // Check if we should extract based on preferences
      const shouldExtract = await zipFileService.shouldUnzip(
        zipBlob,
@@ -120,7 +137,6 @@ export const useToolResources = () => {
        return [new File([zipBlob], 'result.zip', { type: 'application/zip' })];
      }

-      const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' });
      const extractionResult = await zipFileService.extractAllFiles(zipFile);
      return extractionResult.success ? extractionResult.extractedFiles : [];
    } catch (error) {
--- a/frontend/src/services/zipFileService.ts
+++ b/frontend/src/services/zipFileService.ts
@@ -29,6 +29,7 @@ export interface ZipValidationResult {
  fileCount: number;
  totalSizeBytes: number;
  containsPDFs: boolean;
+  containsFiles: boolean;
  errors: string[];
 }

@@ -42,7 +43,6 @@ export interface ZipExtractionProgress {
 export class ZipFileService {
  private readonly maxFileSize = 100 * 1024 * 1024; // 100MB per file
  private readonly maxTotalSize = 500 * 1024 * 1024; // 500MB total extraction limit
-  private readonly supportedExtensions = ['.pdf'];

  // ZIP file validation constants
  private static readonly VALID_ZIP_TYPES = [
@@ -62,6 +62,7 @@ export class ZipFileService {
      fileCount: 0,
      totalSizeBytes: 0,
      containsPDFs: false,
+      containsFiles: false,
      errors: []
    };

@@ -115,10 +116,13 @@ export class ZipFileService {
      result.fileCount = fileCount;
      result.totalSizeBytes = totalSize;
      result.containsPDFs = containsPDFs;
-      result.isValid = result.errors.length === 0 && containsPDFs;
+      result.containsFiles = fileCount > 0;

-      if (!containsPDFs) {
-        result.errors.push('ZIP file does not contain any PDF files');
+      // ZIP is valid if it has files and no size errors
+      result.isValid = result.errors.length === 0 && result.containsFiles;
+
+      if (!result.containsFiles) {
+        result.errors.push('ZIP file does not contain any files');
      }

      return result;
@@ -278,6 +282,37 @@ export class ZipFileService {
    return filename.toLowerCase().endsWith('.pdf');
  }

+  /**
+   * Check if a filename indicates an HTML file
+   */
+  private isHtmlFile(filename: string): boolean {
+    const lowerName = filename.toLowerCase();
+    return lowerName.endsWith('.html') || lowerName.endsWith('.htm') || lowerName.endsWith('.xhtml');
+  }
+
+  /**
+   * Check if a ZIP file contains HTML files
+   * Used to determine if the ZIP should be kept intact (HTML) or extracted (other files)
+   */
+  async containsHtmlFiles(file: Blob | File): Promise<boolean> {
+    try {
+      const zip = new JSZip();
+      const zipContents = await zip.loadAsync(file);
+
+      // Check if any file is an HTML file
+      for (const [filename, zipEntry] of Object.entries(zipContents.files)) {
+        if (!zipEntry.dir && this.isHtmlFile(filename)) {
+          return true;
+        }
+      }
+
+      return false;
+    } catch (error) {
+      console.error('Error checking for HTML files:', error);
+      return false;
+    }
+  }
+
  /**
   * Validate that a file is actually a PDF by checking its header
   */
@@ -486,9 +521,11 @@ export class ZipFileService {
  }

  /**
-   * Extract PDF files from ZIP and store them in IndexedDB with preserved history metadata
+   * Extract all files from ZIP and store them in IndexedDB with preserved history metadata
   * Used by both FileManager and FileEditor to avoid code duplication
   *
+   * Note: HTML files will NOT be extracted - the ZIP is kept intact when HTML is detected
+   *
   * @param zipFile - The ZIP file to extract from
   * @param zipStub - The StirlingFileStub for the ZIP (contains metadata to preserve)
   * @returns Object with success status, extracted stubs, and any errors
@@ -504,8 +541,15 @@ export class ZipFileService {
    };

    try {
-      // Extract PDF files from ZIP
-      const extractionResult = await this.extractPdfFiles(zipFile);
+      // Check if ZIP contains HTML files - if so, don't extract
+      const hasHtml = await this.containsHtmlFiles(zipFile);
+      if (hasHtml) {
+        result.errors.push('ZIP contains HTML files and will not be auto-extracted. Download the ZIP to access the files.');
+        return result;
+      }
+
+      // Extract all files from ZIP (not just PDFs)
+      const extractionResult = await this.extractAllFiles(zipFile);

      if (!extractionResult.success || extractionResult.extractedFiles.length === 0) {
        result.errors = extractionResult.errors;
@@ -515,7 +559,7 @@ export class ZipFileService {
      // Process each extracted file
      for (const extractedFile of extractionResult.extractedFiles) {
        try {
-          // Generate thumbnail
+          // Generate thumbnail (works for PDFs and images)
          const thumbnail = await generateThumbnailForFile(extractedFile);

          // Create StirlingFile
--- a/frontend/src/utils/automationFileProcessor.ts
+++ b/frontend/src/utils/automationFileProcessor.ts
@@ -30,6 +30,7 @@ export class AutomationFileProcessor {

  /**
   * Extract files from a ZIP blob during automation execution, with fallback for non-ZIP files
+   * Extracts all file types (PDFs, images, etc.) except HTML files which stay zipped
   */
  static async extractAutomationZipFiles(blob: Blob): Promise<AutomationProcessingResult> {
    try {
@@ -40,20 +41,26 @@ export class AutomationFileProcessor {
        'application/zip'
      );

-      const result = await zipFileService.extractPdfFiles(zipFile);
-
-      if (!result.success || result.extractedFiles.length === 0) {
-        // Fallback: treat as single PDF file
-        const fallbackFile = ResourceManager.createTimestampedFile(
-          blob,
-          AUTOMATION_CONSTANTS.RESULT_FILE_PREFIX,
-          '.pdf'
-        );
-
+      // Check if ZIP contains HTML files - if so, keep as ZIP
+      const containsHtml = await zipFileService.containsHtmlFiles(zipFile);
+      if (containsHtml) {
+        // HTML files should stay zipped - return ZIP as-is
        return {
          success: true,
-          files: [fallbackFile],
-          errors: [`ZIP extraction failed, treated as single file: ${result.errors?.join(', ') || 'Unknown error'}`]
+          files: [zipFile],
+          errors: []
+        };
+      }
+
+      // Extract all files (not just PDFs) - handles images from scanner-image-split, etc.
+      const result = await zipFileService.extractAllFiles(zipFile);
+
+      if (!result.success || result.extractedFiles.length === 0) {
+        // Fallback: keep as ZIP file (might be valid ZIP with extraction issues)
+        return {
+          success: true,
+          files: [zipFile],
+          errors: [`ZIP extraction failed, kept as ZIP: ${result.errors?.join(', ') || 'Unknown error'}`]
        };
      }

@@ -63,18 +70,19 @@ export class AutomationFileProcessor {
        errors: []
      };
    } catch (error) {
-      console.warn('Failed to extract automation ZIP files, falling back to single file:', error);
-      // Fallback: treat as single PDF file
+      console.warn('Failed to extract automation ZIP files, keeping as ZIP:', error);
+      // Fallback: keep as ZIP file for next automation step to handle
      const fallbackFile = ResourceManager.createTimestampedFile(
        blob,
-        AUTOMATION_CONSTANTS.RESULT_FILE_PREFIX,
-        '.pdf'
+        AUTOMATION_CONSTANTS.RESPONSE_ZIP_PREFIX,
+        '.zip',
+        'application/zip'
      );

      return {
        success: true,
        files: [fallbackFile],
-        errors: [`ZIP extraction failed, treated as single file: ${error}`]
+        errors: [`ZIP extraction failed, kept as ZIP: ${error}`]
      };
    }
  }