Fix/V2/unzip_images (#4647)

Method Usage by Context | Context | Method Used | Respects Preferences | HTML Detection | |------------------------------|-------------------------------------------------------|------------------------|----------------| | Tools (via useToolResources) | extractZipFiles() → extractWithPreferences() | ✅ Yes | ✅ Yes | | Automation | extractAutomationZipFiles() → extractAllFiles() | ❌ No (always extracts) | ✅ Yes | | Manual Unzip | extractAndStoreFilesWithHistory() → extractAllFiles() | ❌ No (always extracts) | ✅ Yes | | Auto-Upload | extractAllFiles() directly | ❌ No (always extracts) | ✅ Yes | Detailed Behavior Matrix | Context | HTML Files | Auto-Unzip OFF | Within Limit | Exceeds Limit | Notes | |--------------------------|-------------|----------------|--------------|---------------|----------------------------------------| | Tools (useToolResources) | Keep zipped | Keep zipped | Extract all | Keep zipped | Respects user preferences | | Automation | Keep zipped | Extract all | Extract all | Extract all | Ignores preferences (automation needs) | | Manual Unzip | Keep zipped | Extract all | Extract all | Extract all | User explicitly unzipping | | Auto-Upload | Keep zipped | Extract all | Extract all | Extract all | User dropped files | Simplified Decision Flow ZIP File Received │ ├─ Contains HTML? → Keep as ZIP (all contexts) │ └─ No HTML │ ├─ Tools Context │ ├─ Auto-unzip OFF? → Keep as ZIP │ └─ Auto-unzip ON │ ├─ File count ≤ limit? → Extract all │ └─ File count > limit? → Keep as ZIP │ └─ Automation/Manual/Auto-Upload └─ Extract all (ignore preferences) Key Changes from Previous Version | Entry Point | Code Path | skipAutoUnzip | Respects Preferences? | HTML Detection? | Extraction Behavior | |-----------------------------------------------|----------------------------------------------------------------------------------------|---------------|-----------------------|---------------------------|-------------------------------------------------------------------------| | Direct File Upload (FileEditor, LandingPage) | FileContext.addRawFiles() → fileActions.addFiles() | True | ❌ No | ✅ Yes | Always extract (except HTML ZIPs) | | Tool Outputs (Split, Merge, etc.) | useToolResources.extractZipFiles() → zipFileService.extractWithPreferences() | false | ✅ Yes | ✅ Yes | Conditional: Only if autoUnzip=true AND file count ≤ autoUnzipFileLimit | | Load from Storage (FileManager) | fileActions.addStirlingFileStubs() | N/A | N/A | N/A | No extraction - files already processed | | Automation Outputs | AutomationFileProcessor.extractAutomationZipFiles() → zipFileService.extractAllFiles() | N/A | ❌ No | ✅ Yes | Always extract (except HTML ZIPs) | | Manual Unzip Action (FileEditor context menu) | zipFileService.extractAndStoreFilesWithHistory() → extractAllFiles() | N/A | ❌ No | ✅ Yes (blocks extraction) | Always extract (except HTML ZIPs) - explicit user action | --------- Co-authored-by: Connor Yoh <connor@stirlingpdf.com>
2025-12-18 20:04:17 +01:00 · 2025-10-15 15:17:44 +01:00 · 2025-10-15 15:17:44 +01:00 · 43887c8179
commit 43887c8179
parent bcd7762594
9 changed files with 239 additions and 241 deletions
--- a/frontend/src/components/fileEditor/FileEditor.tsx
+++ b/frontend/src/components/fileEditor/FileEditor.tsx
@ -1,6 +1,6 @@
 import React, { useState, useCallback, useRef, useMemo, useEffect } from 'react';
 import {
-  Text, Center, Box, LoadingOverlay, Stack, Group
+  Text, Center, Box, LoadingOverlay, Stack
 } from '@mantine/core';
 import { Dropzone } from '@mantine/dropzone';
 import { useFileSelection, useFileState, useFileManagement, useFileActions } from '../../contexts/FileContext';
@ -10,7 +10,6 @@ import { detectFileExtension } from '../../utils/fileUtils';
 import FileEditorThumbnail from './FileEditorThumbnail';
 import AddFileCard from './AddFileCard';
 import FilePickerModal from '../shared/FilePickerModal';
-import SkeletonLoader from '../shared/SkeletonLoader';
 import { FileId, StirlingFile } from '../../types/fileContext';
 import { alert } from '../toast';
 import { downloadBlob } from '../../utils/downloadUtils';
@ -68,19 +67,6 @@ const FileEditor = ({
    }
  }, [toolMode]);
  const [showFilePickerModal, setShowFilePickerModal] = useState(false);
-  const [zipExtractionProgress, setZipExtractionProgress] = useState<{
-    isExtracting: boolean;
-    currentFile: string;
-    progress: number;
-    extractedCount: number;
-    totalFiles: number;
-  }>({
-    isExtracting: false,
-    currentFile: '',
-    progress: 0,
-    extractedCount: 0,
-    totalFiles: 0
-  });
  // Get selected file IDs from context (defensive programming)
  const contextSelectedIds = Array.isArray(selectedFileIds) ? selectedFileIds : [];

@ -92,106 +78,26 @@ const FileEditor = ({
  const localSelectedIds = contextSelectedIds;

  // Process uploaded files using context
+  // ZIP extraction is now handled automatically in FileContext based on user preferences
  const handleFileUpload = useCallback(async (uploadedFiles: File[]) => {
    _setError(null);

    try {
-      const allExtractedFiles: File[] = [];
-      const errors: string[] = [];
-
-      for (const file of uploadedFiles) {
-        if (file.type === 'application/pdf') {
-          // Handle PDF files normally
-          allExtractedFiles.push(file);
-        } else if (file.type === 'application/zip' || file.type === 'application/x-zip-compressed' || file.name.toLowerCase().endsWith('.zip')) {
-          // Handle ZIP files - only expand if they contain PDFs
-          try {
-            // Validate ZIP file first
-            const validation = await zipFileService.validateZipFile(file);
-
-            if (validation.isValid && validation.containsPDFs) {
-              // ZIP contains PDFs - extract them
-              setZipExtractionProgress({
-                isExtracting: true,
-                currentFile: file.name,
-                progress: 0,
-                extractedCount: 0,
-                totalFiles: validation.fileCount
-              });
-
-              const extractionResult = await zipFileService.extractPdfFiles(file, (progress) => {
-                setZipExtractionProgress({
-                  isExtracting: true,
-                  currentFile: progress.currentFile,
-                  progress: progress.progress,
-                  extractedCount: progress.extractedCount,
-                  totalFiles: progress.totalFiles
-                });
-              });
-
-              // Reset extraction progress
-              setZipExtractionProgress({
-                isExtracting: false,
-                currentFile: '',
-                progress: 0,
-                extractedCount: 0,
-                totalFiles: 0
-              });
-
-              if (extractionResult.success) {
-                allExtractedFiles.push(...extractionResult.extractedFiles);
-
-                if (extractionResult.errors.length > 0) {
-                  errors.push(...extractionResult.errors);
-                }
-              } else {
-                errors.push(`Failed to extract ZIP file "${file.name}": ${extractionResult.errors.join(', ')}`);
-              }
-            } else {
-              // ZIP doesn't contain PDFs or is invalid - treat as regular file
-              allExtractedFiles.push(file);
-            }
-          } catch (zipError) {
-            errors.push(`Failed to process ZIP file "${file.name}": ${zipError instanceof Error ? zipError.message : 'Unknown error'}`);
-            setZipExtractionProgress({
-              isExtracting: false,
-              currentFile: '',
-              progress: 0,
-              extractedCount: 0,
-              totalFiles: 0
-            });
-          }
-        } else {
-          allExtractedFiles.push(file);
-        }
-      }
-
-      // Show any errors
-      if (errors.length > 0) {
-        showError(errors.join('\n'));
-      }
-
-      // Process all extracted files
-      if (allExtractedFiles.length > 0) {
-        // Add files to context and select them automatically
-        await addFiles(allExtractedFiles, { selectFiles: true });
-        showStatus(`Added ${allExtractedFiles.length} files`, 'success');
+      if (uploadedFiles.length > 0) {
+        // FileContext will automatically handle ZIP extraction based on user preferences
+        // - Respects autoUnzip setting
+        // - Respects autoUnzipFileLimit
+        // - HTML ZIPs stay intact
+        // - Non-ZIP files pass through unchanged
+        await addFiles(uploadedFiles, { selectFiles: true });
+        showStatus(`Added ${uploadedFiles.length} file(s)`, 'success');
      }
    } catch (err) {
      const errorMessage = err instanceof Error ? err.message : 'Failed to process files';
      showError(errorMessage);
      console.error('File processing error:', err);
-
-      // Reset extraction progress on error
-      setZipExtractionProgress({
-        isExtracting: false,
-        currentFile: '',
-        progress: 0,
-        extractedCount: 0,
-        totalFiles: 0
-      });
    }
-  }, [addFiles]);
+  }, [addFiles, showStatus, showError]);

  const toggleFile = useCallback((fileId: FileId) => {
    const currentSelectedIds = contextSelectedIdsRef.current;
@ -394,7 +300,7 @@ const FileEditor = ({
        <Box p="md">


-        {activeStirlingFileStubs.length === 0 && !zipExtractionProgress.isExtracting ? (
+        {activeStirlingFileStubs.length === 0 ? (
          <Center h="60vh">
            <Stack align="center" gap="md">
              <Text size="lg" c="dimmed">📁</Text>
@ -402,43 +308,6 @@ const FileEditor = ({
              <Text size="sm" c="dimmed">Upload PDF files, ZIP archives, or load from storage to get started</Text>
            </Stack>
          </Center>
-        ) : activeStirlingFileStubs.length === 0 && zipExtractionProgress.isExtracting ? (
-          <Box>
-            <SkeletonLoader type="controls" />
-
-            {/* ZIP Extraction Progress */}
-            {zipExtractionProgress.isExtracting && (
-              <Box mb="md" p="sm" style={{ backgroundColor: 'var(--mantine-color-orange-0)', borderRadius: 8 }}>
-                <Group justify="space-between" mb="xs">
-                  <Text size="sm" fw={500}>Extracting ZIP archive...</Text>
-                  <Text size="sm" c="dimmed">{Math.round(zipExtractionProgress.progress)}%</Text>
-                </Group>
-                <Text size="xs" c="dimmed" mb="xs">
-                  {zipExtractionProgress.currentFile || 'Processing files...'}
-                </Text>
-                <Text size="xs" c="dimmed" mb="xs">
-                  {zipExtractionProgress.extractedCount} of {zipExtractionProgress.totalFiles} files extracted
-                </Text>
-                <div style={{
-                  width: '100%',
-                  height: '4px',
-                  backgroundColor: 'var(--mantine-color-gray-2)',
-                  borderRadius: '2px',
-                  overflow: 'hidden'
-                }}>
-                  <div style={{
-                    width: `${Math.round(zipExtractionProgress.progress)}%`,
-                    height: '100%',
-                    backgroundColor: 'var(--mantine-color-orange-6)',
-                    transition: 'width 0.3s ease'
-                  }} />
-                </div>
-              </Box>
-            )}
-
-
-            <SkeletonLoader type="fileGrid" count={6} />
-          </Box>
        ) : (
          <div
            style={{
--- a/frontend/src/contexts/FileContext.tsx
+++ b/frontend/src/contexts/FileContext.tsx
@ -79,8 +79,21 @@ function FileContextInner({
  };

  // File operations using unified addFiles helper with persistence
-  const addRawFiles = useCallback(async (files: File[], options?: { insertAfterPageId?: string; selectFiles?: boolean }): Promise<StirlingFile[]> => {
-    const stirlingFiles = await addFiles({ files, ...options }, stateRef, filesRef, dispatch, lifecycleManager, enablePersistence);
+  const addRawFiles = useCallback(async (files: File[], options?: { insertAfterPageId?: string; selectFiles?: boolean; skipAutoUnzip?: boolean }): Promise<StirlingFile[]> => {
+    const stirlingFiles = await addFiles(
+      {
+        files,
+        ...options,
+        // For direct file uploads: ALWAYS unzip (except HTML ZIPs)
+        // skipAutoUnzip bypasses preference checks - HTML detection still applies
+        skipAutoUnzip: true
+      },
+      stateRef,
+      filesRef,
+      dispatch,
+      lifecycleManager,
+      enablePersistence
+    );

    // Auto-select the newly added files if requested
    if (options?.selectFiles && stirlingFiles.length > 0) {
--- a/frontend/src/contexts/file/fileActions.ts
+++ b/frontend/src/contexts/file/fileActions.ts
@ -18,6 +18,7 @@ import { FileLifecycleManager } from './lifecycle';
 import { buildQuickKeySet } from './fileSelectors';
 import { StirlingFile } from '../../types/fileContext';
 import { fileStorage } from '../../services/fileStorage';
+import { zipFileService } from '../../services/zipFileService';
 const DEBUG = process.env.NODE_ENV === 'development';

 /**
@ -172,6 +173,11 @@ interface AddFileOptions {

  // Auto-selection after adding
  selectFiles?: boolean;
+
+  // Auto-unzip control
+  autoUnzip?: boolean;
+  autoUnzipFileLimit?: number;
+  skipAutoUnzip?: boolean; // When true: always unzip (except HTML). Used for file uploads. When false: respect autoUnzip/autoUnzipFileLimit preferences. Used for tool outputs.
 }

 /**
@ -198,7 +204,58 @@ export async function addFiles(
  const { files = [] } = options;
  if (DEBUG) console.log(`📄 addFiles(raw): Adding ${files.length} files with immediate thumbnail generation`);

+  // ZIP pre-processing: Extract ZIP files with configurable behavior
+  // - File uploads: skipAutoUnzip=true → always extract (except HTML)
+  // - Tool outputs: skipAutoUnzip=false → respect user preferences
+  const filesToProcess: File[] = [];
+  const autoUnzip = options.autoUnzip ?? true; // Default to true
+  const autoUnzipFileLimit = options.autoUnzipFileLimit ?? 4; // Default limit
+  const skipAutoUnzip = options.skipAutoUnzip ?? false;
+
  for (const file of files) {
+    // Check if file is a ZIP
+    if (zipFileService.isZipFile(file)) {
+      try {
+        if (DEBUG) console.log(`📄 addFiles: Detected ZIP file: ${file.name}`);
+
+        // Check if ZIP contains HTML files - if so, keep as ZIP
+        const containsHtml = await zipFileService.containsHtmlFiles(file);
+        if (containsHtml) {
+          if (DEBUG) console.log(`📄 addFiles: ZIP contains HTML, keeping as ZIP: ${file.name}`);
+          filesToProcess.push(file);
+          continue;
+        }
+
+        // Apply extraction with preferences
+        const extractedFiles = await zipFileService.extractWithPreferences(file, {
+          autoUnzip,
+          autoUnzipFileLimit,
+          skipAutoUnzip
+        });
+
+        if (extractedFiles.length === 1 && extractedFiles[0] === file) {
+          // ZIP was not extracted (over limit or autoUnzip disabled)
+          if (DEBUG) console.log(`📄 addFiles: ZIP not extracted (preferences): ${file.name}`);
+        } else {
+          // ZIP was extracted
+          if (DEBUG) console.log(`📄 addFiles: Extracted ${extractedFiles.length} files from ZIP: ${file.name}`);
+        }
+
+        filesToProcess.push(...extractedFiles);
+      } catch (error) {
+        console.error(`📄 addFiles: Failed to process ZIP file ${file.name}:`, error);
+        // On error, keep the ZIP file as-is
+        filesToProcess.push(file);
+      }
+    } else {
+      // Not a ZIP file, add as-is
+      filesToProcess.push(file);
+    }
+  }
+
+  if (DEBUG) console.log(`📄 addFiles: After ZIP processing, ${filesToProcess.length} files to add`);
+
+  for (const file of filesToProcess) {
    const quickKey = createQuickKey(file);

    // Soft deduplication: Check if file already exists by metadata
--- a/frontend/src/hooks/tools/extractImages/useExtractImagesOperation.ts
+++ b/frontend/src/hooks/tools/extractImages/useExtractImagesOperation.ts
@ -1,8 +1,9 @@
+import { useCallback } from 'react';
 import { useTranslation } from 'react-i18next';
 import { useToolOperation, ToolType } from '../shared/useToolOperation';
 import { createStandardErrorHandler } from '../../../utils/toolErrorHandler';
 import { ExtractImagesParameters, defaultParameters } from './useExtractImagesParameters';
-import JSZip from 'jszip';
+import { useToolResources } from '../shared/useToolResources';

 // Static configuration that can be used by both the hook and automation executor
 export const buildExtractImagesFormData = (parameters: ExtractImagesParameters, file: File): FormData => {
@ -13,39 +14,28 @@ export const buildExtractImagesFormData = (parameters: ExtractImagesParameters,
  return formData;
 };

-// Response handler for extract-images which returns a ZIP file
-const extractImagesResponseHandler = async (responseData: Blob, _originalFiles: File[]): Promise<File[]> => {
-  const zip = new JSZip();
-  const zipContent = await zip.loadAsync(responseData);
-  const extractedFiles: File[] = [];
-
-  for (const [filename, file] of Object.entries(zipContent.files)) {
-    if (!file.dir) {
-      const blob = await file.async('blob');
-      const extractedFile = new File([blob], filename, { type: blob.type });
-      extractedFiles.push(extractedFile);
-    }
-  }
-
-  return extractedFiles;
-};
-
-// Static configuration object
+// Static configuration object (without response handler - will be added in hook)
 export const extractImagesOperationConfig = {
  toolType: ToolType.singleFile,
  buildFormData: buildExtractImagesFormData,
  operationType: 'extractImages',
  endpoint: '/api/v1/misc/extract-images',
  defaultParameters,
-  // Extract-images returns a ZIP file containing multiple image files
-  responseHandler: extractImagesResponseHandler,
 } as const;

 export const useExtractImagesOperation = () => {
  const { t } = useTranslation();
+  const { extractZipFiles } = useToolResources();
+
+  // Response handler that respects auto-unzip preferences
+  const responseHandler = useCallback(async (blob: Blob, _originalFiles: File[]): Promise<File[]> => {
+    // Extract images returns a ZIP file - use preference-aware extraction
+    return await extractZipFiles(blob);
+  }, [extractZipFiles]);

  return useToolOperation<ExtractImagesParameters>({
    ...extractImagesOperationConfig,
+    responseHandler,
    getErrorMessage: createStandardErrorHandler(t('extractImages.error.failed', 'An error occurred while extracting images from the PDF.'))
  });
 };
--- a/frontend/src/hooks/tools/scannerImageSplit/useScannerImageSplitOperation.ts
+++ b/frontend/src/hooks/tools/scannerImageSplit/useScannerImageSplitOperation.ts
@ -27,14 +27,14 @@ export const scannerImageSplitOperationConfig = {

 export const useScannerImageSplitOperation = () => {
  const { t } = useTranslation();
-  const { extractAllZipFiles } = useToolResources();
+  const { extractZipFiles } = useToolResources();

  // Custom response handler that extracts ZIP files containing images
  // Can't add to exported config because it requires access to the hook so must be part of the hook
  const responseHandler = useCallback(async (blob: Blob, originalFiles: File[]): Promise<File[]> => {
    try {
      // Scanner image split returns ZIP files with multiple images
-      const extractedFiles = await extractAllZipFiles(blob);
+      const extractedFiles = await extractZipFiles(blob);

      // If extraction succeeded and returned files, use them
      if (extractedFiles.length > 0) {
@ -49,7 +49,7 @@ export const useScannerImageSplitOperation = () => {
    const baseFileName = inputFileName.replace(/\.[^.]+$/, '');
    const singleFile = new File([blob], `${baseFileName}.png`, { type: 'image/png' });
    return [singleFile];
-  }, [extractAllZipFiles]);
+  }, [extractZipFiles]);

  const config: ToolOperationConfig<ScannerImageSplitParameters> = {
    ...scannerImageSplitOperationConfig,
--- a/frontend/src/hooks/tools/shared/useToolOperation.ts
+++ b/frontend/src/hooks/tools/shared/useToolOperation.ts
@ -151,7 +151,7 @@ export const useToolOperation = <TParams>(
  const { state, actions } = useToolState();
  const { actions: fileActions } = useFileContext();
  const { processFiles, cancelOperation: cancelApiCalls } = useToolApiCalls<TParams>();
-  const { generateThumbnails, createDownloadInfo, cleanupBlobUrls, extractZipFiles, extractAllZipFiles } = useToolResources();
+  const { generateThumbnails, createDownloadInfo, cleanupBlobUrls, extractZipFiles } = useToolResources();

  // Track last operation for undo functionality
  const lastOperationRef = useRef<{
@ -259,11 +259,6 @@ export const useToolOperation = <TParams>(
            // Default: assume ZIP response for multi-file endpoints
            // Note: extractZipFiles will check preferences.autoUnzip setting
            processedFiles = await extractZipFiles(response.data);
-
-            if (processedFiles.length === 0) {
-              // Try the generic extraction as fallback
-              processedFiles = await extractAllZipFiles(response.data);
-            }
          }
          // Assume all inputs succeeded together unless server provided an error earlier
          successSourceIds = validFiles.map(f => (f as any).fileId) as any;
@ -446,7 +441,7 @@ export const useToolOperation = <TParams>(
      actions.setLoading(false);
      actions.setProgress(null);
    }
-  }, [t, config, actions, addFiles, consumeFiles, processFiles, generateThumbnails, createDownloadInfo, cleanupBlobUrls, extractZipFiles, extractAllZipFiles]);
+  }, [t, config, actions, addFiles, consumeFiles, processFiles, generateThumbnails, createDownloadInfo, cleanupBlobUrls, extractZipFiles]);

  const cancelOperation = useCallback(() => {
    cancelApiCalls();
--- a/frontend/src/hooks/tools/shared/useToolResources.ts
+++ b/frontend/src/hooks/tools/shared/useToolResources.ts
@ -27,11 +27,11 @@ export const useToolResources = () => {

  // Cleanup on unmount - use ref to avoid dependency on blobUrls state
  const blobUrlsRef = useRef<string[]>([]);
-  
+
  useEffect(() => {
    blobUrlsRef.current = blobUrls;
  }, [blobUrls]);
-  
+
  useEffect(() => {
    return () => {
      blobUrlsRef.current.forEach(url => {
@ -85,50 +85,17 @@ export const useToolResources = () => {

  const extractZipFiles = useCallback(async (zipBlob: Blob, skipAutoUnzip = false): Promise<File[]> => {
    try {
-      // Check if we should extract based on preferences
-      const shouldExtract = await zipFileService.shouldUnzip(
-        zipBlob,
-        preferences.autoUnzip,
-        preferences.autoUnzipFileLimit,
+      return await zipFileService.extractWithPreferences(zipBlob, {
+        autoUnzip: preferences.autoUnzip,
+        autoUnzipFileLimit: preferences.autoUnzipFileLimit,
        skipAutoUnzip
-      );
-
-      if (!shouldExtract) {
-        return [new File([zipBlob], 'result.zip', { type: 'application/zip' })];
-      }
-
-      const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' });
-      const extractionResult = await zipFileService.extractPdfFiles(zipFile);
-      return extractionResult.success ? extractionResult.extractedFiles : [];
+      });
    } catch (error) {
      console.error('useToolResources.extractZipFiles - Error:', error);
      return [];
    }
  }, [preferences.autoUnzip, preferences.autoUnzipFileLimit]);

-  const extractAllZipFiles = useCallback(async (zipBlob: Blob, skipAutoUnzip = false): Promise<File[]> => {
-    try {
-      // Check if we should extract based on preferences
-      const shouldExtract = await zipFileService.shouldUnzip(
-        zipBlob,
-        preferences.autoUnzip,
-        preferences.autoUnzipFileLimit,
-        skipAutoUnzip
-      );
-
-      if (!shouldExtract) {
-        return [new File([zipBlob], 'result.zip', { type: 'application/zip' })];
-      }
-
-      const zipFile = new File([zipBlob], 'temp.zip', { type: 'application/zip' });
-      const extractionResult = await zipFileService.extractAllFiles(zipFile);
-      return extractionResult.success ? extractionResult.extractedFiles : [];
-    } catch (error) {
-      console.error('useToolResources.extractAllZipFiles - Error:', error);
-      return [];
-    }
-  }, [preferences.autoUnzip, preferences.autoUnzipFileLimit]);
-
  const createDownloadInfo = useCallback(async (
    files: File[],
    operationType: string
@ -152,7 +119,6 @@ export const useToolResources = () => {
    generateThumbnailsWithMetadata,
    createDownloadInfo,
    extractZipFiles,
-    extractAllZipFiles,
    cleanupBlobUrls,
  };
 };
--- a/frontend/src/services/zipFileService.ts
+++ b/frontend/src/services/zipFileService.ts
@ -29,6 +29,7 @@ export interface ZipValidationResult {
  fileCount: number;
  totalSizeBytes: number;
  containsPDFs: boolean;
+  containsFiles: boolean;
  errors: string[];
 }

@ -42,7 +43,6 @@ export interface ZipExtractionProgress {
 export class ZipFileService {
  private readonly maxFileSize = 100 * 1024 * 1024; // 100MB per file
  private readonly maxTotalSize = 500 * 1024 * 1024; // 500MB total extraction limit
-  private readonly supportedExtensions = ['.pdf'];

  // ZIP file validation constants
  private static readonly VALID_ZIP_TYPES = [
@ -62,6 +62,7 @@ export class ZipFileService {
      fileCount: 0,
      totalSizeBytes: 0,
      containsPDFs: false,
+      containsFiles: false,
      errors: []
    };

@ -115,10 +116,13 @@ export class ZipFileService {
      result.fileCount = fileCount;
      result.totalSizeBytes = totalSize;
      result.containsPDFs = containsPDFs;
-      result.isValid = result.errors.length === 0 && containsPDFs;
+      result.containsFiles = fileCount > 0;

-      if (!containsPDFs) {
-        result.errors.push('ZIP file does not contain any PDF files');
+      // ZIP is valid if it has files and no size errors
+      result.isValid = result.errors.length === 0 && result.containsFiles;
+
+      if (!result.containsFiles) {
+        result.errors.push('ZIP file does not contain any files');
      }

      return result;
@ -278,6 +282,37 @@ export class ZipFileService {
    return filename.toLowerCase().endsWith('.pdf');
  }

+  /**
+   * Check if a filename indicates an HTML file
+   */
+  private isHtmlFile(filename: string): boolean {
+    const lowerName = filename.toLowerCase();
+    return lowerName.endsWith('.html') || lowerName.endsWith('.htm') || lowerName.endsWith('.xhtml');
+  }
+
+  /**
+   * Check if a ZIP file contains HTML files
+   * Used to determine if the ZIP should be kept intact (HTML) or extracted (other files)
+   */
+  async containsHtmlFiles(file: Blob | File): Promise<boolean> {
+    try {
+      const zip = new JSZip();
+      const zipContents = await zip.loadAsync(file);
+
+      // Check if any file is an HTML file
+      for (const [filename, zipEntry] of Object.entries(zipContents.files)) {
+        if (!zipEntry.dir && this.isHtmlFile(filename)) {
+          return true;
+        }
+      }
+
+      return false;
+    } catch (error) {
+      console.error('Error checking for HTML files:', error);
+      return false;
+    }
+  }
+
  /**
   * Validate that a file is actually a PDF by checking its header
   */
@ -366,6 +401,62 @@ export class ZipFileService {
    }
  }

+  /**
+   * Extract files from ZIP with HTML detection and preference checking
+   * This is the unified method that handles the common pattern of:
+   * 1. Check for HTML files → keep zipped if present
+   * 2. Check user preferences → respect autoUnzipFileLimit
+   * 3. Extract files if appropriate
+   *
+   * @param zipBlob - The ZIP blob to process
+   * @param options - Extraction options
+   * @returns Array of files (either extracted or the ZIP itself)
+   */
+  async extractWithPreferences(
+    zipBlob: Blob,
+    options: {
+      autoUnzip: boolean;
+      autoUnzipFileLimit: number;
+      skipAutoUnzip?: boolean;
+    }
+  ): Promise<File[]> {
+    try {
+      // Create File object if not already
+      const zipFile = zipBlob instanceof File
+        ? zipBlob
+        : new File([zipBlob], 'result.zip', { type: 'application/zip' });
+
+      // Check if ZIP contains HTML files - if so, keep as ZIP
+      const containsHtml = await this.containsHtmlFiles(zipFile);
+      if (containsHtml) {
+        return [zipFile];
+      }
+
+      // Check if we should extract based on preferences
+      const shouldExtract = await this.shouldUnzip(
+        zipBlob,
+        options.autoUnzip,
+        options.autoUnzipFileLimit,
+        options.skipAutoUnzip || false
+      );
+
+      if (!shouldExtract) {
+        return [zipFile];
+      }
+
+      // Extract all files
+      const extractionResult = await this.extractAllFiles(zipFile);
+      return extractionResult.success ? extractionResult.extractedFiles : [zipFile];
+    } catch (error) {
+      console.error('Error in extractWithPreferences:', error);
+      // On error, return ZIP as-is
+      const zipFile = zipBlob instanceof File
+        ? zipBlob
+        : new File([zipBlob], 'result.zip', { type: 'application/zip' });
+      return [zipFile];
+    }
+  }
+
  /**
   * Extract all files from a ZIP archive (not limited to PDFs)
   */
@ -486,9 +577,11 @@ export class ZipFileService {
  }

  /**
-   * Extract PDF files from ZIP and store them in IndexedDB with preserved history metadata
+   * Extract all files from ZIP and store them in IndexedDB with preserved history metadata
   * Used by both FileManager and FileEditor to avoid code duplication
   *
+   * Note: HTML files will NOT be extracted - the ZIP is kept intact when HTML is detected
+   *
   * @param zipFile - The ZIP file to extract from
   * @param zipStub - The StirlingFileStub for the ZIP (contains metadata to preserve)
   * @returns Object with success status, extracted stubs, and any errors
@ -504,8 +597,15 @@ export class ZipFileService {
    };

    try {
-      // Extract PDF files from ZIP
-      const extractionResult = await this.extractPdfFiles(zipFile);
+      // Check if ZIP contains HTML files - if so, don't extract
+      const hasHtml = await this.containsHtmlFiles(zipFile);
+      if (hasHtml) {
+        result.errors.push('ZIP contains HTML files and will not be auto-extracted. Download the ZIP to access the files.');
+        return result;
+      }
+
+      // Extract all files from ZIP (not just PDFs)
+      const extractionResult = await this.extractAllFiles(zipFile);

      if (!extractionResult.success || extractionResult.extractedFiles.length === 0) {
        result.errors = extractionResult.errors;
@ -515,7 +615,7 @@ export class ZipFileService {
      // Process each extracted file
      for (const extractedFile of extractionResult.extractedFiles) {
        try {
-          // Generate thumbnail
+          // Generate thumbnail (works for PDFs and images)
          const thumbnail = await generateThumbnailForFile(extractedFile);

          // Create StirlingFile
--- a/frontend/src/utils/automationFileProcessor.ts
+++ b/frontend/src/utils/automationFileProcessor.ts
@ -30,6 +30,7 @@ export class AutomationFileProcessor {

  /**
   * Extract files from a ZIP blob during automation execution, with fallback for non-ZIP files
+   * Extracts all file types (PDFs, images, etc.) except HTML files which stay zipped
   */
  static async extractAutomationZipFiles(blob: Blob): Promise<AutomationProcessingResult> {
    try {
@ -40,20 +41,26 @@ export class AutomationFileProcessor {
        'application/zip'
      );

-      const result = await zipFileService.extractPdfFiles(zipFile);
-
-      if (!result.success || result.extractedFiles.length === 0) {
-        // Fallback: treat as single PDF file
-        const fallbackFile = ResourceManager.createTimestampedFile(
-          blob,
-          AUTOMATION_CONSTANTS.RESULT_FILE_PREFIX,
-          '.pdf'
-        );
-
+      // Check if ZIP contains HTML files - if so, keep as ZIP
+      const containsHtml = await zipFileService.containsHtmlFiles(zipFile);
+      if (containsHtml) {
+        // HTML files should stay zipped - return ZIP as-is
        return {
          success: true,
-          files: [fallbackFile],
-          errors: [`ZIP extraction failed, treated as single file: ${result.errors?.join(', ') || 'Unknown error'}`]
+          files: [zipFile],
+          errors: []
+        };
+      }
+
+      // Extract all files (not just PDFs) - handles images from scanner-image-split, etc.
+      const result = await zipFileService.extractAllFiles(zipFile);
+
+      if (!result.success || result.extractedFiles.length === 0) {
+        // Fallback: keep as ZIP file (might be valid ZIP with extraction issues)
+        return {
+          success: true,
+          files: [zipFile],
+          errors: [`ZIP extraction failed, kept as ZIP: ${result.errors?.join(', ') || 'Unknown error'}`]
        };
      }

@ -63,18 +70,19 @@ export class AutomationFileProcessor {
        errors: []
      };
    } catch (error) {
-      console.warn('Failed to extract automation ZIP files, falling back to single file:', error);
-      // Fallback: treat as single PDF file
+      console.warn('Failed to extract automation ZIP files, keeping as ZIP:', error);
+      // Fallback: keep as ZIP file for next automation step to handle
      const fallbackFile = ResourceManager.createTimestampedFile(
        blob,
-        AUTOMATION_CONSTANTS.RESULT_FILE_PREFIX,
-        '.pdf'
+        AUTOMATION_CONSTANTS.RESPONSE_ZIP_PREFIX,
+        '.zip',
+        'application/zip'
      );

      return {
        success: true,
        files: [fallbackFile],
-        errors: [`ZIP extraction failed, treated as single file: ${error}`]
+        errors: [`ZIP extraction failed, kept as ZIP: ${error}`]
      };
    }
  }