diff --git a/.gitignore b/.gitignore index d8b52ec8e..f0e16d5e7 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ SwaggerDoc.json # Gradle .gradle +.gradle-home .lock # External tool builders diff --git a/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.lck b/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.lck deleted file mode 100644 index e69de29bb..000000000 diff --git a/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.part b/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.part deleted file mode 100644 index e69de29bb..000000000 diff --git a/app/common/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/app/common/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 98bbe16fc..5a28da4f3 100644 --- a/app/common/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/app/common/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -296,6 +296,12 @@ public class EndpointConfiguration { addEndpointToGroup("Other", "replace-and-invert-color-pdf"); addEndpointToGroup("Other", "multi-tool"); + // Adding form-related endpoints to "Other" group + addEndpointToGroup("Other", "fields"); + addEndpointToGroup("Other", "modify-fields"); + addEndpointToGroup("Other", "delete-fields"); + addEndpointToGroup("Other", "fill"); + // Adding endpoints to "Advance" group addEndpointToGroup("Advance", "adjust-contrast"); addEndpointToGroup("Advance", "compress-pdf"); diff --git a/docker/compose/docker-compose.yml b/docker/compose/docker-compose.yml index 69ebc7996..713bfec79 100644 --- a/docker/compose/docker-compose.yml +++ b/docker/compose/docker-compose.yml @@ -29,12 +29,6 @@ services: METRICS_ENABLED: "true" SYSTEM_GOOGLEVISIBILITY: "true" SHOW_SURVEY: "true" - STIRLING_PDF_JSON_FONT_NORMALIZATION_ENABLED: "false" - STIRLING_PDF_JSON_CFF_CONVERTER_ENABLED: "true" - STIRLING_PDF_JSON_CFF_CONVERTER_METHOD: python - STIRLING_PDF_JSON_CFF_CONVERTER_PYTHON_COMMAND: /opt/venv/bin/python3 - STIRLING_PDF_JSON_CFF_CONVERTER_PYTHON_SCRIPT: /scripts/convert_cff_to_ttf.py - LOGGING_LEVEL_stirling.software.SPDF.service.PdfJsonConversionService: TRACE networks: - stirling-network diff --git a/docs/pdf-text-editor-backlog.md b/docs/pdf-text-editor-backlog.md deleted file mode 100644 index 6d5f15f32..000000000 --- a/docs/pdf-text-editor-backlog.md +++ /dev/null @@ -1,56 +0,0 @@ -# PDF Text Editor Backlog - -- **Type3 Font Support (Text Additions)** - - Parse Type3 charprocs to extract glyph outlines, build a synthetic TrueType/OpenType font (FontTools, Ghostscript `ps2ttf`, etc.), and store it in `webProgram` / `pdfProgram` for client use. - - Preserve the original Type3 resources for round-trip fidelity; use the synthesized font only for edited elements while reusing the original stream elsewhere. - - Extend conversion logic so fallback kicks in only when conversion fails, and track which elements rely on the synthetic font to avoid mixing source glyphs (`PdfJsonConversionService.java:998-1090`, `1840-2012`). - - Update the viewer/renderer to surface conversion errors and block editing when no faithful font can be produced. - -- **Lazy Fetch Endpoints** - - Provide separate endpoints to fetch: - 1. Raw COS dictionaries/font programs when the user opens advanced panels. - 2. Page-level raster/vector previews to avoid sending large `imageData` upfront. - - Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`). - -- **Editor UX Safeguards** - - Mark groups using fallback glyphs so the UI can warn about possible appearance shifts. Font family matching is now implemented (Liberation fonts), but weight matching is still TODO, so bold/italic text using fallbacks may appear lighter than original. - - Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable. - - Reference: `frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx:1260-1287` - -- **Canonical Font Sharing** - - Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs. - - Note: COS dictionaries are currently preserved for TrueType/Type0 fonts (needed for ToUnicode CMap). The canonical approach should maintain this preservation while deduplicating font programs. - - Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID. - - Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts. - -- **Font Weight Matching for Fallback Fonts** ✓ COMPLETED (January 2025) - - Font family matching is now implemented: - - Liberation fonts (metric-compatible with Microsoft core): Arial/Helvetica→LiberationSans, Times→LiberationSerif, Courier→LiberationMono - - DejaVu fonts (widely used open source): DejaVu→DejaVuSans, DejaVuSerif, DejaVuMono - - Noto fonts (Google universal font): Noto→NotoSans - - Font weight/style matching is now implemented for multiple font families: - - Liberation Sans/Serif/Mono: Regular, Bold, Italic, BoldItalic (full support) - - Noto Sans: Regular, Bold, Italic, BoldItalic (full support) - - DejaVu Sans/Serif/Mono: Regular, Bold, Italic/Oblique, BoldItalic/BoldOblique (full support) - - All font variants registered in `BUILT_IN_FALLBACK_FONTS` map (`PdfJsonFallbackFontService.java:63-267`) - - Weight/style detection implemented in `resolveFallbackFontId()`: - - `detectBold()`: Detects "bold", "heavy", "black", or numeric weights 600-900 (e.g., "700wght") - - `detectItalic()`: Detects "italic" or "oblique" - - `applyWeightStyle()`: Applies appropriate suffix (handles both "italic" and "oblique" naming) - - All fonts consolidated from Type3 library into main fonts directory for unified fallback support - - Benefits: Comprehensive visual consistency when editing text in bold/italic fonts across many font families - -- **Font Text Color Support** - - Add support for reading and preserving text color information from PDF content streams - - Enable color editing in the editor UI - - Ensure proper round-trip conversion maintains color fidelity - -- **Space Character Handling** - - Improve handling of space characters as proper text elements - - Ensure spaces are correctly preserved during text extraction and reconstruction - - Fix any issues with space positioning and width calculations - -- **Textbox Selection Enhancement** - - Improve textbox selection behavior in the editor - - Enhance user experience for selecting and manipulating text boxes - - Address any selection precision or interaction issues diff --git a/docs/pdf_json_threading_analysis.md b/docs/pdf_json_threading_analysis.md deleted file mode 100644 index e6bc7387c..000000000 --- a/docs/pdf_json_threading_analysis.md +++ /dev/null @@ -1,1199 +0,0 @@ -# PDF JSON Editor - Threading, Concurrency, and Performance Analysis - -**Date:** 2025-01-09 -**Version:** 1.0 -**Status:** Comprehensive analysis combining automated review and manual verification - ---- - -## Executive Summary - -This analysis identifies **CRITICAL** security vulnerabilities, thread safety issues, and performance problems in the PDF JSON editor codebase. The service contains: - -- **2 CRITICAL issues** requiring immediate attention -- **2 HIGH severity resource leaks** causing memory exhaustion -- **Multiple performance bottlenecks** limiting scalability - -**Immediate Action Required:** -1. Fix pageFontResources PDFont key issue (Issue #1) -2. Replace unbounded thread spawning (Issue #2) -3. Add cache size limits (Issue #3) -4. Fix Type3 cache race condition (Issue #4) - ---- - -## CRITICAL ISSUES - -### Issue #1: pageFontResources Keyed by PDFont Instances - CRITICAL - -**Location:** `PdfJsonConversionService.java:5075-5081, 5158-5159` - -**Severity:** CRITICAL (Broken Functionality) - -**Type:** Object Identity Mismatch, Cache Miss - -**Verified:** ✅ TRUE - -**Description:** -```java -// Line 5075-5081: Initial metadata extraction (first PDF load) -try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) { - Map> pageFontResources = new HashMap<>(); - for (PDPage page : document.getPages()) { - Map resourceMap = collectFontsForPage(...); - // resourceMap keys are PDFont instances from this document - pageFontResources.put(pageNumber, resourceMap); - } - // Cache it - documentCache.put(jobId, new CachedPdfDocument(..., pageFontResources, ...)); -} // PDDocument closed, PDFont instances now reference freed document - -// Line 5158-5159: Lazy page extraction (reloads PDF) -try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) { - // NEW PDFont instances created! - PDPage page = document.getPage(pageIndex); - - // Try to lookup fonts - Map cachedResourceMap = cached.getPageFontResources().get(pageNum); - // cachedResourceMap keys are OLD PDFont instances from closed document - - // Lookup using NEW PDFont instances - String fontId = cachedResourceMap.get(newFont); // Always NULL! ← BUG -} -``` - -**Why It Fails:** -1. **Object Identity:** `Map` uses PDFont object identity as key -2. **Different Instances:** Each PDF load creates new PDFont instances with different identities -3. **Lookup Fails:** `cachedResourceMap.get(newFont)` returns null because `newFont != oldFont` -4. **Defeat Caching Goal:** Every lazy page request rebuilds font metadata, defeating the cache - -**Impact:** -- Lazy page loading doesn't reuse cached font metadata -- CPU wasted rebuilding font info on every page request -- Cache only stores garbage (unusable keys) -- "Consistent font UID" feature completely broken - -**Evidence:** -```java -// No code actually uses the cached pageFontResources successfully -// Every extractSinglePage call rebuilds fonts from scratch -``` - -**Recommendation:** -```java -// Use resource names as keys instead of PDFont objects -Map> pageFontResources = new HashMap<>(); -// Key: font resource name (e.g., "F1"), Value: font UID - -// Or use font UID directly -Map> pageFontUids = new HashMap<>(); -``` - ---- - -### Issue #2: Unbounded Thread Creation - CRITICAL Resource Leak - -**Location:** `PdfJsonConversionService.java:5550-5562` - -**Severity:** CRITICAL (Resource Exhaustion) - -**Type:** Thread Leak, Memory Leak - -**Verified:** ✅ TRUE - -**Description:** -```java -private void scheduleDocumentCleanup(String jobId) { - new Thread( - () -> { - try { - Thread.sleep(TimeUnit.MINUTES.toMillis(30)); // Sleep 30 minutes! - clearCachedDocument(jobId); - log.debug("Auto-cleaned cached document for jobId: {}", jobId); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - }) - .start(); // Unmanaged thread! -} -``` - -**Also in:** `PdfLazyLoadingService.java:256-269` (duplicate implementation) - -**Problems:** -1. **One Thread Per Upload:** Each PDF upload spawns a new thread -2. **No Thread Pool:** Unlimited thread creation (no cap) -3. **Long Sleep:** Threads sleep for 30 minutes holding resources -4. **No Cancellation:** Cannot stop cleanup threads if job completes early -5. **Non-Daemon:** Threads prevent JVM shutdown (daemon=false by default) -6. **No Monitoring:** No visibility into active cleanup threads - -**Impact Under Load:** -``` -100 concurrent uploads → 100 cleanup threads spawned -Each thread: ~1MB stack + overhead -Total: ~100MB+ wasted on sleeping threads - -1000 uploads/day → 1000+ threads accumulate -OS thread limit (ulimit -u) exceeded → OutOfMemoryError: unable to create new native thread -``` - -**Production Failure Scenario:** -``` -09:00 - Peak traffic: 500 PDFs uploaded -09:00 - 500 cleanup threads spawned (each sleeps 30 min) -09:15 - 400 more uploads → 400 more threads -09:30 - First batch wakes up, cleans, exits - But new threads keep getting created faster than old ones die -10:00 - Thread count > 2000 -10:15 - JVM hits OS thread limit -10:16 - Server crashes: "OutOfMemoryError: unable to create new native thread" -``` - -**Recommendation:** -```java -@Service -public class PdfJsonConversionService { - // Fixed-size thread pool for cleanup - private final ScheduledExecutorService cleanupScheduler = - Executors.newScheduledThreadPool( - 2, // Only 2 threads needed - new ThreadFactoryBuilder() - .setNameFormat("pdf-cache-cleanup-%d") - .setDaemon(true) - .build() - ); - - private void scheduleDocumentCleanup(String jobId) { - cleanupScheduler.schedule( - () -> clearCachedDocument(jobId), - 30, - TimeUnit.MINUTES - ); - } - - @PreDestroy - public void shutdown() { - cleanupScheduler.shutdown(); - try { - if (!cleanupScheduler.awaitTermination(10, TimeUnit.SECONDS)) { - cleanupScheduler.shutdownNow(); - } - } catch (InterruptedException e) { - cleanupScheduler.shutdownNow(); - Thread.currentThread().interrupt(); - } - } -} -``` - ---- - -## HIGH SEVERITY ISSUES - -### Issue #3: Unbounded Cache Growth - HIGH Memory Leak - -**Location:** `PdfJsonConversionService.java:147-148, 154` - -**Severity:** HIGH (Memory Exhaustion) - -**Type:** Memory Leak, Missing Eviction Policy - -**Verified:** ✅ TRUE - -**Description:** -```java -// Line 147: Type3 normalized fonts - cleared only in convertJsonToPdf (line 454) -private final Map type3NormalizedFontCache = new ConcurrentHashMap<>(); - -// Line 148: Type3 glyph coverage - NEVER CLEARED anywhere in codebase! -private final Map> type3GlyphCoverageCache = new ConcurrentHashMap<>(); - -// Line 154: Document cache - relies on buggy cleanup threads -private final Map documentCache = new ConcurrentHashMap<>(); -``` - -**Growth Patterns:** - -**1. type3NormalizedFontCache:** -- Written at line 3766: `type3NormalizedFontCache.put(fontModel.getUid(), font)` -- Cleared only at line 454: `type3NormalizedFontCache.clear()` (JSON→PDF conversion) -- **NOT cleared during PDF→JSON** conversion (most common operation) -- Each PDFont holds references to native resources (C++ objects via JNI) -- Grows unbounded during PDF→JSON operations - -**2. type3GlyphCoverageCache:** -- Written at line 1122: `type3GlyphCoverageCache.put(fontUid, coverageSet)` -- **NEVER CLEARED** in entire codebase (verified via grep) -- Accumulates Set for every Type3 font ever processed -- Each Set can contain thousands of integers (Unicode codepoints) -- Pure memory leak - -**3. documentCache:** -- Stores full PDF bytes in memory -- Each entry can be 1MB-100MB+ (document bytes + metadata) -- Relies on cleanup threads (which have issues from Issue #5) -- If cleanup fails (exception, server restart), entries stay forever -- No max size check - -**Impact:** -``` -Long-running server processes 10,000 Type3 fonts: -- type3GlyphCoverageCache: 10,000 entries × ~1KB each = 10MB -- type3NormalizedFontCache: 1,000 cached fonts × ~100KB each = 100MB -- documentCache: 50 active jobs × 10MB each = 500MB - -After 1 week: Caches grow to 1GB+ -After 1 month: OutOfMemoryError, server restart required -``` - -**Recommendation:** -```java -// Use Caffeine cache with eviction policies -private final Cache type3NormalizedFontCache = - Caffeine.newBuilder() - .maximumSize(1000) // Max 1000 fonts - .expireAfterAccess(1, TimeUnit.HOURS) // Expire after 1hr unused - .removalListener((key, value, cause) -> { - // Cleanup PDFont resources if needed - }) - .build(); - -private final Cache> type3GlyphCoverageCache = - Caffeine.newBuilder() - .maximumSize(5000) - .expireAfterWrite(1, TimeUnit.HOURS) - .build(); - -private final Cache documentCache = - Caffeine.newBuilder() - .maximumWeight(500_000_000) // 500MB max - .weigher((String key, CachedPdfDocument doc) -> - doc.getPdfBytes().length) - .expireAfterWrite(30, TimeUnit.MINUTES) - .removalListener((key, value, cause) -> { - log.info("Evicted document {} (cause: {})", key, cause); - }) - .build(); -``` - ---- - -### Issue #4: Type3 Cache Race Condition - HIGH - -**Location:** `PdfJsonConversionService.java:3759-3773` - -**Severity:** HIGH (Duplicate Work) - -**Type:** Check-Then-Act Race Condition - -**Verified:** ✅ TRUE - -**Description:** -```java -private void loadNormalizedType3Font( - PDDocument document, - PdfJsonFont fontModel, - List candidates, - String originalFormat) throws IOException { - if (fontModel.getUid() == null || candidates == null || candidates.isEmpty()) { - return; - } - if (type3NormalizedFontCache.containsKey(fontModel.getUid())) { // CHECK - return; - } - for (FontByteSource source : candidates) { - PDFont font = loadFontFromSource(...); // EXPENSIVE: 10-50ms - if (font != null) { - type3NormalizedFontCache.put(fontModel.getUid(), font); // ACT - log.info("Cached normalized font {} for Type3 {}", ...); - break; - } - } -} -``` - -**Race Condition:** -``` -Thread A: Check cache for "1:F1" → MISS (line 3759) -Thread B: Check cache for "1:F1" → MISS (line 3759) [both pass check!] -Thread A: Load font from bytes (10ms I/O + parsing) -Thread B: Load font from bytes (10ms I/O + parsing) ← DUPLICATE WORK -Thread A: Put font in cache (line 3766) -Thread B: Put font in cache (line 3766) [overwrites A's entry] -``` - -**Why ConcurrentHashMap Doesn't Help:** -- ConcurrentHashMap prevents **corruption** (map state stays consistent) -- ConcurrentHashMap does NOT prevent **duplicate work** (both threads compute) -- The check (`containsKey`) and act (`put`) are separate operations - -**Impact:** -- Wasted CPU cycles loading same font twice -- Temporary memory spike (two fonts in heap simultaneously) -- Font loading is expensive: Base64 decode + PDFBox parsing + font validation -- Under high concurrency, 10+ threads could all load the same font - -**Recommendation:** -```java -private void loadNormalizedType3Font(...) throws IOException { - if (fontModel.getUid() == null || candidates == null || candidates.isEmpty()) { - return; - } - - // Atomic compute-if-absent - type3NormalizedFontCache.computeIfAbsent(fontModel.getUid(), uid -> { - for (FontByteSource source : candidates) { - try { - PDFont font = loadFontFromSource(...); - if (font != null) { - log.info("Cached normalized font {} for Type3 {}", ...); - return font; - } - } catch (IOException e) { - log.warn("Failed to load font from {}: {}", source.originLabel(), e.getMessage()); - } - } - return null; - }); -} -``` - ---- - -### Issue #5: PDDocument Resource Lifecycle - NEEDS INVESTIGATION - -**Location:** `PdfJsonConversionService.java:3766, 5158` - -**Severity:** UNKNOWN (Requires Investigation) - -**Type:** Unclear Resource Ownership - -**Verified:** ⚠️ SPECULATIVE (No concrete evidence of failure) - -**Description:** -```java -// Line 3766: Cache PDFont created from a PDDocument -try (PDDocument document = ...) { - PDFont font = loadFontFromSource(document, fontModel, source, ...); - type3NormalizedFontCache.put(fontModel.getUid(), font); -} // PDDocument is closed here! - -// Later: cached PDFont is used with a DIFFERENT PDDocument -try (PDDocument newDocument = ...) { - PDFont cachedFont = type3NormalizedFontCache.get(fontUid); - // Is cachedFont safe to use after original document closed? - // Does it hold references to freed native resources? -} -``` - -**Theoretical Concerns:** -1. **Native Memory:** PDFBox uses JNI for some operations -2. **Resource Ties:** PDFont may hold references to the source PDDocument -3. **Freed Resources:** Using PDFont after document closes could access freed memory -4. **Unclear Contract:** PDFBox documentation doesn't explicitly address font lifecycle - -**Current Status:** -- ⚠️ **NO EVIDENCE OF ACTUAL FAILURES** - System appears to work in practice -- ⚠️ **NO CRASHES OBSERVED** - No segmentation faults or memory corruption reported -- ⚠️ **NO MEMORY LEAKS DETECTED** - No profiler data showing leaks -- ⚠️ **PURELY THEORETICAL CONCERN** - Based on API design, not observed behavior - -**Why This May Actually Be Safe:** -- PDFBox may create self-contained PDFont objects -- Font data may be copied rather than referenced -- PDFBox may be designed for this use case -- Current code has been running without apparent issues - -**Required Investigation:** -1. **PDFBox Source Code Review:** Check if PDFont copies or references document data -2. **Load Testing:** Create PDFont, close document, use font in new document -3. **Memory Profiling:** Monitor for native memory leaks over extended runs -4. **PDFBox Documentation/Forums:** Search for guidance on font lifecycle - -**Recommendation:** -- **Priority: MEDIUM** (needs investigation but not blocking) -- Add monitoring for potential issues -- Test font reuse after document closure explicitly -- If problems found, cache serialized bytes instead of PDFont objects - -```java -// Option 1: Cache font bytes instead of PDFont objects -private void cacheType3FontBytes(String fontUid, byte[] fontBytes) { - type3FontBytesCache.put(fontUid, fontBytes); -} - -// Option 2: Verify font is safe to use -private PDFont getCachedFont(String fontUid) { - PDFont cached = type3NormalizedFontCache.get(fontUid); - if (cached != null && !isFontValid(cached)) { - log.warn("Cached font {} is invalid, removing", fontUid); - type3NormalizedFontCache.remove(fontUid); - return null; - } - return cached; -} -``` - ---- - -## MEDIUM SEVERITY ISSUES - -### Issue #6: Full PDF Reload Per Page - MEDIUM Performance - -**Location:** `PdfJsonConversionService.java:5158-5159` - -**Severity:** MEDIUM (Performance) - -**Type:** Inefficient I/O - -**Verified:** ✅ TRUE - -**Description:** -```java -// extractSinglePage method -try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) { - // Full PDF loaded from bytes (10-100ms for large PDFs) - PDPage page = document.getPage(pageIndex); - // Extract just one page... -} -``` - -**Problem:** -Every page request loads the entire PDF from bytes: -- 100-page PDF = Load 10MB, extract 1 page -- 10 page requests = 10× full PDF loads (100MB I/O) -- No incremental parsing or streaming - -**Impact:** -``` -100MB PDF, 50 pages requested sequentially: -- Total I/O: 100MB × 50 = 5GB -- Time: 50× parse time (5-10 seconds total) -- Memory: 100MB peak per request - -Concurrent page requests for same PDF: -- 10 threads × 100MB = 1GB temporary memory spike -``` - -**Why This Exists:** -Lazy loading design trades memory (don't cache full extraction) for CPU (reload on demand). But the tradeoff is poor because: -- PDFBox parsing is expensive -- Repeated decompression of streams -- Could cache extracted page data instead - -**Recommendation:** -```java -// Option 1: Cache extracted page data -private static class CachedPage { - List textElements; - List imageElements; - // ... other page data -} - -Map> pageCache = ...; - -// Option 2: Keep PDF open with RandomAccessFile -private static class CachedPdfDocument { - private final RandomAccessReadBufferedFile randomAccess; - private final PDDocument document; // Keep open! -} - -// Option 3: Pre-split pages at upload time -// Store each page as separate lightweight JSON blob -``` - ---- - -### Issue #7: Large Base64 Operations - MEDIUM Performance - -**Location:** `PdfJsonConversionService.java:1062, 1428, 3570, 3584, 3612, 3630` - -**Severity:** MEDIUM (Performance Bottleneck) - -**Type:** Synchronous Blocking Operation - -**Verified:** ✅ TRUE - -**Description:** -```java -// Encode large font programs -String base64 = Base64.getEncoder().encodeToString(fontBytes); // 10MB → 13MB - -// Decode large font programs -byte[] bytes = Base64.getDecoder().decode(pdfProgram); // 13MB → 10MB -``` - -**Problem:** -- Large fonts (embedded TrueType, Type3) can be 5-10MB -- Base64 encoding inflates size by ~33% -- All encoding/decoding is synchronous on request threads -- CPU-intensive operation (20-50ms for 10MB) - -**Impact:** -``` -100 concurrent requests processing 10MB fonts: -- Each request: 30ms CPU time for Base64 -- All threads blocked on encoding simultaneously -- Thread pool saturation (if using fixed-size pool) -- Other requests starved waiting for threads - -Large PDF with 50 fonts: -- 50 × 30ms = 1.5 seconds just for Base64 operations -- User perceives slowness -``` - -**Recommendation:** -```java -// Option 1: Size limits -private static final int MAX_FONT_SIZE = 10 * 1024 * 1024; // 10MB - -if (fontBytes.length > MAX_FONT_SIZE) { - throw new IllegalArgumentException("Font too large: " + fontBytes.length); -} - -// Option 2: Streaming Base64 (for very large files) -OutputStream base64Out = Base64.getEncoder().wrap(outputStream); -inputStream.transferTo(base64Out); - -// Option 3: Async processing -CompletableFuture encodeFuture = CompletableFuture.supplyAsync( - () -> Base64.getEncoder().encodeToString(fontBytes), - fontEncodingExecutor -); -``` - ---- - -### Issue #8: File I/O on Request Threads - MEDIUM - -**Location:** `PdfJsonConversionService.java:276, 405, 5066` - -**Severity:** MEDIUM (Performance) - -**Type:** Blocking I/O - -**Verified:** ❌ PARTIALLY TRUE - -**Description:** -```java -// Line 276: Write upload to disk -file.transferTo(originalFile.getFile()); - -// Line 405: Read full file into memory -byte[] cachedPdfBytes = Files.readAllBytes(workingPath); - -// Line 5066: Get uploaded file bytes -byte[] pdfBytes = file.getBytes(); -``` - -**Clarification:** -- These are in DIFFERENT methods (not double-reads within one operation) -- Each method reads the file once -- Still synchronous blocking I/O - -**Impact:** -- Large uploads (100MB) block request thread for seconds -- No async or streaming support -- Thread pool saturation under high upload volume - -**Recommendation:** -```java -// Async file I/O -CompletableFuture uploadFuture = CompletableFuture.supplyAsync( - () -> { - Path tempPath = Files.createTempFile("pdf-upload", ".pdf"); - file.transferTo(tempPath.toFile()); - return tempPath; - }, - fileIoExecutor -); - -// Stream large files -try (InputStream in = file.getInputStream(); - OutputStream out = Files.newOutputStream(targetPath)) { - in.transferTo(out); -} -``` - ---- - -## LOW SEVERITY ISSUES - -### Issue #9: PdfLazyLoadingService Unused - LOW - -**Location:** `PdfLazyLoadingService.java` (entire file) - -**Severity:** LOW (Code Quality) - -**Type:** Dead Code - -**Verified:** ✅ TRUE - -**Description:** -- Complete service implementation exists -- Has its own `documentCache` and cleanup logic -- Duplicates functionality in `PdfJsonConversionService` -- Not wired to any controller -- Not imported by any other class - -**Impact:** -- Code maintenance burden -- Confusing for developers -- Potential for accidental use in future -- Cache divergence if both ever get used - -**Recommendation:** -```java -// Delete PdfLazyLoadingService.java entirely -// Or clearly mark as @Deprecated with explanation -``` - ---- - -### Issue #10: PdfJsonFontService Volatile Fields - LOW - -**Location:** `PdfJsonFontService.java:46-47` - -**Severity:** LOW (Actually Correct) - -**Type:** None (Good Practice) - -**Verified:** ✅ TRUE (No issue, correctly implemented) - -**Description:** -```java -private volatile boolean pythonCffConverterAvailable; -private volatile boolean fontForgeCffConverterAvailable; - -@PostConstruct -private void initialiseCffConverterAvailability() { - pythonCffConverterAvailable = isCommandAvailable(pythonCommand); - fontForgeCffConverterAvailable = isCommandAvailable(fontforgeCommand); -} -``` - -**Why This Is Correct:** -- `volatile` ensures visibility across threads -- Set once at startup -- Read many times (thread-safe) -- No synchronization needed - -**Recommendation:** None - this is good practice. - ---- - -## VERIFIED FALSE CLAIMS - -### Claim: file.getBytes() Called Twice - -**Status:** ❌ FALSE - -**Explanation:** The claim stated that `file.getBytes()` is called twice (lines 446, 5065). Investigation shows: -- Line 446: `convertJsonToPdf` method -- Line 5065: `extractDocumentMetadata` method -- These are DIFFERENT methods for DIFFERENT operations -- Each method calls `getBytes()` only once - -**Conclusion:** Not a double-read issue. - ---- - -### Claim: Image Base64 Encoding Per Call - -**Status:** ❌ FALSE - -**Explanation:** The claim stated images are Base64-encoded on every call. Investigation shows: -```java -// PdfJsonImageService.java:430-450 -private EncodedImage getOrEncodeImage(PDImage pdImage) { - COSBase key = xObject.getCOSObject(); - EncodedImage cached = imageCache.get(key); // Cache check! - if (cached != null) { - return cached; // Cache hit - } - EncodedImage encoded = encodeImage(pdImage); - imageCache.put(key, encoded); // Cache miss, encode and store - return encoded; -} -``` - -**Conclusion:** Images ARE cached. Only stencil and inline images bypass cache. - ---- - -## ARCHITECTURE ISSUES - -### Issue #11: Singleton Service Architecture - MEDIUM - -**Location:** All `@Service` and `@Component` classes - -**Severity:** MEDIUM (Maintainability) - -**Type:** Architectural Pattern - -**Description:** -All services use default singleton scope: -```java -@Service // Defaults to singleton -public class PdfJsonConversionService { - // Shared instance variables across all requests - private final Map type3NormalizedFontCache = ...; -} -``` - -**Implications:** -✅ **Good:** -- Most dependencies are stateless and injected -- Caches use ConcurrentHashMap (thread-safe) -- No mutable instance variables beyond caches - -⚠️ **Risks:** -- Singleton means shared state across all requests -- Requires careful synchronization -- Easy for future developers to introduce thread-unsafe code -- Difficult to test concurrent scenarios - -**Recommendation:** -- Document thread-safety requirements prominently -- Add unit tests for concurrent access -- Consider request-scoped services for mutable state -- Code review checklist for new instance variables - ---- - -## SUMMARY BY SEVERITY - -### CRITICAL (Fix Immediately) -1. **pageFontResources PDFont keys** - Broken feature -2. **Unbounded thread creation** - Resource exhaustion - -### HIGH (Fix Soon) -3. **Unbounded cache growth** - Memory leak -4. **Type3 cache race** - Duplicate work -5. **PDDocument lifecycle** - Needs investigation (speculative) - -### MEDIUM (Plan and Address) -6. **Full PDF reload per page** - Performance -7. **Large Base64 operations** - Performance -8. **File I/O blocking** - Performance -9. **PdfLazyLoadingService unused** - Dead code -10. **Singleton architecture** - Maintainability - -### LOW (Monitor) -11. **PdfJsonFontService volatile** - Correctly implemented (no action needed) - -### VERIFIED FALSE -12. file.getBytes() called twice -13. Image Base64 encoding per call - ---- - -## IMPLEMENTATION ROADMAP - -### Phase 1: Critical Fixes (1-2 weeks) - -**1. Fix pageFontResources PDFont keys (Issue #1)** -```java -// Priority: CRITICAL -// Time: 3-5 days -// Risk: Medium (requires careful testing) - -// Replace PDFont keys with String resource names -// Update cache lookup logic -``` - -**2. Fix Thread Leaks (Issue #2)** -```java -// Priority: CRITICAL -// Time: 1 day -// Risk: Low (well-understood solution) - -// Replace new Thread() with ScheduledExecutorService -// Add @PreDestroy cleanup -// Monitor thread counts -``` - -### Phase 2: Resource Management (1 week) - -**3. Add Cache Eviction (Issue #3)** -```java -// Priority: HIGH -// Time: 3 days -// Risk: Low (library-based solution) - -// Integrate Caffeine cache -// Set size limits, TTL -// Add eviction logging -// Monitor cache metrics -``` - -**4. Fix Type3 Cache Race (Issue #4)** -```java -// Priority: HIGH -// Time: 1-2 days -// Risk: Low (straightforward fix) - -// Use computeIfAbsent for atomic operations -``` - -### Phase 3: Performance Optimization (2-3 weeks) - -**5. Optimize Lazy Loading (Issue #6)** -```java -// Priority: MEDIUM -// Time: 1 week -// Risk: Medium (requires benchmarking) - -// Cache extracted page data -// Or: Keep PDDocument open with RandomAccessFile -// Or: Pre-split pages at upload -``` - -**6. Async I/O (Issues #7, #8)** -```java -// Priority: MEDIUM -// Time: 3-5 days -// Risk: Medium (requires async architecture changes) - -// Add dedicated I/O thread pool -// Async file operations -// Stream large files -``` - -### Phase 4: Code Quality (1 week) - -**7. Remove Dead Code (Issue #9)** -```java -// Priority: LOW -// Time: 1 day -// Risk: None - -// Delete PdfLazyLoadingService -// Clean up unused imports -``` - -**8. Documentation & Testing** -```java -// Priority: MEDIUM -// Time: 3-5 days - -// Add thread-safety documentation -// Concurrent integration tests -// Load testing scripts -``` - ---- - -## TESTING STRATEGY - -### 1. Concurrency Tests - -```java -@SpringBootTest -@TestInstance(TestInstance.Lifecycle.PER_CLASS) -class ConcurrencyTest { - - @Test - void testConcurrentCacheAccess() throws Exception { - ExecutorService executor = Executors.newFixedThreadPool(20); - CountDownLatch latch = new CountDownLatch(100); - List> futures = new ArrayList<>(); - - // 100 requests across 10 jobIds (10 requests per job) - for (int i = 0; i < 100; i++) { - String jobId = "job-" + (i % 10); - int pageNum = (i % 5) + 1; - - futures.add(executor.submit(() -> { - try { - service.extractSinglePage(jobId, pageNum); - } catch (Exception e) { - log.error("Concurrent access failed", e); - throw e; - } finally { - latch.countDown(); - } - })); - } - - // Wait for completion - assertTrue(latch.await(60, TimeUnit.SECONDS)); - - // Check for exceptions - for (Future future : futures) { - future.get(); // Throws if any task failed - } - } - - @Test - void testCacheNotCorrupted() throws Exception { - // Upload document - String jobId = "test-job"; - service.extractDocumentMetadata(testPdf, jobId); - - // Concurrent page requests - ExecutorService executor = Executors.newFixedThreadPool(10); - List> futures = new ArrayList<>(); - - for (int i = 0; i < 50; i++) { - int page = (i % 10) + 1; - futures.add(executor.submit(() -> - service.extractSinglePage(jobId, page))); - } - - // All should succeed without ConcurrentModificationException - for (Future future : futures) { - assertNotNull(future.get()); - } - } -} -``` - -### 2. Memory Leak Tests - -```java -@Test -void testCacheDoesNotGrowUnbounded() { - long initialHeap = getHeapUsage(); - - // Process 10,000 small PDFs with Type3 fonts - for (int i = 0; i < 10000; i++) { - service.convertPdfToJson(createTestPdfWithType3Fonts()); - } - - // Force GC - System.gc(); - Thread.sleep(1000); - - long finalHeap = getHeapUsage(); - long growth = finalHeap - initialHeap; - - // Cache should not grow beyond reasonable limit - assertThat(growth).isLessThan(100_000_000); // 100MB max -} - -@Test -void testThreadsNotLeaking() { - int initialThreads = getActiveThreadCount(); - - // Upload 100 PDFs (spawns 100 cleanup threads) - for (int i = 0; i < 100; i++) { - service.extractDocumentMetadata(testPdf, "job-" + i); - } - - int peakThreads = getActiveThreadCount(); - - // Should not create 100+ threads - assertThat(peakThreads - initialThreads).isLessThan(10); -} - -private long getHeapUsage() { - Runtime runtime = Runtime.getRuntime(); - return runtime.totalMemory() - runtime.freeMemory(); -} - -private int getActiveThreadCount() { - return Thread.getAllStackTraces().size(); -} -``` - -### 3. Security Tests - -```java -@Test -void testJobIdIsolation() { - // User A uploads PDF - String jobIdA = service.extractDocumentMetadata(userAPdf, sessionA); - - // User B tries to access User A's jobId - assertThrows(AccessDeniedException.class, () -> { - service.extractSinglePage(jobIdA, 1, sessionB); - }); -} - -@Test -void testJobIdUnpredictable() { - Set jobIds = new HashSet<>(); - - for (int i = 0; i < 1000; i++) { - String jobId = service.extractDocumentMetadata(testPdf, session); - jobIds.add(jobId); - } - - // All jobIds should be unique UUIDs - assertThat(jobIds).hasSize(1000); - - // Should not be sequential - List sorted = new ArrayList<>(jobIds); - Collections.sort(sorted); - assertThat(sorted).isNotEqualTo(new ArrayList<>(jobIds)); -} -``` - -### 4. Performance Tests - -```java -@Test -void testLargeFilePerformance() { - // 100MB PDF - byte[] largePdf = createLargePdf(100 * 1024 * 1024); - - long start = System.currentTimeMillis(); - String json = service.convertPdfToJson(largePdf); - long duration = System.currentTimeMillis() - start; - - // Should complete in reasonable time - assertThat(duration).isLessThan(30_000); // 30 seconds -} - -@Test -void testConcurrentThroughput() throws Exception { - ExecutorService executor = Executors.newFixedThreadPool(50); - CountDownLatch latch = new CountDownLatch(500); - - long start = System.currentTimeMillis(); - - for (int i = 0; i < 500; i++) { - executor.submit(() -> { - try { - service.convertPdfToJson(testPdf); - } finally { - latch.countDown(); - } - }); - } - - latch.await(); - long duration = System.currentTimeMillis() - start; - - // 500 conversions should complete in reasonable time - double throughput = 500.0 / (duration / 1000.0); - assertThat(throughput).isGreaterThan(10); // At least 10 conversions/sec -} -``` - ---- - -## MONITORING & METRICS - -### Recommended Metrics (Micrometer) - -```java -@Service -public class PdfJsonConversionService { - - private final MeterRegistry meterRegistry; - - // Cache size gauges - @PostConstruct - void registerMetrics() { - Gauge.builder("pdf.cache.document.size", documentCache, Map::size) - .description("Number of cached documents") - .register(meterRegistry); - - Gauge.builder("pdf.cache.type3font.size", type3NormalizedFontCache, Map::size) - .description("Number of cached Type3 fonts") - .register(meterRegistry); - - Gauge.builder("pdf.cache.coverage.size", type3GlyphCoverageCache, Map::size) - .description("Number of cached glyph coverage sets") - .register(meterRegistry); - - Gauge.builder("pdf.threads.cleanup", this::getCleanupThreadCount) - .description("Active cleanup threads") - .register(meterRegistry); - } - - // Operation timers - public String convertPdfToJson(byte[] pdfBytes) { - Timer.Sample sample = Timer.start(meterRegistry); - try { - String result = doConvertPdfToJson(pdfBytes); - sample.stop(meterRegistry.timer("pdf.convert.toJson")); - return result; - } catch (Exception e) { - meterRegistry.counter("pdf.convert.errors", "operation", "toJson").increment(); - throw e; - } - } - - // Cache hit/miss counters - private PDFont getCachedType3Font(String uid) { - PDFont cached = type3NormalizedFontCache.get(uid); - if (cached != null) { - meterRegistry.counter("pdf.cache.type3font.hits").increment(); - } else { - meterRegistry.counter("pdf.cache.type3font.misses").increment(); - } - return cached; - } -} -``` - -### Alerts - -```yaml -alerts: - # Cache growth - - name: DocumentCacheTooLarge - condition: pdf_cache_document_size > 100 - severity: warning - - - name: Type3CacheTooLarge - condition: pdf_cache_type3font_size > 1000 - severity: warning - - # Thread leaks - - name: TooManyCleanupThreads - condition: pdf_threads_cleanup > 10 - severity: critical - - # Memory pressure - - name: HeapUsageHigh - condition: jvm_memory_used_bytes / jvm_memory_max_bytes > 0.8 - severity: warning - - # Performance - - name: SlowConversions - condition: pdf_convert_toJson{quantile="0.95"} > 10s - severity: warning - - # Error rate - - name: HighErrorRate - condition: rate(pdf_convert_errors[5m]) > 0.1 - severity: critical -``` - ---- - -## CONCLUSION - -The PDF JSON editor has **CRITICAL** issues that must be fixed before production deployment: - -### Must-Fix Issues (Blocks Production): -1. **pageFontResources broken** - Lazy page loading completely broken (PDFont instances as keys) -2. **Thread leaks** - Unbounded thread creation causes OutOfMemoryError - -### Should-Fix Issues (Prevents Scale): -3. **Unbounded cache growth** - Memory leaks require server restarts -4. **Type3 cache races** - Wasted CPU doing duplicate work -5. **PDDocument lifecycle** - Needs investigation (no evidence of actual problems yet) - -### Performance Improvements (Nice-to-Have): -6. Full PDF reload per page -7. Large Base64 operations -8. Synchronous file I/O - -### Code Quality Issues: -9. PdfLazyLoadingService dead code -10. Documentation of thread-safety requirements - -**Estimated Effort:** -- Critical fixes: 1-2 weeks (Issues #1, #2) -- High priority: 1 week (Issues #3, #4) -- Performance: 2-3 weeks (Issues #6-8) -- **Total:** 4-6 weeks for complete remediation - -**Recommendation:** Fix critical issues (#1, #2) immediately, then address high priority issues before beta testing. diff --git a/docs/pdf_json_type3_fonts.md b/docs/pdf_json_type3_fonts.md deleted file mode 100644 index a35272a41..000000000 --- a/docs/pdf_json_type3_fonts.md +++ /dev/null @@ -1,443 +0,0 @@ -# PDF JSON Type3 Font System - -## Overview - -The PDF Text editor needs to handle **Type3 fonts** - custom vector fonts embedded in PDFs that don't follow standard font formats. These are common in PDFs generated by Matplotlib, LaTeX, scientific papers, and presentation tools. - -When converting a PDF to JSON for editing, Type3 fonts present two challenges: -1. **No Unicode mapping** - Character codes don't map to standard Unicode characters -2. **Custom glyphs** - Each font contains vector drawing instructions unique to that PDF - -This document explains how the system handles Type3 fonts during the full PDF → JSON → PDF workflow. - ---- - -## Architecture Flow - -### PDF → JSON Conversion Flow - -``` -┌─────────────┐ -│ Input PDF │ -└──────┬──────┘ - │ - ▼ -┌─────────────────────────────────┐ -│ PDFBox Parsing │ -│ - Extract text positions │ -│ - Identify fonts │ -└──────┬──────────────────────────┘ - │ - ▼ -┌─────────────────────────────────┐ -│ Font Detection │ -│ Is this a Type3 font? │ -└──────┬──────────────────────────┘ - │ - ├─── YES (Type3) ───────────────────────┐ - │ │ - ▼ ▼ -┌──────────────────────────┐ ┌──────────────────────────────┐ -│ Type3FontConversion │ │ Extract Type3 Metadata │ -│ Service │ │ - Glyph outlines (paths) │ -│ │ │ - Character codes │ -│ 1. Calculate signature │ │ - Font matrix │ -│ 2. Match against │ │ - Bounding boxes │ -│ library │ └──────────────────────────────┘ -└──────┬───────────────────┘ - │ - ▼ -┌──────────────────────────────────────┐ -│ Library Match? │ -└──────┬───────────────────────────────┘ - │ - ├─── FOUND ─────────────────────┐ - │ │ - │ ▼ - │ ┌─────────────────────────────┐ - │ │ Load Pre-built Font │ - │ │ - TTF/OTF from library │ - │ │ - Full Unicode mappings │ - │ │ - Web + PDF payloads │ - │ └──────────┬──────────────────┘ - │ │ - ├─── NOT FOUND ──────────────────┘ - │ - ▼ -┌──────────────────────────────────────┐ -│ Store Type3 Metadata in JSON │ -│ - type3Glyphs: [{charCode, unicode, │ -│ glyphName, outline}] │ -│ - Original char codes preserved │ -│ - Font marked as Type3 │ -└──────┬───────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────┐ -│ PdfJsonDocument Output │ -│ { │ -│ fonts: [{ │ -│ id: "F1", │ -│ baseName: "ABCD+DejaVuSans", │ -│ subtype: "Type3", │ -│ type3Glyphs: [...], │ -│ conversionCandidates: [{ │ -│ strategyId: "type3-library", │ -│ status: "SUCCESS", │ -│ pdfProgram: "base64...", │ -│ glyphCoverage: [65,66,67...] │ -│ }] │ -│ }], │ -│ textElements: [{ │ -│ text: "Hello", │ -│ fontId: "F1", │ -│ charCodes: [72,101,108,108,111]│ -│ }] │ -│ } │ -└─────────────────────────────────────┘ -``` - -### JSON → PDF Conversion Flow - -``` -┌─────────────────────────────────────┐ -│ Input JSON (edited by user) │ -└──────┬──────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────┐ -│ Load Fonts from JSON │ -│ - Check for conversionCandidates │ -│ - Check for type3Glyphs │ -└──────┬──────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────┐ -│ Has conversionCandidates? │ -└──────┬──────────────────────────────┘ - │ - ├─── YES (Library Match) ───────┐ - │ │ - │ ▼ - │ ┌─────────────────────────────┐ - │ │ Load from Candidate │ - │ │ 1. Decode base64 pdfProgram│ - │ │ 2. Create PDType0Font │ - │ │ 3. Embed in new PDF │ - │ └──────────┬──────────────────┘ - │ │ - ├─── NO (Use Type3 Metadata) ───┘ - │ - ▼ -┌──────────────────────────────────────┐ -│ Text Rendering Strategy │ -│ - Normalized Type3 fonts: │ -│ Use original text (font has │ -│ Unicode mappings) │ -│ │ -│ - Actual Type3 fonts: │ -│ Use charCodes array │ -│ │ -│ - Other fonts: │ -│ Standard encoding │ -└──────┬───────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────┐ -│ Generate PDF Content Streams │ -│ - Set font: /F1 12 Tf │ -│ - Position text: x y Td │ -│ - Show text: (encoded) Tj │ -└──────┬──────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────┐ -│ Output PDF │ -│ - Fonts embedded correctly │ -│ - Text renders with proper glyphs │ -│ - Preserves visual appearance │ -└─────────────────────────────────────┘ -``` - ---- - -## Key Components - -### 1. Type3 Font Signature Matching - -**Location:** `Type3FontSignatureCalculator.java` - -**Purpose:** Creates a unique fingerprint of a Type3 font based on its glyph shapes. - -**How it works:** -```java -// 1. Extract glyph outlines from Type3 font -List glyphs = extractor.extractGlyphs(document, font, fontId, pageNumber); - -// 2. Normalize and hash the shapes -String signature = calculator.calculateSignature(glyphs); -// Result: "sha256:2be58b6ef1e29a83b8634d70b9e32c37a15dea2e..." - -// 3. Look up in library -Optional match = library.findBySignature(signature); -``` - -**Signature includes:** -- Glyph outline paths (curves, lines) -- Glyph bounding boxes -- Advance widths -- Character code mappings - -### 2. Type3 Font Library - -**Location:** `app/core/src/main/resources/type3/library/` - -**Structure:** -``` -type3/library/ -├── index.json # Font metadata and signatures -├── catalogue.json # Quick lookup of common fonts -└── fonts/ - ├── dejavu/ - │ ├── DejaVuSans.ttf - │ ├── DejaVuSans-Bold.ttf - │ └── DejaVuSans-Oblique.ttf - ├── cm/ # Computer Modern (LaTeX) - │ ├── cmr10.ttf - │ ├── cmmi10.ttf - │ └── cmsy10.ttf - ├── stix/ # Scientific symbols - │ └── STIXSizeThreeSym-Regular.otf - └── scp/ # Monospace - └── SauceCodeProNerdFont-Regular.ttf -``` - -**index.json format:** -```json -[ - { - "id": "dejavu-sans-bold", - "label": "DejaVu Sans Bold", - "aliases": [ - "DejaVuSans-Bold", - "EVICAO+DejaVuSans-Bold", - "dejavusans-bold" - ], - "signatures": [ - "sha256:a1b2c3d4...", - "sha256:e5f6g7h8..." - ], - "pdfProgram": { - "resource": "type3/library/fonts/dejavu/DejaVuSans-Bold.ttf", - "format": "ttf" - }, - "glyphCoverage": [32, 33, 65, 66, 67, ...], - "source": "DejaVu Fonts 2.37" - } -] -``` - -### 3. Normalized vs Actual Type3 Fonts - -The system handles two types of Type3 fonts differently: - -**Normalized Type3 Fonts:** -- Original PDF has Type3 font -- Matched against library -- Replaced with standard TTF/OTF -- Font object is `PDType0Font` (not `PDType3Font`) -- Has proper Unicode mappings - -**Actual Type3 Fonts:** -- Original PDF has Type3 font -- No library match found -- Keeps Type3 glyph data in JSON -- Font object is `PDType3Font` -- Uses character codes instead of Unicode - -**Rendering logic (PdfJsonConversionService.java:2411-2463):** -```java -boolean isNormalizedType3 = !(run.font() instanceof PDType3Font) - && runFontModel != null - && runFontModel.getType3Glyphs() != null - && !runFontModel.getType3Glyphs().isEmpty(); - -if (isNormalizedType3) { - // Font has Unicode mappings, use text directly - contentStream.showText(run.text()); -} else { - // Use raw byte encoding (for Type3 or other fonts) - byte[] encoded = encodeTextWithFont(run.font(), fontModel, run.text(), charCodes); - contentStream.showText(new String(encoded, StandardCharsets.ISO_8859_1)); -} -``` - -### 4. Character Code Preservation - -**Why needed:** Type3 fonts often lack ToUnicode mappings. We preserve the original character codes so text can be reconstructed. - -**Storage in JSON:** -```json -{ - "text": "Hello", - "fontId": "F1", - "charCodes": [72, 101, 108, 108, 111] -} -``` - -**Extraction (PDF → JSON):** -```java -// TextCollectingStripper.java:4431-4443 -if (pdfont instanceof PDType3Font) { - int[] codes = position.getCharacterCodes(); - if (codes != null && codes.length > 0) { - element.setCharCodes(Arrays.stream(codes) - .boxed() - .collect(Collectors.toList())); - } -} -``` - -### 5. Font Embedding Strategies - -When converting JSON → PDF, fonts are embedded based on their type: - -| Font Type | Strategy | Implementation | -|-----------|----------|----------------| -| **Normalized Type3** | Load TTF/OTF from library, embed as PDType0Font | `conversionCandidates[0].pdfProgram` | -| **Standard fonts** | Use system fonts or embedded fonts from original | PDFBox standard loading | -| **CFF/Type1C fonts** | Wrap as OpenType-CFF for browser compatibility | Optional Python converter | -| **Actual Type3** | Keep original Type3 definition | Preserve from original PDF | - ---- - -## Configuration - -**settings.yml:** -```yaml -processing: - pdf-json: - fonts: - type3: - library: - enabled: true - index: classpath:/type3/library/index.json -``` - -**Environment variables:** -```bash -# Disable Type3 library matching -STIRLING_PDF_JSON_TYPE3_LIBRARY_ENABLED=false - -# Use custom library -STIRLING_PDF_JSON_TYPE3_LIBRARY_INDEX=file:/path/to/custom/index.json -``` - ---- - -## Debugging - -### View Type3 Font Information - -**Backend logs** show signature matching: -``` -[TYPE3] Strategy type3-library finished with status SUCCESS - (message: Matched DejaVu Sans Bold via alias:dejavusans-bold) - for font 1:F2 - -[TYPE3-RUNTIME] Loading library font F2 WITHOUT subsetting - (full glyph set) from candidate:type3-library:pdfProgram -``` - -### Check JSON Output - -Look for `type3Glyphs` in font definitions: -```json -{ - "id": "F1", - "baseName": "BMQQDV+DejaVuSans", - "subtype": "Type3", - "type3Glyphs": [ - { - "charCode": 65, - "glyphName": "A", - "unicode": 65, - "advanceWidth": 684, - "bbox": [0, 0, 684, 729], - "outline": "M 72 0 L ..." - } - ] -} -``` - -### Test Signature Calculation - -Use the CLI tool to analyze any PDF: -```bash -./gradlew :proprietary:type3SignatureTool \ - --args="--pdf sample.pdf --output analysis.json --pretty" -``` - -Output shows all Type3 fonts with their signatures and glyph coverage. - ---- - -## Performance Considerations - -### Memory Management - -- **Type3 glyph data** can be large (detailed vector paths) -- **Font subsetting** not used for library fonts (full glyph set loaded) -- **Caching:** Library fonts are loaded once and cached per conversion - -### File Size Impact - -- **JSON size:** Type3 glyph data adds ~5-50KB per font -- **PDF size:** Embedding TTF/OTF fonts adds ~50-500KB per font -- **Optimization:** Use library matching to avoid storing raw Type3 data - ---- - -## Limitations - -1. **Library coverage:** Only common Type3 fonts are in the library - - Matplotlib (DejaVu Sans family) - - LaTeX (Computer Modern) - - STIX Math symbols - -2. **Glyph accuracy:** Signature matching assumes exact glyph shapes - - Slight variations may not match - - Subset fonts may have different signatures - -3. **Unicode mapping:** Unmatched Type3 fonts lose Unicode text - - Character codes preserved but not searchable - - Copy/paste may not work correctly - -4. **No runtime synthesis:** Unlike earlier designs, no attempt to convert Type3 to TTF at runtime - - All conversions must be pre-built in library - - Unknown Type3 fonts keep their Type3 definition - ---- - -## Related Files - -### Backend (Java) -- `PdfJsonConversionService.java` - Main conversion logic -- `Type3FontConversionService.java` - Signature calculation and matching -- `Type3FontLibrary.java` - Library loading and lookup -- `Type3GlyphExtractor.java` - Extract glyph data from Type3 fonts -- `Type3FontSignatureCalculator.java` - Create font fingerprints -- `PdfJsonFontType3Glyph.java` - Model for Type3 glyph data - -### Frontend (TypeScript) -- `pdfTextEditorTypes.ts` - Type definitions for JSON structure -- `pdfTextEditorUtils.ts` - Font handling utilities - -### Resources -- `type3/library/index.json` - Font library metadata -- `type3/library/fonts/` - Actual font files (TTF/OTF) -- `settings.yml.template` - Configuration options - -### Documentation -- `pdf_text_edit_flow.md` - Overall text editing architecture -- `type3_fallback_plan.md` - Original design and planning diff --git a/docs/pdf_json_type3_library_management.md b/docs/pdf_json_type3_library_management.md deleted file mode 100644 index 56e13b055..000000000 --- a/docs/pdf_json_type3_library_management.md +++ /dev/null @@ -1,660 +0,0 @@ -# Type3 Font Library Management Guide - -## Overview - -This guide explains how to add new fonts to the Type3 font library, including: -1. Finding Type3 fonts in PDFs -2. Extracting and analyzing Type3 fonts -3. Adding pre-converted TTF/OTF fonts to the library -4. Importing fonts with example PDFs - ---- - -## Two Approaches to Adding Fonts - -### Approach 1: Import Existing TTF/OTF Files - -**When to use:** You already have a TTF/OTF file that matches a Type3 font you encounter. - -**Example:** You find a PDF with a Type3 font that's actually DejaVu Sans, and you have the official DejaVu Sans TTF file. - -### Approach 2: Extract from Type3 PDFs - -**When to use:** The Type3 font is unique to a PDF (no standard equivalent exists). - -**Example:** Custom corporate fonts, specialized scientific notation, or generated fonts. - ---- - -## Approach 1: Import Existing TTF/OTF Files - -This is the **recommended approach** when possible, as it gives you the full font with proper Unicode mappings. - -### Step 1: Find the Type3 Font Signature - -Use the signature tool to analyze a PDF containing the Type3 font: - -```bash -./gradlew :proprietary:type3SignatureTool \ - --args="--pdf examples/matplotlib_chart.pdf --output matplotlib_fonts.json --pretty" -``` - -**Output (`matplotlib_fonts.json`):** -```json -{ - "fonts": [ - { - "fontId": "F1", - "pageNumber": 1, - "baseName": "BMQQDV+DejaVuSans", - "subtype": "Type3", - "signature": "sha256:2be58b6ef1e29a83b8634d70b9e32c37a15dea2e608894439ef7224c35b77f5d", - "glyphCount": 50, - "glyphs": [ - { - "charCode": 65, - "glyphName": "A", - "unicode": 65, - "advanceWidth": 684, - "bbox": [0, 0, 684, 729] - } - ] - } - ] -} -``` - -**Key information:** -- `signature`: Unique fingerprint of this font -- `baseName`: Font name in the PDF (often subset like "BMQQDV+DejaVuSans") -- `glyphs`: Character codes and Unicode mappings - -### Step 2: Obtain the Matching TTF/OTF File - -**Sources:** -1. **System fonts:** Check if the font is already installed - - Windows: `C:\Windows\Fonts\` - - Linux: `/usr/share/fonts/` - - macOS: `/Library/Fonts/` - -2. **Official font websites:** - - [DejaVu Fonts](https://dejavu-fonts.github.io/) - - [Computer Modern Unicode](https://www.fontsquirrel.com/fonts/computer-modern) - - [STIX Fonts](https://www.stixfonts.org/) - - [Google Fonts](https://fonts.google.com/) - -3. **Font foundries:** If it's a commercial font, you'll need a license - -**Example - Getting DejaVu Sans:** -```bash -# Download from official source -wget https://github.com/dejavu-fonts/dejavu-fonts/releases/download/version_2_37/dejavu-fonts-ttf-2.37.tar.bz2 - -# Extract -tar xjf dejavu-fonts-ttf-2.37.tar.bz2 - -# Copy the needed font -cp dejavu-fonts-ttf-2.37/ttf/DejaVuSans.ttf app/core/src/main/resources/type3/library/fonts/dejavu/ -``` - -### Step 3: Add Entry to Library Index - -Edit `app/core/src/main/resources/type3/library/index.json`: - -```json -[ - { - "id": "dejavu-sans-regular", - "label": "DejaVu Sans", - "aliases": [ - "DejaVuSans", - "BMQQDV+DejaVuSans", - "DejaVuSansDisplay", - "dejavusans" - ], - "signatures": [ - "sha256:2be58b6ef1e29a83b8634d70b9e32c37a15dea2e608894439ef7224c35b77f5d" - ], - "pdfProgram": { - "resource": "type3/library/fonts/dejavu/DejaVuSans.ttf", - "format": "ttf" - }, - "webProgram": { - "resource": "type3/library/fonts/dejavu/DejaVuSans.ttf", - "format": "ttf" - }, - "glyphCoverage": [32, 33, 65, 66, 67, 68, 69, ...], - "source": "DejaVu Fonts 2.37" - } -] -``` - -**Field descriptions:** - -| Field | Required | Description | Example | -|-------|----------|-------------|---------| -| `id` | ✅ | Unique identifier (lowercase, hyphenated) | `"dejavu-sans-bold"` | -| `label` | ✅ | Human-readable name | `"DejaVu Sans Bold"` | -| `aliases` | ✅ | Font name variations to match | `["DejaVuSans-Bold", "EVICAO+DejaVuSans-Bold"]` | -| `signatures` | ✅ | SHA256 hashes from signature tool | `["sha256:a1b2c3..."]` | -| `pdfProgram` | ✅ | Font file for PDF embedding | See below | -| `webProgram` | ⚠️ | Font file for browser preview (can be same as pdfProgram) | See below | -| `glyphCoverage` | ❌ | Unicode code points covered (optional) | `[65, 66, 67]` | -| `source` | ❌ | Attribution/version info (optional) | `"DejaVu Fonts 2.37"` | - -**Program format:** -```json -"pdfProgram": { - "resource": "type3/library/fonts/dejavu/DejaVuSans.ttf", - "format": "ttf" -} -``` -- `resource`: Path relative to classpath root (or absolute file:// path) -- `format`: `"ttf"` or `"otf"` - -### Step 4: Add to Catalogue (Optional) - -Edit `app/core/src/main/resources/type3/catalogue.json`: - -```json -{ - "fonts": [ - { - "name": "DejaVuSans", - "source": "Matplotlib PDFs", - "variants": ["Regular", "Bold", "Oblique"], - "status": "complete" - } - ] -} -``` - -### Step 5: Test the Font - -1. **Rebuild the application:** - ```bash - ./gradlew clean build - ``` - -2. **Convert a PDF with the Type3 font:** - ```bash - curl -X POST http://localhost:8080/api/v1/convert/pdf/to-json \ - -F file=@test.pdf \ - -o output.json - ``` - -3. **Check the JSON output:** - ```bash - # Look for conversionCandidates - cat output.json | jq '.fonts[] | select(.baseName | contains("DejaVu"))' - ``` - -4. **Expected output:** - ```json - { - "id": "F1", - "baseName": "BMQQDV+DejaVuSans", - "subtype": "Type3", - "conversionCandidates": [ - { - "strategyId": "type3-library", - "status": "SUCCESS", - "message": "Matched DejaVu Sans via alias:dejavusans", - "pdfProgram": "T1RUTwAJAIAAAwAQQ0ZGIHHwJ9o...", - "pdfProgramFormat": "ttf" - } - ] - } - ``` - -5. **Test round-trip conversion:** - ```bash - # Convert back to PDF - curl -X POST http://localhost:8080/api/v1/convert/json/to-pdf \ - -F file=@output.json \ - -o roundtrip.pdf - - # Verify text renders correctly - pdftotext roundtrip.pdf - | head - ``` - ---- - -## Approach 2: Extract from Type3 PDFs - -When no standard TTF/OTF exists, you can convert the Type3 font itself. - -⚠️ **Note:** This produces a synthetic font that may not be perfect. Use Approach 1 whenever possible. - -### Step 1: Extract Type3 Metadata - -Same as Approach 1 - use the signature tool: - -```bash -./gradlew :proprietary:type3SignatureTool \ - --args="--pdf custom_font.pdf --output custom_font_analysis.json --pretty" -``` - -### Step 2: Convert Type3 to TTF Using Python Script - -Use the included conversion script: - -```bash -# Install fontTools if not already installed -pip install fonttools - -# Convert the Type3 font -python scripts/type3_to_cff.py \ - --input custom_font_analysis.json \ - --output-cff custom_font.otf \ - --output-ttf custom_font.ttf \ - --font-name "CustomFont" \ - --family-name "Custom Font Family" -``` - -**Script options:** -- `--input`: JSON file from signature tool -- `--output-cff`: OpenType-CFF output (best for PDF) -- `--output-ttf`: TrueType output (best for web) -- `--font-name`: PostScript name (no spaces) -- `--family-name`: Human-readable name - -### Step 3: Copy Font to Library - -```bash -# Create directory for custom fonts -mkdir -p app/core/src/main/resources/type3/library/fonts/custom/ - -# Copy the converted fonts -cp custom_font.otf app/core/src/main/resources/type3/library/fonts/custom/ -cp custom_font.ttf app/core/src/main/resources/type3/library/fonts/custom/ -``` - -### Step 4: Add to Library Index - -Same as Approach 1, but note the synthetic nature: - -```json -{ - "id": "custom-font-regular", - "label": "Custom Font", - "aliases": ["CustomFont", "ABCDEF+CustomFont"], - "signatures": ["sha256:extracted-from-json"], - "pdfProgram": { - "resource": "type3/library/fonts/custom/custom_font.otf", - "format": "otf" - }, - "webProgram": { - "resource": "type3/library/fonts/custom/custom_font.ttf", - "format": "ttf" - }, - "source": "Converted from custom_font.pdf", - "glyphCoverage": [65, 66, 67, 68, 69] -} -``` - -### Step 5: Quality Check - -Type3 → TTF conversion may have issues: - -1. **Check glyph rendering:** - - Open the TTF in a font viewer (FontForge, FontDrop) - - Verify all glyphs look correct - -2. **Check metrics:** - - Compare advance widths to original - - Verify bounding boxes - -3. **Test in PDF:** - - Create a test document using the font - - Verify it renders the same as the original PDF - ---- - -## Adding Fonts with Example PDFs - -### Scenario: You Want to Support Matplotlib PDFs - -Matplotlib uses DejaVu Sans fonts but embeds them as Type3 fonts with subset names. - -### Step 1: Collect Example PDFs - -```bash -# Create a samples directory -mkdir -p samples/matplotlib/ - -# Download or copy example PDFs -cp ~/Downloads/matplotlib_chart.pdf samples/matplotlib/01_chart.pdf -cp ~/Downloads/matplotlib_plot.pdf samples/matplotlib/02_plot.pdf -``` - -### Step 2: Batch Analyze Fonts - -```bash -# Analyze all samples -for pdf in samples/matplotlib/*.pdf; do - basename=$(basename "$pdf" .pdf) - ./gradlew :proprietary:type3SignatureTool \ - --args="--pdf $pdf --output analysis_${basename}.json --pretty" -done -``` - -### Step 3: Identify Common Fonts - -Use the summarize script to see what fonts appear: - -```bash -python scripts/summarize_type3_signatures.py \ - --signatures-dir . \ - --output summary.md -``` - -**Output (`summary.md`):** -```markdown -# Type3 Font Signature Inventory - -## Fonts by Signature - -### sha256:2be58b6ef1e... -- **Alias:** DejaVuSans, BMQQDV+DejaVuSans -- **Seen in:** 01_chart.pdf, 02_plot.pdf -- **Status:** ✅ In library (dejavu-sans-regular) - -### sha256:a1b2c3d4e5f6... -- **Alias:** DejaVuSans-Bold, EVICAO+DejaVuSans-Bold -- **Seen in:** 01_chart.pdf -- **Status:** ❌ Missing from library -``` - -### Step 4: Add Missing Fonts - -For each missing font: - -1. **Download the standard version:** - ```bash - # DejaVu Sans Bold - wget https://github.com/dejavu-fonts/dejavu-fonts/releases/download/version_2_37/dejavu-fonts-ttf-2.37.tar.bz2 - tar xjf dejavu-fonts-ttf-2.37.tar.bz2 - cp dejavu-fonts-ttf-2.37/ttf/DejaVuSans-Bold.ttf \ - app/core/src/main/resources/type3/library/fonts/dejavu/ - ``` - -2. **Add to index.json:** - ```json - { - "id": "dejavu-sans-bold", - "label": "DejaVu Sans Bold", - "aliases": [ - "DejaVuSans-Bold", - "EVICAO+DejaVuSans-Bold", - "dejavusans-bold" - ], - "signatures": [ - "sha256:a1b2c3d4e5f6..." - ], - "pdfProgram": { - "resource": "type3/library/fonts/dejavu/DejaVuSans-Bold.ttf", - "format": "ttf" - }, - "webProgram": { - "resource": "type3/library/fonts/dejavu/DejaVuSans-Bold.ttf", - "format": "ttf" - }, - "source": "DejaVu Fonts 2.37" - } - ``` - -### Step 5: Update Library Automatically - -Use the update script to sync signatures: - -```bash -python scripts/update_type3_library.py \ - --signatures-dir . \ - --apply -``` - -This script: -- Reads all `analysis_*.json` files -- Matches them to library entries by alias -- Updates signatures and glyph coverage -- Writes back to `index.json` - -### Step 6: Verify Coverage - -Test with all your example PDFs: - -```bash -for pdf in samples/matplotlib/*.pdf; do - echo "Testing $pdf..." - curl -X POST http://localhost:8080/api/v1/convert/pdf/to-json \ - -F file=@"$pdf" \ - -o test_output.json - - # Check for successful matches - matches=$(cat test_output.json | jq '[.fonts[].conversionCandidates[] | select(.status == "SUCCESS")] | length') - total=$(cat test_output.json | jq '.fonts | length') - echo " Matched $matches of $total fonts" -done -``` - ---- - -## Common Fonts to Add - -### Matplotlib (Python plotting library) - -**Fonts used:** DejaVu Sans family - -```bash -# Download DejaVu fonts -wget https://github.com/dejavu-fonts/dejavu-fonts/releases/download/version_2_37/dejavu-fonts-ttf-2.37.tar.bz2 -tar xjf dejavu-fonts-ttf-2.37.tar.bz2 - -# Copy needed variants -cp dejavu-fonts-ttf-2.37/ttf/DejaVuSans.ttf \ - app/core/src/main/resources/type3/library/fonts/dejavu/ -cp dejavu-fonts-ttf-2.37/ttf/DejaVuSans-Bold.ttf \ - app/core/src/main/resources/type3/library/fonts/dejavu/ -cp dejavu-fonts-ttf-2.37/ttf/DejaVuSans-Oblique.ttf \ - app/core/src/main/resources/type3/library/fonts/dejavu/ -cp dejavu-fonts-ttf-2.37/ttf/DejaVuSansMono.ttf \ - app/core/src/main/resources/type3/library/fonts/dejavu/ -``` - -### LaTeX Documents - -**Fonts used:** Computer Modern family - -```bash -# Download Computer Modern Unicode -wget https://downloads.sourceforge.net/project/cm-unicode/cm-unicode/0.7.0/cm-unicode-0.7.0-ttf.tar.xz -tar xJf cm-unicode-0.7.0-ttf.tar.xz - -# Copy common variants -cp cm-unicode-0.7.0/cmunrm.ttf \ - app/core/src/main/resources/type3/library/fonts/cm/cmr10.ttf -cp cm-unicode-0.7.0/cmunti.ttf \ - app/core/src/main/resources/type3/library/fonts/cm/cmmi10.ttf -cp cm-unicode-0.7.0/cmunsy.ttf \ - app/core/src/main/resources/type3/library/fonts/cm/cmsy10.ttf -``` - -### Scientific Documents - -**Fonts used:** STIX fonts - -```bash -# Download STIX -wget https://github.com/stipub/stixfonts/raw/master/fonts/static_otf/STIXTwoText-Regular.otf - -# Copy to library -cp STIXTwoText-Regular.otf \ - app/core/src/main/resources/type3/library/fonts/stix/ -``` - ---- - -## Troubleshooting - -### Signature Doesn't Match - -**Problem:** You added a font but PDFs still don't match. - -**Causes:** -1. **Different font version:** The Type3 font in the PDF uses a different version -2. **Subset differences:** Subset fonts may have different signatures -3. **Alias mismatch:** Font name in PDF doesn't match any alias - -**Solutions:** - -1. **Check the PDF's font name:** - ```bash - pdffonts sample.pdf - ``` - Output shows: `BMQQDV+DejaVuSans` - -2. **Add the subset name as an alias:** - ```json - "aliases": [ - "DejaVuSans", - "BMQQDV+DejaVuSans", // Add this - "dejavusans" - ] - ``` - -3. **Add multiple signatures:** - ```json - "signatures": [ - "sha256:original-signature", - "sha256:subset-signature" // Add from your PDF - ] - ``` - -### Font Renders Incorrectly - -**Problem:** Text appears but glyphs look wrong. - -**Causes:** -1. **Wrong font file:** You matched a different font -2. **Glyph coverage mismatch:** Font doesn't have all the glyphs -3. **Encoding issues:** Character codes don't match - -**Solutions:** - -1. **Verify it's the right font:** - - Open both the Type3 PDF and your TTF in a font viewer - - Compare glyph shapes visually - -2. **Check glyph coverage:** - ```bash - # Extract glyphs from Type3 - ./gradlew :proprietary:type3SignatureTool \ - --args="--pdf sample.pdf --output analysis.json" - - # Compare to font file - ttfdump -t cmap your_font.ttf | grep "glyphID" - ``` - -3. **Use a more complete font version:** - - Get the full font (not a subset) - - Ensure all Unicode ranges are covered - -### Missing Glyphs - -**Problem:** Some characters don't render. - -**Cause:** The TTF/OTF file doesn't have all the glyphs that the Type3 font had. - -**Solution:** You need a more complete font, or must use Approach 2 to convert the Type3 font itself. - ---- - -## Library Maintenance Scripts - -### `type3SignatureTool` - -**Purpose:** Analyze Type3 fonts in a PDF - -```bash -./gradlew :proprietary:type3SignatureTool \ - --args="--pdf INPUT.pdf --output OUTPUT.json [--pretty]" -``` - -### `summarize_type3_signatures.py` - -**Purpose:** Create an inventory of fonts across multiple PDFs - -```bash -python scripts/summarize_type3_signatures.py \ - --signatures-dir path/to/jsons/ \ - --output inventory.md -``` - -### `update_type3_library.py` - -**Purpose:** Sync signature JSON files with library index - -```bash -# Dry run (show what would change) -python scripts/update_type3_library.py \ - --signatures-dir analysis/ - -# Apply changes -python scripts/update_type3_library.py \ - --signatures-dir analysis/ \ - --apply -``` - -### `type3_to_cff.py` - -**Purpose:** Convert Type3 glyph JSON to TTF/OTF - -```bash -python scripts/type3_to_cff.py \ - --input type3_data.json \ - --output-cff font.otf \ - --output-ttf font.ttf \ - --font-name MyFont \ - --family-name "My Font Family" -``` - ---- - -## Best Practices - -1. **Always prefer official fonts** (Approach 1) over synthesized ones (Approach 2) -2. **Use multiple signatures** for fonts that appear in different PDFs with different subset names -3. **Test with real PDFs** before considering a font "done" -4. **Document the source** in the `source` field for attribution and versioning -5. **Keep example PDFs** for each font in case you need to debug later -6. **Version the library** - commit `index.json` changes with clear descriptions - ---- - -## File Size Considerations - -### Font File Sizes - -Typical sizes: -- **DejaVu Sans:** ~750KB per variant -- **Computer Modern:** ~200-400KB per variant -- **STIX Math:** ~500KB-1MB per variant - -### Library Size Management - -Current library: **~3MB** (10 fonts) - -To keep the library manageable: -1. **Only include common fonts** (used in >5% of PDFs you process) -2. **Don't include every variant** (e.g., skip "Ultra Light" if rarely used) -3. **Consider subsetting** for languages you don't support -4. **Use OTF/CFF over TTF** when possible (better compression for PDF) - ---- - -## Related Documentation - -- [PDF JSON Type3 Font System](pdf_json_type3_fonts.md) - Technical architecture -- [PDF Text Edit Flow](pdf_text_edit_flow.md) - Overall editing workflow -- [Type3 Fallback Plan](type3_fallback_plan.md) - Original design document diff --git a/docs/pdf_text_edit_flow.md b/docs/pdf_text_edit_flow.md deleted file mode 100644 index e9ccae243..000000000 --- a/docs/pdf_text_edit_flow.md +++ /dev/null @@ -1,50 +0,0 @@ -# PDF Text Edit Flow - -This high-level diagram shows every major component involved when a user edits text inside a PDF via the JSON editor. It highlights where fonts (especially Type3) are captured, matched against the library, and re-applied during export. - -```mermaid -flowchart TD - %% Upload & Extraction - A([Upload PDF]) --> B[PdfJsonConversionService] - B --> B1[Optional Ghostscript preflight] - B1 --> B2[Iterate pages & resources] - B2 --> B3[Extract text runs + fonts] - - %% Font handling (serial tree) - B3 --> C{Font subtype?} - C -->|Type 0 / TrueType / CID| C1[Copy embedded program bytes] - C -->|Type3| C2[Type3FontConversionService] - C1 --> C4[Attach font payload + metadata] - C2 --> C21{Library match?} - C21 --|Yes|--> C22[Inject canonical TTF/OTF from library] - C21 --|No|--> C23[Mark unsupported
& keep Type3 glyphs] - C2 --> C25[Record glyph charCodes + unicode mapping] - C22 --> C25 - C23 --> C25 - - %% JSON output - C4 --> D[Build PdfJsonDocument (pages, fonts, elements)] - C25 --> D - D --> E([Send JSON to UI]) - - %% Edit round-trip - E --> F[User edits text/elements] - F --> G[Patched JSON POSTed back] - G --> H{Regeneration pipeline} - H --> H1[Resolve fonts + candidates] - H1 --> H11[Prefer library/embedded payloads] - H1 --> H12[Fallback font service for missing glyphs] - H --> H2{Can rewrite token stream?} - H2 -->|Yes| H21[Rewrite existing operators] - H2 -->|No| H22[Full page regeneration] - H22 --> H23[Embed canonical fonts + Type3 glyph codes] - H21 --> I[Apply annotations/metadata] - H23 --> I - I --> J([Download edited PDF]) -``` - -**Key points** -- Type3 conversion happens entirely inside `Type3FontConversionService`. Matching entries pull canonical fonts from the library; when a signature is missing we simply keep the original Type3 glyph codes until a library entry is added. -- Raw Type3 char codes are preserved in `PdfJsonTextElement.charCodes` so edits can fall back to the original glyph sequence when users do not change the text. -- When the frontend submits changes, the backend preflights each text run, picks the proper font candidate (library > embedded > fallback), and rewrites the PDF with either token replacements or full page regeneration. -- Glyph coverage metadata from the Type3 library now informs which fonts can legitimately render new characters, so added text keeps using the original Type3 face whenever its coverage includes those code points. diff --git a/frontend/src/constants/convertConstants.ts b/frontend/src/constants/convertConstants.ts deleted file mode 100644 index a7e12c266..000000000 --- a/frontend/src/constants/convertConstants.ts +++ /dev/null @@ -1,158 +0,0 @@ - -export const COLOR_TYPES = { - COLOR: 'color', - GRAYSCALE: 'grayscale', - BLACK_WHITE: 'blackwhite' -} as const; - -export const OUTPUT_OPTIONS = { - SINGLE: 'single', - MULTIPLE: 'multiple' -} as const; - -export const FIT_OPTIONS = { - FIT_PAGE: 'fitDocumentToPage', - MAINTAIN_ASPECT: 'maintainAspectRatio', - FILL_PAGE: 'fillPage' -} as const; - - -export const CONVERSION_ENDPOINTS = { - 'office-pdf': '/api/v1/convert/file/pdf', - 'pdf-image': '/api/v1/convert/pdf/img', - 'image-pdf': '/api/v1/convert/img/pdf', - 'pdf-office-word': '/api/v1/convert/pdf/word', - 'pdf-office-presentation': '/api/v1/convert/pdf/presentation', - 'pdf-office-text': '/api/v1/convert/pdf/text', - 'pdf-csv': '/api/v1/convert/pdf/csv', - 'pdf-markdown': '/api/v1/convert/pdf/markdown', - 'pdf-html': '/api/v1/convert/pdf/html', - 'pdf-xml': '/api/v1/convert/pdf/xml', - 'pdf-pdfa': '/api/v1/convert/pdf/pdfa', - 'html-pdf': '/api/v1/convert/html/pdf', - 'markdown-pdf': '/api/v1/convert/markdown/pdf', - 'eml-pdf': '/api/v1/convert/eml/pdf', - 'pdf-json': '/api/v1/convert/pdf/json', - 'json-pdf': '/api/v1/convert/json/pdf' -} as const; - -export const ENDPOINT_NAMES = { - 'office-pdf': 'file-to-pdf', - 'pdf-image': 'pdf-to-img', - 'image-pdf': 'img-to-pdf', - 'pdf-office-word': 'pdf-to-word', - 'pdf-office-presentation': 'pdf-to-presentation', - 'pdf-office-text': 'pdf-to-text', - 'pdf-csv': 'pdf-to-csv', - 'pdf-markdown': 'pdf-to-markdown', - 'pdf-html': 'pdf-to-html', - 'pdf-xml': 'pdf-to-xml', - 'pdf-pdfa': 'pdf-to-pdfa', - 'html-pdf': 'html-to-pdf', - 'markdown-pdf': 'markdown-to-pdf', - 'eml-pdf': 'eml-to-pdf', - 'pdf-json': 'pdf-to-json', - 'json-pdf': 'json-to-pdf' -} as const; - - -// Grouped file extensions for dropdowns -export const FROM_FORMAT_OPTIONS = [ - { value: 'any', label: 'Any', group: 'Multiple Files' }, - { value: 'image', label: 'Images', group: 'Multiple Files' }, - { value: 'pdf', label: 'PDF', group: 'Document' }, - { value: 'docx', label: 'DOCX', group: 'Document' }, - { value: 'doc', label: 'DOC', group: 'Document' }, - { value: 'odt', label: 'ODT', group: 'Document' }, - { value: 'xlsx', label: 'XLSX', group: 'Spreadsheet' }, - { value: 'xls', label: 'XLS', group: 'Spreadsheet' }, - { value: 'ods', label: 'ODS', group: 'Spreadsheet' }, - { value: 'pptx', label: 'PPTX', group: 'Presentation' }, - { value: 'ppt', label: 'PPT', group: 'Presentation' }, - { value: 'odp', label: 'ODP', group: 'Presentation' }, - { value: 'jpg', label: 'JPG', group: 'Image' }, - { value: 'jpeg', label: 'JPEG', group: 'Image' }, - { value: 'png', label: 'PNG', group: 'Image' }, - { value: 'gif', label: 'GIF', group: 'Image' }, - { value: 'bmp', label: 'BMP', group: 'Image' }, - { value: 'tiff', label: 'TIFF', group: 'Image' }, - { value: 'webp', label: 'WEBP', group: 'Image' }, - { value: 'svg', label: 'SVG', group: 'Image' }, - { value: 'html', label: 'HTML', group: 'Web' }, - { value: 'zip', label: 'ZIP', group: 'Web' }, - { value: 'md', label: 'MD', group: 'Text' }, - { value: 'txt', label: 'TXT', group: 'Text' }, - { value: 'rtf', label: 'RTF', group: 'Text' }, - { value: 'eml', label: 'EML', group: 'Email' }, - { value: 'json', label: 'JSON', group: 'Data' }, -]; - -export const TO_FORMAT_OPTIONS = [ - { value: 'pdf', label: 'PDF', group: 'Document' }, - { value: 'pdfa', label: 'PDF/A', group: 'Document' }, - { value: 'docx', label: 'DOCX', group: 'Document' }, - { value: 'odt', label: 'ODT', group: 'Document' }, - { value: 'csv', label: 'CSV', group: 'Spreadsheet' }, - { value: 'pptx', label: 'PPTX', group: 'Presentation' }, - { value: 'odp', label: 'ODP', group: 'Presentation' }, - { value: 'txt', label: 'TXT', group: 'Text' }, - { value: 'rtf', label: 'RTF', group: 'Text' }, - { value: 'md', label: 'MD', group: 'Text' }, - { value: 'png', label: 'PNG', group: 'Image' }, - { value: 'jpg', label: 'JPG', group: 'Image' }, - { value: 'gif', label: 'GIF', group: 'Image' }, - { value: 'tiff', label: 'TIFF', group: 'Image' }, - { value: 'bmp', label: 'BMP', group: 'Image' }, - { value: 'webp', label: 'WEBP', group: 'Image' }, - { value: 'html', label: 'HTML', group: 'Web' }, - { value: 'xml', label: 'XML', group: 'Web' }, - { value: 'json', label: 'JSON', group: 'Data' }, -]; - -// Conversion matrix - what each source format can convert to -export const CONVERSION_MATRIX: Record = { - 'any': ['pdf'], // Mixed files always convert to PDF - 'image': ['pdf'], // Multiple images always convert to PDF - 'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa', 'json'], - 'docx': ['pdf'], 'doc': ['pdf'], 'odt': ['pdf'], - 'xlsx': ['pdf'], 'xls': ['pdf'], 'ods': ['pdf'], - 'pptx': ['pdf'], 'ppt': ['pdf'], 'odp': ['pdf'], - 'jpg': ['pdf'], 'jpeg': ['pdf'], 'png': ['pdf'], 'gif': ['pdf'], 'bmp': ['pdf'], 'tiff': ['pdf'], 'webp': ['pdf'], 'svg': ['pdf'], - 'html': ['pdf'], - 'zip': ['pdf'], - 'md': ['pdf'], - 'txt': ['pdf'], 'rtf': ['pdf'], - 'eml': ['pdf'], - 'json': ['pdf'] -}; - -// Map extensions to endpoint keys -export const EXTENSION_TO_ENDPOINT: Record> = { - 'any': { 'pdf': 'file-to-pdf' }, // Mixed files use file-to-pdf endpoint - 'image': { 'pdf': 'img-to-pdf' }, // Multiple images use img-to-pdf endpoint - 'pdf': { - 'png': 'pdf-to-img', 'jpg': 'pdf-to-img', 'gif': 'pdf-to-img', 'tiff': 'pdf-to-img', 'bmp': 'pdf-to-img', 'webp': 'pdf-to-img', - 'docx': 'pdf-to-word', 'odt': 'pdf-to-word', - 'pptx': 'pdf-to-presentation', 'odp': 'pdf-to-presentation', - 'csv': 'pdf-to-csv', - 'txt': 'pdf-to-text', 'rtf': 'pdf-to-text', 'md': 'pdf-to-markdown', - 'html': 'pdf-to-html', 'xml': 'pdf-to-xml', - 'pdfa': 'pdf-to-pdfa', - 'json': 'pdf-to-json' - }, - 'docx': { 'pdf': 'file-to-pdf' }, 'doc': { 'pdf': 'file-to-pdf' }, 'odt': { 'pdf': 'file-to-pdf' }, - 'xlsx': { 'pdf': 'file-to-pdf' }, 'xls': { 'pdf': 'file-to-pdf' }, 'ods': { 'pdf': 'file-to-pdf' }, - 'pptx': { 'pdf': 'file-to-pdf' }, 'ppt': { 'pdf': 'file-to-pdf' }, 'odp': { 'pdf': 'file-to-pdf' }, - 'jpg': { 'pdf': 'img-to-pdf' }, 'jpeg': { 'pdf': 'img-to-pdf' }, 'png': { 'pdf': 'img-to-pdf' }, - 'gif': { 'pdf': 'img-to-pdf' }, 'bmp': { 'pdf': 'img-to-pdf' }, 'tiff': { 'pdf': 'img-to-pdf' }, 'webp': { 'pdf': 'img-to-pdf' }, 'svg': { 'pdf': 'img-to-pdf' }, - 'html': { 'pdf': 'html-to-pdf' }, - 'zip': { 'pdf': 'html-to-pdf' }, - 'md': { 'pdf': 'markdown-to-pdf' }, - 'txt': { 'pdf': 'file-to-pdf' }, 'rtf': { 'pdf': 'file-to-pdf' }, - 'eml': { 'pdf': 'eml-to-pdf' }, - 'json': { 'pdf': 'json-to-pdf' } -}; - -export type ColorType = typeof COLOR_TYPES[keyof typeof COLOR_TYPES]; -export type OutputOption = typeof OUTPUT_OPTIONS[keyof typeof OUTPUT_OPTIONS]; -export type FitOption = typeof FIT_OPTIONS[keyof typeof FIT_OPTIONS]; diff --git a/frontend/src/constants/convertSupportedFornats.ts b/frontend/src/constants/convertSupportedFornats.ts deleted file mode 100644 index 5934cce3a..000000000 --- a/frontend/src/constants/convertSupportedFornats.ts +++ /dev/null @@ -1,21 +0,0 @@ -// Central list of formats supported by Convert operations -export const CONVERT_SUPPORTED_FORMATS = [ - // Microsoft Office - 'doc', 'docx', 'dot', 'dotx', 'csv', 'xls', 'xlsx', 'xlt', 'xltx', 'slk', 'dif', 'ppt', 'pptx', - // OpenDocument - 'odt', 'ott', 'ods', 'ots', 'odp', 'otp', 'odg', 'otg', - // Text formats - 'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', 'json', - // Images - 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'pbm', 'pgm', 'ppm', 'ras', 'xbm', 'xpm', 'svg', 'svm', 'wmf', 'webp', - // StarOffice - 'sda', 'sdc', 'sdd', 'sdw', 'stc', 'std', 'sti', 'stw', 'sxd', 'sxg', 'sxi', 'sxw', - // Email formats - 'eml', - // Archive formats - 'zip', - // Other - 'dbf', 'fods', 'vsd', 'vor', 'vor3', 'vor4', 'uop', 'pct', 'ps', 'pdf', -]; - - diff --git a/frontend/src/core/constants/convertSupportedFornats.ts b/frontend/src/core/constants/convertSupportedFornats.ts index 86138c4e6..dc46b17e2 100644 --- a/frontend/src/core/constants/convertSupportedFornats.ts +++ b/frontend/src/core/constants/convertSupportedFornats.ts @@ -5,7 +5,7 @@ export const CONVERT_SUPPORTED_FORMATS = [ // OpenDocument 'odt', 'ott', 'ods', 'ots', 'odp', 'otp', 'odg', 'otg', // Text formats - 'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', + 'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', 'json', // Images 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'pbm', 'pgm', 'ppm', 'ras', 'xbm', 'xpm', 'svg', 'svm', 'wmf', 'webp', // StarOffice