clean fonts

This commit is contained in:
Anthony Stirling 2025-10-23 23:36:53 +01:00
parent af19a5af23
commit c7c5613c13
15 changed files with 1793 additions and 269 deletions

View File

@ -0,0 +1,61 @@
package stirling.software.SPDF.model.json;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* Represents a PDF annotation (comments, highlights, stamps, etc.). Annotations often contain OCR
* text layers or other metadata not visible in content streams.
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonAnnotation {
/** Annotation subtype (Text, Highlight, Link, Stamp, Widget, etc.) */
private String subtype;
/** Human-readable text content of the annotation */
private String contents;
/** Annotation rectangle [x1, y1, x2, y2] */
private List<Float> rect;
/** Annotation appearance characteristics */
private String appearanceState;
/** Color components (e.g., [r, g, b] for RGB) */
private List<Float> color;
/** Annotation flags (print, hidden, etc.) */
private Integer flags;
/** For link annotations: destination or action */
private String destination;
/** For text annotations: icon name */
private String iconName;
/** Subject/title of the annotation */
private String subject;
/** Author of the annotation */
private String author;
/** Creation date (ISO 8601 format) */
private String creationDate;
/** Modification date (ISO 8601 format) */
private String modificationDate;
/** Full annotation dictionary for lossless round-tripping */
private PdfJsonCosValue rawData;
}

View File

@ -25,4 +25,7 @@ public class PdfJsonDocument {
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPage> pages = new ArrayList<>();
/** Form fields (AcroForm) at document level */
@Builder.Default private List<PdfJsonFormField> formFields = new ArrayList<>();
}

View File

@ -52,4 +52,22 @@ public class PdfJsonFont {
/** Font descriptor flags copied from the source document. */
private Integer fontDescriptorFlags;
/** Font ascent in glyph units (typically 1/1000). */
private Float ascent;
/** Font descent in glyph units (typically negative). */
private Float descent;
/** Capital height when available. */
private Float capHeight;
/** x-height when available. */
private Float xHeight;
/** Italic angle reported by the font descriptor. */
private Float italicAngle;
/** Units per em extracted from the font matrix. */
private Integer unitsPerEm;
}

View File

@ -0,0 +1,66 @@
package stirling.software.SPDF.model.json;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
/** Represents a PDF form field (AcroForm). */
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonFormField {
/** Fully qualified field name (e.g., "form1.textfield1") */
private String name;
/** Partial field name (last component) */
private String partialName;
/** Field type (Tx=text, Btn=button, Ch=choice, Sig=signature) */
private String fieldType;
/** Field value as string */
private String value;
/** Default value */
private String defaultValue;
/** Field flags (readonly, required, multiline, etc.) */
private Integer flags;
/** Alternative field name (for accessibility) */
private String alternateFieldName;
/** Mapping name (for export) */
private String mappingName;
/** Page number where field appears (1-indexed) */
private Integer pageNumber;
/** Field rectangle [x1, y1, x2, y2] on the page */
private List<Float> rect;
/** For choice fields: list of options */
private List<String> options;
/** For choice fields: selected indices */
private List<Integer> selectedIndices;
/** For button fields: whether it's checked */
private Boolean checked;
/** Font information for text fields */
private String fontName;
private Float fontSize;
/** Full field dictionary for lossless round-tripping */
private PdfJsonCosValue rawData;
}

View File

@ -24,6 +24,7 @@ public class PdfJsonPage {
@Builder.Default private List<PdfJsonTextElement> textElements = new ArrayList<>();
@Builder.Default private List<PdfJsonImageElement> imageElements = new ArrayList<>();
@Builder.Default private List<PdfJsonAnnotation> annotations = new ArrayList<>();
/** Serialized representation of the page resources dictionary. */
private PdfJsonCosValue resources;

View File

@ -168,6 +168,16 @@ system:
startupCleanup: true # Clean up old temp files on startup
cleanupSystemTemp: false # Whether to clean broader system temp directory
stirling:
pdf:
fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font
json:
font-normalization:
enabled: true # Run Ghostscript preflight to normalize fonts before PDF→JSON
cff-converter:
enabled: true # Attempt to transcode CFF/Type1C programs to OTF using FontForge when available
fontforge-command: fontforge # Override if FontForge is installed under a different name/path
ui:
appName: '' # application's visible name
homeDescription: '' # short description or tagline shown on the homepage

View File

@ -83,6 +83,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
gcompat \
libc6-compat \
libreoffice \
ghostscript \
fontforge \
# pdftohtml
poppler-utils \
# OCR MY PDF (unpaper for descew and other advanced features)
@ -119,4 +121,4 @@ EXPOSE 8080/tcp
# Set user and run command
ENTRYPOINT ["tini", "--", "/scripts/init.sh"]
CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"]
CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"]

View File

@ -73,6 +73,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
gcompat \
libc6-compat \
libreoffice \
ghostscript \
fontforge \
# pdftohtml
poppler-utils \
# OCR MY PDF (unpaper for descew and other advanced featues)
@ -109,4 +111,4 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
EXPOSE 8080/tcp
# Set user and run command
ENTRYPOINT ["tini", "--", "/scripts/init.sh"]
CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"]
CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"]

View File

@ -59,7 +59,9 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
curl \
shadow \
su-exec \
openjdk21-jre && \
openjdk21-jre \
ghostscript \
fontforge && \
# User permissions
mkdir -p /configs /logs /customFiles /usr/share/fonts/opentype/noto /tmp/stirling-pdf /pipeline/watchedFolders /pipeline/finishedFolders && \
chmod +x /scripts/*.sh && \

View File

@ -24,7 +24,7 @@ http {
index index.html index.htm;
# Global settings for file uploads
client_max_body_size 100m;
client_max_body_size 0;
# Handle client-side routing - support subpaths
location / {
@ -48,12 +48,12 @@ http {
proxy_cache off;
# Timeout settings for large file uploads
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
proxy_connect_timeout 600s;
proxy_send_timeout 600s;
proxy_read_timeout 600s;
# Request size limits for file uploads
client_max_body_size 100m;
client_max_body_size 0;
proxy_request_buffering off;
}

View File

@ -4031,6 +4031,7 @@
"fontSizeValue": "{{size}}pt",
"noTextOnPage": "No editable text was detected on this page.",
"emptyGroup": "[Empty Group]",
"imageLabel": "Placed image",
"empty": {
"title": "No document loaded",
"subtitle": "Load a PDF or JSON file to begin editing text content."

View File

@ -1,10 +1,12 @@
import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react';
import {
ActionIcon,
Alert,
Badge,
Box,
Button,
Card,
Collapse,
Divider,
FileButton,
Group,
@ -21,10 +23,13 @@ import PictureAsPdfIcon from '@mui/icons-material/PictureAsPdfOutlined';
import AutorenewIcon from '@mui/icons-material/Autorenew';
import WarningAmberIcon from '@mui/icons-material/WarningAmber';
import UploadIcon from '@mui/icons-material/Upload';
import ExpandMoreIcon from '@mui/icons-material/ExpandMore';
import ExpandLessIcon from '@mui/icons-material/ExpandLess';
import { Rnd } from 'react-rnd';
import {
PdfJsonEditorViewData,
PdfJsonFont,
PdfJsonPage,
} from '../../../tools/pdfJsonEditorTypes';
import { getImageBounds, pageDimensions } from '../../../tools/pdfJsonEditorUtils';
@ -32,6 +37,68 @@ import { getImageBounds, pageDimensions } from '../../../tools/pdfJsonEditorUtil
const MAX_RENDER_WIDTH = 820;
const MIN_BOX_SIZE = 18;
const normalizeFontFormat = (format?: string | null): string => {
if (!format) {
return 'ttf';
}
const lower = format.toLowerCase();
if (lower.includes('woff2')) {
return 'woff2';
}
if (lower.includes('woff')) {
return 'woff';
}
if (lower.includes('otf')) {
return 'otf';
}
if (lower.includes('cff')) {
return 'otf';
}
return 'ttf';
};
const getFontMimeType = (format: string): string => {
switch (format) {
case 'woff2':
return 'font/woff2';
case 'woff':
return 'font/woff';
case 'otf':
return 'font/otf';
default:
return 'font/ttf';
}
};
const getFontFormatHint = (format: string): string | null => {
switch (format) {
case 'woff2':
return 'woff2';
case 'woff':
return 'woff';
case 'otf':
return 'opentype';
case 'ttf':
return 'truetype';
default:
return null;
}
};
const decodeBase64ToUint8Array = (value: string): Uint8Array => {
const binary = window.atob(value);
const bytes = new Uint8Array(binary.length);
for (let index = 0; index < binary.length; index += 1) {
bytes[index] = binary.charCodeAt(index);
}
return bytes;
};
const buildFontFamilyName = (font: PdfJsonFont): string => {
const base = (font.uid ?? font.id ?? 'font').toString();
return `pdf-font-${base.replace(/[^a-zA-Z0-9_-]/g, '')}`;
};
const getCaretOffset = (element: HTMLElement): number => {
const selection = window.getSelection();
if (!selection || selection.rangeCount === 0 || !element.contains(selection.focusNode)) {
@ -85,11 +152,13 @@ const toCssBounds = (
bounds: { left: number; right: number; top: number; bottom: number },
) => {
const width = Math.max(bounds.right - bounds.left, 1);
// Note: This codebase uses inverted naming where bounds.bottom > bounds.top
// bounds.bottom = visually upper edge (larger Y in PDF coords)
// bounds.top = visually lower edge (smaller Y in PDF coords)
const height = Math.max(bounds.bottom - bounds.top, 1);
// Add 20% buffer to width to account for padding and font rendering variations
const bufferedWidth = width * 1.2;
const scaledWidth = Math.max(bufferedWidth * scale, MIN_BOX_SIZE);
const scaledWidth = Math.max(width * scale, MIN_BOX_SIZE);
const scaledHeight = Math.max(height * scale, MIN_BOX_SIZE / 2);
// Convert PDF's visually upper edge (bounds.bottom) to CSS top
const top = Math.max(pageHeight - bounds.bottom, 0) * scale;
return {
@ -105,6 +174,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
const [editingGroupId, setEditingGroupId] = useState<string | null>(null);
const [activeImageId, setActiveImageId] = useState<string | null>(null);
const [fontFamilies, setFontFamilies] = useState<Map<string, string>>(new Map());
const [textGroupsExpanded, setTextGroupsExpanded] = useState(false);
const containerRef = useRef<HTMLDivElement | null>(null);
const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map());
const caretOffsetsRef = useRef<Map<string, number>>(new Map());
@ -135,6 +206,10 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
if (!fontId || !pdfDocument?.fonts) {
return 'sans-serif';
}
const loadedFamily = fontFamilies.get(fontId);
if (loadedFamily) {
return `'${loadedFamily}', sans-serif`;
}
const font = pdfDocument.fonts.find((f) => f.id === fontId);
if (!font) {
return 'sans-serif';
@ -161,10 +236,134 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
return 'Arial, Helvetica, sans-serif';
};
const getLineHeightPx = (fontId: string | null | undefined, fontSizePx: number): number => {
if (fontSizePx <= 0) {
return fontSizePx;
}
const metrics = fontId ? fontMetrics.get(fontId) : undefined;
if (!metrics || metrics.unitsPerEm <= 0) {
return fontSizePx * 1.2;
}
const totalUnits = metrics.ascent - metrics.descent;
if (totalUnits <= 0) {
return fontSizePx * 1.2;
}
const lineHeight = (totalUnits / metrics.unitsPerEm) * fontSizePx;
return Math.max(lineHeight, fontSizePx * 1.05);
};
const getFontWeight = (fontId: string | null | undefined): number | 'normal' | 'bold' => {
if (!fontId || !pdfDocument?.fonts) {
return 'normal';
}
const font = pdfDocument.fonts.find((f) => f.id === fontId);
if (!font || !font.fontDescriptorFlags) {
return 'normal';
}
// PDF font descriptor flag bit 18 (value 262144 = 0x40000) indicates ForceBold
const FORCE_BOLD_FLAG = 262144;
if ((font.fontDescriptorFlags & FORCE_BOLD_FLAG) !== 0) {
return 'bold';
}
// Also check if font name contains "Bold"
const fontName = font.standard14Name || font.baseName || '';
if (fontName.toLowerCase().includes('bold')) {
return 'bold';
}
return 'normal';
};
const pages = pdfDocument?.pages ?? [];
const currentPage = pages[selectedPage] ?? null;
const pageGroups = groupsByPage[selectedPage] ?? [];
const pageImages = imagesByPage[selectedPage] ?? [];
const fontMetrics = useMemo(() => {
const metrics = new Map<string, { unitsPerEm: number; ascent: number; descent: number }>();
pdfDocument?.fonts?.forEach((font) => {
if (!font?.id) {
return;
}
const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000;
const ascent = font.ascent ?? unitsPerEm;
const descent = font.descent ?? -(unitsPerEm * 0.2);
metrics.set(font.id, { unitsPerEm, ascent, descent });
});
return metrics;
}, [pdfDocument?.fonts]);
useEffect(() => {
if (typeof FontFace === 'undefined') {
setFontFamilies(new Map());
return undefined;
}
let disposed = false;
const active: { fontFace: FontFace; url?: string }[] = [];
const registerFonts = async () => {
const fonts = pdfDocument?.fonts ?? [];
if (fonts.length === 0) {
setFontFamilies(new Map());
return;
}
const next = new Map<string, string>();
for (const font of fonts) {
if (!font?.id || !font.program) {
continue;
}
try {
const format = normalizeFontFormat(font.programFormat);
const data = decodeBase64ToUint8Array(font.program);
const blob = new Blob([data as BlobPart], { type: getFontMimeType(format) });
const url = URL.createObjectURL(blob);
const formatHint = getFontFormatHint(format);
const familyName = buildFontFamilyName(font);
const source = formatHint ? `url(${url}) format('${formatHint}')` : `url(${url})`;
const fontFace = new FontFace(familyName, source);
await fontFace.load();
if (disposed) {
document.fonts.delete(fontFace);
URL.revokeObjectURL(url);
continue;
}
document.fonts.add(fontFace);
active.push({ fontFace, url });
next.set(font.id, familyName);
} catch (error) {
// Silently ignore font loading failures - embedded PDF fonts often lack web font tables
// Fallback to web-safe fonts is already implemented via getFontFamily()
}
}
if (!disposed) {
setFontFamilies(next);
} else {
active.forEach(({ fontFace, url }) => {
document.fonts.delete(fontFace);
if (url) {
URL.revokeObjectURL(url);
}
});
}
};
registerFonts();
return () => {
disposed = true;
active.forEach(({ fontFace, url }) => {
document.fonts.delete(fontFace);
if (url) {
URL.revokeObjectURL(url);
}
});
};
}, [pdfDocument?.fonts]);
const visibleGroups = useMemo(
() =>
pageGroups.filter((group) => {
@ -419,25 +618,33 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
<ScrollArea h="100%" offsetScrollbars>
<Box
style={{
margin: '0 auto',
background: '#f3f4f6',
padding: '1.5rem',
borderRadius: '0.75rem',
display: 'flex',
justifyContent: 'center',
alignItems: 'flex-start',
width: '100%',
minHeight: '100%',
}}
onClick={handleBackgroundClick}
>
<Box
style={{
position: 'relative',
width: `${scaledWidth}px`,
height: `${scaledHeight}px`,
backgroundColor: '#ffffff',
boxShadow: '0 0 12px rgba(15, 23, 42, 0.12)',
borderRadius: '0.5rem',
overflow: 'hidden',
background: '#f3f4f6',
padding: '0.5rem',
borderRadius: '0.75rem',
}}
ref={containerRef}
onClick={handleBackgroundClick}
>
<Box
style={{
position: 'relative',
width: `${scaledWidth}px`,
height: `${scaledHeight}px`,
backgroundColor: '#ffffff',
boxShadow: '0 0 12px rgba(15, 23, 42, 0.12)',
borderRadius: '0.5rem',
overflow: 'hidden',
}}
ref={containerRef}
>
{orderedImages.map((image, imageIndex) => {
if (!image?.imageData) {
return null;
@ -466,7 +673,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
setEditingGroupId(null);
setActiveImageId(imageId);
}}
onDrag={(event, data) => {
onDrag={(_event, data) => {
emitImageTransform(
imageId,
data.x,
@ -475,7 +682,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
cssHeight,
);
}}
onDragStop={(event, data) => {
onDragStop={(_event, data) => {
emitImageTransform(
imageId,
data.x,
@ -489,7 +696,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
setActiveGroupId(null);
setEditingGroupId(null);
}}
onResize={(event, _direction, ref, _delta, position) => {
onResize={(_event, _direction, ref, _delta, position) => {
const nextWidth = parseFloat(ref.style.width);
const nextHeight = parseFloat(ref.style.height);
emitImageTransform(
@ -500,7 +707,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
nextHeight,
);
}}
onResizeStop={(event, _direction, ref, _delta, position) => {
onResizeStop={(_event, _direction, ref, _delta, position) => {
const nextWidth = parseFloat(ref.style.width);
const nextHeight = parseFloat(ref.style.height);
emitImageTransform(
@ -567,21 +774,48 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const baseFontSize = group.fontMatrixSize ?? group.fontSize ?? 12;
const fontSizePx = Math.max(baseFontSize * scale, 6);
const fontFamily = getFontFamily(group.fontId);
const lineHeightPx = getLineHeightPx(group.fontId, fontSizePx);
const lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2;
const hasRotation = group.rotation != null && Math.abs(group.rotation) > 0.5;
const baselineLength = group.baselineLength ?? Math.max(group.bounds.right - group.bounds.left, 0);
const visualHeight = Math.max(bounds.height, fontSizePx * 1.2);
let containerLeft = bounds.left;
let containerTop = bounds.top;
let containerWidth = Math.max(bounds.width, fontSizePx);
let containerHeight = Math.max(bounds.height, lineHeightPx);
let transform: string | undefined;
let transformOrigin: React.CSSProperties['transformOrigin'];
if (hasRotation) {
const anchorX = group.anchor?.x ?? group.bounds.left;
const anchorY = group.anchor?.y ?? group.bounds.bottom;
containerLeft = anchorX * scale;
containerTop = Math.max(pageHeight - anchorY, 0) * scale;
containerWidth = Math.max(baselineLength * scale, MIN_BOX_SIZE);
containerHeight = Math.max(lineHeightPx, fontSizePx * lineHeightRatio);
transformOrigin = 'left bottom';
// Negate rotation because Y-axis is flipped from PDF to web coordinates
transform = `rotate(${-group.rotation}deg)`;
}
// Extract styling from group
const textColor = group.color || '#111827';
const fontWeight = group.fontWeight || getFontWeight(group.fontId);
const containerStyle: React.CSSProperties = {
position: 'absolute',
left: `${bounds.left}px`,
top: `${bounds.top}px`,
width: `${bounds.width}px`,
height: `${visualHeight}px`,
left: `${containerLeft}px`,
top: `${containerTop}px`,
width: `${containerWidth}px`,
height: `${containerHeight}px`,
display: 'flex',
alignItems: 'flex-start',
justifyContent: 'flex-start',
pointerEvents: 'auto',
cursor: 'text',
zIndex: 2_000_000,
transform,
transformOrigin,
};
if (isEditing) {
@ -628,17 +862,17 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
style={{
width: '100%',
height: '100%',
padding: '3px 4px',
padding: 0,
backgroundColor: 'rgba(255,255,255,0.95)',
color: '#111827',
color: textColor,
fontSize: `${fontSizePx}px`,
fontFamily,
lineHeight: 1.25,
fontWeight,
lineHeight: lineHeightRatio,
outline: 'none',
border: 'none',
display: 'block',
whiteSpace: 'pre-wrap',
overflowWrap: 'anywhere',
whiteSpace: 'nowrap',
cursor: 'text',
overflow: 'visible',
}}
@ -660,12 +894,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
style={{
width: '100%',
minHeight: '100%',
padding: '2px 4px',
whiteSpace: 'pre-wrap',
padding: 0,
whiteSpace: 'nowrap',
fontSize: `${fontSizePx}px`,
fontFamily,
lineHeight: 1.25,
color: '#111827',
fontWeight,
lineHeight: lineHeightRatio,
color: textColor,
display: 'block',
cursor: 'text',
overflow: 'visible',
@ -682,6 +917,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
);
})
)}
</Box>
</Box>
</Box>
</ScrollArea>
@ -689,48 +925,61 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
<Card padding="md" withBorder radius="md">
<Stack gap="xs">
<Text fw={500}>{t('pdfJsonEditor.groupList', 'Detected Text Groups')}</Text>
<Divider />
<ScrollArea h={180} offsetScrollbars>
<Stack gap="sm">
{visibleGroups.map((group) => {
const changed = group.text !== group.originalText;
return (
<Card
key={`list-${group.id}`}
padding="sm"
radius="md"
withBorder
shadow={changed ? 'sm' : 'none'}
onMouseEnter={() => setActiveGroupId(group.id)}
onMouseLeave={() => setActiveGroupId((current) => (current === group.id ? null : current))}
style={{ cursor: 'pointer' }}
onClick={() => {
setActiveGroupId(group.id);
setEditingGroupId(group.id);
}}
>
<Stack gap={4}>
<Group gap="xs">
{changed && <Badge color="yellow" size="xs">{t('pdfJsonEditor.badges.modified', 'Edited')}</Badge>}
{group.fontId && (
<Badge size="xs" variant="outline">{group.fontId}</Badge>
)}
{group.fontSize && (
<Badge size="xs" variant="light">
{t('pdfJsonEditor.fontSizeValue', '{{size}}pt', { size: group.fontSize.toFixed(1) })}
</Badge>
)}
</Group>
<Text size="sm" c="dimmed" lineClamp={2}>
{group.text || t('pdfJsonEditor.emptyGroup', '[Empty Group]')}
</Text>
</Stack>
</Card>
);
})}
<Group justify="space-between" align="center">
<Text fw={500}>{t('pdfJsonEditor.groupList', 'Detected Text Groups')}</Text>
<ActionIcon
variant="subtle"
onClick={() => setTextGroupsExpanded(!textGroupsExpanded)}
aria-label={textGroupsExpanded ? 'Collapse' : 'Expand'}
>
{textGroupsExpanded ? <ExpandLessIcon /> : <ExpandMoreIcon />}
</ActionIcon>
</Group>
<Collapse in={textGroupsExpanded}>
<Stack gap="xs">
<Divider />
<ScrollArea h={180} offsetScrollbars>
<Stack gap="sm">
{visibleGroups.map((group) => {
const changed = group.text !== group.originalText;
return (
<Card
key={`list-${group.id}`}
padding="sm"
radius="md"
withBorder
shadow={changed ? 'sm' : 'none'}
onMouseEnter={() => setActiveGroupId(group.id)}
onMouseLeave={() => setActiveGroupId((current) => (current === group.id ? null : current))}
style={{ cursor: 'pointer' }}
onClick={() => {
setActiveGroupId(group.id);
setEditingGroupId(group.id);
}}
>
<Stack gap={4}>
<Group gap="xs">
{changed && <Badge color="yellow" size="xs">{t('pdfJsonEditor.badges.modified', 'Edited')}</Badge>}
{group.fontId && (
<Badge size="xs" variant="outline">{group.fontId}</Badge>
)}
{group.fontSize && (
<Badge size="xs" variant="light">
{t('pdfJsonEditor.fontSizeValue', '{{size}}pt', { size: group.fontSize.toFixed(1) })}
</Badge>
)}
</Group>
<Text size="sm" c="dimmed" lineClamp={2}>
{group.text || t('pdfJsonEditor.emptyGroup', '[Empty Group]')}
</Text>
</Stack>
</Card>
);
})}
</Stack>
</ScrollArea>
</Stack>
</ScrollArea>
</Collapse>
</Stack>
</Card>
</Stack>

View File

@ -23,6 +23,12 @@ export interface PdfJsonFont {
toUnicode?: string | null;
standard14Name?: string | null;
fontDescriptorFlags?: number | null;
ascent?: number | null;
descent?: number | null;
capHeight?: number | null;
xHeight?: number | null;
italicAngle?: number | null;
unitsPerEm?: number | null;
}
export interface PdfJsonTextElement {
@ -117,6 +123,11 @@ export interface TextGroup {
fontId?: string | null;
fontSize?: number | null;
fontMatrixSize?: number | null;
color?: string | null;
fontWeight?: number | 'normal' | 'bold' | null;
rotation?: number | null;
anchor?: { x: number; y: number } | null;
baselineLength?: number | null;
elements: PdfJsonTextElement[];
originalElements: PdfJsonTextElement[];
text: string;

View File

@ -69,9 +69,15 @@ const getHeight = (element: PdfJsonTextElement): number => {
const getElementBounds = (element: PdfJsonTextElement): BoundingBox => {
const left = getX(element);
const width = getWidth(element);
const bottom = getBaseline(element);
const baseline = getBaseline(element);
const height = getHeight(element);
const top = bottom - height;
// In PDF coordinates, baseline is where text sits
// Typical typography: ~80% of height above baseline (ascenders), ~20% below (descenders)
// Using codebase's inverted naming: bottom (visual top) > top (visual bottom)
const ascent = height * 0.8;
const descent = height * 0.2;
const bottom = baseline + ascent; // Visual top of text
const top = baseline - descent; // Visual bottom (includes descenders)
return {
left,
right: left + width,
@ -181,6 +187,136 @@ const buildGroupText = (elements: PdfJsonTextElement[]): string => {
return result;
};
const rgbToCss = (components: number[]): string => {
if (components.length >= 3) {
const r = Math.round(Math.max(0, Math.min(1, components[0])) * 255);
const g = Math.round(Math.max(0, Math.min(1, components[1])) * 255);
const b = Math.round(Math.max(0, Math.min(1, components[2])) * 255);
return `rgb(${r}, ${g}, ${b})`;
}
return 'rgb(0, 0, 0)';
};
const cmykToCss = (components: number[]): string => {
if (components.length >= 4) {
const c = Math.max(0, Math.min(1, components[0]));
const m = Math.max(0, Math.min(1, components[1]));
const y = Math.max(0, Math.min(1, components[2]));
const k = Math.max(0, Math.min(1, components[3]));
const r = Math.round(255 * (1 - c) * (1 - k));
const g = Math.round(255 * (1 - m) * (1 - k));
const b = Math.round(255 * (1 - y) * (1 - k));
return `rgb(${r}, ${g}, ${b})`;
}
return 'rgb(0, 0, 0)';
};
const grayToCss = (components: number[]): string => {
if (components.length >= 1) {
const gray = Math.round(Math.max(0, Math.min(1, components[0])) * 255);
return `rgb(${gray}, ${gray}, ${gray})`;
}
return 'rgb(0, 0, 0)';
};
const extractColor = (element: PdfJsonTextElement): string | null => {
const fillColor = element.fillColor;
if (!fillColor || !fillColor.components || fillColor.components.length === 0) {
return null;
}
const colorSpace = (fillColor.colorSpace ?? '').toLowerCase();
if (colorSpace.includes('rgb') || colorSpace.includes('srgb')) {
return rgbToCss(fillColor.components);
}
if (colorSpace.includes('cmyk')) {
return cmykToCss(fillColor.components);
}
if (colorSpace.includes('gray') || colorSpace.includes('grey')) {
return grayToCss(fillColor.components);
}
// Default to RGB interpretation
if (fillColor.components.length >= 3) {
return rgbToCss(fillColor.components);
}
if (fillColor.components.length === 1) {
return grayToCss(fillColor.components);
}
return null;
};
const RAD_TO_DEG = 180 / Math.PI;
const normalizeAngle = (angle: number): number => {
let normalized = angle % 360;
if (normalized > 180) {
normalized -= 360;
} else if (normalized <= -180) {
normalized += 360;
}
return normalized;
};
const extractElementRotation = (element: PdfJsonTextElement): number | null => {
const matrix = element.textMatrix;
if (!matrix || matrix.length !== 6) {
return null;
}
const a = matrix[0];
const b = matrix[1];
if (Math.abs(a) < 1e-6 && Math.abs(b) < 1e-6) {
return null;
}
const angle = Math.atan2(b, a) * RAD_TO_DEG;
if (Math.abs(angle) < 0.5) {
return null;
}
return normalizeAngle(angle);
};
const computeGroupRotation = (elements: PdfJsonTextElement[]): number | null => {
const angles = elements
.map(extractElementRotation)
.filter((angle): angle is number => angle !== null);
if (angles.length === 0) {
return null;
}
const vector = angles.reduce(
(acc, angle) => {
const radians = (angle * Math.PI) / 180;
acc.x += Math.cos(radians);
acc.y += Math.sin(radians);
return acc;
},
{ x: 0, y: 0 },
);
if (Math.abs(vector.x) < 1e-6 && Math.abs(vector.y) < 1e-6) {
return null;
}
const average = Math.atan2(vector.y, vector.x) * RAD_TO_DEG;
const normalized = normalizeAngle(average);
return Math.abs(normalized) < 0.5 ? null : normalized;
};
const getAnchorPoint = (element: PdfJsonTextElement): { x: number; y: number } => {
if (element.textMatrix && element.textMatrix.length === 6) {
return {
x: valueOr(element.textMatrix[4]),
y: valueOr(element.textMatrix[5]),
};
}
return {
x: valueOr(element.x),
y: valueOr(element.y),
};
};
const computeBaselineLength = (elements: PdfJsonTextElement[]): number =>
elements.reduce((acc, current) => acc + getWidth(current), 0);
const createGroup = (
pageIndex: number,
idSuffix: number,
@ -189,13 +325,22 @@ const createGroup = (
const clones = elements.map(cloneTextElement);
const originalClones = clones.map(cloneTextElement);
const bounds = mergeBounds(elements.map(getElementBounds));
const firstElement = elements[0];
const rotation = computeGroupRotation(elements);
const anchor = rotation !== null ? getAnchorPoint(firstElement) : null;
const baselineLength = computeBaselineLength(elements);
return {
id: `${pageIndex}-${idSuffix}`,
pageIndex,
fontId: elements[0]?.fontId,
fontSize: elements[0]?.fontSize,
fontMatrixSize: elements[0]?.fontMatrixSize,
fontId: firstElement?.fontId,
fontSize: firstElement?.fontSize,
fontMatrixSize: firstElement?.fontMatrixSize,
color: firstElement ? extractColor(firstElement) : null,
fontWeight: null, // Will be determined from font descriptor
rotation,
anchor,
baselineLength,
elements: clones,
originalElements: originalClones,
text: buildGroupText(elements),
@ -253,7 +398,18 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR);
const sameFont = previous.fontId === element.fontId;
const shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0);
let shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0);
const previousRotation = extractElementRotation(previous);
const currentRotation = extractElementRotation(element);
if (
shouldSplit &&
previousRotation !== null &&
currentRotation !== null &&
Math.abs(normalizeAngle(previousRotation - currentRotation)) < 1
) {
shouldSplit = false;
}
if (shouldSplit) {
groups.push(createGroup(pageIndex, groupCounter, currentBucket));