feat(form-fill): add CSV and XLSX extraction for form fields, improve file ID handling (#5776)

This commit is contained in:
Balázs Szücs
2026-02-23 21:17:58 +01:00
committed by GitHub
parent 340224b40b
commit 549f796e47
9 changed files with 321 additions and 64 deletions

View File

@@ -258,7 +258,9 @@ public class JobExecutorService {
// GlobalExceptionHandler (either directly or wrapped)
Throwable cause = e.getCause();
if (e instanceof IllegalArgumentException
|| cause instanceof stirling.software.common.util.ExceptionUtils.BaseAppException
|| cause
instanceof
stirling.software.common.util.ExceptionUtils.BaseAppException
|| cause
instanceof
stirling.software.common.util.ExceptionUtils

View File

@@ -132,7 +132,7 @@ public class FormUtils {
continue;
}
String currentValue = safeValue(terminalField);
String currentValue = safeFieldValue(terminalField);
boolean required = field.isRequired();
int pageIndex = resolveFirstWidgetPageIndex(document, terminalField, annotationPageMap);
List<String> options = resolveOptions(terminalField);
@@ -203,7 +203,7 @@ public class FormUtils {
continue;
}
String currentValue = safeValue(terminalField);
String currentValue = safeFieldValue(terminalField);
boolean required = field.isRequired();
boolean readOnly = field.isReadOnly();
List<String> options = resolveOptions(terminalField);
@@ -1234,7 +1234,11 @@ public class FormUtils {
return states;
}
private String safeValue(PDTerminalField field) {
public String safeValue(String value) {
return value != null ? value : "";
}
private String safeFieldValue(PDTerminalField field) {
try {
// PDChoice.getValueAsString() returns a raw COS string representation
// that doesn't reliably reflect the selected value. Use getValue()

View File

@@ -75,6 +75,7 @@ dependencies {
}
implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4'
implementation 'com.opencsv:opencsv:5.12.0' // https://mvnrepository.com/artifact/com.opencsv/opencsv
implementation 'org.apache.poi:poi-ooxml:5.5.1'
// Batik
implementation 'org.apache.xmlgraphics:batik-all:1.19'

View File

@@ -2,11 +2,14 @@ package stirling.software.SPDF.controller.api.form;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
@@ -16,7 +19,9 @@ import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.CSVWriter;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
@@ -139,6 +144,117 @@ public class FormFillController {
}
}
@PostMapping(value = "/extract-csv", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
@Operation(
summary = "Extract form fields as CSV",
description =
"Returns a CSV file containing all form field names and their current values")
public ResponseEntity<byte[]> extractCsv(
@Parameter(
description = "The input PDF file",
required = true,
content =
@Content(
mediaType = MediaType.APPLICATION_PDF_VALUE,
schema = @Schema(type = "string", format = "binary")))
@RequestParam("file")
MultipartFile file,
@RequestParam(value = "data", required = false) MultipartFile data)
throws IOException {
requirePdf(file);
try (PDDocument document = pdfDocumentFactory.load(file, true);
StringWriter sw = new StringWriter()) {
FormUtils.repairMissingWidgetPageReferences(document);
if (data != null && !data.isEmpty()) {
Map<String, String> values =
objectMapper.readValue(
data.getInputStream(), new TypeReference<Map<String, String>>() {});
FormUtils.applyFieldValues(document, values, false);
}
List<FormUtils.FormFieldInfo> fields = FormUtils.extractFormFields(document);
try (CSVWriter csvWriter = new CSVWriter(sw)) {
String[] header = {"Field Name", "Value"};
csvWriter.writeNext(header);
for (FormUtils.FormFieldInfo field : fields) {
csvWriter.writeNext(new String[] {field.name(), field.value()});
}
}
byte[] csvBytes = sw.toString().getBytes(StandardCharsets.UTF_8);
String baseName = buildBaseName(file, "extracted");
return WebResponseUtils.bytesToWebResponse(
csvBytes, baseName + ".csv", MediaType.parseMediaType("text/csv"));
}
}
@PostMapping(value = "/extract-xlsx", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
@Operation(
summary = "Extract form fields as XLSX",
description =
"Returns an Excel (XLSX) file containing all form field names and their current values")
public ResponseEntity<byte[]> extractXlsx(
@Parameter(
description = "The input PDF file",
required = true,
content =
@Content(
mediaType = MediaType.APPLICATION_PDF_VALUE,
schema = @Schema(type = "string", format = "binary")))
@RequestParam("file")
MultipartFile file,
@RequestParam(value = "data", required = false) MultipartFile data)
throws IOException {
requirePdf(file);
try (PDDocument document = pdfDocumentFactory.load(file, true);
Workbook workbook = new XSSFWorkbook();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
FormUtils.repairMissingWidgetPageReferences(document);
if (data != null && !data.isEmpty()) {
Map<String, String> values =
objectMapper.readValue(
data.getInputStream(), new TypeReference<Map<String, String>>() {});
FormUtils.applyFieldValues(document, values, false);
}
List<FormUtils.FormFieldInfo> fields = FormUtils.extractFormFields(document);
Sheet sheet = workbook.createSheet("Form Fields");
// Header row
Row headerRow = sheet.createRow(0);
headerRow.createCell(0).setCellValue("Field Name");
headerRow.createCell(1).setCellValue("Value");
// Data rows
int rowNum = 1;
for (FormUtils.FormFieldInfo field : fields) {
Row row = sheet.createRow(rowNum++);
row.createCell(0).setCellValue(field.name());
row.createCell(1).setCellValue(FormUtils.safeValue(field.value()));
}
// Auto-size columns
sheet.autoSizeColumn(0);
sheet.autoSizeColumn(1);
workbook.write(baos);
String baseName = buildBaseName(file, "extracted");
return WebResponseUtils.bytesToWebResponse(
baos.toByteArray(),
baseName + ".xlsx",
MediaType.parseMediaType(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
}
}
@PostMapping(value = "/modify-fields", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
@Operation(
summary = "Modify existing form fields",

View File

@@ -16,7 +16,7 @@ import { useRedaction } from '@app/contexts/RedactionContext';
import type { RedactionPendingTrackerAPI } from '@app/components/viewer/RedactionPendingTracker';
import { createStirlingFilesAndStubs } from '@app/services/fileStubHelpers';
import NavigationWarningModal from '@app/components/shared/NavigationWarningModal';
import { isStirlingFile } from '@app/types/fileContext';
import { isStirlingFile, getFormFillFileId } from '@app/types/fileContext';
import { useViewerRightRailButtons } from '@app/components/viewer/useViewerRightRailButtons';
import { StampPlacementOverlay } from '@app/components/viewer/StampPlacementOverlay';
import { RulerOverlay, type PageMeasureScales, type PageScaleInfo, type ViewportScale } from '@app/components/viewer/RulerOverlay';
@@ -788,19 +788,7 @@ const EmbedPdfViewerContent = ({
// Generate a unique identifier for the current file to detect file changes
const currentFileId = React.useMemo(() => {
if (!currentFile) return null;
if (isStirlingFile(currentFile)) {
return `stirling-${currentFile.fileId}`;
}
// File is also a Blob, but has more specific properties
if (currentFile instanceof File) {
return `file-${currentFile.name}-${currentFile.size}-${currentFile.lastModified}`;
}
// Fallback for any other object (shouldn't happen in practice)
return `unknown-${(currentFile as any).size || 0}`;
return getFormFillFileId(currentFile);
}, [currentFile]);
useEffect(() => {

View File

@@ -79,15 +79,33 @@
}
.actionBar {
display: flex;
flex-direction: column;
gap: 0.5rem;
}
.primaryActions {
display: flex;
gap: 0.5rem;
align-items: center;
}
.actionBar > *:first-child {
.primaryActions > *:first-child {
flex: 1;
}
.secondaryActions {
display: flex;
gap: 0.375rem;
align-items: center;
}
.secondaryActions > button {
flex: 1;
padding-left: 0.25rem;
padding-right: 0.25rem;
}
.fieldList {
flex: 1;
overflow: hidden;

View File

@@ -26,7 +26,7 @@ import { useNavigation } from '@app/contexts/NavigationContext';
import { useViewer } from '@app/contexts/ViewerContext';
import { useFileState } from '@app/contexts/FileContext';
import { Skeleton } from '@mantine/core';
import { isStirlingFile } from '@app/types/fileContext';
import { isStirlingFile, getFormFillFileId } from '@app/types/fileContext';
import type { BaseToolProps } from '@app/types/tool';
import type { FormField } from '@app/tools/formFill/types';
import { FieldInput } from '@app/tools/formFill/FieldInput';
@@ -40,6 +40,7 @@ import FileCopyIcon from '@mui/icons-material/FileCopy';
import BuildCircleIcon from '@mui/icons-material/BuildCircle';
import DescriptionIcon from '@mui/icons-material/Description';
import FileDownloadIcon from '@mui/icons-material/FileDownload';
import { extractFormFieldsCsv, extractFormFieldsXlsx } from '@app/tools/formFill/formApi';
import styles from '@app/tools/formFill/FormFill.module.css';
// ---------------------------------------------------------------------------
@@ -149,6 +150,44 @@ const FormFill = (_props: BaseToolProps) => {
return activeFiles[0];
}, [activeFiles, selectedFileIds]);
const handleExtractCsv = useCallback(async () => {
if (!currentFile) return;
setExtracting(true);
try {
const blob = await extractFormFieldsCsv(currentFile, allValues);
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `form-data-${new Date().getTime()}.csv`;
a.click();
setTimeout(() => URL.revokeObjectURL(url), 250);
} catch (err) {
console.error('[FormFill] CSV extraction failed:', err);
setSaveError('Failed to extract CSV');
} finally {
setExtracting(false);
}
}, [currentFile, allValues]);
const handleExtractXlsx = useCallback(async () => {
if (!currentFile) return;
setExtracting(true);
try {
const blob = await extractFormFieldsXlsx(currentFile, allValues);
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `form-data-${new Date().getTime()}.xlsx`;
a.click();
setTimeout(() => URL.revokeObjectURL(url), 250);
} catch (err) {
console.error('[FormFill] XLSX extraction failed:', err);
setSaveError('Failed to extract XLSX');
} finally {
setExtracting(false);
}
}, [currentFile, allValues]);
const isActive = selectedTool === 'formFill';
useEffect(() => {
@@ -228,10 +267,8 @@ const FormFill = (_props: BaseToolProps) => {
}, [formState.isDirty]);
const handleRefresh = useCallback(() => {
if (currentFile && isStirlingFile(currentFile)) {
fetchFields(currentFile, currentFile.fileId);
} else if (currentFile) {
fetchFields(currentFile);
if (currentFile) {
fetchFields(currentFile, getFormFillFileId(currentFile) ?? undefined);
}
}, [currentFile, fetchFields]);
@@ -406,38 +443,63 @@ const FormFill = (_props: BaseToolProps) => {
{/* Action buttons */}
<div className={styles.actionBar}>
<Button
leftSection={<SaveIcon sx={{ fontSize: 14 }} />}
size="xs"
onClick={handleSave}
loading={saving}
disabled={!formState.isDirty && !flattenChanged}
flex={1}
>
Save
</Button>
<Button
variant="light"
color="blue"
leftSection={<FileDownloadIcon sx={{ fontSize: 14 }} />}
loading={extracting}
onClick={handleExtractJson}
size="xs"
>
Extract JSON
</Button>
<Tooltip label="Re-scan fields" withArrow position="bottom">
<ActionIcon
variant="light"
size="md"
onClick={handleRefresh}
aria-label="Re-scan form fields"
<div className={styles.primaryActions}>
<Button
leftSection={<SaveIcon sx={{ fontSize: 14 }} />}
size="xs"
onClick={handleSave}
loading={saving}
disabled={!formState.isDirty && !flattenChanged}
>
<RefreshIcon sx={{ fontSize: 16 }} />
</ActionIcon>
</Tooltip>
Save
</Button>
<Tooltip label="Re-scan fields" withArrow position="bottom">
<ActionIcon
variant="light"
size="md"
onClick={handleRefresh}
aria-label="Re-scan form fields"
>
<RefreshIcon sx={{ fontSize: 16 }} />
</ActionIcon>
</Tooltip>
</div>
<div className={styles.secondaryActions}>
<Button
variant="light"
color="blue"
leftSection={<FileDownloadIcon sx={{ fontSize: 14 }} />}
loading={extracting}
onClick={handleExtractJson}
size="xs"
>
JSON
</Button>
<Button
variant="light"
color="blue"
leftSection={<FileDownloadIcon sx={{ fontSize: 14 }} />}
loading={extracting}
onClick={handleExtractCsv}
size="xs"
>
CSV
</Button>
<Button
variant="light"
color="blue"
leftSection={<FileDownloadIcon sx={{ fontSize: 14 }} />}
loading={extracting}
onClick={handleExtractXlsx}
size="xs"
>
XLSX
</Button>
</div>
</div>
{/* Error message */}

View File

@@ -44,3 +44,49 @@ export async function fillFormFields(
return response.data;
}
/**
* Extract form fields as CSV.
* Calls POST /api/v1/form/extract-csv
*/
export async function extractFormFieldsCsv(
file: File | Blob,
values?: Record<string, string>
): Promise<Blob> {
const formData = new FormData();
formData.append('file', file);
if (values) {
formData.append(
'data',
new Blob([JSON.stringify(values)], { type: 'application/json' })
);
}
const response = await apiClient.post('/api/v1/form/extract-csv', formData, {
responseType: 'blob',
});
return response.data;
}
/**
* Extract form fields as XLSX.
* Calls POST /api/v1/form/extract-xlsx
*/
export async function extractFormFieldsXlsx(
file: File | Blob,
values?: Record<string, string>
): Promise<Blob> {
const formData = new FormData();
formData.append('file', file);
if (values) {
formData.append(
'data',
new Blob([JSON.stringify(values)], { type: 'application/json' })
);
}
const response = await apiClient.post('/api/v1/form/extract-xlsx', formData, {
responseType: 'blob',
});
return response.data;
}

View File

@@ -65,7 +65,7 @@ export function createFileId(): FileId {
return window.crypto.randomUUID() as FileId;
}
// Fallback for environments without randomUUID
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function (c) {
const r = Math.random() * 16 | 0;
const v = c == 'x' ? r : (r & 0x3 | 0x8);
return v.toString(16);
@@ -85,9 +85,29 @@ export interface StirlingFile extends File {
}
// Type guard to check if a File object has an embedded fileId
export function isStirlingFile(file: File): file is StirlingFile {
return 'fileId' in file && typeof (file as any).fileId === 'string' &&
'quickKey' in file && typeof (file as any).quickKey === 'string';
export function isStirlingFile(file: File | Blob): file is StirlingFile {
return file instanceof File && 'fileId' in file && typeof (file as any).fileId === 'string' &&
'quickKey' in file && typeof (file as any).quickKey === 'string';
}
/**
* Generate a unique identifier for form fill state tracking.
* This ensures that form widgets/values are correctly isolated between files
* even if they have the same name or are re-scanned.
*/
export function getFormFillFileId(file: File | Blob | null | undefined): string | null {
if (!file) return null;
if (isStirlingFile(file)) {
return `stirling-${file.fileId}`;
}
if (file instanceof File) {
return `file-${file.name}-${file.size}-${file.lastModified}`;
}
// Fallback for Blobs or other objects
return `blob-${(file as any).size || 0}`;
}
// Create a StirlingFile from a regular File object
@@ -141,11 +161,11 @@ export function extractFiles(files: StirlingFile[]): File[] {
// Check if an object is a File or StirlingFile (replaces instanceof File checks)
export function isFileObject(obj: any): obj is File | StirlingFile {
return obj &&
typeof obj.name === 'string' &&
typeof obj.size === 'number' &&
typeof obj.type === 'string' &&
typeof obj.lastModified === 'number' &&
typeof obj.arrayBuffer === 'function';
typeof obj.name === 'string' &&
typeof obj.size === 'number' &&
typeof obj.type === 'string' &&
typeof obj.lastModified === 'number' &&
typeof obj.arrayBuffer === 'function';
}