Add PDF JSON conversion endpoints

This commit is contained in:
Anthony Stirling 2025-10-22 15:59:21 +01:00
parent c9eee00d66
commit ece4641432
7 changed files with 657 additions and 0 deletions

View File

@ -0,0 +1,73 @@
package stirling.software.SPDF.controller.api.converters;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import lombok.RequiredArgsConstructor;
import stirling.software.SPDF.config.swagger.StandardPdfResponse;
import stirling.software.SPDF.service.PdfJsonConversionService;
import stirling.software.common.annotations.AutoJobPostMapping;
import stirling.software.common.annotations.api.ConvertApi;
import stirling.software.common.model.api.GeneralFile;
import stirling.software.common.model.api.PDFFile;
import stirling.software.common.util.ExceptionUtils;
import stirling.software.common.util.WebResponseUtils;
@ConvertApi
@RequiredArgsConstructor
public class ConvertPdfJsonController {
private final PdfJsonConversionService pdfJsonConversionService;
@AutoJobPostMapping(consumes = "multipart/form-data", value = "/pdf/json")
@Operation(
summary = "Convert PDF to JSON",
description =
"Extracts PDF text, fonts, and metadata into an editable JSON structure that can be"
+ " transformed back into a PDF. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> convertPdfToJson(@ModelAttribute PDFFile request) throws Exception {
MultipartFile inputFile = request.getFileInput();
if (inputFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile);
String originalName = inputFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())
? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "")
: "document";
String docName = baseName + ".json";
return WebResponseUtils.bytesToWebResponse(
jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@AutoJobPostMapping(consumes = "multipart/form-data", value = "/json/pdf")
@StandardPdfResponse
@Operation(
summary = "Convert JSON to PDF",
description =
"Rebuilds a PDF from the editable JSON structure generated by the PDF to JSON"
+ " endpoint. Input:JSON Output:PDF Type:SISO")
public ResponseEntity<byte[]> convertJsonToPdf(@ModelAttribute GeneralFile request) throws Exception {
MultipartFile jsonFile = request.getFileInput();
if (jsonFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
byte[] pdfBytes = pdfJsonConversionService.convertJsonToPdf(jsonFile);
String originalName = jsonFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())
? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "")
: "document";
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
}
}

View File

@ -0,0 +1,25 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonDocument {
private PdfJsonMetadata metadata;
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPage> pages = new ArrayList<>();
}

View File

@ -0,0 +1,25 @@
package stirling.software.SPDF.model.json;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonFont {
private String id;
private String name;
private String subtype;
private String encoding;
private Boolean embedded;
private String standard14Name;
private Integer fontDescriptorFlags;
private String base64Data;
}

View File

@ -0,0 +1,27 @@
package stirling.software.SPDF.model.json;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonMetadata {
private String title;
private String author;
private String subject;
private String keywords;
private String creator;
private String producer;
private String creationDate;
private String modificationDate;
private String trapped;
private Integer numberOfPages;
}

View File

@ -0,0 +1,26 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonPage {
private Integer pageNumber;
private Float width;
private Float height;
private Integer rotation;
@Builder.Default private List<PdfJsonTextElement> textElements = new ArrayList<>();
}

View File

@ -0,0 +1,29 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonTextElement {
private String text;
private String fontId;
private Float fontSize;
private Float x;
private Float y;
private Float width;
private Float height;
@Builder.Default private List<Float> textMatrix = new ArrayList<>();
private Integer renderingMode;
}

View File

@ -0,0 +1,452 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.time.Instant;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.TimeZone;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonDocument;
import stirling.software.SPDF.model.json.PdfJsonFont;
import stirling.software.SPDF.model.json.PdfJsonMetadata;
import stirling.software.SPDF.model.json.PdfJsonPage;
import stirling.software.SPDF.model.json.PdfJsonTextElement;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ExceptionUtils;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfJsonConversionService {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final ObjectMapper objectMapper;
public byte[] convertPdfToJson(MultipartFile file) throws IOException {
if (file == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
try (PDDocument document = pdfDocumentFactory.load(file.getInputStream(), true)) {
Map<String, PdfJsonFont> fonts = new LinkedHashMap<>();
Map<Integer, List<PdfJsonTextElement>> textByPage = new LinkedHashMap<>();
TextCollectingStripper stripper = new TextCollectingStripper(fonts, textByPage);
stripper.setSortByPosition(true);
stripper.getText(document);
PdfJsonDocument pdfJson = new PdfJsonDocument();
pdfJson.setMetadata(extractMetadata(document));
pdfJson.setFonts(new ArrayList<>(fonts.values()));
pdfJson.setPages(extractPages(document, textByPage));
return objectMapper
.writerWithDefaultPrettyPrinter()
.writeValueAsBytes(pdfJson);
}
}
public byte[] convertJsonToPdf(MultipartFile file) throws IOException {
if (file == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
byte[] jsonBytes = file.getBytes();
PdfJsonDocument pdfJson =
objectMapper.readValue(jsonBytes, PdfJsonDocument.class);
try (PDDocument document = new PDDocument()) {
applyMetadata(document, pdfJson.getMetadata());
Map<String, PDFont> fontMap = buildFontMap(document, pdfJson.getFonts());
PDFont defaultFont = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
List<PdfJsonPage> pages = pdfJson.getPages();
if (pages == null) {
pages = new ArrayList<>();
}
for (PdfJsonPage pageModel : pages) {
PDRectangle pageSize =
new PDRectangle(
safeFloat(pageModel.getWidth(), 612f),
safeFloat(pageModel.getHeight(), 792f));
PDPage page = new PDPage(pageSize);
if (pageModel.getRotation() != null) {
page.setRotation(pageModel.getRotation());
}
document.addPage(page);
List<PdfJsonTextElement> elements = pageModel.getTextElements();
if (elements == null || elements.isEmpty()) {
continue;
}
try (PDPageContentStream contentStream =
new PDPageContentStream(
document,
page,
AppendMode.APPEND,
true,
true)) {
contentStream.beginText();
for (PdfJsonTextElement element : elements) {
PDFont font = fontMap.getOrDefault(element.getFontId(), defaultFont);
float fontSize = safeFloat(element.getFontSize(), 12f);
contentStream.setFont(font, fontSize);
applyRenderingMode(contentStream, element.getRenderingMode());
applyTextMatrix(contentStream, element);
try {
contentStream.showText(Objects.toString(element.getText(), ""));
} catch (IllegalArgumentException ex) {
log.debug(
"Falling back to default font for text element due to encoding issue: {}",
ex.getMessage());
contentStream.setFont(defaultFont, fontSize);
contentStream.showText(Objects.toString(element.getText(), ""));
}
}
contentStream.endText();
}
}
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
document.save(baos);
return baos.toByteArray();
}
}
}
private List<PdfJsonPage> extractPages(
PDDocument document, Map<Integer, List<PdfJsonTextElement>> textByPage) {
List<PdfJsonPage> pages = new ArrayList<>();
int pageIndex = 0;
for (PDPage page : document.getPages()) {
PdfJsonPage pageModel = new PdfJsonPage();
pageModel.setPageNumber(pageIndex + 1);
PDRectangle mediaBox = page.getMediaBox();
pageModel.setWidth(mediaBox.getWidth());
pageModel.setHeight(mediaBox.getHeight());
pageModel.setRotation(page.getRotation());
pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));
pages.add(pageModel);
pageIndex++;
}
return pages;
}
private PdfJsonMetadata extractMetadata(PDDocument document) {
PdfJsonMetadata metadata = new PdfJsonMetadata();
PDDocumentInformation info = document.getDocumentInformation();
if (info != null) {
metadata.setTitle(info.getTitle());
metadata.setAuthor(info.getAuthor());
metadata.setSubject(info.getSubject());
metadata.setKeywords(info.getKeywords());
metadata.setCreator(info.getCreator());
metadata.setProducer(info.getProducer());
metadata.setCreationDate(formatCalendar(info.getCreationDate()));
metadata.setModificationDate(formatCalendar(info.getModificationDate()));
metadata.setTrapped(info.getTrapped());
}
metadata.setNumberOfPages(document.getNumberOfPages());
return metadata;
}
private void applyMetadata(PDDocument document, PdfJsonMetadata metadata) {
if (metadata == null) {
return;
}
PDDocumentInformation info = document.getDocumentInformation();
info.setTitle(metadata.getTitle());
info.setAuthor(metadata.getAuthor());
info.setSubject(metadata.getSubject());
info.setKeywords(metadata.getKeywords());
info.setCreator(metadata.getCreator());
info.setProducer(metadata.getProducer());
if (metadata.getCreationDate() != null) {
parseInstant(metadata.getCreationDate())
.ifPresent(instant -> info.setCreationDate(toCalendar(instant)));
}
if (metadata.getModificationDate() != null) {
parseInstant(metadata.getModificationDate())
.ifPresent(instant -> info.setModificationDate(toCalendar(instant)));
}
info.setTrapped(metadata.getTrapped());
}
private Map<String, PDFont> buildFontMap(PDDocument document, List<PdfJsonFont> fonts)
throws IOException {
Map<String, PDFont> fontMap = new HashMap<>();
if (fonts == null) {
return fontMap;
}
for (PdfJsonFont fontModel : fonts) {
PDFont font = createFontFromModel(document, fontModel);
if (font != null && fontModel.getId() != null) {
fontMap.put(fontModel.getId(), font);
}
}
return fontMap;
}
private PDFont createFontFromModel(PDDocument document, PdfJsonFont fontModel)
throws IOException {
if (fontModel == null) {
return null;
}
String base64 = fontModel.getBase64Data();
if (base64 != null && !base64.isBlank()) {
byte[] fontBytes = Base64.getDecoder().decode(base64);
try (InputStream fontStream = new ByteArrayInputStream(fontBytes)) {
return PDType0Font.load(document, fontStream, true);
} catch (IOException ex) {
log.debug(
Locale.ROOT,
"Unable to load font as Type0 ({}), trying Type1: {}",
fontModel.getName(),
ex.getMessage());
try (InputStream fontStream = new ByteArrayInputStream(fontBytes)) {
return PDType1Font.load(document, fontStream);
} catch (IOException innerEx) {
log.warn(
Locale.ROOT,
"Failed to load embedded font {}, falling back to Standard 14 if available",
fontModel.getName());
}
}
}
String standardName = fontModel.getStandard14Name();
if (standardName != null) {
try {
Standard14Fonts.FontName fontName =
Standard14Fonts.getMappedFontName(standardName);
return new PDType1Font(fontName);
} catch (IllegalArgumentException ex) {
log.warn(
Locale.ROOT,
"Unknown Standard 14 font {}, using Helvetica",
standardName);
}
}
return new PDType1Font(Standard14Fonts.FontName.HELVETICA);
}
private void applyTextMatrix(PDPageContentStream contentStream, PdfJsonTextElement element)
throws IOException {
List<Float> matrix = element.getTextMatrix();
if (matrix != null && matrix.size() == 6) {
contentStream.setTextMatrix(
matrix.get(0),
matrix.get(1),
matrix.get(2),
matrix.get(3),
matrix.get(4),
matrix.get(5));
return;
}
float x = safeFloat(element.getX(), 0f);
float y = safeFloat(element.getY(), 0f);
contentStream.setTextMatrix(1, 0, 0, 1, x, y);
}
private void applyRenderingMode(PDPageContentStream contentStream, Integer renderingMode)
throws IOException {
if (renderingMode == null) {
return;
}
try {
RenderingMode mode = RenderingMode.fromInt(renderingMode);
contentStream.setRenderingMode(mode);
} catch (IllegalArgumentException ex) {
log.debug(
Locale.ROOT,
"Ignoring unsupported rendering mode {}: {}",
renderingMode,
ex.getMessage());
}
}
private float safeFloat(Float value, float defaultValue) {
if (value == null || Float.isNaN(value) || Float.isInfinite(value)) {
return defaultValue;
}
return value;
}
private String formatCalendar(Calendar calendar) {
if (calendar == null) {
return null;
}
return calendar.toInstant().toString();
}
private Optional<Instant> parseInstant(String value) {
try {
return Optional.of(Instant.parse(value));
} catch (DateTimeParseException ex) {
log.warn(Locale.ROOT, "Failed to parse instant '{}': {}", value, ex.getMessage());
return Optional.empty();
}
}
private Calendar toCalendar(Instant instant) {
Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
calendar.setTimeInMillis(instant.toEpochMilli());
return calendar;
}
private class TextCollectingStripper extends PDFTextStripper {
private final Map<String, PdfJsonFont> fonts;
private final Map<Integer, List<PdfJsonTextElement>> textByPage;
private int currentPage = 1;
TextCollectingStripper(
Map<String, PdfJsonFont> fonts, Map<Integer, List<PdfJsonTextElement>> textByPage)
throws IOException {
this.fonts = fonts;
this.textByPage = textByPage;
}
@Override
protected void startPage(PDPage page) throws IOException {
super.startPage(page);
currentPage = getCurrentPageNo();
}
@Override
protected void writeString(String text, List<TextPosition> textPositions)
throws IOException {
if (textPositions == null || textPositions.isEmpty()) {
return;
}
List<PdfJsonTextElement> pageElements =
textByPage.computeIfAbsent(currentPage, key -> new ArrayList<>());
for (TextPosition position : textPositions) {
PDFont font = position.getFont();
String fontId = registerFont(font);
PdfJsonTextElement element = new PdfJsonTextElement();
element.setText(position.getUnicode());
element.setFontId(fontId);
element.setFontSize(position.getFontSizeInPt());
element.setX(position.getXDirAdj());
element.setY(position.getYDirAdj());
element.setWidth(position.getWidthDirAdj());
element.setHeight(position.getHeightDir());
element.setRenderingMode(position.getRenderingMode().intValue());
element.setTextMatrix(extractMatrix(position));
pageElements.add(element);
}
}
private List<Float> extractMatrix(TextPosition position) {
float[] values = new float[6];
values[0] = position.getTextMatrix().getValue(0, 0);
values[1] = position.getTextMatrix().getValue(0, 1);
values[2] = position.getTextMatrix().getValue(1, 0);
values[3] = position.getTextMatrix().getValue(1, 1);
values[4] = position.getTextMatrix().getValue(2, 0);
values[5] = position.getTextMatrix().getValue(2, 1);
List<Float> matrix = new ArrayList<>(6);
for (float value : values) {
matrix.add(value);
}
return matrix;
}
private String registerFont(PDFont font) throws IOException {
String id = font.getName();
if (!fonts.containsKey(id)) {
PdfJsonFont fontModel = new PdfJsonFont();
fontModel.setId(id);
fontModel.setName(font.getName());
fontModel.setSubtype(font.getSubtype());
fontModel.setEncoding(font.getEncoding() != null ? font.getEncoding().getClass().getName() : null);
fontModel.setEmbedded(!font.isStandard14Font());
fontModel.setStandard14Name(resolveStandard14Name(font));
fontModel.setFontDescriptorFlags(
font.getFontDescriptor() != null
? font.getFontDescriptor().getFlags()
: null);
fontModel.setBase64Data(extractFontData(font));
fonts.put(id, fontModel);
}
return id;
}
private String resolveStandard14Name(PDFont font) {
if (font == null) {
return null;
}
if (font.isStandard14Font()) {
return font.getName();
}
try {
Standard14Fonts.FontName mapped =
Standard14Fonts.getMappedFontName(font.getName());
return mapped.getName();
} catch (IllegalArgumentException ex) {
return null;
}
}
private String extractFontData(PDFont font) throws IOException {
if (font == null || font.isStandard14Font()) {
return null;
}
PDFontDescriptor descriptor = font.getFontDescriptor();
if (descriptor == null) {
return null;
}
org.apache.pdfbox.pdmodel.common.PDStream fontStream = descriptor.getFontFile();
if (fontStream == null) {
fontStream = descriptor.getFontFile2();
}
if (fontStream == null) {
fontStream = descriptor.getFontFile3();
}
if (fontStream == null) {
return null;
}
try (InputStream inputStream = fontStream.createInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
inputStream.transferTo(baos);
return Base64.getEncoder().encodeToString(baos.toByteArray());
}
}
}
}