diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java new file mode 100644 index 000000000..372d0e201 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java @@ -0,0 +1,73 @@ +package stirling.software.SPDF.controller.api.converters; + +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.ModelAttribute; +import org.springframework.web.multipart.MultipartFile; + +import io.github.pixee.security.Filenames; +import io.swagger.v3.oas.annotations.Operation; + +import lombok.RequiredArgsConstructor; + +import stirling.software.SPDF.config.swagger.StandardPdfResponse; +import stirling.software.SPDF.service.PdfJsonConversionService; +import stirling.software.common.annotations.AutoJobPostMapping; +import stirling.software.common.annotations.api.ConvertApi; +import stirling.software.common.model.api.GeneralFile; +import stirling.software.common.model.api.PDFFile; +import stirling.software.common.util.ExceptionUtils; +import stirling.software.common.util.WebResponseUtils; + +@ConvertApi +@RequiredArgsConstructor +public class ConvertPdfJsonController { + + private final PdfJsonConversionService pdfJsonConversionService; + + @AutoJobPostMapping(consumes = "multipart/form-data", value = "/pdf/json") + @Operation( + summary = "Convert PDF to JSON", + description = + "Extracts PDF text, fonts, and metadata into an editable JSON structure that can be" + + " transformed back into a PDF. Input:PDF Output:JSON Type:SISO") + public ResponseEntity convertPdfToJson(@ModelAttribute PDFFile request) throws Exception { + MultipartFile inputFile = request.getFileInput(); + if (inputFile == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + + byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile); + String originalName = inputFile.getOriginalFilename(); + String baseName = + (originalName != null && !originalName.isBlank()) + ? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "") + : "document"; + String docName = baseName + ".json"; + return WebResponseUtils.bytesToWebResponse( + jsonBytes, docName, MediaType.APPLICATION_JSON); + } + + @AutoJobPostMapping(consumes = "multipart/form-data", value = "/json/pdf") + @StandardPdfResponse + @Operation( + summary = "Convert JSON to PDF", + description = + "Rebuilds a PDF from the editable JSON structure generated by the PDF to JSON" + + " endpoint. Input:JSON Output:PDF Type:SISO") + public ResponseEntity convertJsonToPdf(@ModelAttribute GeneralFile request) throws Exception { + MultipartFile jsonFile = request.getFileInput(); + if (jsonFile == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + + byte[] pdfBytes = pdfJsonConversionService.convertJsonToPdf(jsonFile); + String originalName = jsonFile.getOriginalFilename(); + String baseName = + (originalName != null && !originalName.isBlank()) + ? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "") + : "document"; + String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf"; + return WebResponseUtils.bytesToWebResponse(pdfBytes, docName); + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java new file mode 100644 index 000000000..805f664ce --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java @@ -0,0 +1,25 @@ +package stirling.software.SPDF.model.json; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonDocument { + + private PdfJsonMetadata metadata; + + @Builder.Default private List fonts = new ArrayList<>(); + + @Builder.Default private List pages = new ArrayList<>(); +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java new file mode 100644 index 000000000..a0eba01f0 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java @@ -0,0 +1,25 @@ +package stirling.software.SPDF.model.json; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonFont { + + private String id; + private String name; + private String subtype; + private String encoding; + private Boolean embedded; + private String standard14Name; + private Integer fontDescriptorFlags; + private String base64Data; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java new file mode 100644 index 000000000..8db869ca5 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java @@ -0,0 +1,27 @@ +package stirling.software.SPDF.model.json; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonMetadata { + + private String title; + private String author; + private String subject; + private String keywords; + private String creator; + private String producer; + private String creationDate; + private String modificationDate; + private String trapped; + private Integer numberOfPages; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java new file mode 100644 index 000000000..8a02cc33e --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java @@ -0,0 +1,26 @@ +package stirling.software.SPDF.model.json; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonPage { + + private Integer pageNumber; + private Float width; + private Float height; + private Integer rotation; + + @Builder.Default private List textElements = new ArrayList<>(); +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java new file mode 100644 index 000000000..97be75234 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java @@ -0,0 +1,29 @@ +package stirling.software.SPDF.model.json; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonTextElement { + + private String text; + private String fontId; + private Float fontSize; + private Float x; + private Float y; + private Float width; + private Float height; + @Builder.Default private List textMatrix = new ArrayList<>(); + private Integer renderingMode; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java new file mode 100644 index 000000000..349d97f5d --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -0,0 +1,452 @@ +package stirling.software.SPDF.service; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.time.Instant; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Calendar; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.TimeZone; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.json.PdfJsonDocument; +import stirling.software.SPDF.model.json.PdfJsonFont; +import stirling.software.SPDF.model.json.PdfJsonMetadata; +import stirling.software.SPDF.model.json.PdfJsonPage; +import stirling.software.SPDF.model.json.PdfJsonTextElement; +import stirling.software.common.service.CustomPDFDocumentFactory; +import stirling.software.common.util.ExceptionUtils; + +@Slf4j +@Service +@RequiredArgsConstructor +public class PdfJsonConversionService { + + private final CustomPDFDocumentFactory pdfDocumentFactory; + private final ObjectMapper objectMapper; + + public byte[] convertPdfToJson(MultipartFile file) throws IOException { + if (file == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + try (PDDocument document = pdfDocumentFactory.load(file.getInputStream(), true)) { + Map fonts = new LinkedHashMap<>(); + Map> textByPage = new LinkedHashMap<>(); + + TextCollectingStripper stripper = new TextCollectingStripper(fonts, textByPage); + stripper.setSortByPosition(true); + stripper.getText(document); + + PdfJsonDocument pdfJson = new PdfJsonDocument(); + pdfJson.setMetadata(extractMetadata(document)); + pdfJson.setFonts(new ArrayList<>(fonts.values())); + pdfJson.setPages(extractPages(document, textByPage)); + + return objectMapper + .writerWithDefaultPrettyPrinter() + .writeValueAsBytes(pdfJson); + } + } + + public byte[] convertJsonToPdf(MultipartFile file) throws IOException { + if (file == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + byte[] jsonBytes = file.getBytes(); + PdfJsonDocument pdfJson = + objectMapper.readValue(jsonBytes, PdfJsonDocument.class); + + try (PDDocument document = new PDDocument()) { + applyMetadata(document, pdfJson.getMetadata()); + + Map fontMap = buildFontMap(document, pdfJson.getFonts()); + PDFont defaultFont = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + + List pages = pdfJson.getPages(); + if (pages == null) { + pages = new ArrayList<>(); + } + + for (PdfJsonPage pageModel : pages) { + PDRectangle pageSize = + new PDRectangle( + safeFloat(pageModel.getWidth(), 612f), + safeFloat(pageModel.getHeight(), 792f)); + PDPage page = new PDPage(pageSize); + if (pageModel.getRotation() != null) { + page.setRotation(pageModel.getRotation()); + } + document.addPage(page); + + List elements = pageModel.getTextElements(); + if (elements == null || elements.isEmpty()) { + continue; + } + + try (PDPageContentStream contentStream = + new PDPageContentStream( + document, + page, + AppendMode.APPEND, + true, + true)) { + contentStream.beginText(); + for (PdfJsonTextElement element : elements) { + PDFont font = fontMap.getOrDefault(element.getFontId(), defaultFont); + float fontSize = safeFloat(element.getFontSize(), 12f); + contentStream.setFont(font, fontSize); + applyRenderingMode(contentStream, element.getRenderingMode()); + applyTextMatrix(contentStream, element); + try { + contentStream.showText(Objects.toString(element.getText(), "")); + } catch (IllegalArgumentException ex) { + log.debug( + "Falling back to default font for text element due to encoding issue: {}", + ex.getMessage()); + contentStream.setFont(defaultFont, fontSize); + contentStream.showText(Objects.toString(element.getText(), "")); + } + } + contentStream.endText(); + } + } + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + document.save(baos); + return baos.toByteArray(); + } + } + } + + private List extractPages( + PDDocument document, Map> textByPage) { + List pages = new ArrayList<>(); + int pageIndex = 0; + for (PDPage page : document.getPages()) { + PdfJsonPage pageModel = new PdfJsonPage(); + pageModel.setPageNumber(pageIndex + 1); + PDRectangle mediaBox = page.getMediaBox(); + pageModel.setWidth(mediaBox.getWidth()); + pageModel.setHeight(mediaBox.getHeight()); + pageModel.setRotation(page.getRotation()); + pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>())); + pages.add(pageModel); + pageIndex++; + } + return pages; + } + + private PdfJsonMetadata extractMetadata(PDDocument document) { + PdfJsonMetadata metadata = new PdfJsonMetadata(); + PDDocumentInformation info = document.getDocumentInformation(); + if (info != null) { + metadata.setTitle(info.getTitle()); + metadata.setAuthor(info.getAuthor()); + metadata.setSubject(info.getSubject()); + metadata.setKeywords(info.getKeywords()); + metadata.setCreator(info.getCreator()); + metadata.setProducer(info.getProducer()); + metadata.setCreationDate(formatCalendar(info.getCreationDate())); + metadata.setModificationDate(formatCalendar(info.getModificationDate())); + metadata.setTrapped(info.getTrapped()); + } + metadata.setNumberOfPages(document.getNumberOfPages()); + return metadata; + } + + private void applyMetadata(PDDocument document, PdfJsonMetadata metadata) { + if (metadata == null) { + return; + } + PDDocumentInformation info = document.getDocumentInformation(); + info.setTitle(metadata.getTitle()); + info.setAuthor(metadata.getAuthor()); + info.setSubject(metadata.getSubject()); + info.setKeywords(metadata.getKeywords()); + info.setCreator(metadata.getCreator()); + info.setProducer(metadata.getProducer()); + if (metadata.getCreationDate() != null) { + parseInstant(metadata.getCreationDate()) + .ifPresent(instant -> info.setCreationDate(toCalendar(instant))); + } + if (metadata.getModificationDate() != null) { + parseInstant(metadata.getModificationDate()) + .ifPresent(instant -> info.setModificationDate(toCalendar(instant))); + } + info.setTrapped(metadata.getTrapped()); + } + + private Map buildFontMap(PDDocument document, List fonts) + throws IOException { + Map fontMap = new HashMap<>(); + if (fonts == null) { + return fontMap; + } + for (PdfJsonFont fontModel : fonts) { + PDFont font = createFontFromModel(document, fontModel); + if (font != null && fontModel.getId() != null) { + fontMap.put(fontModel.getId(), font); + } + } + return fontMap; + } + + private PDFont createFontFromModel(PDDocument document, PdfJsonFont fontModel) + throws IOException { + if (fontModel == null) { + return null; + } + String base64 = fontModel.getBase64Data(); + if (base64 != null && !base64.isBlank()) { + byte[] fontBytes = Base64.getDecoder().decode(base64); + try (InputStream fontStream = new ByteArrayInputStream(fontBytes)) { + return PDType0Font.load(document, fontStream, true); + } catch (IOException ex) { + log.debug( + Locale.ROOT, + "Unable to load font as Type0 ({}), trying Type1: {}", + fontModel.getName(), + ex.getMessage()); + try (InputStream fontStream = new ByteArrayInputStream(fontBytes)) { + return PDType1Font.load(document, fontStream); + } catch (IOException innerEx) { + log.warn( + Locale.ROOT, + "Failed to load embedded font {}, falling back to Standard 14 if available", + fontModel.getName()); + } + } + } + String standardName = fontModel.getStandard14Name(); + if (standardName != null) { + try { + Standard14Fonts.FontName fontName = + Standard14Fonts.getMappedFontName(standardName); + return new PDType1Font(fontName); + } catch (IllegalArgumentException ex) { + log.warn( + Locale.ROOT, + "Unknown Standard 14 font {}, using Helvetica", + standardName); + } + } + return new PDType1Font(Standard14Fonts.FontName.HELVETICA); + } + + private void applyTextMatrix(PDPageContentStream contentStream, PdfJsonTextElement element) + throws IOException { + List matrix = element.getTextMatrix(); + if (matrix != null && matrix.size() == 6) { + contentStream.setTextMatrix( + matrix.get(0), + matrix.get(1), + matrix.get(2), + matrix.get(3), + matrix.get(4), + matrix.get(5)); + return; + } + float x = safeFloat(element.getX(), 0f); + float y = safeFloat(element.getY(), 0f); + contentStream.setTextMatrix(1, 0, 0, 1, x, y); + } + + private void applyRenderingMode(PDPageContentStream contentStream, Integer renderingMode) + throws IOException { + if (renderingMode == null) { + return; + } + try { + RenderingMode mode = RenderingMode.fromInt(renderingMode); + contentStream.setRenderingMode(mode); + } catch (IllegalArgumentException ex) { + log.debug( + Locale.ROOT, + "Ignoring unsupported rendering mode {}: {}", + renderingMode, + ex.getMessage()); + } + } + + private float safeFloat(Float value, float defaultValue) { + if (value == null || Float.isNaN(value) || Float.isInfinite(value)) { + return defaultValue; + } + return value; + } + + private String formatCalendar(Calendar calendar) { + if (calendar == null) { + return null; + } + return calendar.toInstant().toString(); + } + + private Optional parseInstant(String value) { + try { + return Optional.of(Instant.parse(value)); + } catch (DateTimeParseException ex) { + log.warn(Locale.ROOT, "Failed to parse instant '{}': {}", value, ex.getMessage()); + return Optional.empty(); + } + } + + private Calendar toCalendar(Instant instant) { + Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC")); + calendar.setTimeInMillis(instant.toEpochMilli()); + return calendar; + } + + private class TextCollectingStripper extends PDFTextStripper { + + private final Map fonts; + private final Map> textByPage; + private int currentPage = 1; + + TextCollectingStripper( + Map fonts, Map> textByPage) + throws IOException { + this.fonts = fonts; + this.textByPage = textByPage; + } + + @Override + protected void startPage(PDPage page) throws IOException { + super.startPage(page); + currentPage = getCurrentPageNo(); + } + + @Override + protected void writeString(String text, List textPositions) + throws IOException { + if (textPositions == null || textPositions.isEmpty()) { + return; + } + List pageElements = + textByPage.computeIfAbsent(currentPage, key -> new ArrayList<>()); + + for (TextPosition position : textPositions) { + PDFont font = position.getFont(); + String fontId = registerFont(font); + PdfJsonTextElement element = new PdfJsonTextElement(); + element.setText(position.getUnicode()); + element.setFontId(fontId); + element.setFontSize(position.getFontSizeInPt()); + element.setX(position.getXDirAdj()); + element.setY(position.getYDirAdj()); + element.setWidth(position.getWidthDirAdj()); + element.setHeight(position.getHeightDir()); + element.setRenderingMode(position.getRenderingMode().intValue()); + element.setTextMatrix(extractMatrix(position)); + pageElements.add(element); + } + } + + private List extractMatrix(TextPosition position) { + float[] values = new float[6]; + values[0] = position.getTextMatrix().getValue(0, 0); + values[1] = position.getTextMatrix().getValue(0, 1); + values[2] = position.getTextMatrix().getValue(1, 0); + values[3] = position.getTextMatrix().getValue(1, 1); + values[4] = position.getTextMatrix().getValue(2, 0); + values[5] = position.getTextMatrix().getValue(2, 1); + List matrix = new ArrayList<>(6); + for (float value : values) { + matrix.add(value); + } + return matrix; + } + + private String registerFont(PDFont font) throws IOException { + String id = font.getName(); + if (!fonts.containsKey(id)) { + PdfJsonFont fontModel = new PdfJsonFont(); + fontModel.setId(id); + fontModel.setName(font.getName()); + fontModel.setSubtype(font.getSubtype()); + fontModel.setEncoding(font.getEncoding() != null ? font.getEncoding().getClass().getName() : null); + fontModel.setEmbedded(!font.isStandard14Font()); + fontModel.setStandard14Name(resolveStandard14Name(font)); + fontModel.setFontDescriptorFlags( + font.getFontDescriptor() != null + ? font.getFontDescriptor().getFlags() + : null); + fontModel.setBase64Data(extractFontData(font)); + fonts.put(id, fontModel); + } + return id; + } + + private String resolveStandard14Name(PDFont font) { + if (font == null) { + return null; + } + if (font.isStandard14Font()) { + return font.getName(); + } + try { + Standard14Fonts.FontName mapped = + Standard14Fonts.getMappedFontName(font.getName()); + return mapped.getName(); + } catch (IllegalArgumentException ex) { + return null; + } + } + + private String extractFontData(PDFont font) throws IOException { + if (font == null || font.isStandard14Font()) { + return null; + } + PDFontDescriptor descriptor = font.getFontDescriptor(); + if (descriptor == null) { + return null; + } + org.apache.pdfbox.pdmodel.common.PDStream fontStream = descriptor.getFontFile(); + if (fontStream == null) { + fontStream = descriptor.getFontFile2(); + } + if (fontStream == null) { + fontStream = descriptor.getFontFile3(); + } + if (fontStream == null) { + return null; + } + try (InputStream inputStream = fontStream.createInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + return Base64.getEncoder().encodeToString(baos.toByteArray()); + } + } + } +}