Working (only tested Split yet but im tired)

This commit is contained in:
Felix Kaspar 2023-11-14 23:14:08 +01:00
parent d7feec32dd
commit 85d61fddf0
12 changed files with 164 additions and 136 deletions

10
package-lock.json generated
View File

@ -4470,6 +4470,15 @@
"integrity": "sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==",
"dev": true
},
"node_modules/@types/multer": {
"version": "1.4.10",
"resolved": "https://registry.npmjs.org/@types/multer/-/multer-1.4.10.tgz",
"integrity": "sha512-6l9mYMhUe8wbnz/67YIjc7ZJyQNZoKq7fRXVf7nMdgWgalD0KyzJ2ywI7hoATUSXSbTu9q2HBiEwzy0tNN1v2w==",
"dev": true,
"dependencies": {
"@types/express": "*"
}
},
"node_modules/@types/node": {
"version": "18.18.7",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.18.7.tgz",
@ -14543,6 +14552,7 @@
},
"devDependencies": {
"@types/express": "^4.17.21",
"@types/multer": "^1.4.10",
"ts-node-dev": "^2.0.0",
"typescript": "^5.2.2"
}

View File

@ -1,5 +1,5 @@
{
"watch": ["src"],
"watch": ["src", "../shared-operations/src"],
"ext": "ts,json",
"ignore": ["src/**/*.spec.ts"],
"exec": "node --trace-warnings --experimental-specifier-resolution=node --loader ts-node/esm ./src/index.ts"

View File

@ -25,6 +25,7 @@
},
"devDependencies": {
"@types/express": "^4.17.21",
"@types/multer": "^1.4.10",
"ts-node-dev": "^2.0.0",
"typescript": "^5.2.2"
},

View File

@ -7,6 +7,7 @@ const upload = multer();
import Operations from "../../utils/pdf-operations";
import { traverseOperations } from "@stirling-pdf/shared-operations/src/workflow/traverseOperations";
import { PdfFile, RepresentationType } from '@stirling-pdf/shared-operations/src/wrappers/PdfFile';
const activeWorkflows: any = {};
@ -24,82 +25,78 @@ router.post("/:workflowUuid?", [
// TODO: Validate input further (json may be invalid or not be in workflow format)
const workflow = JSON.parse(req.body.workflow);
// TODO: Replace with static multer function of pdffile
const inputs = await Promise.all((req.files as Express.Multer.File[]).map(async file => {
console.log(file);
return {
originalFileName: file.originalname.replace(/\.[^/.]+$/, ""),
fileName: file.originalname.replace(/\.[^/.]+$/, ""),
buffer: new Uint8Array(await file.buffer)
}
return new PdfFile(file.originalname.replace(/\.[^/.]+$/, ""), new Uint8Array(await file.buffer), RepresentationType.Uint8Array, file.originalname.replace(/\.[^/.]+$/, ""));
}));
// TODO: Enable if traverse & organize migration is done.
// // Allow option to do it synchronously and just make a long request
// if(req.body.async === "false") {
// console.log("Don't do async");
// Allow option to do it synchronously and just make a long request
if(req.body.async === "false") {
console.log("Don't do async");
// const traverse = traverseOperations(workflow.operations, inputs, Operations);
const traverse = traverseOperations(workflow.operations, inputs, Operations);
// let pdfResults;
// let iteration;
// while (true) {
// iteration = await traverse.next();
// if (iteration.done) {
// pdfResults = iteration.value;
// console.log("Done");
// break;
// }
// console.log(iteration.value);
// }
let pdfResults;
let iteration;
while (true) {
iteration = await traverse.next();
if (iteration.done) {
pdfResults = iteration.value;
console.log("Done");
break;
}
console.log(iteration.value);
}
// console.log("Download");
// downloadHandler(res, pdfResults);
// }
// else {
// console.log("Start Aync Workflow");
// // TODO: UUID collision checks
// let workflowID = req.params.workflowUuid
// if(!workflowID)
// workflowID = generateWorkflowID();
console.log("Download");
await downloadHandler(res, pdfResults);
}
else {
console.log("Start Aync Workflow");
// TODO: UUID collision checks
let workflowID = req.params.workflowUuid
if(!workflowID)
workflowID = generateWorkflowID();
// activeWorkflows[workflowID] = {
// createdAt: Date.now(),
// finished: false,
// eventStream: null,
// result: null,
// // TODO: When auth is implemented: owner
// }
// const activeWorkflow = activeWorkflows[workflowID];
activeWorkflows[workflowID] = {
createdAt: Date.now(),
finished: false,
eventStream: null,
result: null,
// TODO: When auth is implemented: owner
}
const activeWorkflow = activeWorkflows[workflowID];
// res.status(200).json({
// "workflowID": workflowID,
// "data-recieved": {
// "fileCount": filesArr.length,
// "workflow": workflow
// }
// });
res.status(200).json({
"workflowID": workflowID,
"data-recieved": {
"fileCount": inputs.length,
"workflow": workflow
}
});
// const traverse = traverseOperations(workflow.operations, inputs, Operations);
const traverse = traverseOperations(workflow.operations, inputs, Operations);
// let pdfResults;
// let iteration;
// while (true) {
// iteration = await traverse.next();
// if (iteration.done) {
// pdfResults = iteration.value;
// if(activeWorkflow.eventStream) {
// activeWorkflow.eventStream.write(`data: processing done\n\n`);
// activeWorkflow.eventStream.end();
// }
// break;
// }
// if(activeWorkflow.eventStream)
// activeWorkflow.eventStream.write(`data: ${iteration.value}\n\n`);
// }
let pdfResults;
let iteration;
while (true) {
iteration = await traverse.next();
if (iteration.done) {
pdfResults = iteration.value;
if(activeWorkflow.eventStream) {
activeWorkflow.eventStream.write(`data: processing done\n\n`);
activeWorkflow.eventStream.end();
}
break;
}
if(activeWorkflow.eventStream)
activeWorkflow.eventStream.write(`data: ${iteration.value}\n\n`);
}
// activeWorkflow.result = pdfResults;
// activeWorkflow.finished = true;
// }
activeWorkflow.result = pdfResults;
activeWorkflow.finished = true;
}
}
]);
@ -146,7 +143,7 @@ router.get("/progress-stream/:workflowUuid", (req: Request, res: Response) => {
});
});
router.get("/result/:workflowUuid", (req: Request, res: Response) => {
router.get("/result/:workflowUuid", async (req: Request, res: Response) => {
if(!req.params.workflowUuid) {
res.status(400).json({"error": "No workflowUuid weres provided."});
return;
@ -167,7 +164,7 @@ router.get("/result/:workflowUuid", (req: Request, res: Response) => {
return
}
downloadHandler(res, workflow.result);
await downloadHandler(res, workflow.result);
// Delete workflow / results when done.
delete activeWorkflows[req.params.workflowUuid];
});
@ -190,7 +187,7 @@ function generateWorkflowID() {
return crypto.randomUUID();
}
function downloadHandler(res: Response, pdfResults: any) {
async function downloadHandler(res: Response, pdfResults: PdfFile[]) {
if(pdfResults.length == 0) {
res.status(500).json({"warning": "The workflow had no outputs."});
}
@ -211,7 +208,7 @@ function downloadHandler(res: Response, pdfResults: any) {
for (let i = 0; i < pdfResults.length; i++) {
// TODO: Implement other file types (mostly fro image & text extraction)
// TODO: Check for name collisions
zip.append(Buffer.from(pdfResults[i].buffer), { name: pdfResults[i].fileName + ".pdf" });
zip.append(Buffer.from(await pdfResults[i].uint8Array), { name: pdfResults[i].filename + ".pdf" });
}
zip.finalize();
@ -219,10 +216,10 @@ function downloadHandler(res: Response, pdfResults: any) {
}
else {
const readStream = new stream.PassThrough();
readStream.end(pdfResults[0].buffer);
readStream.end(pdfResults[0].uint8Array);
// TODO: Implement other file types (mostly fro image & text extraction)
res.set("Content-disposition", 'attachment; filename=' + pdfResults[0].fileName + ".pdf");
res.set("Content-disposition", 'attachment; filename=' + pdfResults[0].filename + ".pdf");
res.set("Content-Type", "application/pdf");
readStream.pipe(res);

View File

@ -3,7 +3,7 @@ import fs from 'fs';
import os from 'os';
import path from 'path';
import { exec, spawn } from 'child_process'
import { PdfFile, fromUint8Array } from '@stirling-pdf/shared-operations/src/wrappers/PdfFile'
import { PdfFile, RepresentationType } from '@stirling-pdf/shared-operations/src/wrappers/PdfFile'
export async function fileToPdf(byteArray: Uint8Array, filename: string): Promise<PdfFile> {
const parentDir = path.join(os.tmpdir(), "StirlingPDF");
@ -22,7 +22,7 @@ export async function fileToPdf(byteArray: Uint8Array, filename: string): Promis
fs.rmdirSync(tempDir);
return fromUint8Array(outputBytes, outputFileName);
return new PdfFile(outputFileName, outputBytes, RepresentationType.Uint8Array);
}
export function isLibreOfficeInstalled() {

View File

@ -6,7 +6,7 @@ import { Image } from 'image-js';
import { getImagesOnPage } from "./getImagesOnPage.js";
export async function detectEmptyPages(file: PdfFile, whiteThreashold: number): Promise<number[]> {
const pdfDoc = await file.pdfjsDocuemnt;
const pdfDoc = await file.pdfjsDocument;
const emptyPages: number[] = [];
for (let i = 1; i <= pdfDoc.numPages; i++) {

View File

@ -1,6 +1,6 @@
import { PDFDocument } from 'pdf-lib';
import { PdfFile } from '../wrappers/PdfFile';
import { PdfFile, RepresentationType } from '../wrappers/PdfFile';
export type MergeParamsType = {
files: PdfFile[];
@ -15,5 +15,5 @@ export async function mergePDFs(params: MergeParamsType): Promise<PdfFile> {
copiedPages.forEach((page) => mergedPdf.addPage(page));
}
return new PdfFile("mergedPDF", mergedPdf);
return new PdfFile("mergedPDF", mergedPdf, RepresentationType.PDFLibDocument);
};

View File

@ -16,6 +16,8 @@ export async function splitOn(params: SplitOnParamsType) {
const { file, type, whiteThreashold } = params;
let splitAtPages: number[] = [];
console.log("File: ", file);
switch (type) {
case "BAR_CODE":
@ -36,6 +38,8 @@ export async function splitOn(params: SplitOnParamsType) {
console.log("Split At Pages: ", splitAtPages);
console.log("File: ", file);
// Remove detected Pages & Split
const pdfDoc = await file.pdflibDocument;
const numberOfPages = pdfDoc.getPageCount();
@ -66,7 +70,9 @@ export async function splitOn(params: SplitOnParamsType) {
return subDocuments;
async function getPagesWithQRCode(file: PdfFile) {
const pdfDoc = await file.pdfjsDocuemnt;
console.log("FileInQRPrev: ", file);
const pdfDoc = await file.pdfjsDocument;
console.log("FileInQRAfter: ", file);
const pagesWithQR: number[] = [];
for (let i = 0; i < pdfDoc.numPages; i++) {
@ -74,7 +80,7 @@ export async function splitOn(params: SplitOnParamsType) {
const page = await pdfDoc.getPage(i + 1);
const images = await getImagesOnPage(page);
console.log("images:", images);
// console.log("images:", images);
for (const image of images) {
const data = await checkForQROnImage(image);
if(data == "https://github.com/Frooodle/Stirling-PDF") {

View File

@ -1,6 +1,6 @@
import { PDFDocument } from 'pdf-lib';
import { PdfFile, fromPdfLib } from '../wrappers/PdfFile.js';
import { PdfFile, RepresentationType } from '../wrappers/PdfFile.js';
import { detectEmptyPages } from "./common/detectEmptyPages.js";
@ -21,12 +21,11 @@ export async function sortPagesWithPreset(params: SortPagesWithPresetParamsType)
throw new Error("Operation not supported");
}
const byteFile = await file.convertToPdfLibFile();
if (!byteFile?.pdfLib) return byteFile;
const pdflibDocument = await file.pdflibDocument;
const pageCount = byteFile.pdfLib.getPageCount();
const pageCount = pdflibDocument.getPageCount();
const sortIndecies = sortFunction(pageCount);
return selectPages({file:byteFile, pagesToExtractArray:sortIndecies});
return selectPages({file: file, pagesToExtractArray: sortIndecies});
}
export type RearrangePagesParamsType = {
@ -37,11 +36,10 @@ export type RearrangePagesParamsType = {
export async function rearrangePages(params: RearrangePagesParamsType): Promise<PdfFile> {
const { file, fancyPageSelector } = params;
const byteFile = await file.convertToPdfLibFile();
if (!byteFile?.pdfLib) return byteFile;
const pdflibDocument = await file.pdflibDocument;
const pagesToExtractArray = parseFancyPageSelector(fancyPageSelector, byteFile.pdfLib.getPageCount());
const newDocument = selectPages({file:byteFile, pagesToExtractArray});
const pagesToExtractArray = parseFancyPageSelector(fancyPageSelector, pdflibDocument.getPageCount());
const newDocument = selectPages({file: file, pagesToExtractArray});
return newDocument;
};
@ -52,23 +50,22 @@ export type SelectPagesParamsType = {
export async function selectPages(params: SelectPagesParamsType): Promise<PdfFile> {
const { file, pagesToExtractArray } = params;
const byteFile = await file.convertToPdfLibFile();
if (!byteFile?.pdfLib) return byteFile;
const pdflibDocument = await file.pdflibDocument;
const subDocument = await PDFDocument.create();
// Check that array max number is not larger pdf pages number
if(Math.max(...pagesToExtractArray) >= byteFile.pdfLib.getPageCount()) {
throw new Error(`The PDF document only has ${byteFile.pdfLib.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
if(Math.max(...pagesToExtractArray) >= pdflibDocument.getPageCount()) {
throw new Error(`The PDF document only has ${pdflibDocument.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
}
const copiedPages = await subDocument.copyPages(byteFile.pdfLib, pagesToExtractArray);
const copiedPages = await subDocument.copyPages(pdflibDocument, pagesToExtractArray);
for (let i = 0; i < copiedPages.length; i++) {
subDocument.addPage(copiedPages[i]);
}
return fromPdfLib(subDocument, file.filename);
return new PdfFile(file.originalFilename, subDocument, RepresentationType.PDFLibDocument, file.filename);
}
export type RemovePagesParamsType = {
@ -78,11 +75,10 @@ export type RemovePagesParamsType = {
export async function removePages(params: RemovePagesParamsType): Promise<PdfFile> {
const { file, pagesToRemoveArray } = params;
const byteFile = await file.convertToPdfLibFile();
if (!byteFile?.pdfLib) return byteFile;
const pdflibDocument = await file.pdflibDocument;
const pagesToExtractArray = invertSelection(pagesToRemoveArray, byteFile.pdfLib.getPageIndices())
return selectPages({file:byteFile, pagesToExtractArray});
const pagesToExtractArray = invertSelection(pagesToRemoveArray, pdflibDocument.getPageIndices())
return selectPages({file: file, pagesToExtractArray});
}
export type RemoveBlankPagesParamsType = {

View File

@ -1,5 +1,5 @@
import { PdfFile, fromPdfLib } from '../wrappers/PdfFile';
import { PdfFile } from '../wrappers/PdfFile';
export type UpdateMetadataParams = {
file: PdfFile,
@ -17,7 +17,7 @@ export type UpdateMetadataParams = {
}
export async function updateMetadata(params: UpdateMetadataParams): Promise<PdfFile> {
const pdfDoc = await params.file.getAsPdfLib();
const pdfDoc = await params.file.pdflibDocument;
if (params.deleteAll) {
pdfDoc.setAuthor("");
@ -49,5 +49,5 @@ export async function updateMetadata(params: UpdateMetadataParams): Promise<PdfF
// TODO add trapped and custom metadata. May need another library
return fromPdfLib(pdfDoc, params.file.filename);
return params.file;
};

View File

@ -9,8 +9,10 @@ export async function * traverseOperations(operations: Action[], input: PdfFile[
yield* nextOperation(operations, input);
return results;
async function * nextOperation(actions: Action[], input: PdfFile[] | PdfFile): AsyncGenerator<string, void, void> {
if(Array.isArray(actions) && actions.length == 0) { // isEmpty
async function * nextOperation(actions: Action[] | undefined, input: PdfFile[] | PdfFile): AsyncGenerator<string, void, void> {
console.log("Next Operation");
if(actions === undefined || (Array.isArray(actions) && actions.length == 0)) { // isEmpty
console.log("Last Operation");
if(Array.isArray(input)) {
console.log("operation done: " + input[0].filename + (input.length > 1 ? "+" : ""));
results = results.concat(input);
@ -24,11 +26,12 @@ export async function * traverseOperations(operations: Action[], input: PdfFile[
}
for (let i = 0; i < actions.length; i++) {
yield* computeOperation(actions[i], structuredClone(input));
yield* computeOperation(actions[i], input); // TODO: structuredClone doesn't work in ts need to find another solution to pass by value.
}
}
async function * computeOperation(action: Action, input: PdfFile|PdfFile[]): AsyncGenerator<string, void, void> {
yield "Starting: " + action.type;
switch (action.type) {
case "done": // Skip this, because it is a valid node.
@ -132,9 +135,7 @@ export async function * traverseOperations(operations: Action[], input: PdfFile[
const input = Array.isArray(inputs) ? inputs : [inputs]; // Convert single values to array, keep arrays as is.
const newInputs = await callback(input);
if (action.actions) {
yield* nextOperation(action.actions, newInputs);
}
yield* nextOperation(action.actions, newInputs);
}
/**
@ -149,15 +150,11 @@ export async function * traverseOperations(operations: Action[], input: PdfFile[
for (let i = 0; i < input.length; i++) {
output = output.concat(await callback(input[i]));
}
if (action.actions) {
yield* nextOperation(action.actions, output);
}
yield* nextOperation(action.actions, output);
}
else {
const nextInput = await callback(input);
if (action.actions) {
yield* nextOperation(action.actions, nextInput);
}
yield* nextOperation(action.actions, nextInput);
}
}
@ -167,15 +164,11 @@ export async function * traverseOperations(operations: Action[], input: PdfFile[
for (let i = 0; i < input.length; i++) {
nextInputs.concat(await callback(input[i]));
}
if (action.actions) {
yield* nextOperation(action.actions, nextInputs);
}
yield* nextOperation(action.actions, nextInputs);
}
else {
const nextInput = await callback(input);
if (action.actions) {
yield* nextOperation(action.actions, nextInput);
}
yield* nextOperation(action.actions, nextInput);
}
}
}

View File

@ -1,75 +1,100 @@
import * as PDFJS from 'pdfjs-dist';
import { PDFDocumentProxy as PDFJSDocument } from 'pdfjs-dist/types/src/display/api';
import type { PDFDocumentProxy as PDFJSDocument } from 'pdfjs-dist/types/src/display/api';
import { PDFDocument as PDFLibDocument } from 'pdf-lib';
import Joi from 'joi';
export enum RepresentationType {
Uint8Array,
PDFLibDocument,
PDFJSDocument
}
export class PdfFile {
private representation: Uint8Array | PDFLibDocument | PDFJSDocument;
private representationType: RepresentationType;
originalFilename: string;
filename: string;
get uint8Array() : Promise<Uint8Array> {
switch (this.representation.constructor) {
case Uint8Array:
switch (this.representationType) {
case RepresentationType.Uint8Array:
return new Promise((resolve, reject) => {
resolve(this.representation as Uint8Array);
});
case PDFLibDocument:
return (this.representation as PDFLibDocument).save();
case PDFJSDocument:
return (this.representation as PDFJSDocument).getData();
case RepresentationType.PDFLibDocument:
return new Promise(async (resolve, reject) => {
var uint8Array = await (this.representation as PDFLibDocument).save();
this.uint8Array = uint8Array;
resolve(uint8Array);
});
case RepresentationType.PDFJSDocument:
return new Promise(async (resolve, reject) => {
var uint8Array = await (this.representation as PDFJSDocument).getData();
this.uint8Array = uint8Array;
resolve(uint8Array);
});
default:
console.error("unhandeled PDF type: " + typeof this.representation as string);
throw Error("unhandeled PDF type");
}
}
set uint8Array(value: Uint8Array) {
this.representation = value;
this.representationType = RepresentationType.Uint8Array;
}
get pdflibDocument() : Promise<PDFLibDocument> {
switch (this.representation.constructor) {
case PDFLibDocument: // PDFLib
switch (this.representationType) {
case RepresentationType.PDFLibDocument:
return new Promise((resolve, reject) => {
resolve(this.representation as PDFLibDocument);
});
default:
return new Promise(async (resolve, reject) => {
resolve(PDFLibDocument.load(await this.uint8Array, {
var uint8Array = await this.uint8Array;
var pdfLibDoc = await PDFLibDocument.load(uint8Array, {
updateMetadata: false,
}));
});
this.pdflibDocument = pdfLibDoc;
resolve(pdfLibDoc);
});
}
}
set pdflibDocument(value: PDFLibDocument) {
this.representation = value;
this.representationType = RepresentationType.PDFLibDocument;
}
get pdfjsDocuemnt() : Promise<PDFJSDocument> {
switch (this.representation.constructor) {
case PDFJSDocument:
get pdfjsDocument() : Promise<PDFJSDocument> {
switch (this.representationType) {
case RepresentationType.PDFJSDocument:
return new Promise((resolve, reject) => {
resolve(this.representation as PDFJSDocument);
});
default:
return new Promise(async (resolve, reject) => {
resolve(await PDFJS.getDocument(await this.uint8Array).promise);
const pdfjsDoc = await PDFJS.getDocument(await this.uint8Array).promise;
this.pdfjsDocument = pdfjsDoc;
resolve(pdfjsDoc);
});
}
}
set pdfjsDocuemnt(value: PDFJSDocument) {
set pdfjsDocument(value: PDFJSDocument) {
this.representation = value;
this.representationType = RepresentationType.PDFJSDocument;
}
constructor(originalFilename: string, representation: Uint8Array | PDFLibDocument | PDFJSDocument, filename?: string) {
constructor(originalFilename: string, representation: Uint8Array | PDFLibDocument | PDFJSDocument, representationType: RepresentationType, filename?: string) {
this.originalFilename = originalFilename;
this.filename = filename ? filename : originalFilename;
this.representation = representation;
this.representationType = representationType;
}
static fromMulterFile(value: Express.Multer.File): PdfFile {
return new PdfFile(value.originalname, value.buffer as Uint8Array)
return new PdfFile(value.originalname, value.buffer as Uint8Array, RepresentationType.Uint8Array);
}
static fromMulterFiles(values: Express.Multer.File[]): PdfFile[] {
return values.map(v => PdfFile.fromMulterFile(v));