From ab7610f72ce52e43a09ad2f4ecc68f8670497bb6 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Tue, 26 Nov 2024 20:24:46 +0000 Subject: [PATCH] update docs --- HowToUseOCR.md | 7 ++----- LocalRunGuide.md | 1 - .../software/SPDF/config/ExternalAppDepConfig.java | 3 +++ 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/HowToUseOCR.md b/HowToUseOCR.md index afed2504..0a5cc94c 100644 --- a/HowToUseOCR.md +++ b/HowToUseOCR.md @@ -8,7 +8,7 @@ The paths have changed for the tessdata locations on new Docker images. Please u ## How does the OCR Work -Stirling-PDF uses [qpdf](https://github.com/qpdf/qpdf), which in turn uses Tesseract for its text recognition. All credit goes to them for this awesome work! +Stirling-PDF uses Tesseract for its text recognition. All credit goes to them for this awesome work! ## Language Packs @@ -52,8 +52,6 @@ Add the following to your existing Docker run command: ### Non-Docker Setup -If you are not using Docker, you need to install the OCR components, including the `qpdf` app. You can see the [qpdf install guide](https://qpdf.readthedocs.io/en/latest/installation.html). - For Debian-based systems, install languages with this command: ```bash @@ -83,8 +81,7 @@ rpm -qa | grep tesseract-langpack | sed 's/tesseract-langpack-//g' For Windows: -Ensure qpdf in installed with -``pip install qpdf`` +You must ensure tesseract is installed Additional languages must be downloaded manually: Download desired .traineddata files from tessdata or tessdata_fast diff --git a/LocalRunGuide.md b/LocalRunGuide.md index c3fba2df..124cff9b 100644 --- a/LocalRunGuide.md +++ b/LocalRunGuide.md @@ -146,7 +146,6 @@ The easiest method is to use the language packs provided by your repositories. S 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need. 2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata` -3. Please view [qpdf install guide](https://qpdf.readthedocs.io/en/latest/installation.html) for more info. **IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED. diff --git a/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java b/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java index 7fff2bc0..8ed708ea 100644 --- a/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java +++ b/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java @@ -42,6 +42,8 @@ public class ExternalAppDepConfig { put("pdftohtml", List.of("Pdftohtml")); put("unoconv", List.of("Unoconv")); put("qpdf", List.of("qpdf")); + put("tesseract", List.of("tesseract")); + } }; @@ -96,6 +98,7 @@ public class ExternalAppDepConfig { public void checkDependencies() { // Check core dependencies + checkDependencyAndDisableGroup("tesseract"); checkDependencyAndDisableGroup("soffice"); checkDependencyAndDisableGroup("qpdf"); checkDependencyAndDisableGroup("weasyprint");