docker and ocr updates

This commit is contained in:
Anthony Stirling 2023-12-10 22:02:30 +00:00
parent 8b55ffff96
commit 59c7978330
28 changed files with 100 additions and 110 deletions

View File

@ -1,5 +1,5 @@
# Use the base image
FROM frooodle/stirling-pdf-base:beta4
FROM frooodle/stirling-pdf-base:testDontUseMe
ARG VERSION_TAG

View File

@ -1,33 +1,35 @@
# Main stage
FROM bellsoft/liberica-openjdk-debian:17 AS base
FROM ubuntu:latest AS base
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2
RUN add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr
RUN apt-get update && \
apt-get install -y --no-install-recommends \
openjdk-17-jre \
libreoffice-core-nogui \
libreoffice-common \
libreoffice-writer-nogui \
libreoffice-calc-nogui \
libreoffice-impress-nogui \
python3-uno \
ghostscript \
python3-pip \
unoconv \
pngquant \
unpaper \
ocrmypdf && \
ocrmypdf \
unoconv && \
pip install --upgrade pip && \
pip install --no-cache-dir --user --upgrade ocrmypdf && \
pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1 \
pip install --no-cache-dir --upgrade ocrmypdf && \
pip install --no-cache-dir \
pillow==10.0.1 \
lxml==4.6.5 \
reportlab==3.6.13 \
setuptools==65.5.1 \
pikepdf==4.4.1 \
wheel==0.38.1 \
cryptography==39.0.1 \
opencv-python-headless && \
rm -rf /var/lib/apt/lists/* && \
mkdir /usr/share/tesseract-ocr-original && \
cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \
rm -rf /usr/share/tesseract-ocr
# Python packages stage
FROM base AS python-packages
# Install build tools and Python libraries
@ -43,32 +45,4 @@ RUN apt-get update && \
FROM base
COPY --from=python-packages /usr/local /usr/local
# Install wkhtmltopdf
RUN apt-get update && \
apt-get install -y \
fontconfig \
libfontconfig1 \
libfreetype6 \
libx11-6 \
libxext6 \
libxrender1 \
xfonts-75dpi \
wget \
xfonts-base
# Set a default value for TARGETARCH if it's not provided
ARG TARGETARCH=arm64
# Conditional statement to choose the correct wkhtmltopdf package based on architecture
RUN if [ "$TARGETARCH" = "amd64" ]; then \
wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6.1-3/wkhtmltox_0.12.6.1-3.bullseye_amd64.deb; \
elif [ "$TARGETARCH" = "arm64" ]; then \
wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6.1-3/wkhtmltox_0.12.6.1-3.bullseye_arm64.deb; \
else \
wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6.1-3/wkhtmltox_0.12.6.1-3.bullseye_amd64.deb; \
fi && \
dpkg -i wkhtmltox_0.12.6.1-3.bullseye_$TARGETARCH.deb && \
rm wkhtmltox_0.12.6.1-3.bullseye_$TARGETARCH.deb && \
rm -rf /var/lib/apt/lists/*

View File

@ -2,6 +2,9 @@
This document provides instructions on how to add additional language packs for the OCR tab in Stirling-PDF, both inside and outside of Docker.
## My OCR used to work and now doesnt!
Please update your tesseract docker volume path version from 4.00 to 5
## How does the OCR Work
Stirling-PDF uses [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF) which in turn uses tesseract for its text recognition.
All credit goes to them for this awesome work!
@ -18,7 +21,7 @@ Depending on your requirements, you can choose the appropriate language pack for
### Installing Language Packs
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata` (Debian) or `/usr/share/tesseract/tessdata` (Fedora)
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata` (Debian) or `/usr/share/tesseract/tessdata` (Fedora)
# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, IT'S REQUIRED.
@ -34,14 +37,14 @@ services:
your_service_name:
image: your_docker_image_name
volumes:
- /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata
- /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata
```
#### Docker run
Add the following to your existing docker run command
```bash
-v /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata
-v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata
```
#### Non-Docker

View File

@ -139,7 +139,7 @@ Easiest is to use the langpacks provided by your repositories. Skip the other st
Manual:
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata`
3.
Please view [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for more info.
**IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED.

View File

@ -113,7 +113,7 @@ Docker Run
```
docker run -d \
-p 8080:8080 \
-v /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata \
-v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata \
-v /location/of/extraConfigs:/configs \
-e DOCKER_ENABLE_SECURITY=false \
--name stirling-pdf \
@ -133,7 +133,7 @@ services:
ports:
- '8080:8080'
volumes:
- /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata #Required for extra OCR languages
- /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata #Required for extra OCR languages
- /location/of/extraConfigs:/configs
# - /location/of/customFiles:/customFiles/
environment:

View File

@ -8,7 +8,7 @@ plugins {
}
group = 'stirling.software'
version = '0.15.2'
version = '0.16.0'
sourceCompatibility = '17'
repositories {

View File

@ -39,7 +39,7 @@ public class OCRController {
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
String tessdataDir = "/usr/share/tesseract-ocr/5/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();

View File

@ -78,7 +78,7 @@ public class OtherWebController {
}
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
String tessdataDir = "/usr/share/tesseract-ocr/5/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redact
home.autoRedact.desc=Auto Redacts(Blacks out) text in a PDF based on input text
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=تحويل
#PDFToCSV
PDFToCSV.title=PDF ??? CSV
PDFToCSV.header=PDF ??? CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=??????

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Автоматично редактиране
home.autoRedact.desc=Автоматично редактира (зачернява) текст в PDF въз основа на въведен текст
showJS.tags=Редактиране,Скриване,затъмняване,черен,маркер,скрит
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Преобразуване
#PDFToCSV
PDFToCSV.title=PDF ??? CSV
PDFToCSV.header=PDF ??? CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=????????

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redact
home.autoRedact.desc=Auto Redacts(Blacks out) text in a PDF based on input text
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Converteix
#PDFToCSV
PDFToCSV.title=PDF a CSV
PDFToCSV.header=PDF a CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extracte

View File

@ -831,4 +831,8 @@ PDFToXML.submit=Konvertieren
#PDFToCSV
PDFToCSV.title=PDF zu CSV
PDFToCSV.header=PDF zu CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extrakt

View File

@ -336,9 +336,6 @@ home.autoRedact.title=\u0391\u03C5\u03C4\u03CC\u03BC\u03B1\u03C4\u03BF \u039C\u0
home.autoRedact.desc=\u0391\u03C5\u03C4\u03CC\u03BC\u03B1\u03C4\u03B7 \u03B5\u03C0\u03B5\u03BE\u03B5\u03C1\u03B3\u03B1\u03C3\u03AF\u03B1 (\u039C\u03B1\u03CD\u03C1\u03B9\u03C3\u03BC\u03B1) \u03BA\u03B5\u03AF\u03BC\u03B5\u03BD\u03BF\u03C5 \u03C3\u03B5 PDF \u03BC\u03B5 \u03B2\u03AC\u03C3\u03B7 \u03C4\u03BF \u03BA\u03B5\u03AF\u03BC\u03B5\u03BD\u03BF \u03B5\u03B9\u03C3\u03B1\u03B3\u03C9\u03B3\u03AE\u03C2
showJS.tags=Redact,Hide,black out,black,marker,hidden
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=\u039C\u03B5\u03C4\u03B1\u03C4\u03C1\u03BF\u03C0\u03AE
#PDFToCSV
PDFToCSV.title=PDF ?? CSV
PDFToCSV.header=PDF ?? CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=?????????

View File

@ -831,4 +831,5 @@ PDFToXML.submit=Convert
#PDFToCSV
PDFToCSV.title=PDF to CSV
PDFToCSV.header=PDF to CSV
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract

View File

@ -831,4 +831,8 @@ PDFToXML.submit=Convert
#PDFToCSV
PDFToCSV.title=PDF to CSV
PDFToCSV.header=PDF to CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redactar
home.autoRedact.desc=Redactar automáticamente (ocultar) texto en un PDF según el texto introducido
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Convertir
#PDFToCSV
PDFToCSV.title=PDF a CSV
PDFToCSV.header=PDF a CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extracto

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Idatzi
home.autoRedact.desc=Auto Idatzi testua pdf fitxategian sarrerako testuan oinarritua
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Bihurtu
#PDFToCSV
PDFToCSV.title=PDF a CSV
PDFToCSV.header=PDF a CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extracto

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Caviarder automatiquement
home.autoRedact.desc=Caviardez automatiquement les informations sensibles d\u2019un PDF.
showJS.tags=caviarder,redact,auto
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Convertir
#PDFToCSV
PDFToCSV.title=PDF en CSV
PDFToCSV.header=PDF en CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extrait

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Redazione automatica
home.autoRedact.desc=Redige automaticamente (oscura) il testo in un PDF in base al testo immesso
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Converti
#PDFToCSV
PDFToCSV.title=Da PDF a CSV
PDFToCSV.header=Da PDF a CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Estratto

View File

@ -336,9 +336,6 @@ home.autoRedact.title=自動塗りつぶし
home.autoRedact.desc=入力したテキストに基づいてPDF内のテキストを自動で塗りつぶし(黒塗り)します。
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=変換
#PDFToCSV
PDFToCSV.title=PDF??CSV?
PDFToCSV.header=PDF??CSV?
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=????

View File

@ -336,9 +336,6 @@ home.autoRedact.title=자동 검열
home.autoRedact.desc=PDF 문서에서 입력된 텍스트들을 자동으로 검열(모자이크)합니다.
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=변환
#PDFToCSV
PDFToCSV.title=PDF? CSV?
PDFToCSV.header=PDF? CSV?
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=??

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redact
home.autoRedact.desc=Auto Redacts(Blacks out) text in a PDF based on input text
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Converteren
#PDFToCSV
PDFToCSV.title=PDF naar CSV
PDFToCSV.header=PDF naar CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redact
home.autoRedact.desc=Auto Redacts(Blacks out) text in a PDF based on input text
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Konwertuj
#PDFToCSV
PDFToCSV.title=PDF na CSV
PDFToCSV.header=PDF na CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Wyci?g

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redact
home.autoRedact.desc=Auto Redacts(Blacks out) text in a PDF based on input text
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Конвертировать
#PDFToCSV
PDFToCSV.title=PDF ? CSV
PDFToCSV.header=PDF ? CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=???????

View File

@ -336,9 +336,6 @@ home.autoRedact.title=Auto Redact
home.autoRedact.desc=Auto Redacts(Blacks out) text in a PDF based on input text
showJS.tags=JS
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=Konvertera
#PDFToCSV
PDFToCSV.title=PDF till CSV
PDFToCSV.header=PDF till CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Navvit

View File

@ -126,15 +126,9 @@ adminUserSettings.submit=Kullanıcıyı Kaydet
# HOME-PAGE #
#############
home.desc=Yerel olarak barındırılan tüm PDF ihtiyaçlarınız için tek durak noktanız.
##########################
### TODO: Translate ###
##########################
home.searchBar=Search for features...
##########################
### TODO: Translate ###
##########################
home.viewPdf.title=View PDF
home.viewPdf.desc=View, annotate, add text or images
viewPdf.tags=view,read,annotate,text,image
@ -342,9 +336,6 @@ home.autoRedact.title=Otomatik Karartma
home.autoRedact.desc=Giriş metnine dayanarak bir PDF'teki metni Otomatik Karartır (Redakte)
showJS.tags=Karart,Gizle,karartma,siyah,markör,gizli
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -651,9 +642,6 @@ multiTool.title=PDF Çoklu Araç
multiTool.header=PDF Çoklu Araç
#view pdf
##########################
### TODO: Translate ###
##########################
viewPdf.title=View PDF
viewPdf.header=View PDF
@ -841,9 +829,10 @@ PDFToXML.credit=Bu hizmet dosya dönüşümü için LibreOffice kullanır.
PDFToXML.submit=Dönüştür
#PDFToCSV
PDFToCSV.title=PDF to CSV
PDFToCSV.header=PDF to CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.title=PDF to CSV
PDFToCSV.header=PDF to CSV
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract

View File

@ -336,9 +336,6 @@ home.autoRedact.title=自动删除
home.autoRedact.desc=根据输入文本自动删除覆盖PDF中的文本
showJS.tags=JavaScript
##########################
### TODO: Translate ###
##########################
home.tableExtraxt.title=PDF to CSV
home.tableExtraxt.desc=Extracts Tables from a PDF converting it to CSV
tableExtraxt.tags=CSV,Table Extraction,extract,convert
@ -834,4 +831,8 @@ PDFToXML.submit=转换
#PDFToCSV
PDFToCSV.title=PDF ? CSV
PDFToCSV.header=PDF ? CSV
##########################
### TODO: Translate ###
##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=??

View File

@ -18,7 +18,7 @@
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<button type="submit" class="btn btn-primary" th:text="#{PDFToCSV.submit}"></button>
</form>
<p id="instruction-text" style="margin: 0; display: none">Choose page to extract table</p>
<p id="instruction-text" style="margin: 0; display: none" th:text="#{PDFToCSV.prompt}"></p>
<div style="position: relative; display: inline-block;">
<div>