mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-02-17 13:52:14 +01:00
feat(docker-runtime): unified Debian-based images, dynamic path resolution & enhanced UNO/LibreOffice handling (#4880)
# Description of Changes ### What was changed This PR introduces a major refinement to the Docker runtime, system path resolution, conversion tooling, and integration logic across the codebase. Key improvements include: - Migration of **Dockerfile**, **Dockerfile.fat** to a unified Debian-based environment. - Introduction of **RuntimePathConfig** enhancements to dynamically resolve: - `weasyprint`, `unoconvert`, `calibre`, `ocrmypdf`, `soffice` - Tesseract `tessdata` paths with Docker-aware defaults. - Support for **UNO server (unoserver/unoconvert)** as primary document converter with automatic fallback to `soffice`. - Isolation of Python environments for WeasyPrint and UNO tooling. - Updated controllers and services to correctly inject `RuntimePathConfig`. - Improved process execution logic in converters and OCR handling. - Major updates to `init.sh` and `init-without-ocr.sh`: - Unified environment initialization - Proper UID/GID remapping - Safer permissions handling - Automatic Tesseract path detection - Reliable startup of headless LibreOffice + Xvfb + UNO server - Full test suite updates: - Adaptation to new conversion paths - Mocking of UNO and LibreOffice commands - More robust Docker test logic - Updated example docker-compose files referencing GHCR test images. - Expanded configuration schema for new operations paths. ### Why the change was made These changes address long-standing issues around: - Inconsistent or missing binary paths between image variants. - Reduced reliability of document conversions (UNO vs. soffice). - Lack of uniform runtime initialization across Docker images. - Repetitive environment setup logic split across multiple scripts. - Fragile test scenarios tied to Alpine-based images. Switching to a unified Debian-based runtime significantly improves: - Compatibility with LibreOffice, Calibre, WebEngine and graphics stack. - UNO stability for document conversions. - Tesseract deterministic behavior. - Debuggability and reliability of CI/CD Docker-based tests. The improvements to `RuntimePathConfig` ensure all system binaries are fully configurable and correctly detected at runtime. --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [x] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### Translations (if applicable) - [ ] I ran [`scripts/counter_translation.py`](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/docs/counter_translation.md) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [x] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details.
This commit is contained in:
@@ -1,42 +1,188 @@
|
||||
#!/bin/bash
|
||||
# This script initializes Stirling PDF without OCR features.
|
||||
set -euo pipefail
|
||||
|
||||
export JAVA_TOOL_OPTIONS="${JAVA_BASE_OPTS} ${JAVA_CUSTOM_OPTS}"
|
||||
echo "running with JAVA_TOOL_OPTIONS ${JAVA_BASE_OPTS} ${JAVA_CUSTOM_OPTS}"
|
||||
log() { printf '%s\n' "$*" >&2; }
|
||||
command_exists() { command -v "$1" >/dev/null 2>&1; }
|
||||
|
||||
# Update the user and group IDs as per environment variables
|
||||
if [ ! -z "$PUID" ] && [ "$PUID" != "$(id -u stirlingpdfuser)" ]; then
|
||||
usermod -o -u "$PUID" stirlingpdfuser || true
|
||||
SU_EXEC_BIN=""
|
||||
if command_exists su-exec; then
|
||||
SU_EXEC_BIN="su-exec"
|
||||
elif command_exists gosu; then
|
||||
SU_EXEC_BIN="gosu"
|
||||
fi
|
||||
|
||||
CURRENT_USER="$(id -un)"
|
||||
CURRENT_UID="$(id -u)"
|
||||
SWITCH_USER_WARNING_EMITTED=false
|
||||
|
||||
if [ ! -z "$PGID" ] && [ "$PGID" != "$(getent group stirlingpdfgroup | cut -d: -f3)" ]; then
|
||||
groupmod -o -g "$PGID" stirlingpdfgroup || true
|
||||
fi
|
||||
umask "$UMASK" || true
|
||||
warn_switch_user_once() {
|
||||
if [ "$SWITCH_USER_WARNING_EMITTED" = false ]; then
|
||||
log "WARNING: Unable to switch to user ${RUNTIME_USER:-stirlingpdfuser}; running command as ${CURRENT_USER}."
|
||||
SWITCH_USER_WARNING_EMITTED=true
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ "$INSTALL_BOOK_AND_ADVANCED_HTML_OPS" == "true" && "$FAT_DOCKER" != "true" ]]; then
|
||||
echo "issue with calibre in current version, feature currently disabled on Stirling-PDF"
|
||||
#apk add --no-cache calibre@testing
|
||||
run_as_runtime_user() {
|
||||
if [ "$CURRENT_USER" = "$RUNTIME_USER" ]; then
|
||||
"$@"
|
||||
elif [ "$CURRENT_UID" -eq 0 ] && [ -n "$SU_EXEC_BIN" ]; then
|
||||
"$SU_EXEC_BIN" "$RUNTIME_USER" "$@"
|
||||
else
|
||||
warn_switch_user_once
|
||||
"$@"
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------- VERSION_TAG ----------
|
||||
# Load VERSION_TAG from file if not provided via environment.
|
||||
if [ -z "${VERSION_TAG:-}" ] && [ -f /etc/stirling_version ]; then
|
||||
VERSION_TAG="$(tr -d '\r\n' < /etc/stirling_version)"
|
||||
export VERSION_TAG
|
||||
fi
|
||||
|
||||
if [[ "$FAT_DOCKER" != "true" ]]; then
|
||||
/scripts/download-security-jar.sh
|
||||
fi
|
||||
# ---------- JAVA_OPTS ----------
|
||||
# Configure Java runtime options.
|
||||
export JAVA_TOOL_OPTIONS="${JAVA_BASE_OPTS:-} ${JAVA_CUSTOM_OPTS:-}"
|
||||
export JAVA_TOOL_OPTIONS="-Djava.awt.headless=true ${JAVA_TOOL_OPTIONS}"
|
||||
log "running with JAVA_TOOL_OPTIONS=${JAVA_TOOL_OPTIONS}"
|
||||
log "Running Stirling PDF with DISABLE_ADDITIONAL_FEATURES=${DISABLE_ADDITIONAL_FEATURES:-} and VERSION_TAG=${VERSION_TAG:-<unset>}"
|
||||
|
||||
if [[ -n "$LANGS" ]]; then
|
||||
/scripts/installFonts.sh $LANGS
|
||||
fi
|
||||
# ---------- UMASK ----------
|
||||
# Set default permissions mask.
|
||||
UMASK_VAL="${UMASK:-022}"
|
||||
umask "$UMASK_VAL" 2>/dev/null || umask 022
|
||||
|
||||
echo "Setting permissions and ownership for necessary directories..."
|
||||
# Ensure temp directory exists and has correct permissions
|
||||
mkdir -p /tmp/stirling-pdf || true
|
||||
# Attempt to change ownership of directories and files
|
||||
if chown -R stirlingpdfuser:stirlingpdfgroup $HOME /logs /scripts /usr/share/fonts/opentype/noto /configs /customFiles /pipeline /tmp/stirling-pdf /app.jar; then
|
||||
chmod -R 755 /logs /scripts /usr/share/fonts/opentype/noto /configs /customFiles /pipeline /tmp/stirling-pdf /app.jar || true
|
||||
# If chown succeeds, execute the command as stirlingpdfuser
|
||||
exec su-exec stirlingpdfuser "$@"
|
||||
# ---------- XDG_RUNTIME_DIR ----------
|
||||
# Create the runtime directory, respecting UID/GID settings.
|
||||
RUNTIME_USER="stirlingpdfuser"
|
||||
if id -u "$RUNTIME_USER" >/dev/null 2>&1; then
|
||||
RUID="$(id -u "$RUNTIME_USER")"
|
||||
RGRP="$(id -gn "$RUNTIME_USER")"
|
||||
else
|
||||
# If chown fails, execute the command without changing the user context
|
||||
echo "[WARN] Chown failed, running as host user"
|
||||
exec "$@"
|
||||
RUID="$(id -u)"
|
||||
RGRP="$(id -gn)"
|
||||
RUNTIME_USER="$(id -un)"
|
||||
fi
|
||||
CURRENT_USER="$(id -un)"
|
||||
CURRENT_UID="$(id -u)"
|
||||
|
||||
export XDG_RUNTIME_DIR="/tmp/xdg-${RUID}"
|
||||
mkdir -p "${XDG_RUNTIME_DIR}" || true
|
||||
if [ "$(id -u)" -eq 0 ]; then
|
||||
chown "${RUNTIME_USER}:${RGRP}" "${XDG_RUNTIME_DIR}" 2>/dev/null || true
|
||||
fi
|
||||
chmod 700 "${XDG_RUNTIME_DIR}" 2>/dev/null || true
|
||||
log "XDG_RUNTIME_DIR=${XDG_RUNTIME_DIR}"
|
||||
|
||||
# ---------- Optional ----------
|
||||
# Disable advanced HTML operations if required.
|
||||
if [[ "${INSTALL_BOOK_AND_ADVANCED_HTML_OPS:-false}" == "true" && "${FAT_DOCKER:-true}" != "true" ]]; then
|
||||
log "issue with calibre in current version, feature currently disabled on Stirling-PDF"
|
||||
fi
|
||||
|
||||
# Download security JAR in non-fat builds.
|
||||
if [[ "${FAT_DOCKER:-true}" != "true" && -x /scripts/download-security-jar.sh ]]; then
|
||||
/scripts/download-security-jar.sh || true
|
||||
fi
|
||||
|
||||
# ---------- UID/GID remap ----------
|
||||
# Remap user/group IDs to match container runtime settings.
|
||||
if [ "$(id -u)" -eq 0 ]; then
|
||||
if id -u stirlingpdfuser >/dev/null 2>&1; then
|
||||
if [ -n "${PUID:-}" ] && [ "$PUID" != "$(id -u stirlingpdfuser)" ]; then
|
||||
usermod -o -u "$PUID" stirlingpdfuser || true
|
||||
chown stirlingpdfuser:stirlingpdfgroup "${XDG_RUNTIME_DIR}" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
if getent group stirlingpdfgroup >/dev/null 2>&1; then
|
||||
if [ -n "${PGID:-}" ] && [ "$PGID" != "$(getent group stirlingpdfgroup | cut -d: -f3)" ]; then
|
||||
groupmod -o -g "$PGID" stirlingpdfgroup || true
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# ---------- Permissions ----------
|
||||
# Ensure required directories exist and set correct permissions.
|
||||
log "Setting permissions..."
|
||||
mkdir -p /tmp/stirling-pdf /logs /configs /customFiles /pipeline || true
|
||||
CHOWN_PATHS=("$HOME" "/logs" "/scripts" "/configs" "/customFiles" "/pipeline" "/tmp/stirling-pdf" "/app.jar")
|
||||
[ -d /usr/share/fonts/truetype ] && CHOWN_PATHS+=("/usr/share/fonts/truetype")
|
||||
CHOWN_OK=true
|
||||
for p in "${CHOWN_PATHS[@]}"; do
|
||||
if [ -e "$p" ]; then
|
||||
chown -R "stirlingpdfuser:stirlingpdfgroup" "$p" 2>/dev/null || CHOWN_OK=false
|
||||
chmod -R 755 "$p" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# ---------- Xvfb ----------
|
||||
# Start a virtual framebuffer for GUI-based LibreOffice interactions.
|
||||
if command_exists Xvfb; then
|
||||
log "Starting Xvfb on :99"
|
||||
Xvfb :99 -screen 0 1024x768x24 -ac +extension GLX +render -noreset > /dev/null 2>&1 &
|
||||
export DISPLAY=:99
|
||||
sleep 1
|
||||
else
|
||||
log "Xvfb not installed; skipping virtual display setup"
|
||||
fi
|
||||
|
||||
# ---------- unoserver ----------
|
||||
# Start LibreOffice UNO server for document conversions.
|
||||
UNOSERVER_BIN="$(command -v unoserver || true)"
|
||||
UNOCONVERT_BIN="$(command -v unoconvert || true)"
|
||||
UNOSERVER_PID=""
|
||||
|
||||
if [ -n "$UNOSERVER_BIN" ] && [ -n "$UNOCONVERT_BIN" ]; then
|
||||
LIBREOFFICE_PROFILE="${HOME:-/home/${RUNTIME_USER}}/.libreoffice_uno_${RUID}"
|
||||
run_as_runtime_user mkdir -p "$LIBREOFFICE_PROFILE"
|
||||
|
||||
log "Starting unoserver on 127.0.0.1:2003"
|
||||
run_as_runtime_user "$UNOSERVER_BIN" \
|
||||
--interface 127.0.0.1 \
|
||||
--port 2003 \
|
||||
--uno-port 2004 \
|
||||
&
|
||||
UNOSERVER_PID=$!
|
||||
log "unoserver PID: $UNOSERVER_PID (Profile: $LIBREOFFICE_PROFILE)"
|
||||
|
||||
# Wait until UNO server is ready.
|
||||
log "Waiting for unoserver..."
|
||||
for _ in {1..20}; do
|
||||
if run_as_runtime_user "$UNOCONVERT_BIN" --version >/dev/null 2>&1; then
|
||||
log "unoserver is ready!"
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
if ! run_as_runtime_user "$UNOCONVERT_BIN" --version >/dev/null 2>&1; then
|
||||
log "ERROR: unoserver failed!"
|
||||
if [ -n "$UNOSERVER_PID" ]; then
|
||||
kill "$UNOSERVER_PID" 2>/dev/null || true
|
||||
wait "$UNOSERVER_PID" 2>/dev/null || true
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log "unoserver/unoconvert not installed; skipping UNO setup"
|
||||
fi
|
||||
|
||||
# ---------- Java ----------
|
||||
# Start Stirling PDF Java application.
|
||||
log "Starting Stirling PDF"
|
||||
JAVA_CMD=(
|
||||
java
|
||||
-Dfile.encoding=UTF-8
|
||||
-Djava.io.tmpdir=/tmp/stirling-pdf
|
||||
-jar /app.jar
|
||||
)
|
||||
|
||||
if [ "$CURRENT_USER" = "$RUNTIME_USER" ]; then
|
||||
exec "${JAVA_CMD[@]}"
|
||||
elif [ "$CURRENT_UID" -eq 0 ] && [ -n "$SU_EXEC_BIN" ]; then
|
||||
exec "$SU_EXEC_BIN" "$RUNTIME_USER" "${JAVA_CMD[@]}"
|
||||
else
|
||||
warn_switch_user_once
|
||||
exec "${JAVA_CMD[@]}"
|
||||
fi
|
||||
|
||||
120
scripts/init.sh
120
scripts/init.sh
@@ -1,36 +1,110 @@
|
||||
#!/bin/bash
|
||||
# This script initializes environment variables and paths,
|
||||
# prepares Tesseract data directories, and then runs the main init script.
|
||||
|
||||
# Copy the original tesseract-ocr files to the volume directory without overwriting existing files
|
||||
echo "Copying original files without overwriting existing files"
|
||||
mkdir -p /usr/share/tessdata
|
||||
cp -rn /usr/share/tessdata-original/* /usr/share/tessdata
|
||||
set -euo pipefail
|
||||
|
||||
if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then
|
||||
cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tessdata || true;
|
||||
append_env_path() {
|
||||
local target="$1" current="$2" separator=":"
|
||||
if [ -d "$target" ] && [[ ":${current}:" != *":${target}:"* ]]; then
|
||||
if [ -n "$current" ]; then
|
||||
printf '%s' "${target}${separator}${current}"
|
||||
else
|
||||
printf '%s' "${target}"
|
||||
fi
|
||||
else
|
||||
printf '%s' "$current"
|
||||
fi
|
||||
}
|
||||
|
||||
python_site_dir() {
|
||||
local venv_dir="$1"
|
||||
local python_bin="$venv_dir/bin/python"
|
||||
if [ -x "$python_bin" ]; then
|
||||
local py_tag
|
||||
if py_tag="$("$python_bin" -c 'import sys; print(f"python{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null)" \
|
||||
&& [ -n "$py_tag" ] \
|
||||
&& [ -d "$venv_dir/lib/$py_tag/site-packages" ]; then
|
||||
printf '%s' "$venv_dir/lib/$py_tag/site-packages"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# === LD_LIBRARY_PATH ===
|
||||
# Adjust the library path depending on CPU architecture.
|
||||
ARCH=$(uname -m)
|
||||
case "$ARCH" in
|
||||
x86_64)
|
||||
[ -d /usr/lib/x86_64-linux-gnu ] && export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
||||
;;
|
||||
aarch64)
|
||||
[ -d /usr/lib/aarch64-linux-gnu ] && export LD_LIBRARY_PATH="/usr/lib/aarch64-linux-gnu${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Add LibreOffice program directory to library path if available.
|
||||
if [ -d /usr/lib/libreoffice/program ]; then
|
||||
export LD_LIBRARY_PATH="/usr/lib/libreoffice/program${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
||||
fi
|
||||
|
||||
# === Python PATH ===
|
||||
# Add virtual environments to PATH and PYTHONPATH.
|
||||
for dir in /opt/venv/bin /opt/unoserver-venv/bin; do
|
||||
PATH="$(append_env_path "$dir" "$PATH")"
|
||||
done
|
||||
export PATH
|
||||
|
||||
PYTHON_PATH_ENTRIES=()
|
||||
for venv in /opt/venv /opt/unoserver-venv; do
|
||||
if [ -d "$venv" ]; then
|
||||
site_dir="$(python_site_dir "$venv")"
|
||||
[ -n "${site_dir:-}" ] && PYTHON_PATH_ENTRIES+=("$site_dir")
|
||||
fi
|
||||
done
|
||||
if [ ${#PYTHON_PATH_ENTRIES[@]} -gt 0 ]; then
|
||||
PYTHONPATH="$(IFS=:; printf '%s' "${PYTHON_PATH_ENTRIES[*]}")${PYTHONPATH:+:$PYTHONPATH}"
|
||||
export PYTHONPATH
|
||||
fi
|
||||
|
||||
# # === tessdata ===
|
||||
# # Prepare Tesseract OCR data directory.
|
||||
REAL_TESSDATA="/usr/share/tesseract-ocr/5/tessdata"
|
||||
SEC_TESSDATA="/usr/share/tessdata"
|
||||
|
||||
log_warn() {
|
||||
echo "[init][warn] $*" >&2
|
||||
}
|
||||
|
||||
if [ -d "$REAL_TESSDATA" ] && [ -w "$REAL_TESSDATA" ]; then
|
||||
log_warn "Skipping tessdata adjustments; directory writable: $REAL_TESSDATA"
|
||||
else
|
||||
log_warn "Skipping tessdata adjustments; directory missing or not writable: $REAL_TESSDATA"
|
||||
fi
|
||||
|
||||
if [ -d /usr/share/tesseract-ocr/5/tessdata ]; then
|
||||
cp -r /usr/share/tesseract-ocr/5/tessdata/* /usr/share/tessdata || true;
|
||||
REAL_TESSDATA="/usr/share/tesseract-ocr/5/tessdata"
|
||||
log_warn "Using /usr/share/tesseract-ocr/5/tessdata as TESSDATA_PREFIX"
|
||||
elif [ -d /usr/share/tessdata ]; then
|
||||
REAL_TESSDATA="/usr/share/tessdata"
|
||||
log_warn "Using /usr/share/tessdata as TESSDATA_PREFIX"
|
||||
elif [ -d /tessdata ]; then
|
||||
REAL_TESSDATA="/tessdata"
|
||||
log_warn "Using /tessdata as TESSDATA_PREFIX"
|
||||
else
|
||||
REAL_TESSDATA=""
|
||||
log_warn "No tessdata directory found"
|
||||
fi
|
||||
|
||||
# Check if TESSERACT_LANGS environment variable is set and is not empty
|
||||
if [[ -n "$TESSERACT_LANGS" ]]; then
|
||||
# Convert comma-separated values to a space-separated list
|
||||
SPACE_SEPARATED_LANGS=$(echo $TESSERACT_LANGS | tr ',' ' ')
|
||||
pattern='^[a-zA-Z]{2,4}(_[a-zA-Z]{2,4})?$'
|
||||
# Install each language pack
|
||||
for LANG in $SPACE_SEPARATED_LANGS; do
|
||||
if [[ $LANG =~ $pattern ]]; then
|
||||
apk add --no-cache "tesseract-ocr-data-$LANG"
|
||||
else
|
||||
echo "Skipping invalid language code"
|
||||
fi
|
||||
done
|
||||
if [ -n "$REAL_TESSDATA" ]; then
|
||||
export TESSDATA_PREFIX="$REAL_TESSDATA"
|
||||
fi
|
||||
|
||||
# Ensure temp directory exists with correct permissions before running main init
|
||||
mkdir -p /tmp/stirling-pdf || true
|
||||
# === Temp dir ===
|
||||
# Ensure the temporary directory exists and has proper permissions.
|
||||
mkdir -p /tmp/stirling-pdf
|
||||
chown -R stirlingpdfuser:stirlingpdfgroup /tmp/stirling-pdf || true
|
||||
chmod -R 755 /tmp/stirling-pdf || true
|
||||
|
||||
/scripts/init-without-ocr.sh "$@"
|
||||
# === Start application ===
|
||||
# Run the main init script that handles the full startup logic.
|
||||
exec /scripts/init-without-ocr.sh
|
||||
|
||||
Reference in New Issue
Block a user