Files
Stirling-PDF/scripts/aot-diagnostics.sh
Balázs Szücs 9ac260ee92 feat(aot): add aot-diagnostics.sh for AOT cache diagnostics and validation (#5848)
# Description of Changes



This pull request makes significant improvements to the Docker build
process for the embedded Stirling-PDF image, focusing on build
efficiency, runtime optimization, and maintainability. Key changes
include upgrading major tool versions, introducing optional stripping of
Calibre's WebEngine to reduce image size, consolidating ImageMagick
layers, and refining the Python environment build process. The runtime
image is now leaner, with clearer separation between build and runtime
dependencies, and improved caching for faster builds and pulls.

**Build and Dependency Management Improvements**
* Upgraded Calibre to version `9.4.0` and added support for the
`TARGETPLATFORM` build argument for multi-platform builds.
* Added an optional `CALIBRE_STRIP_WEBENGINE` build argument to strip
Chromium/WebEngine from Calibre, saving ~80 MB when PDF output via
Calibre is not needed.
* Consolidated ImageMagick outputs into a single staging directory
(`/magick-export`) to reduce Docker layers and improve caching
efficiency.
* Refactored Python virtual environment build: now built in a dedicated
stage with pre-built wheels and copied into the runtime image,
eliminating the need for build tools and pip installs at runtime.

**Runtime Image Optimization**
* Reduced installed system packages to only what is needed at runtime;
Python build tools and dev packages are no longer included.
* Cleaned up unnecessary runtime files, including removal of build-only
Python artifacts and system headers, for a smaller and more secure
image.

**Layer and Copy Optimization**
* Switched to `COPY --link` for all major external tool layers and
application files, enabling independent layer caching and parallel pulls
for faster builds.

**Runtime Configuration and Health**
* Improved runtime directory structure and permissions, added persistent
cache directories for Project Leyden AOT, and wrote the version tag to
`/etc/stirling_version` for easier script access.
* Updated the healthcheck to wait longer for startup and increased
timeout/retries for more robust readiness detection.

<!--
Please provide a summary of the changes, including:

- What was changed
- Why the change was made
- Any challenges encountered

Closes #(issue_number)
-->

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### Translations (if applicable)

- [ ] I ran
[`scripts/counter_translation.py`](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/docs/counter_translation.md)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.

---------

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2026-03-03 19:06:46 +00:00

381 lines
15 KiB
Bash
Executable File

#!/bin/bash
# aot-diagnostics.sh - Project Leyden AOT cache diagnostic tool for Stirling-PDF
#
# Diagnoses AOT cache generation failures, especially on ARM64 (aarch64).
# Reports JVM feature support, memory limits, cache state, and fingerprint validity.
#
# Usage:
# aot-diagnostics.sh [--test] [--cache PATH]
#
# --test Run a quick AOT RECORD smoke test (~10-30s). Shows exactly
# what error the JVM produces, useful for ARM debugging.
# --cache PATH Override the AOT cache path (default: /configs/cache/stirling.aot)
#
# Symlink aliases set up by init-without-ocr.sh: aot-diag, aot-diagnostics
set -euo pipefail
AOT_CACHE_DEFAULT="/configs/cache/stirling.aot"
RUN_SMOKE_TEST=false
AOT_CACHE_PATH=""
for arg in "$@"; do
case "$arg" in
--test) RUN_SMOKE_TEST=true ;;
--cache=*) AOT_CACHE_PATH="${arg#--cache=}" ;;
--cache) shift; AOT_CACHE_PATH="${1:-}" ;;
-h|--help)
sed -n '/^#/,/^[^#]/{ /^#/{ s/^# \{0,1\}//; p } }' "$0" | head -20
exit 0
;;
esac
done
AOT_CACHE="${AOT_CACHE_PATH:-$AOT_CACHE_DEFAULT}"
AOT_FP="${AOT_CACHE}.fingerprint"
# ── Terminal colours ──────────────────────────────────────────────────────────
if [ -t 1 ]; then
C_RED='\033[0;31m' C_GRN='\033[0;32m' C_YLW='\033[0;33m'
C_CYN='\033[0;36m' C_BLD='\033[1m' C_RST='\033[0m'
else
C_RED='' C_GRN='' C_YLW='' C_CYN='' C_BLD='' C_RST=''
fi
PASS=0; WARN=0; FAIL=0
pass() { printf "${C_GRN}[PASS]${C_RST} %s\n" "$*"; PASS=$((PASS+1)); }
warn() { printf "${C_YLW}[WARN]${C_RST} %s\n" "$*"; WARN=$((WARN+1)); }
fail() { printf "${C_RED}[FAIL]${C_RST} %s\n" "$*"; FAIL=$((FAIL+1)); }
info() { printf "${C_CYN}[INFO]${C_RST} %s\n" "$*"; }
hdr() { printf "\n${C_BLD}=== %s ===${C_RST}\n" "$*"; }
command_exists() { command -v "$1" >/dev/null 2>&1; }
# ── Section 1: Environment ────────────────────────────────────────────────────
hdr "Environment"
info "Date: $(date -u +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date)"
info "Hostname: $(hostname 2>/dev/null || echo unknown)"
info "Architecture: $(uname -m)"
info "Kernel: $(uname -r)"
if [ -f /etc/stirling_version ]; then
info "Version: $(tr -d '\r\n' < /etc/stirling_version)"
elif [ -n "${VERSION_TAG:-}" ]; then
info "Version: ${VERSION_TAG}"
else
warn "VERSION_TAG not set and /etc/stirling_version not found"
fi
if [ -f /etc/os-release ]; then
info "OS: $(. /etc/os-release; echo "${PRETTY_NAME:-${NAME:-unknown}}")"
fi
# Warn about external JVM option vars — these break AOT training if set
for _jvm_var in JAVA_TOOL_OPTIONS JDK_JAVA_OPTIONS _JAVA_OPTIONS; do
_jvm_val="$(eval echo "\${${_jvm_var}:-}")"
if [ -n "$_jvm_val" ]; then
warn "${_jvm_var}='${_jvm_val}'"
warn " External JVM options are cleared during AOT training (fixed), but may"
warn " affect the running app. Ensure they are compatible with -Xmx limits."
fi
done
unset _jvm_var _jvm_val
# ── Section 2: JVM Detection ──────────────────────────────────────────────────
hdr "JVM Detection"
if ! command_exists java; then
fail "java not found in PATH. PATH=${PATH}"
exit 1
fi
JDK_VER="$(JAVA_TOOL_OPTIONS= JDK_JAVA_OPTIONS= _JAVA_OPTIONS= java -version 2>&1 | head -1)"
info "JDK: ${JDK_VER}"
info "java binary: $(command -v java)"
ARCH="$(uname -m)"
# --- AOTMode support (Project Leyden) ---
AOT_SUPPORTED=false
if java -XX:AOTMode=off -version >/dev/null 2>&1; then
pass "AOTMode supported (-XX:AOTMode=off accepted)"
AOT_SUPPORTED=true
else
fail "AOTMode NOT supported on this JVM build ($(uname -m))"
fail " This JDK does not support Project Leyden (JEP 483/514/515)."
fail " AOT cache generation will be skipped."
if [[ "$ARCH" == "aarch64" ]]; then
warn " ARM64: some vendor JDK 25 builds omit Leyden. Try eclipse-temurin:25-jre."
fi
fi
# --- CompactObjectHeaders support (Project Lilliput) ---
COMPACT_HEADERS_FLAG=""
if java -XX:+UseCompactObjectHeaders -version >/dev/null 2>&1; then
pass "UseCompactObjectHeaders supported (Project Lilliput active)"
COMPACT_HEADERS_FLAG="-XX:+UseCompactObjectHeaders"
else
warn "UseCompactObjectHeaders NOT supported on $(uname -m)"
warn " AOT training will run without this flag. Runtime must also omit it."
if [[ "$ARCH" == "aarch64" ]]; then
warn " This is the most common cause of ARM AOT failures: the flag was"
warn " hardcoded in training but unsupported at runtime (or vice-versa)."
fi
fi
# --- CompressedOops ---
COMPRESSED_OOPS_FLAG="-XX:+UseCompressedOops"
if java -XX:+UseCompressedOops -version >/dev/null 2>&1; then
pass "UseCompressedOops accepted by JVM"
else
warn "UseCompressedOops flag not accepted — will use -XX:-UseCompressedOops"
COMPRESSED_OOPS_FLAG="-XX:-UseCompressedOops"
fi
# ── Section 3: Memory Limits ──────────────────────────────────────────────────
hdr "Memory Limits"
MEM_MB=0
if [ -f /sys/fs/cgroup/memory.max ]; then
RAW="$(cat /sys/fs/cgroup/memory.max 2>/dev/null || echo '')"
if [ "$RAW" = "max" ]; then
info "cgroup v2 memory.max: unlimited"
elif [ -n "$RAW" ] && [ "$RAW" -gt 0 ] 2>/dev/null; then
MEM_MB=$(( RAW / 1048576 ))
info "cgroup v2 memory.max: ${MEM_MB}MB"
fi
elif [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
RAW="$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes 2>/dev/null || echo '')"
if [ "${#RAW}" -ge 19 ]; then
info "cgroup v1 limit: unlimited (max uint64)"
elif [ -n "$RAW" ] && [ "$RAW" -gt 0 ] 2>/dev/null; then
MEM_MB=$(( RAW / 1048576 ))
info "cgroup v1 limit: ${MEM_MB}MB"
fi
else
info "No cgroup memory limit detected"
fi
if [ "$MEM_MB" -eq 0 ] && [ -f /proc/meminfo ]; then
MEM_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo 2>/dev/null || echo 0)
info "System MemTotal: ${MEM_MB}MB"
fi
MIN_MEM=768
if [ "$ARCH" = "aarch64" ]; then
MIN_MEM=1024
fi
if [ "$MEM_MB" -eq 0 ]; then
warn "Could not determine container memory. AOT generation may be skipped."
elif [ "$MEM_MB" -le "$MIN_MEM" ]; then
warn "Available memory (${MEM_MB}MB) is at or below AOT generation minimum (${MIN_MEM}MB on ${ARCH})."
warn " AOT background generation will be skipped for this architecture."
warn " Increase container memory above ${MIN_MEM}MB to enable AOT cache generation."
else
pass "Memory OK: ${MEM_MB}MB available, minimum ${MIN_MEM}MB for ${ARCH}"
fi
if command_exists free; then
FREE_MB="$(free -m 2>/dev/null | awk '/^Mem:/ {print $7}')"
info "Available (free+cache): ${FREE_MB:-?}MB"
fi
# ── Section 4: AOT Cache State ────────────────────────────────────────────────
hdr "AOT Cache State"
info "Cache path: ${AOT_CACHE}"
info "Fingerprint path: ${AOT_FP}"
if [ -f "${AOT_CACHE}" ]; then
CACHE_SIZE="$(du -h "${AOT_CACHE}" 2>/dev/null | cut -f1 || echo '?')"
CACHE_MTIME="$(stat -c '%y' "${AOT_CACHE}" 2>/dev/null | cut -d. -f1 || echo '?')"
info "Cache exists: ${CACHE_SIZE} (modified ${CACHE_MTIME})"
if [ -s "${AOT_CACHE}" ]; then
pass "Cache file is non-empty"
else
fail "Cache file is empty — will be regenerated on next boot"
rm -f "${AOT_CACHE}" "${AOT_FP}" 2>/dev/null || true
fi
else
warn "No cache file at ${AOT_CACHE}"
info " Cache will be generated in background on next boot."
if [ ! -d "$(dirname "${AOT_CACHE}")" ]; then
warn " Parent directory $(dirname "${AOT_CACHE}") does not exist."
warn " Ensure /configs is volume-mounted and writable."
fi
fi
# --- Fingerprint validation ---
if [ -f "${AOT_FP}" ]; then
STORED_FP="$(tr -d '\r\n' < "${AOT_FP}" 2>/dev/null || echo '')"
info "Stored fingerprint: ${STORED_FP}"
# Recompute fingerprint using the same logic as init-without-ocr.sh
FP=""
FP+="jdk:$(JAVA_TOOL_OPTIONS= JDK_JAVA_OPTIONS= _JAVA_OPTIONS= java -version 2>&1 | head -1);"
FP+="arch:${ARCH};"
FP+="compact:${COMPACT_HEADERS_FLAG:-none};"
FP+="oops:${COMPRESSED_OOPS_FLAG:-none};"
if [ -f /app/app.jar ]; then
FP+="app:$(stat -c '%s-%Y' /app/app.jar 2>/dev/null || echo unknown);"
elif [ -f /app.jar ]; then
FP+="app:$(stat -c '%s-%Y' /app.jar 2>/dev/null || echo unknown);"
elif [ -d /app/lib ]; then
FP+="app:$(ls -la /app/lib/ 2>/dev/null | md5sum 2>/dev/null | cut -c1-16 || echo unknown);"
fi
FP+="ver:${VERSION_TAG:-unknown};"
if command_exists md5sum; then
EXPECTED_FP="$(printf '%s' "$FP" | md5sum | cut -c1-16)"
elif command_exists sha256sum; then
EXPECTED_FP="$(printf '%s' "$FP" | sha256sum | cut -c1-16)"
else
EXPECTED_FP="$(printf '%s' "$FP" | cksum | cut -d' ' -f1)"
fi
info "Expected fingerprint: ${EXPECTED_FP}"
if [ "$STORED_FP" = "$EXPECTED_FP" ]; then
pass "Fingerprint valid — cache matches current JDK/arch/app"
else
fail "Fingerprint mismatch — cache is stale"
info " The cache was built with a different JDK, arch, flags, or app version."
info " It will be automatically removed and regenerated on next boot."
# Print a diff of fingerprint components for easier debugging
printf " Stored FP string: (run with --test to regenerate)\n"
printf " Expected FP string: %s\n" "$FP"
fi
else
if [ -f "${AOT_CACHE}" ]; then
warn "Cache exists but no fingerprint file found"
warn " Cache will be treated as stale and regenerated on next boot."
else
info "No fingerprint file (expected — cache not yet generated)"
fi
fi
# ── Section 5: JAR Layout Detection ──────────────────────────────────────────
hdr "JAR Layout"
if [ -f /app/app.jar ] && [ -d /app/lib ]; then
pass "Spring Boot 4 layered layout: /app/app.jar + /app/lib/"
info " Classpath: -cp /app/app.jar:/app/lib/* stirling.software.SPDF.SPDFApplication"
JAR_LAYOUT="layered"
elif [ -f /app.jar ]; then
pass "Single JAR layout: /app.jar"
info " Invocation: -jar /app.jar"
JAR_LAYOUT="single"
elif [ -d /app/BOOT-INF ]; then
pass "Exploded Spring Boot 3 layout: /app/BOOT-INF"
info " Classpath: -cp /app org.springframework.boot.loader.launch.JarLauncher"
JAR_LAYOUT="exploded"
else
fail "No recognisable JAR layout found. Looked for:"
fail " /app/app.jar + /app/lib/ (Spring Boot 4 layered)"
fail " /app.jar (single fat JAR)"
fail " /app/BOOT-INF/ (Spring Boot 3 exploded)"
JAR_LAYOUT="unknown"
fi
# ── Section 6: Disk Space ─────────────────────────────────────────────────────
hdr "Disk Space"
CACHE_DIR="$(dirname "${AOT_CACHE}")"
if [ -d "$CACHE_DIR" ]; then
DF="$(df -h "$CACHE_DIR" 2>/dev/null | tail -1 || echo '')"
info "Volume ($CACHE_DIR): $DF"
AVAIL_PCT="$(df "$CACHE_DIR" 2>/dev/null | awk 'NR==2{print $5}' | tr -d '%')"
if [ -n "$AVAIL_PCT" ] && [ "$AVAIL_PCT" -ge 95 ]; then
fail "Disk almost full (${AVAIL_PCT}% used). AOT cache creation will fail."
elif [ -n "$AVAIL_PCT" ] && [ "$AVAIL_PCT" -ge 85 ]; then
warn "Disk usage high (${AVAIL_PCT}% used). AOT cache is typically 50-150MB."
else
pass "Sufficient disk space available"
fi
else
warn "Cache directory ${CACHE_DIR} does not exist."
warn " /configs must be volume-mounted. AOT cache will not persist across restarts."
fi
# ── Section 7: Optional Smoke Test ───────────────────────────────────────────
if [ "$RUN_SMOKE_TEST" = true ]; then
hdr "AOT RECORD Smoke Test"
if [ "$AOT_SUPPORTED" = false ]; then
warn "Skipping smoke test — AOTMode not supported on this JVM"
elif [ "$JAR_LAYOUT" = "unknown" ]; then
warn "Skipping smoke test — could not determine JAR layout"
else
info "Running minimal AOT RECORD phase (this may take 10-30s on ARM)..."
SMOKE_CONF="/tmp/aot-diag-smoke.aotconf"
SMOKE_LOG="/tmp/aot-diag-smoke.log"
rm -f "$SMOKE_CONF" "$SMOKE_LOG"
SMOKE_CMD=(java -Xmx256m ${COMPACT_HEADERS_FLAG:-} ${COMPRESSED_OOPS_FLAG}
-Xlog:aot=info
-XX:AOTMode=record
-XX:AOTConfiguration="$SMOKE_CONF"
-Dspring.main.banner-mode=off
-Dspring.context.exit=onRefresh
-Dstirling.datasource.url="jdbc:h2:mem:aotsmoke;DB_CLOSE_DELAY=-1;MODE=PostgreSQL")
case "$JAR_LAYOUT" in
layered) SMOKE_CMD+=(-cp "/app/app.jar:/app/lib/*" stirling.software.SPDF.SPDFApplication) ;;
single) SMOKE_CMD+=(-jar /app.jar) ;;
exploded) SMOKE_CMD+=(-cp /app org.springframework.boot.loader.launch.JarLauncher) ;;
esac
info "Command: ${SMOKE_CMD[*]}"
SMOKE_EXIT=0
if command_exists timeout; then
JAVA_TOOL_OPTIONS= JDK_JAVA_OPTIONS= _JAVA_OPTIONS= \
timeout 120s "${SMOKE_CMD[@]}" >"$SMOKE_LOG" 2>&1 || SMOKE_EXIT=$?
else
JAVA_TOOL_OPTIONS= JDK_JAVA_OPTIONS= _JAVA_OPTIONS= \
"${SMOKE_CMD[@]}" >"$SMOKE_LOG" 2>&1 || SMOKE_EXIT=$?
fi
case "$SMOKE_EXIT" in
0|1)
if [ -f "$SMOKE_CONF" ] && [ -s "$SMOKE_CONF" ]; then
CONF_SIZE="$(du -h "$SMOKE_CONF" | cut -f1)"
pass "RECORD phase succeeded (exit=${SMOKE_EXIT}, conf=${CONF_SIZE})"
info " AOT cache generation should work on this system."
else
fail "RECORD phase exit=${SMOKE_EXIT} but no .aotconf produced"
info " Last 30 lines of AOT output:"
tail -30 "$SMOKE_LOG" 2>/dev/null | while IFS= read -r line; do
printf " %s\n" "$line"
done
fi
;;
124)
fail "RECORD phase timed out after 120s"
warn " On ARM under QEMU or slow storage this can happen."
warn " Try running with more memory or on native ARM hardware."
;;
137)
fail "RECORD phase OOM-killed (exit 137)"
warn " Increase container memory. Minimum for ARM AOT training: 1GB."
;;
*)
fail "RECORD phase failed (exit=${SMOKE_EXIT})"
info " Last 30 lines of AOT output:"
tail -30 "$SMOKE_LOG" 2>/dev/null | while IFS= read -r line; do
printf " %s\n" "$line"
done
;;
esac
rm -f "$SMOKE_CONF" "$SMOKE_LOG"
fi
fi
# ── Summary ───────────────────────────────────────────────────────────────────
printf "\n${C_BLD}=== Summary: PASS=%d WARN=%d FAIL=%d ===${C_RST}\n" \
"$PASS" "$WARN" "$FAIL"
if [ "$FAIL" -gt 0 ]; then
printf "${C_RED}AOT cache has issues. See FAIL items above.${C_RST}\n"
printf "To disable AOT: omit STIRLING_AOT_ENABLE (default is off) or set STIRLING_AOT_ENABLE=false\n"
exit 1
elif [ "$WARN" -gt 0 ]; then
printf "${C_YLW}AOT cache may not function optimally. See WARN items above.${C_RST}\n"
exit 0
else
printf "${C_GRN}All AOT checks passed.${C_RST}\n"
exit 0
fi