🤖 format everything with pre-commit by stirlingbot (#5144)

Auto-generated by [create-pull-request][1] with **stirlingbot**

[1]: https://github.com/peter-evans/create-pull-request

Signed-off-by: stirlingbot[bot] <stirlingbot[bot]@users.noreply.github.com>
Co-authored-by: stirlingbot[bot] <195170888+stirlingbot[bot]@users.noreply.github.com>
This commit is contained in:
stirlingbot[bot]
2025-12-22 15:44:38 +00:00
committed by GitHub
parent e6d3f20c36
commit c990ab3216
26 changed files with 1239 additions and 822 deletions

View File

@@ -1 +1 @@
allow-ghsas: GHSA-wrw7-89jp-8q8g
allow-ghsas: GHSA-wrw7-89jp-8q8g

View File

@@ -14,12 +14,10 @@ Usage:
# Sample for Windows:
# python .github/scripts/check_language_toml.py --reference-file frontend/public/locales/en-GB/translation.toml --branch "" --files frontend/public/locales/de-DE/translation.toml frontend/public/locales/fr-FR/translation.toml
import copy
import glob
import os
import argparse
import re
import json
import tomllib # Python 3.11+ (stdlib)
import tomli_w # For writing TOML files
@@ -38,7 +36,7 @@ def find_duplicate_keys(file_path, keys=None, prefix=""):
duplicates = []
# Load TOML file
with open(file_path, 'rb') as file:
with open(file_path, "rb") as file:
data = tomllib.load(file)
def process_dict(obj, current_prefix=""):
@@ -67,7 +65,7 @@ def parse_toml_file(file_path):
:param file_path: Path to the TOML file.
:return: Dictionary with flattened keys.
"""
with open(file_path, 'rb') as file:
with open(file_path, "rb") as file:
data = tomllib.load(file)
def flatten_dict(d, parent_key="", sep="."):
@@ -193,13 +191,13 @@ def check_for_differences(reference_file, file_list, branch, actor):
basename_current_file = os.path.basename(os.path.join(branch, file_normpath))
locale_dir = os.path.basename(os.path.dirname(file_normpath))
if (
basename_current_file == basename_reference_file
and locale_dir == "en-GB"
):
if basename_current_file == basename_reference_file and locale_dir == "en-GB":
continue
if not file_normpath.endswith(".toml") or basename_current_file != "translation.toml":
if (
not file_normpath.endswith(".toml")
or basename_current_file != "translation.toml"
):
continue
only_reference_file = False
@@ -288,7 +286,9 @@ def check_for_differences(reference_file, file_list, branch, actor):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Find missing keys in TOML translation files")
parser = argparse.ArgumentParser(
description="Find missing keys in TOML translation files"
)
parser.add_argument(
"--actor",
required=False,

View File

@@ -54,4 +54,4 @@ services:
networks:
stirling-network:
driver: bridge
driver: bridge

View File

@@ -19,27 +19,27 @@ const debug = (message) => {
function scanForUsedIcons() {
const usedIcons = new Set();
const srcDir = path.join(__dirname, '..', 'src');
info('🔍 Scanning codebase for LocalIcon usage...');
if (!fs.existsSync(srcDir)) {
console.error('❌ Source directory not found:', srcDir);
process.exit(1);
}
// Recursively scan all .tsx and .ts files
function scanDirectory(dir) {
const files = fs.readdirSync(dir);
files.forEach(file => {
const filePath = path.join(dir, file);
const stat = fs.statSync(filePath);
if (stat.isDirectory()) {
scanDirectory(filePath);
} else if (file.endsWith('.tsx') || file.endsWith('.ts')) {
const content = fs.readFileSync(filePath, 'utf8');
// Match LocalIcon usage: <LocalIcon icon="icon-name" ...>
const localIconMatches = content.match(/<LocalIcon\s+[^>]*icon="([^"]+)"/g);
if (localIconMatches) {
@@ -51,7 +51,7 @@ function scanForUsedIcons() {
}
});
}
// Match old material-symbols-rounded spans: <span className="material-symbols-rounded">icon-name</span>
const spanMatches = content.match(/<span[^>]*className="[^"]*material-symbols-rounded[^"]*"[^>]*>([^<]+)<\/span>/g);
if (spanMatches) {
@@ -64,7 +64,7 @@ function scanForUsedIcons() {
}
});
}
// Match Icon component usage: <Icon icon="material-symbols:icon-name" ...>
const iconMatches = content.match(/<Icon\s+[^>]*icon="material-symbols:([^"]+)"/g);
if (iconMatches) {
@@ -79,12 +79,12 @@ function scanForUsedIcons() {
}
});
}
scanDirectory(srcDir);
const iconArray = Array.from(usedIcons).sort();
info(`📋 Found ${iconArray.length} unique icons across codebase`);
return iconArray;
}
@@ -102,7 +102,7 @@ async function main() {
const existingSet = JSON.parse(fs.readFileSync(outputPath, 'utf8'));
const existingIcons = Object.keys(existingSet.icons || {}).sort();
const currentIcons = [...usedIcons].sort();
if (JSON.stringify(existingIcons) === JSON.stringify(currentIcons)) {
needsRegeneration = false;
info(`✅ Icon set already up-to-date (${usedIcons.length} icons, ${Math.round(fs.statSync(outputPath).size / 1024)}KB)`);
@@ -122,7 +122,7 @@ async function main() {
// Dynamic import of ES module
const { getIcons } = await import('@iconify/utils');
// Extract only our used icons from the full set
const extractedIcons = getIcons(icons, usedIcons);
@@ -183,4 +183,4 @@ export default iconSet;
main().catch(error => {
console.error('❌ Script failed:', error);
process.exit(1);
});
});

View File

@@ -9,10 +9,10 @@ The script prints size and font statistics so we can confirm whether the
lightweight export (no COS dictionaries) is active and how large the font
payloads are.
"""
from __future__ import annotations
import argparse
import base64
import json
import math
from pathlib import Path
@@ -105,7 +105,11 @@ def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
sample_cos_ids.append((font_id, uid))
metadata_bytes += approx_struct_size(
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
{
k: v
for k, v in font.items()
if k not in {"program", "webProgram", "pdfProgram"}
}
)
program = font.get("program")
@@ -259,18 +263,14 @@ def main() -> None:
f" Text payload characters (not counting JSON overhead): "
f"{page_stats.text_payload_chars:,}"
)
print(
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
)
print(f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}")
print(
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
)
print(
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
)
print(
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
)
print(f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}")
if __name__ == "__main__":

View File

@@ -3,6 +3,7 @@
Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility.
Builds proper Unicode cmap from PDF ToUnicode data.
"""
import sys
import re
from pathlib import Path
@@ -13,6 +14,7 @@ from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12
from fontTools.ttLib.tables._n_a_m_e import NameRecord
from fontTools.ttLib.tables.O_S_2f_2 import Panose
def parse_unicode_mapping(mapping_path):
"""
Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap).
@@ -21,23 +23,27 @@ def parse_unicode_mapping(mapping_path):
dict[int, int]: GID → Unicode codepoint
"""
try:
with open(mapping_path, 'rb') as f:
data = f.read().decode('utf-8', errors='ignore')
with open(mapping_path, "rb") as f:
data = f.read().decode("utf-8", errors="ignore")
# Try parsing as JSON first (CID font with complete mapping)
if data.strip().startswith('{'):
if data.strip().startswith("{"):
import json
try:
mapping_data = json.loads(data)
if mapping_data.get('isCID'):
if mapping_data.get("isCID"):
# Build GID → Unicode mapping from entries
gid_to_unicode = {}
for entry in mapping_data.get('entries', []):
gid = entry['gid']
unicode_val = entry['unicode']
for entry in mapping_data.get("entries", []):
gid = entry["gid"]
unicode_val = entry["unicode"]
if unicode_val > 0:
gid_to_unicode[gid] = unicode_val
print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr)
print(
f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries",
file=sys.stderr,
)
return gid_to_unicode
except json.JSONDecodeError:
pass
@@ -47,7 +53,7 @@ def parse_unicode_mapping(mapping_path):
gid_to_unicode = {}
# Pattern for bfchar entries
bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
bfchar_pattern = r"<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>"
for match in re.finditer(bfchar_pattern, data):
gid = int(match.group(1), 16) # For non-CID, char code == GID
unicode_val = int(match.group(2), 16)
@@ -55,7 +61,7 @@ def parse_unicode_mapping(mapping_path):
gid_to_unicode[gid] = unicode_val
# Pattern for bfrange entries
bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
bfrange_pattern = r"<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>"
for match in re.finditer(bfrange_pattern, data):
start_gid = int(match.group(1), 16)
end_gid = int(match.group(2), 16)
@@ -72,6 +78,7 @@ def parse_unicode_mapping(mapping_path):
print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr)
return {}
def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
"""
Wrap raw CFF data (from PDF font stream) as OpenType-CFF.
@@ -86,7 +93,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
"""
try:
# Read raw CFF data
with open(input_path, 'rb') as f:
with open(input_path, "rb") as f:
cff_data = f.read()
# Parse raw CFF data
@@ -106,29 +113,35 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
gid_to_unicode = parse_unicode_mapping(tounicode_path)
# Create a new OTF font
otf = TTFont(sfntVersion='OTTO') # 'OTTO' = CFF-flavored OpenType
otf = TTFont(sfntVersion="OTTO") # 'OTTO' = CFF-flavored OpenType
# Get glyph names
if hasattr(cff_font, 'charset') and cff_font.charset is not None:
glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef']
if hasattr(cff_font, "charset") and cff_font.charset is not None:
glyph_order = [".notdef"] + [
name for name in cff_font.charset if name != ".notdef"
]
else:
# Fallback to CharStrings keys
charstrings = cff_font.CharStrings
glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef']
glyph_order = [".notdef"] + [
name for name in charstrings.keys() if name != ".notdef"
]
otf.setGlyphOrder(glyph_order)
# === Add CFF table (the actual font outlines) ===
cff_table = newTable('CFF ')
cff_table = newTable("CFF ")
cff_table.cff = cff_fontset
otf['CFF '] = cff_table
otf["CFF "] = cff_table
# === Calculate metrics from CFF ===
charstrings = cff_font.CharStrings
# Get defaults from CFF Private dict
private_dict = getattr(cff_font, 'Private', None)
default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500
private_dict = getattr(cff_font, "Private", None)
default_width = (
getattr(private_dict, "defaultWidthX", 500) if private_dict else 500
)
# Calculate bounding box, widths, and LSBs
x_min = 0
@@ -152,7 +165,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
cs = charstrings[glyph_name]
# Get width from charstring
if hasattr(cs, 'width'):
if hasattr(cs, "width"):
width = int(cs.width)
# Calculate bounds for LSB and bbox
@@ -181,7 +194,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
except:
pass # Some glyphs may not have outlines
except Exception as e:
except Exception:
pass # Use defaults
widths[glyph_name] = width
@@ -196,7 +209,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
units_per_em = 1000 # Standard for Type1/CFF
# === Create head table ===
head = newTable('head')
head = newTable("head")
head.tableVersion = 1.0
head.fontRevision = 1.0
head.checkSumAdjustment = 0
@@ -214,10 +227,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
head.indexToLocFormat = 0
head.glyphDataFormat = 0
head.lowestRecPPEM = 8
otf['head'] = head
otf["head"] = head
# === Create hhea table with correct metrics ===
hhea = newTable('hhea')
hhea = newTable("hhea")
hhea.tableVersion = 0x00010000
hhea.ascent = max(y_max, 800)
hhea.descent = min(y_min, -200)
@@ -235,27 +248,30 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
hhea.reserved3 = 0
hhea.metricDataFormat = 0
hhea.numberOfHMetrics = len(glyph_order)
otf['hhea'] = hhea
otf["hhea"] = hhea
# === Create hmtx table with correct LSBs ===
hmtx = newTable('hmtx')
hmtx = newTable("hmtx")
hmtx.metrics = {}
for glyph_name in glyph_order:
hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0))
otf['hmtx'] = hmtx
hmtx.metrics[glyph_name] = (
widths.get(glyph_name, default_width),
lsbs.get(glyph_name, 0),
)
otf["hmtx"] = hmtx
# === Create maxp table (simpler for CFF) ===
maxp = newTable('maxp')
maxp = newTable("maxp")
maxp.tableVersion = 0x00005000 # CFF version (0.5)
maxp.numGlyphs = len(glyph_order)
otf['maxp'] = maxp
otf["maxp"] = maxp
# === Build Unicode cmap from GID→Unicode mapping ===
unicode_to_glyph = {}
if gid_to_unicode:
# Debug: Show first few glyph names to understand naming convention
sample_glyphs = glyph_order[:min(10, len(glyph_order))]
sample_glyphs = glyph_order[: min(10, len(glyph_order))]
print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr)
# Debug: Show which GIDs we have mappings for
@@ -264,7 +280,9 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
# For CID fonts: glyph names are "cid00123" (5-digit zero-padded)
# For non-CID fonts: glyph names vary but GID == array index
is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6]) # Check first few non-.notdef glyphs
is_cid_font = any(
gn.startswith("cid") for gn in glyph_order[1:6]
) # Check first few non-.notdef glyphs
for gid, unicode_val in gid_to_unicode.items():
if unicode_val > 0:
@@ -285,18 +303,21 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
glyph_name = glyph_order[gid]
unicode_to_glyph[unicode_val] = glyph_name
print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr)
print(
f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})",
file=sys.stderr,
)
# Also try to map from glyph names (uni0041 → U+0041)
for glyph_name in glyph_order:
if glyph_name.startswith('uni') and len(glyph_name) == 7:
if glyph_name.startswith("uni") and len(glyph_name) == 7:
try:
unicode_val = int(glyph_name[3:], 16)
if unicode_val not in unicode_to_glyph:
unicode_to_glyph[unicode_val] = glyph_name
except:
pass
elif glyph_name.startswith('u') and len(glyph_name) >= 5:
elif glyph_name.startswith("u") and len(glyph_name) >= 5:
try:
unicode_val = int(glyph_name[1:], 16)
if unicode_val not in unicode_to_glyph:
@@ -305,14 +326,14 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
pass
# === Create cmap table ===
cmap = newTable('cmap')
cmap = newTable("cmap")
cmap.tableVersion = 0
cmap_tables = []
# Windows Unicode BMP (format 4) - required
cmap4_win = cmap_format_4(4)
cmap4_win.platformID = 3 # Windows
cmap4_win.platEncID = 1 # Unicode BMP
cmap4_win.platEncID = 1 # Unicode BMP
cmap4_win.language = 0
cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
cmap_tables.append(cmap4_win)
@@ -329,23 +350,27 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
# Mac Unicode (format 4) - for compatibility
cmap4_mac = cmap_format_4(4)
cmap4_mac.platformID = 1 # Mac
cmap4_mac.platEncID = 0 # Roman
cmap4_mac.platEncID = 0 # Roman
cmap4_mac.language = 0
cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
cmap_tables.append(cmap4_mac)
cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win] # Ensure at least one
otf['cmap'] = cmap
cmap.tables = [t for t in cmap_tables if t.cmap] or [
cmap4_win
] # Ensure at least one
otf["cmap"] = cmap
print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr)
print(
f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr
)
# === Create OS/2 table with correct metrics ===
os2 = newTable('OS/2')
os2 = newTable("OS/2")
os2.version = 4
os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500
os2.usWeightClass = 400 # Normal
os2.usWidthClass = 5 # Medium
os2.fsType = 0 # Installable embedding
os2.usWidthClass = 5 # Medium
os2.fsType = 0 # Installable embedding
os2.ySubscriptXSize = 650
os2.ySubscriptYSize = 600
os2.ySubscriptXOffset = 0
@@ -375,7 +400,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
os2.ulUnicodeRange2 = 0
os2.ulUnicodeRange3 = 0
os2.ulUnicodeRange4 = 0
os2.achVendID = 'SPDF'
os2.achVendID = "SPDF"
os2.fsSelection = 0x0040 # REGULAR bit
# Set character index range from actual cmap
@@ -385,7 +410,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
os2.usLastCharIndex = codepoints[-1]
else:
os2.usFirstCharIndex = 0x20 # space
os2.usLastCharIndex = 0x7E # tilde
os2.usLastCharIndex = 0x7E # tilde
# Typo metrics match hhea
os2.sTypoAscender = hhea.ascent
@@ -403,10 +428,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
os2.usDefaultChar = 0
os2.usBreakChar = 32
os2.usMaxContext = 0
otf['OS/2'] = os2
otf["OS/2"] = os2
# === Create name table with Windows and Mac records ===
name = newTable('name')
name = newTable("name")
name.names = []
# Get font name from CFF if available
@@ -418,7 +443,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
3: f"Stirling-PDF: {font_name}", # Unique ID
4: font_name, # Full Name
5: "Version 1.0", # Version
6: font_name.replace(' ', '-'), # PostScript Name
6: font_name.replace(" ", "-"), # PostScript Name
}
# Add both Windows and Mac name records
@@ -441,10 +466,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
rec_mac.string = value
name.names.append(rec_mac)
otf['name'] = name
otf["name"] = name
# === Create post table (format 3.0 for smaller web fonts) ===
post = newTable('post')
post = newTable("post")
post.formatType = 3.0 # No glyph names (smaller, web-optimized)
post.italicAngle = 0
post.underlinePosition = -100
@@ -454,7 +479,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
post.maxMemType42 = 0
post.minMemType1 = 0
post.maxMemType1 = 0
otf['post'] = post
otf["post"] = post
# Save the OTF font
otf.save(output_path)
@@ -465,12 +490,17 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
except Exception as e:
print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
return False
def main():
if len(sys.argv) < 3:
print("Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]", file=sys.stderr)
print(
"Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]",
file=sys.stderr,
)
sys.exit(1)
input_path = Path(sys.argv[1])
@@ -485,8 +515,13 @@ def main():
print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr)
tounicode_path = None
success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None)
success = wrap_cff_as_otf(
str(input_path),
str(output_path),
str(tounicode_path) if tounicode_path else None,
)
sys.exit(0 if success else 1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,7 +27,7 @@ import os
import re
import sys
from pathlib import Path
from typing import Iterable, List, Optional, Set, Tuple
from typing import List, Optional, Set, Tuple
from urllib.parse import unquote, urlparse
import requests
@@ -121,10 +121,10 @@ def build_filename(url: str, output_dir: Path) -> Path:
def download_pdf(
url: str,
output_dir: Path,
timeout: int,
overwrite: bool,
url: str,
output_dir: Path,
timeout: int,
overwrite: bool,
) -> Tuple[str, Optional[Path], Optional[str]]:
try:
dest = build_filename(url, output_dir)
@@ -139,8 +139,12 @@ def download_pdf(
# Peek into the first bytes to be safe
peek = response.raw.read(5, decode_content=True)
if not peek.startswith(b"%PDF"):
return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
content = peek + response.content[len(peek):]
return (
url,
None,
f"Skipping non-PDF content-type ({content_type or 'unknown'})",
)
content = peek + response.content[len(peek) :]
else:
content = response.content
@@ -157,7 +161,9 @@ def main() -> None:
output_dir = Path(args.output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
print(
f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers..."
)
successes = 0
skipped = 0
@@ -184,7 +190,9 @@ def main() -> None:
print(f"[OK] {url} -> {path}")
print()
print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
print(
f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}"
)
if failures:
print("Failures:")
for url, error in failures:

View File

@@ -28,13 +28,15 @@ import shlex
import subprocess
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from typing import Dict, List, Sequence, Tuple
REPO_ROOT = Path(__file__).resolve().parents[1]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Bulk collect Type3 font signatures from PDFs.")
parser = argparse.ArgumentParser(
description="Bulk collect Type3 font signatures from PDFs."
)
parser.add_argument(
"--input",
nargs="+",
@@ -145,7 +147,7 @@ def run_signature_tool(
if pretty:
args += " --pretty"
# Use shell invocation so the quoted --args string is parsed correctly by Gradle.
cmd = f"{gradle_cmd} -q :proprietary:type3SignatureTool --args=\"{args}\""
cmd = f'{gradle_cmd} -q :proprietary:type3SignatureTool --args="{args}"'
completed = subprocess.run(
cmd,
shell=True,
@@ -207,11 +209,15 @@ def main() -> None:
try:
payload = load_signature_file(signature_path)
except Exception as exc:
print(f"[WARN] Failed to parse cached signature {signature_path}: {exc}")
print(
f"[WARN] Failed to parse cached signature {signature_path}: {exc}"
)
payload = None
else:
try:
run_signature_tool(args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT)
run_signature_tool(
args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT
)
except Exception as exc:
print(f"[ERROR] Harvest failed for {pdf}: {exc}", file=sys.stderr)
continue

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python3
"""Build a Type3 font catalogue from sample PDFs."""
import argparse
import json
import subprocess

View File

@@ -18,7 +18,9 @@ from typing import Dict, List
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Summarize Type3 signature JSON dumps.")
parser = argparse.ArgumentParser(
description="Summarize Type3 signature JSON dumps."
)
parser.add_argument(
"--input",
default="docs/type3/signatures",
@@ -53,7 +55,9 @@ def load_signatures(directory: Path) -> Dict[str, List[dict]]:
return inventory
def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Path) -> None:
def write_markdown(
inventory: Dict[str, List[dict]], output: Path, input_dir: Path
) -> None:
lines: List[str] = []
lines.append("# Type3 Signature Inventory")
lines.append("")
@@ -72,7 +76,9 @@ def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Pa
for entry in entries:
signature = entry.get("signature") or ""
sample = Path(entry["source"]).name
glyph_count = entry.get("glyphCount") if entry.get("glyphCount") is not None else ""
glyph_count = (
entry.get("glyphCount") if entry.get("glyphCount") is not None else ""
)
coverage = entry.get("glyphCoverage") or []
preview = ", ".join(str(code) for code in coverage[:10])
lines.append(f"| `{signature}` | `{sample}` | {glyph_count} | {preview} |")

View File

@@ -7,10 +7,8 @@ TOML format only.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any, Optional
from typing import Dict, List, Any
import argparse
import re
from datetime import datetime
@@ -27,7 +25,7 @@ class AITranslationHelper:
def _load_translation_file(self, file_path: Path) -> Dict:
"""Load TOML translation file."""
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
except (FileNotFoundError, Exception) as e:
print(f"Error loading {file_path}: {e}")
@@ -35,27 +33,31 @@ class AITranslationHelper:
def _save_translation_file(self, data: Dict, file_path: Path) -> None:
"""Save TOML translation file."""
with open(file_path, 'wb') as f:
with open(file_path, "wb") as f:
tomli_w.dump(data, f)
def create_ai_batch_file(self, languages: List[str], output_file: Path,
max_entries_per_language: int = 50) -> None:
def create_ai_batch_file(
self,
languages: List[str],
output_file: Path,
max_entries_per_language: int = 50,
) -> None:
"""Create a batch file for AI translation with multiple languages."""
golden_truth = self._load_translation_file(self.golden_truth_file)
batch_data = {
'metadata': {
'created_at': datetime.now().isoformat(),
'source_language': 'en-GB',
'target_languages': languages,
'max_entries_per_language': max_entries_per_language,
'instructions': {
'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}',
'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.',
'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.',
'style': 'Keep translations concise and user-friendly'
}
"metadata": {
"created_at": datetime.now().isoformat(),
"source_language": "en-GB",
"target_languages": languages,
"max_entries_per_language": max_entries_per_language,
"instructions": {
"format": "Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}",
"context": "This is for a PDF manipulation tool. Keep technical terms consistent.",
"placeholders": "Preserve all placeholders: {n}, {total}, {filename}, etc.",
"style": "Keep translations concise and user-friendly",
},
},
'translations': {}
"translations": {},
}
for lang in languages:
@@ -72,41 +74,57 @@ class AITranslationHelper:
untranslated = self._find_untranslated_entries(golden_truth, lang_data)
# Limit entries if specified
if max_entries_per_language and len(untranslated) > max_entries_per_language:
if (
max_entries_per_language
and len(untranslated) > max_entries_per_language
):
# Prioritize by key importance
untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language)
untranslated = self._prioritize_translation_keys(
untranslated, max_entries_per_language
)
batch_data['translations'][lang] = {}
batch_data["translations"][lang] = {}
for key, value in untranslated.items():
batch_data['translations'][lang][key] = {
'original': value,
'translated': '', # AI fills this
'context': self._get_key_context(key)
batch_data["translations"][lang][key] = {
"original": value,
"translated": "", # AI fills this
"context": self._get_key_context(key),
}
# Always save batch files as JSON for compatibility
with open(output_file, 'w', encoding='utf-8') as f:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(batch_data, f, indent=2, ensure_ascii=False)
total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values())
total_entries = sum(
len(lang_data) for lang_data in batch_data["translations"].values()
)
print(f"Created AI batch file: {output_file}")
print(f"Total entries to translate: {total_entries}")
def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]:
def _find_untranslated_entries(
self, golden_truth: Dict, lang_data: Dict
) -> Dict[str, str]:
"""Find entries that need translation."""
golden_flat = self._flatten_dict(golden_truth)
lang_flat = self._flatten_dict(lang_data)
untranslated = {}
for key, value in golden_flat.items():
if (key not in lang_flat or
lang_flat[key] == value or
(isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))):
if (
key not in lang_flat
or lang_flat[key] == value
or (
isinstance(lang_flat[key], str)
and lang_flat[key].startswith("[UNTRANSLATED]")
)
):
if not self._is_expected_identical(key, value):
untranslated[key] = value
return untranslated
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
def _flatten_dict(
self, d: Dict, parent_key: str = "", separator: str = "."
) -> Dict[str, Any]:
"""Flatten nested dictionary."""
items = []
for k, v in d.items():
@@ -119,25 +137,27 @@ class AITranslationHelper:
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if key should be identical across languages."""
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]:
return True
return 'language.direction' in key.lower()
return "language.direction" in key.lower()
def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]:
def _prioritize_translation_keys(
self, untranslated: Dict[str, str], max_count: int
) -> Dict[str, str]:
"""Prioritize which keys to translate first based on importance."""
# Define priority order (higher score = higher priority)
priority_patterns = [
('title', 10),
('header', 9),
('submit', 8),
('selectText', 7),
('prompt', 6),
('desc', 5),
('error', 8),
('warning', 7),
('save', 8),
('download', 8),
('upload', 7),
("title", 10),
("header", 9),
("submit", 8),
("selectText", 7),
("prompt", 6),
("desc", 5),
("error", 8),
("warning", 7),
("save", 8),
("download", 8),
("upload", 7),
]
scored_keys = []
@@ -154,89 +174,99 @@ class AITranslationHelper:
def _get_key_context(self, key: str) -> str:
"""Get contextual information for a translation key."""
parts = key.split('.')
parts = key.split(".")
contexts = {
'addPageNumbers': 'Feature for adding page numbers to PDFs',
'compress': 'PDF compression functionality',
'merge': 'PDF merging functionality',
'split': 'PDF splitting functionality',
'rotate': 'PDF rotation functionality',
'convert': 'File conversion functionality',
'security': 'PDF security and permissions',
'metadata': 'PDF metadata editing',
'watermark': 'Adding watermarks to PDFs',
'overlay': 'PDF overlay functionality',
'extract': 'Extracting content from PDFs'
"addPageNumbers": "Feature for adding page numbers to PDFs",
"compress": "PDF compression functionality",
"merge": "PDF merging functionality",
"split": "PDF splitting functionality",
"rotate": "PDF rotation functionality",
"convert": "File conversion functionality",
"security": "PDF security and permissions",
"metadata": "PDF metadata editing",
"watermark": "Adding watermarks to PDFs",
"overlay": "PDF overlay functionality",
"extract": "Extracting content from PDFs",
}
if len(parts) > 0:
main_section = parts[0]
context = contexts.get(main_section, f'Part of {main_section} functionality')
context = contexts.get(
main_section, f"Part of {main_section} functionality"
)
if len(parts) > 1:
context += f', specifically for {parts[-1]}'
context += f", specifically for {parts[-1]}"
return context
return 'General application text'
return "General application text"
def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
"""Validate AI translations for common issues."""
# Batch files are always JSON
with open(batch_file, 'r', encoding='utf-8') as f:
with open(batch_file, "r", encoding="utf-8") as f:
batch_data = json.load(f)
issues = {'errors': [], 'warnings': []}
issues = {"errors": [], "warnings": []}
for lang, translations in batch_data.get('translations', {}).items():
for lang, translations in batch_data.get("translations", {}).items():
for key, translation_data in translations.items():
original = translation_data.get('original', '')
translated = translation_data.get('translated', '')
original = translation_data.get("original", "")
translated = translation_data.get("translated", "")
if not translated:
issues['errors'].append(f"{lang}.{key}: Missing translation")
issues["errors"].append(f"{lang}.{key}: Missing translation")
continue
# Check for placeholder preservation
original_placeholders = re.findall(r'\{[^}]+\}', original)
translated_placeholders = re.findall(r'\{[^}]+\}', translated)
original_placeholders = re.findall(r"\{[^}]+\}", original)
translated_placeholders = re.findall(r"\{[^}]+\}", translated)
if set(original_placeholders) != set(translated_placeholders):
issues['warnings'].append(
issues["warnings"].append(
f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
f"Translated: {translated_placeholders}"
)
# Check if translation is identical to original (might be untranslated)
if translated == original and not self._is_expected_identical(key, original):
issues['warnings'].append(f"{lang}.{key}: Translation identical to original")
if translated == original and not self._is_expected_identical(
key, original
):
issues["warnings"].append(
f"{lang}.{key}: Translation identical to original"
)
# Check for common AI translation artifacts
artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}']
artifacts = ["[TRANSLATE]", "[TODO]", "UNTRANSLATED", "{{", "}}"]
for artifact in artifacts:
if artifact in translated:
issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}")
issues["errors"].append(
f"{lang}.{key}: Contains translation artifact: {artifact}"
)
return issues
def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]:
def apply_ai_batch_translations(
self, batch_file: Path, validate: bool = True
) -> Dict[str, Any]:
"""Apply translations from AI batch file to individual language files."""
# Batch files are always JSON
with open(batch_file, 'r', encoding='utf-8') as f:
with open(batch_file, "r", encoding="utf-8") as f:
batch_data = json.load(f)
results = {'applied': {}, 'errors': [], 'warnings': []}
results = {"applied": {}, "errors": [], "warnings": []}
if validate:
validation_issues = self.validate_ai_translations(batch_file)
if validation_issues['errors']:
if validation_issues["errors"]:
print("Validation errors found. Fix these before applying:")
for error in validation_issues['errors']:
for error in validation_issues["errors"]:
print(f" ERROR: {error}")
return results
if validation_issues['warnings']:
if validation_issues["warnings"]:
print("Validation warnings (review recommended):")
for warning in validation_issues['warnings'][:10]:
for warning in validation_issues["warnings"][:10]:
print(f" WARNING: {warning}")
for lang, translations in batch_data.get('translations', {}).items():
for lang, translations in batch_data.get("translations", {}).items():
lang_dir = self.locales_dir / lang
toml_file = lang_dir / "translation.toml"
@@ -249,42 +279,48 @@ class AITranslationHelper:
applied_count = 0
for key, translation_data in translations.items():
translated = translation_data.get('translated', '').strip()
if translated and translated != translation_data.get('original', ''):
translated = translation_data.get("translated", "").strip()
if translated and translated != translation_data.get("original", ""):
self._set_nested_value(lang_data, key, translated)
applied_count += 1
if applied_count > 0:
self._save_translation_file(lang_data, toml_file)
results['applied'][lang] = applied_count
results["applied"][lang] = applied_count
print(f"Applied {applied_count} translations to {lang}")
return results
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
"""Set value in nested dict using dot notation."""
keys = key_path.split('.')
keys = key_path.split(".")
current = data
for key in keys[:-1]:
if key not in current:
current[key] = {}
elif not isinstance(current[key], dict):
# If the current value is not a dict, we can't nest into it
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
print(
f"Warning: Converting non-dict value at '{key}' to dict to allow nesting"
)
current[key] = {}
current = current[key]
current[keys[-1]] = value
def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None:
def export_for_external_translation(
self, languages: List[str], output_format: str = "csv"
) -> None:
"""Export translations for external translation services."""
golden_truth = self._load_translation_file(self.golden_truth_file)
golden_flat = self._flatten_dict(golden_truth)
if output_format == 'csv':
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv')
if output_format == "csv":
output_file = Path(
f"translations_export_{datetime.now().strftime('%Y%m%d')}.csv"
)
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['key', 'context', 'en_GB'] + languages
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
fieldnames = ["key", "context", "en_GB"] + languages
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
@@ -293,9 +329,9 @@ class AITranslationHelper:
continue
row = {
'key': key,
'context': self._get_key_context(key),
'en_GB': en_value
"key": key,
"context": self._get_key_context(key),
"en_GB": en_value,
}
for lang in languages:
@@ -305,28 +341,30 @@ class AITranslationHelper:
if toml_file.exists():
lang_data = self._load_translation_file(toml_file)
lang_flat = self._flatten_dict(lang_data)
value = lang_flat.get(key, '')
if value.startswith('[UNTRANSLATED]'):
value = ''
value = lang_flat.get(key, "")
if value.startswith("[UNTRANSLATED]"):
value = ""
row[lang] = value
else:
row[lang] = ''
row[lang] = ""
writer.writerow(row)
print(f"Exported to {output_file}")
elif output_format == 'json':
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json')
export_data = {'languages': languages, 'translations': {}}
elif output_format == "json":
output_file = Path(
f"translations_export_{datetime.now().strftime('%Y%m%d')}.json"
)
export_data = {"languages": languages, "translations": {}}
for key, en_value in golden_flat.items():
if self._is_expected_identical(key, en_value):
continue
export_data['translations'][key] = {
'en_GB': en_value,
'context': self._get_key_context(key)
export_data["translations"][key] = {
"en_GB": en_value,
"context": self._get_key_context(key),
}
for lang in languages:
@@ -336,51 +374,64 @@ class AITranslationHelper:
if toml_file.exists():
lang_data = self._load_translation_file(toml_file)
lang_flat = self._flatten_dict(lang_data)
value = lang_flat.get(key, '')
if value.startswith('[UNTRANSLATED]'):
value = ''
export_data['translations'][key][lang] = value
value = lang_flat.get(key, "")
if value.startswith("[UNTRANSLATED]"):
value = ""
export_data["translations"][key][lang] = value
# Export files are always JSON
with open(output_file, 'w', encoding='utf-8') as f:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(export_data, f, indent=2, ensure_ascii=False)
print(f"Exported to {output_file}")
def main():
parser = argparse.ArgumentParser(
description='AI Translation Helper',
epilog='Works with TOML translation files.'
description="AI Translation Helper", epilog="Works with TOML translation files."
)
parser.add_argument(
"--locales-dir",
default="frontend/public/locales",
help="Path to locales directory",
)
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
subparsers = parser.add_subparsers(dest='command', help='Available commands')
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Create batch command
batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file')
batch_parser.add_argument('--languages', nargs='+', required=True,
help='Language codes to include')
batch_parser.add_argument('--output', required=True, help='Output batch file')
batch_parser.add_argument('--max-entries', type=int, default=100,
help='Max entries per language')
batch_parser = subparsers.add_parser(
"create-batch", help="Create AI translation batch file"
)
batch_parser.add_argument(
"--languages", nargs="+", required=True, help="Language codes to include"
)
batch_parser.add_argument("--output", required=True, help="Output batch file")
batch_parser.add_argument(
"--max-entries", type=int, default=100, help="Max entries per language"
)
# Validate command
validate_parser = subparsers.add_parser('validate', help='Validate AI translations')
validate_parser.add_argument('batch_file', help='Batch file to validate')
validate_parser = subparsers.add_parser("validate", help="Validate AI translations")
validate_parser.add_argument("batch_file", help="Batch file to validate")
# Apply command
apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations')
apply_parser.add_argument('batch_file', help='Batch file with translations')
apply_parser.add_argument('--skip-validation', action='store_true',
help='Skip validation before applying')
apply_parser = subparsers.add_parser(
"apply-batch", help="Apply AI batch translations"
)
apply_parser.add_argument("batch_file", help="Batch file with translations")
apply_parser.add_argument(
"--skip-validation", action="store_true", help="Skip validation before applying"
)
# Export command
export_parser = subparsers.add_parser('export', help='Export for external translation')
export_parser.add_argument('--languages', nargs='+', required=True,
help='Language codes to export')
export_parser.add_argument('--format', choices=['csv', 'json'], default='csv',
help='Export format')
export_parser = subparsers.add_parser(
"export", help="Export for external translation"
)
export_parser.add_argument(
"--languages", nargs="+", required=True, help="Language codes to export"
)
export_parser.add_argument(
"--format", choices=["csv", "json"], default="csv", help="Export format"
)
args = parser.parse_args()
@@ -390,40 +441,39 @@ def main():
helper = AITranslationHelper(args.locales_dir)
if args.command == 'create-batch':
if args.command == "create-batch":
output_file = Path(args.output)
helper.create_ai_batch_file(args.languages, output_file, args.max_entries)
elif args.command == 'validate':
elif args.command == "validate":
batch_file = Path(args.batch_file)
issues = helper.validate_ai_translations(batch_file)
if issues['errors']:
if issues["errors"]:
print("ERRORS:")
for error in issues['errors']:
for error in issues["errors"]:
print(f" - {error}")
if issues['warnings']:
if issues["warnings"]:
print("WARNINGS:")
for warning in issues['warnings']:
for warning in issues["warnings"]:
print(f" - {warning}")
if not issues['errors'] and not issues['warnings']:
if not issues["errors"] and not issues["warnings"]:
print("No validation issues found!")
elif args.command == 'apply-batch':
elif args.command == "apply-batch":
batch_file = Path(args.batch_file)
results = helper.apply_ai_batch_translations(
batch_file,
validate=not args.skip_validation
batch_file, validate=not args.skip_validation
)
total_applied = sum(results['applied'].values())
total_applied = sum(results["applied"].values())
print(f"Total translations applied: {total_applied}")
elif args.command == 'export':
elif args.command == "export":
helper.export_for_external_translation(args.languages, args.format)
if __name__ == "__main__":
main()
main()

View File

@@ -19,9 +19,9 @@ import tomllib
def run_command(cmd, description=""):
"""Run a shell command and return success status."""
if description:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"Step: {description}")
print(f"{'='*60}")
print(f"{'=' * 60}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
@@ -40,29 +40,35 @@ def find_translation_file(lang_dir):
return toml_file
return None
def load_translation_file(file_path):
"""Load TOML translation file."""
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
def extract_untranslated(language_code, batch_size=500, include_existing=False):
"""Extract untranslated entries and split into batches."""
mode = "all untranslated (including existing)" if include_existing else "new (missing)"
mode = (
"all untranslated (including existing)" if include_existing else "new (missing)"
)
print(f"\n🔍 Extracting {mode} entries for {language_code}...")
# Load files
golden_path = find_translation_file(Path('frontend/public/locales/en-GB'))
lang_path = find_translation_file(Path(f'frontend/public/locales/{language_code}'))
golden_path = find_translation_file(Path("frontend/public/locales/en-GB"))
lang_path = find_translation_file(Path(f"frontend/public/locales/{language_code}"))
if not golden_path:
print(f"Error: Golden truth file not found in frontend/public/locales/en-GB")
print("Error: Golden truth file not found in frontend/public/locales/en-GB")
return None
if not lang_path:
print(f"Error: Language file not found in frontend/public/locales/{language_code}")
print(
f"Error: Language file not found in frontend/public/locales/{language_code}"
)
return None
def flatten_dict(d, parent_key='', separator='.'):
def flatten_dict(d, parent_key="", separator="."):
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
@@ -76,7 +82,7 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False):
lang_data = load_translation_file(lang_path)
if not golden or not lang_data:
print(f"Error: Failed to load translation files")
print("Error: Failed to load translation files")
return None
golden_flat = flatten_dict(golden)
@@ -87,9 +93,14 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False):
for key, value in golden_flat.items():
if include_existing:
# Include missing keys, keys with English values, and [UNTRANSLATED] keys
if (key not in lang_flat or
lang_flat.get(key) == value or
(isinstance(lang_flat.get(key), str) and lang_flat.get(key).startswith("[UNTRANSLATED]"))):
if (
key not in lang_flat
or lang_flat.get(key) == value
or (
isinstance(lang_flat.get(key), str)
and lang_flat.get(key).startswith("[UNTRANSLATED]")
)
):
untranslated[key] = value
else:
# Only include missing keys (not in target file at all)
@@ -108,16 +119,16 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False):
num_batches = (total + batch_size - 1) // batch_size
batch_files = []
lang_code_safe = language_code.replace('-', '_')
lang_code_safe = language_code.replace("-", "_")
for i in range(num_batches):
start = i * batch_size
end = min((i + 1) * batch_size, total)
batch = dict(entries[start:end])
filename = f'{lang_code_safe}_batch_{i+1}_of_{num_batches}.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(batch, f, ensure_ascii=False, separators=(',', ':'))
filename = f"{lang_code_safe}_batch_{i + 1}_of_{num_batches}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(batch, f, ensure_ascii=False, separators=(",", ":"))
batch_files.append(filename)
print(f" Created {filename} with {len(batch)} entries")
@@ -131,7 +142,7 @@ def translate_batches(batch_files, language_code, api_key, timeout=600):
return []
print(f"\n🤖 Translating {len(batch_files)} batches using GPT-5...")
print(f"Timeout: {timeout}s ({timeout//60} minutes) per batch")
print(f"Timeout: {timeout}s ({timeout // 60} minutes) per batch")
translated_files = []
@@ -142,7 +153,9 @@ def translate_batches(batch_files, language_code, api_key, timeout=600):
cmd = f'python3 scripts/translations/batch_translator.py "{batch_file}" --language {language_code} --api-key "{api_key}"'
# Run with timeout
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
result = subprocess.run(
cmd, shell=True, capture_output=True, text=True, timeout=timeout
)
if result.stdout:
print(result.stdout)
@@ -153,7 +166,7 @@ def translate_batches(batch_files, language_code, api_key, timeout=600):
print(f"✗ Failed to translate {batch_file}")
return None
translated_file = batch_file.replace('.json', '_translated.json')
translated_file = batch_file.replace(".json", "_translated.json")
translated_files.append(translated_file)
# Small delay between batches
@@ -177,14 +190,14 @@ def merge_translations(translated_files, language_code):
print(f"Error: Translated file not found: {filename}")
return None
with open(filename, 'r', encoding='utf-8') as f:
with open(filename, "r", encoding="utf-8") as f:
merged.update(json.load(f))
lang_code_safe = language_code.replace('-', '_')
merged_file = f'{lang_code_safe}_merged.json'
lang_code_safe = language_code.replace("-", "_")
merged_file = f"{lang_code_safe}_merged.json"
with open(merged_file, 'w', encoding='utf-8') as f:
json.dump(merged, f, ensure_ascii=False, separators=(',', ':'))
with open(merged_file, "w", encoding="utf-8") as f:
json.dump(merged, f, ensure_ascii=False, separators=(",", ":"))
print(f"✓ Merged {len(merged)} translations into {merged_file}")
return merged_file
@@ -194,13 +207,13 @@ def apply_translations(merged_file, language_code):
"""Apply merged translations to the language file."""
print(f"\n📝 Applying translations to {language_code}...")
cmd = f'python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}'
cmd = f"python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}"
if not run_command(cmd):
print(f"✗ Failed to apply translations")
print("✗ Failed to apply translations")
return False
print(f"✓ Translations applied successfully")
print("✓ Translations applied successfully")
return True
@@ -208,27 +221,25 @@ def beautify_translations(language_code):
"""Beautify translation file to match en-GB structure."""
print(f"\n✨ Beautifying {language_code} translation file...")
cmd = f'python3 scripts/translations/toml_beautifier.py --language {language_code}'
cmd = f"python3 scripts/translations/toml_beautifier.py --language {language_code}"
if not run_command(cmd):
print(f"✗ Failed to beautify translations")
print("✗ Failed to beautify translations")
return False
print(f"✓ Translation file beautified")
print("✓ Translation file beautified")
return True
def cleanup_temp_files(language_code):
"""Remove temporary batch files."""
print(f"\n🧹 Cleaning up temporary files...")
print("\n🧹 Cleaning up temporary files...")
lang_code_safe = language_code.replace('-', '_')
patterns = [
f'{lang_code_safe}_batch_*.json',
f'{lang_code_safe}_merged.json'
]
lang_code_safe = language_code.replace("-", "_")
patterns = [f"{lang_code_safe}_batch_*.json", f"{lang_code_safe}_merged.json"]
import glob
removed = 0
for pattern in patterns:
for file in glob.glob(pattern):
@@ -240,15 +251,15 @@ def cleanup_temp_files(language_code):
def verify_completion(language_code):
"""Check final completion percentage."""
print(f"\n📊 Verifying completion...")
print("\n📊 Verifying completion...")
cmd = f'python3 scripts/translations/translation_analyzer.py --language {language_code} --summary'
cmd = f"python3 scripts/translations/translation_analyzer.py --language {language_code} --summary"
run_command(cmd)
def main():
parser = argparse.ArgumentParser(
description='Automated translation pipeline for Stirling PDF',
description="Automated translation pipeline for Stirling PDF",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Note: This script works with TOML translation files.
@@ -266,36 +277,57 @@ Examples:
# Skip cleanup (keep temporary files for inspection)
python3 scripts/translations/auto_translate.py fr-FR --no-cleanup
"""
""",
)
parser.add_argument('language', help='Language code (e.g., es-ES, de-DE, zh-CN)')
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
parser.add_argument('--batch-size', type=int, default=500, help='Entries per batch (default: 500)')
parser.add_argument('--no-cleanup', action='store_true', help='Keep temporary batch files')
parser.add_argument('--skip-verification', action='store_true', help='Skip final completion check')
parser.add_argument('--timeout', type=int, default=600, help='Timeout per batch in seconds (default: 600 = 10 minutes)')
parser.add_argument('--include-existing', action='store_true', help='Also retranslate existing keys that match English (default: only translate missing keys)')
parser.add_argument("language", help="Language code (e.g., es-ES, de-DE, zh-CN)")
parser.add_argument(
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
)
parser.add_argument(
"--batch-size", type=int, default=500, help="Entries per batch (default: 500)"
)
parser.add_argument(
"--no-cleanup", action="store_true", help="Keep temporary batch files"
)
parser.add_argument(
"--skip-verification", action="store_true", help="Skip final completion check"
)
parser.add_argument(
"--timeout",
type=int,
default=600,
help="Timeout per batch in seconds (default: 600 = 10 minutes)",
)
parser.add_argument(
"--include-existing",
action="store_true",
help="Also retranslate existing keys that match English (default: only translate missing keys)",
)
args = parser.parse_args()
# Verify API key
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
if not api_key:
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
print(
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
)
sys.exit(1)
print("="*60)
print(f"Automated Translation Pipeline")
print("=" * 60)
print("Automated Translation Pipeline")
print(f"Language: {args.language}")
print(f"Batch Size: {args.batch_size} entries")
print("="*60)
print("=" * 60)
start_time = time.time()
try:
# Step 1: Extract and split
batch_files = extract_untranslated(args.language, args.batch_size, args.include_existing)
batch_files = extract_untranslated(
args.language, args.batch_size, args.include_existing
)
if batch_files is None:
sys.exit(1)
@@ -304,7 +336,9 @@ Examples:
sys.exit(0)
# Step 2: Translate all batches
translated_files = translate_batches(batch_files, args.language, api_key, args.timeout)
translated_files = translate_batches(
batch_files, args.language, api_key, args.timeout
)
if translated_files is None:
sys.exit(1)
@@ -330,10 +364,10 @@ Examples:
verify_completion(args.language)
elapsed = time.time() - start_time
print("\n" + "="*60)
print(f"✅ Translation pipeline completed successfully!")
print("\n" + "=" * 60)
print("✅ Translation pipeline completed successfully!")
print(f"Time elapsed: {elapsed:.1f} seconds")
print("="*60)
print("=" * 60)
except KeyboardInterrupt:
print("\n\n⚠ Translation interrupted by user")
@@ -341,6 +375,7 @@ Examples:
except Exception as e:
print(f"\n\n✗ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -79,10 +79,12 @@ CRITICAL RULES - MUST FOLLOW EXACTLY:
Return ONLY the translated JSON. No markdown, no explanations, just the JSON object."""
def translate_batch(self, batch_data: dict, target_language: str, language_code: str) -> dict:
def translate_batch(
self, batch_data: dict, target_language: str, language_code: str
) -> dict:
"""Translate a batch file using OpenAI API."""
# Convert batch to compact JSON for API
input_json = json.dumps(batch_data, ensure_ascii=False, separators=(',', ':'))
input_json = json.dumps(batch_data, ensure_ascii=False, separators=(",", ":"))
print(f"Translating {len(batch_data)} entries to {target_language}...")
print(f"Input size: {len(input_json)} characters")
@@ -94,12 +96,14 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
messages=[
{
"role": "system",
"content": self.get_translation_prompt(target_language, language_code)
"content": self.get_translation_prompt(
target_language, language_code
),
},
{
"role": "user",
"content": f"Translate this JSON:\n\n{input_json}"
}
"content": f"Translate this JSON:\n\n{input_json}",
},
],
)
@@ -107,13 +111,13 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
# Remove markdown code blocks if present
if translated_text.startswith("```"):
lines = translated_text.split('\n')
translated_text = '\n'.join(lines[1:-1])
lines = translated_text.split("\n")
translated_text = "\n".join(lines[1:-1])
# Parse the translated JSON
translated_data = json.loads(translated_text)
print(f"✓ Translation complete")
print("✓ Translation complete")
return translated_data
except json.JSONDecodeError as e:
@@ -139,7 +143,8 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
# Check placeholders in each value
import re
placeholder_pattern = r'\{[^}]+\}|\{\{[^}]+\}\}'
placeholder_pattern = r"\{[^}]+\}|\{\{[^}]+\}\}"
for key in original.keys():
if key not in translated:
@@ -153,7 +158,9 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
trans_placeholders = set(re.findall(placeholder_pattern, trans_value))
if orig_placeholders != trans_placeholders:
issues.append(f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}")
issues.append(
f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}"
)
if issues:
print("\n⚠ Validation warnings:")
@@ -170,37 +177,37 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
def get_language_info(language_code: str) -> tuple:
"""Get full language name from code."""
languages = {
'zh-CN': ('Simplified Chinese', 'zh-CN'),
'es-ES': ('Spanish', 'es-ES'),
'it-IT': ('Italian', 'it-IT'),
'de-DE': ('German', 'de-DE'),
'ar-AR': ('Arabic', 'ar-AR'),
'pt-BR': ('Brazilian Portuguese', 'pt-BR'),
'ru-RU': ('Russian', 'ru-RU'),
'fr-FR': ('French', 'fr-FR'),
'ja-JP': ('Japanese', 'ja-JP'),
'ko-KR': ('Korean', 'ko-KR'),
'nl-NL': ('Dutch', 'nl-NL'),
'pl-PL': ('Polish', 'pl-PL'),
'sv-SE': ('Swedish', 'sv-SE'),
'da-DK': ('Danish', 'da-DK'),
'no-NB': ('Norwegian', 'no-NB'),
'fi-FI': ('Finnish', 'fi-FI'),
'tr-TR': ('Turkish', 'tr-TR'),
'vi-VN': ('Vietnamese', 'vi-VN'),
'th-TH': ('Thai', 'th-TH'),
'id-ID': ('Indonesian', 'id-ID'),
'hi-IN': ('Hindi', 'hi-IN'),
'cs-CZ': ('Czech', 'cs-CZ'),
'hu-HU': ('Hungarian', 'hu-HU'),
'ro-RO': ('Romanian', 'ro-RO'),
'uk-UA': ('Ukrainian', 'uk-UA'),
'el-GR': ('Greek', 'el-GR'),
'bg-BG': ('Bulgarian', 'bg-BG'),
'hr-HR': ('Croatian', 'hr-HR'),
'sk-SK': ('Slovak', 'sk-SK'),
'sl-SI': ('Slovenian', 'sl-SI'),
'ca-CA': ('Catalan', 'ca-CA'),
"zh-CN": ("Simplified Chinese", "zh-CN"),
"es-ES": ("Spanish", "es-ES"),
"it-IT": ("Italian", "it-IT"),
"de-DE": ("German", "de-DE"),
"ar-AR": ("Arabic", "ar-AR"),
"pt-BR": ("Brazilian Portuguese", "pt-BR"),
"ru-RU": ("Russian", "ru-RU"),
"fr-FR": ("French", "fr-FR"),
"ja-JP": ("Japanese", "ja-JP"),
"ko-KR": ("Korean", "ko-KR"),
"nl-NL": ("Dutch", "nl-NL"),
"pl-PL": ("Polish", "pl-PL"),
"sv-SE": ("Swedish", "sv-SE"),
"da-DK": ("Danish", "da-DK"),
"no-NB": ("Norwegian", "no-NB"),
"fi-FI": ("Finnish", "fi-FI"),
"tr-TR": ("Turkish", "tr-TR"),
"vi-VN": ("Vietnamese", "vi-VN"),
"th-TH": ("Thai", "th-TH"),
"id-ID": ("Indonesian", "id-ID"),
"hi-IN": ("Hindi", "hi-IN"),
"cs-CZ": ("Czech", "cs-CZ"),
"hu-HU": ("Hungarian", "hu-HU"),
"ro-RO": ("Romanian", "ro-RO"),
"uk-UA": ("Ukrainian", "uk-UA"),
"el-GR": ("Greek", "el-GR"),
"bg-BG": ("Bulgarian", "bg-BG"),
"hr-HR": ("Croatian", "hr-HR"),
"sk-SK": ("Slovak", "sk-SK"),
"sl-SI": ("Slovenian", "sl-SI"),
"ca-CA": ("Catalan", "ca-CA"),
}
return languages.get(language_code, (language_code, language_code))
@@ -208,7 +215,7 @@ def get_language_info(language_code: str) -> tuple:
def main():
parser = argparse.ArgumentParser(
description='Translate JSON batch files using OpenAI API (output supports TOML and JSON)',
description="Translate JSON batch files using OpenAI API (output supports TOML and JSON)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Note: This script works with JSON batch files. The translation files it updates can be TOML or JSON.
@@ -226,24 +233,51 @@ Examples:
# Use different model
python batch_translator.py file.json --api-key KEY --language es-ES --model gpt-4-turbo
"""
""",
)
parser.add_argument('input_files', nargs='+', help='Input batch JSON file(s) or pattern')
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
parser.add_argument('--language', '-l', required=True, help='Target language code (e.g., zh-CN, es-ES)')
parser.add_argument('--model', default='gpt-5', help='OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)')
parser.add_argument('--output-suffix', default='_translated', help='Suffix for output files (default: _translated)')
parser.add_argument('--skip-validation', action='store_true', help='Skip validation checks')
parser.add_argument('--delay', type=float, default=1.0, help='Delay between API calls in seconds (default: 1.0)')
parser.add_argument(
"input_files", nargs="+", help="Input batch JSON file(s) or pattern"
)
parser.add_argument(
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
)
parser.add_argument(
"--language",
"-l",
required=True,
help="Target language code (e.g., zh-CN, es-ES)",
)
parser.add_argument(
"--model",
default="gpt-5",
help="OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)",
)
parser.add_argument(
"--output-suffix",
default="_translated",
help="Suffix for output files (default: _translated)",
)
parser.add_argument(
"--skip-validation", action="store_true", help="Skip validation checks"
)
parser.add_argument(
"--delay",
type=float,
default=1.0,
help="Delay between API calls in seconds (default: 1.0)",
)
args = parser.parse_args()
# Get API key from args or environment
import os
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
if not api_key:
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
print(
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
)
sys.exit(1)
# Get language info
@@ -251,6 +285,7 @@ Examples:
# Expand file patterns
import glob
input_files = []
for pattern in args.input_files:
matched = glob.glob(pattern)
@@ -263,7 +298,7 @@ Examples:
print("Error: No input files found")
sys.exit(1)
print(f"Batch Translator")
print("Batch Translator")
print(f"Target Language: {language_name} ({language_code})")
print(f"Model: {args.model}")
print(f"Files to translate: {len(input_files)}")
@@ -281,11 +316,13 @@ Examples:
try:
# Load input file
with open(input_file, 'r', encoding='utf-8') as f:
with open(input_file, "r", encoding="utf-8") as f:
batch_data = json.load(f)
# Translate
translated_data = translator.translate_batch(batch_data, language_name, language_code)
translated_data = translator.translate_batch(
batch_data, language_name, language_code
)
# Validate
if not args.skip_validation:
@@ -295,8 +332,8 @@ Examples:
input_path = Path(input_file)
output_file = input_path.stem + args.output_suffix + input_path.suffix
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(translated_data, f, ensure_ascii=False, separators=(',', ':'))
with open(output_file, "w", encoding="utf-8") as f:
json.dump(translated_data, f, ensure_ascii=False, separators=(",", ":"))
print(f"✓ Saved to: {output_file}")
successful += 1
@@ -312,7 +349,7 @@ Examples:
# Summary
print("\n" + "=" * 60)
print(f"Translation complete!")
print("Translation complete!")
print(f"Successful: {successful}/{len(input_files)}")
if failed > 0:
print(f"Failed: {failed}/{len(input_files)}")
@@ -321,5 +358,4 @@ Examples:
if __name__ == "__main__":
import os
main()

View File

@@ -54,16 +54,16 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
return None
try:
with open(toml_file, 'rb') as f:
with open(toml_file, "rb") as f:
target_data = tomllib.load(f)
# Load en-GB reference
en_gb_file = locales_dir / 'en-GB' / 'translation.toml'
with open(en_gb_file, 'rb') as f:
en_gb_file = locales_dir / "en-GB" / "translation.toml"
with open(en_gb_file, "rb") as f:
en_gb_data = tomllib.load(f)
# Flatten and count
def flatten(d, parent=''):
def flatten(d, parent=""):
items = {}
for k, v in d.items():
key = f"{parent}.{k}" if parent else k
@@ -77,7 +77,11 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
target_flat = flatten(target_data)
# Count translated (not equal to en-GB)
translated = sum(1 for k in en_gb_flat if k in target_flat and target_flat[k] != en_gb_flat[k])
translated = sum(
1
for k in en_gb_flat
if k in target_flat and target_flat[k] != en_gb_flat[k]
)
total = len(en_gb_flat)
return (translated / total * 100) if total > 0 else 0.0
@@ -87,7 +91,14 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
return None
def translate_language(language: str, api_key: str, batch_size: int, timeout: int, skip_verification: bool, include_existing: bool) -> Tuple[str, bool, str]:
def translate_language(
language: str,
api_key: str,
batch_size: int,
timeout: int,
skip_verification: bool,
include_existing: bool,
) -> Tuple[str, bool, str]:
"""
Translate a single language.
Returns: (language_code, success, message)
@@ -95,25 +106,29 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
safe_print(f"[{language}] Starting translation...")
cmd = [
'python3', 'scripts/translations/auto_translate.py',
"python3",
"scripts/translations/auto_translate.py",
language,
'--api-key', api_key,
'--batch-size', str(batch_size),
'--timeout', str(timeout)
"--api-key",
api_key,
"--batch-size",
str(batch_size),
"--timeout",
str(timeout),
]
if skip_verification:
cmd.append('--skip-verification')
cmd.append("--skip-verification")
if include_existing:
cmd.append('--include-existing')
cmd.append("--include-existing")
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout * 5 # Overall timeout = 5x per-batch timeout
timeout=timeout * 5, # Overall timeout = 5x per-batch timeout
)
if result.returncode == 0:
@@ -124,7 +139,9 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
safe_print(f"[{language}] ✓ Success")
return (language, True, "Success")
else:
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
error_msg = (
result.stderr.strip() or result.stdout.strip() or "Unknown error"
)
safe_print(f"[{language}] ✗ Failed: {error_msg[:100]}")
return (language, False, error_msg[:200]) # Truncate long errors
@@ -138,7 +155,7 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
def main():
parser = argparse.ArgumentParser(
description='Bulk auto-translate all languages using OpenAI API',
description="Bulk auto-translate all languages using OpenAI API",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -155,35 +172,70 @@ Examples:
python3 bulk_auto_translate.py --dry-run
Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
"""
""",
)
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
parser.add_argument('--parallel', type=int, default=1,
help='Number of parallel translation threads (default: 1)')
parser.add_argument('--batch-size', type=int, default=500,
help='Entries per batch for translation (default: 500)')
parser.add_argument('--timeout', type=int, default=600,
help='Timeout per batch in seconds (default: 600)')
parser.add_argument('--threshold', type=float, default=0.0,
help='Only translate languages below this completion %% (default: 0 = all)')
parser.add_argument('--languages', nargs='+',
help='Translate only specific languages (e.g., de-DE fr-FR)')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--skip-verification', action='store_true',
help='Skip final completion verification for each language')
parser.add_argument('--include-existing', action='store_true',
help='Also retranslate existing keys that match English (default: only translate missing keys)')
parser.add_argument('--dry-run', action='store_true',
help='Show what would be translated without actually translating')
parser.add_argument(
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
)
parser.add_argument(
"--parallel",
type=int,
default=1,
help="Number of parallel translation threads (default: 1)",
)
parser.add_argument(
"--batch-size",
type=int,
default=500,
help="Entries per batch for translation (default: 500)",
)
parser.add_argument(
"--timeout",
type=int,
default=600,
help="Timeout per batch in seconds (default: 600)",
)
parser.add_argument(
"--threshold",
type=float,
default=0.0,
help="Only translate languages below this completion %% (default: 0 = all)",
)
parser.add_argument(
"--languages",
nargs="+",
help="Translate only specific languages (e.g., de-DE fr-FR)",
)
parser.add_argument(
"--locales-dir",
default="frontend/public/locales",
help="Path to locales directory",
)
parser.add_argument(
"--skip-verification",
action="store_true",
help="Skip final completion verification for each language",
)
parser.add_argument(
"--include-existing",
action="store_true",
help="Also retranslate existing keys that match English (default: only translate missing keys)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be translated without actually translating",
)
args = parser.parse_args()
# Verify API key (unless dry run)
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
if not args.dry_run and not api_key:
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
print(
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
)
sys.exit(1)
locales_dir = Path(args.locales_dir)
@@ -221,16 +273,16 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
print("\nNo languages below threshold!")
sys.exit(0)
print(f"\n{'='*60}")
print(f"Bulk Translation Configuration")
print(f"{'='*60}")
print(f"\n{'=' * 60}")
print("Bulk Translation Configuration")
print(f"{'=' * 60}")
print(f"Languages to translate: {len(languages)}")
print(f"Parallel threads: {args.parallel}")
print(f"Batch size: {args.batch_size}")
print(f"Timeout per batch: {args.timeout}s")
if args.threshold > 0:
print(f"Completion threshold: {args.threshold}%")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
if args.dry_run:
print("DRY RUN - Languages that would be translated:")
@@ -244,11 +296,7 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
start_time = time.time()
# Translate in parallel
results = {
'success': [],
'failed': [],
'already_complete': []
}
results = {"success": [], "failed": [], "already_complete": []}
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
futures = {
@@ -259,7 +307,7 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
args.batch_size,
args.timeout,
args.skip_verification,
args.include_existing
args.include_existing,
): lang
for lang in languages
}
@@ -269,43 +317,43 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
if success:
if message == "Already complete":
results['already_complete'].append(language)
results["already_complete"].append(language)
else:
results['success'].append(language)
results["success"].append(language)
else:
results['failed'].append((language, message))
results["failed"].append((language, message))
elapsed = time.time() - start_time
# Print summary
print("\n" + "="*60)
print("\n" + "=" * 60)
print("Bulk Translation Summary")
print("="*60)
print("=" * 60)
print(f"Total languages: {len(languages)}")
print(f"Successful: {len(results['success'])}")
print(f"Already complete: {len(results['already_complete'])}")
print(f"Failed: {len(results['failed'])}")
print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
print("="*60)
print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed / 60:.1f} minutes)")
print("=" * 60)
if results['success']:
if results["success"]:
print(f"\n✅ Successfully translated ({len(results['success'])}):")
for lang in sorted(results['success']):
for lang in sorted(results["success"]):
print(f" - {lang}")
if results['already_complete']:
if results["already_complete"]:
print(f"\n✓ Already complete ({len(results['already_complete'])}):")
for lang in sorted(results['already_complete']):
for lang in sorted(results["already_complete"]):
print(f" - {lang}")
if results['failed']:
if results["failed"]:
print(f"\n❌ Failed ({len(results['failed'])}):")
for lang, msg in sorted(results['failed']):
for lang, msg in sorted(results["failed"]):
print(f" - {lang}: {msg}")
sys.exit(1)
print("\n✅ Bulk translation completed successfully!")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -13,11 +13,18 @@ import tomllib # Python 3.11+ (stdlib)
class CompactTranslationExtractor:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
def __init__(
self,
locales_dir: str = "frontend/public/locales",
ignore_file: str = "scripts/ignore_translation.toml",
):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
if not self.golden_truth_file.exists():
print(f"Error: en-GB translation file not found at {self.golden_truth_file}", file=sys.stderr)
print(
f"Error: en-GB translation file not found at {self.golden_truth_file}",
file=sys.stderr,
)
sys.exit(1)
self.golden_truth = self._load_translation_file(self.golden_truth_file)
self.ignore_file = Path(ignore_file)
@@ -26,7 +33,7 @@ class CompactTranslationExtractor:
def _load_translation_file(self, file_path: Path) -> dict:
"""Load TOML translation file."""
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}", file=sys.stderr)
@@ -41,14 +48,21 @@ class CompactTranslationExtractor:
return {}
try:
with open(self.ignore_file, 'rb') as f:
with open(self.ignore_file, "rb") as f:
ignore_data = tomllib.load(f)
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
return {
lang: set(data.get("ignore", [])) for lang, data in ignore_data.items()
}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr)
print(
f"Warning: Could not load ignore file {self.ignore_file}: {e}",
file=sys.stderr,
)
return {}
def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict:
def _flatten_dict(
self, d: dict, parent_key: str = "", separator: str = "."
) -> dict:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
@@ -65,14 +79,17 @@ class CompactTranslationExtractor:
target_file = lang_dir / "translation.toml"
if not target_file.exists():
print(f"Error: Translation file not found for language: {language}", file=sys.stderr)
print(
f"Error: Translation file not found for language: {language}",
file=sys.stderr,
)
sys.exit(1)
target_data = self._load_translation_file(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
lang_code = language.replace('-', '_')
lang_code = language.replace("-", "_")
ignore_set = self.ignore_patterns.get(lang_code, set())
# Find missing translations
@@ -85,8 +102,13 @@ class CompactTranslationExtractor:
target_value = target_flat[key]
golden_value = golden_flat[key]
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
(golden_value == target_value and not self._is_expected_identical(key, golden_value)):
if (
isinstance(target_value, str)
and target_value.startswith("[UNTRANSLATED]")
) or (
golden_value == target_value
and not self._is_expected_identical(key, golden_value)
):
untranslated_keys.add(key)
# Combine and create compact output
@@ -101,8 +123,8 @@ class CompactTranslationExtractor:
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
identical_patterns = ['language.direction']
identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'}
identical_patterns = ["language.direction"]
identical_values = {"ltr", "rtl", "True", "False", "true", "false", "unknown"}
if value.strip() in identical_values:
return True
@@ -116,13 +138,23 @@ class CompactTranslationExtractor:
def main():
parser = argparse.ArgumentParser(
description='Extract untranslated entries in compact format for AI translation (TOML format only)'
description="Extract untranslated entries in compact format for AI translation (TOML format only)"
)
parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)')
parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file')
parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output')
parser.add_argument('--output', help='Output file (default: stdout)')
parser.add_argument("language", help="Language code (e.g., de-DE, fr-FR)")
parser.add_argument(
"--locales-dir",
default="frontend/public/locales",
help="Path to locales directory",
)
parser.add_argument(
"--ignore-file",
default="scripts/ignore_translation.toml",
help="Path to ignore patterns file",
)
parser.add_argument(
"--max-entries", type=int, help="Maximum number of entries to output"
)
parser.add_argument("--output", help="Output file (default: stdout)")
args = parser.parse_args()
@@ -131,19 +163,22 @@ def main():
if args.max_entries:
# Take first N entries
keys = list(untranslated.keys())[:args.max_entries]
keys = list(untranslated.keys())[: args.max_entries]
untranslated = {k: untranslated[k] for k in keys}
# Output compact JSON (no indentation, minimal whitespace)
output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False)
output = json.dumps(untranslated, separators=(",", ":"), ensure_ascii=False)
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr)
print(
f"Extracted {len(untranslated)} untranslated entries to {args.output}",
file=sys.stderr,
)
else:
print(output)
if __name__ == "__main__":
main()
main()

View File

@@ -4,7 +4,6 @@ TOML Beautifier and Structure Fixer for Stirling PDF Frontend
Restructures translation TOML files to match en-GB structure and key order exactly.
"""
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
@@ -24,7 +23,7 @@ class TOMLBeautifier:
def _load_toml(self, file_path: Path) -> Dict:
"""Load TOML file with error handling."""
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
@@ -36,15 +35,18 @@ class TOMLBeautifier:
def _save_toml(self, data: Dict, file_path: Path, backup: bool = False) -> None:
"""Save TOML file with proper formatting."""
if backup and file_path.exists():
backup_path = file_path.with_suffix(f'.backup.restructured.toml')
backup_path = file_path.with_suffix(".backup.restructured.toml")
import shutil
shutil.copy2(file_path, backup_path)
print(f"Backup created: {backup_path}")
with open(file_path, 'wb') as f:
with open(file_path, "wb") as f:
tomli_w.dump(data, f)
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
def _flatten_dict(
self, d: Dict, parent_key: str = "", separator: str = "."
) -> Dict[str, Any]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
@@ -55,9 +57,12 @@ class TOMLBeautifier:
items.append((new_key, v))
return dict(items)
def _rebuild_structure(self, flat_dict: Dict[str, Any], reference_structure: Dict) -> Dict:
def _rebuild_structure(
self, flat_dict: Dict[str, Any], reference_structure: Dict
) -> Dict:
"""Rebuild nested structure based on reference structure and available translations."""
def build_recursive(ref_obj: Any, current_path: str = '') -> Any:
def build_recursive(ref_obj: Any, current_path: str = "") -> Any:
if isinstance(ref_obj, dict):
result = OrderedDict()
for key, value in ref_obj.items():
@@ -106,7 +111,9 @@ class TOMLBeautifier:
return restructured
def beautify_and_restructure(self, target_file: Path, backup: bool = False) -> Dict[str, Any]:
def beautify_and_restructure(
self, target_file: Path, backup: bool = False
) -> Dict[str, Any]:
"""Main function to beautify and restructure a translation file."""
lang_code = target_file.parent.name
print(f"Restructuring {lang_code} translation file...")
@@ -125,10 +132,12 @@ class TOMLBeautifier:
preserved_keys = len(flat_restructured)
result = {
'language': lang_code,
'total_reference_keys': total_keys,
'preserved_keys': preserved_keys,
'structure_match': self._compare_structures(self.golden_structure, restructured_data)
"language": lang_code,
"total_reference_keys": total_keys,
"preserved_keys": preserved_keys,
"structure_match": self._compare_structures(
self.golden_structure, restructured_data
),
}
print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved")
@@ -136,7 +145,8 @@ class TOMLBeautifier:
def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]:
"""Compare structures between reference and target."""
def compare_recursive(r: Any, t: Any, path: str = '') -> List[str]:
def compare_recursive(r: Any, t: Any, path: str = "") -> List[str]:
issues = []
if isinstance(r, dict) and isinstance(t, dict):
@@ -147,7 +157,9 @@ class TOMLBeautifier:
missing_sections = ref_keys - target_keys
if missing_sections:
for section in missing_sections:
issues.append(f"Missing section: {path}.{section}" if path else section)
issues.append(
f"Missing section: {path}.{section}" if path else section
)
# Recurse into common sections
for key in ref_keys & target_keys:
@@ -159,16 +171,16 @@ class TOMLBeautifier:
issues = compare_recursive(ref, target)
return {
'structures_match': len(issues) == 0,
'issues': issues[:10], # Limit to first 10 issues
'total_issues': len(issues)
"structures_match": len(issues) == 0,
"issues": issues[:10], # Limit to first 10 issues
"total_issues": len(issues),
}
def validate_key_order(self, target_file: Path) -> Dict[str, Any]:
"""Validate that keys appear in the same order as en-GB."""
target_data = self._load_toml(target_file)
def get_key_order(obj: Dict, path: str = '') -> List[str]:
def get_key_order(obj: Dict, path: str = "") -> List[str]:
keys = []
for key in obj.keys():
new_path = f"{path}.{key}" if path else key
@@ -183,37 +195,51 @@ class TOMLBeautifier:
# Find common keys and check their relative order
common_keys = set(golden_order) & set(target_order)
golden_indices = {key: idx for idx, key in enumerate(golden_order) if key in common_keys}
target_indices = {key: idx for idx, key in enumerate(target_order) if key in common_keys}
golden_indices = {
key: idx for idx, key in enumerate(golden_order) if key in common_keys
}
target_indices = {
key: idx for idx, key in enumerate(target_order) if key in common_keys
}
order_preserved = all(
golden_indices[key1] < golden_indices[key2]
for key1 in common_keys for key2 in common_keys
if golden_indices[key1] < golden_indices[key2] and target_indices[key1] < target_indices[key2]
for key1 in common_keys
for key2 in common_keys
if golden_indices[key1] < golden_indices[key2]
and target_indices[key1] < target_indices[key2]
)
return {
'order_preserved': order_preserved,
'common_keys_count': len(common_keys),
'golden_keys_count': len(golden_order),
'target_keys_count': len(target_order)
"order_preserved": order_preserved,
"common_keys_count": len(common_keys),
"golden_keys_count": len(golden_order),
"target_keys_count": len(target_order),
}
def main():
parser = argparse.ArgumentParser(
description='Beautify and restructure translation TOML files',
epilog='Works with TOML format translation files.'
description="Beautify and restructure translation TOML files",
epilog="Works with TOML format translation files.",
)
parser.add_argument(
"--locales-dir",
default="frontend/public/locales",
help="Path to locales directory",
)
parser.add_argument("--language", help="Restructure specific language only")
parser.add_argument(
"--all-languages", action="store_true", help="Restructure all language files"
)
parser.add_argument(
"--backup", action="store_true", help="Create backup files before modifying"
)
parser.add_argument(
"--validate-only",
action="store_true",
help="Only validate structure, do not modify files",
)
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--language', help='Restructure specific language only')
parser.add_argument('--all-languages', action='store_true',
help='Restructure all language files')
parser.add_argument('--backup', action='store_true',
help='Create backup files before modifying')
parser.add_argument('--validate-only', action='store_true',
help='Only validate structure, do not modify files')
args = parser.parse_args()
@@ -229,14 +255,22 @@ def main():
order_result = beautifier.validate_key_order(target_file)
print(f"Key order validation for {args.language}:")
print(f" Order preserved: {order_result['order_preserved']}")
print(f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}")
print(
f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}"
)
else:
result = beautifier.beautify_and_restructure(target_file, backup=args.backup)
result = beautifier.beautify_and_restructure(
target_file, backup=args.backup
)
print(f"\nResults for {result['language']}:")
print(f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}")
if result['structure_match']['total_issues'] > 0:
print(f" Structure issues: {result['structure_match']['total_issues']}")
for issue in result['structure_match']['issues']:
print(
f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}"
)
if result["structure_match"]["total_issues"] > 0:
print(
f" Structure issues: {result['structure_match']['total_issues']}"
)
for issue in result["structure_match"]["issues"]:
print(f" - {issue}")
elif args.all_languages:
@@ -247,18 +281,24 @@ def main():
if translation_file.exists():
if args.validate_only:
order_result = beautifier.validate_key_order(translation_file)
print(f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}")
print(
f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}"
)
else:
result = beautifier.beautify_and_restructure(translation_file, backup=args.backup)
result = beautifier.beautify_and_restructure(
translation_file, backup=args.backup
)
results.append(result)
if not args.validate_only and results:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("RESTRUCTURING SUMMARY")
print(f"{'='*60}")
for result in sorted(results, key=lambda x: x['language']):
print(f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
f"({result['preserved_keys']/result['total_reference_keys']*100:.1f}%)")
print(f"{'=' * 60}")
for result in sorted(results, key=lambda x: x["language"]):
print(
f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
f"({result['preserved_keys'] / result['total_reference_keys'] * 100:.1f}%)"
)
else:
parser.print_help()

View File

@@ -15,7 +15,6 @@ Usage:
import sys
import argparse
import glob
from pathlib import Path
import tomllib
@@ -23,7 +22,7 @@ import tomllib
def get_line_context(file_path, line_num, context_lines=3):
"""Get lines around the error for context"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
start = max(0, line_num - context_lines - 1)
@@ -32,7 +31,7 @@ def get_line_context(file_path, line_num, context_lines=3):
context = []
for i in range(start, end):
marker = ">>> " if i == line_num - 1 else " "
context.append(f"{marker}{i+1:4d}: {lines[i].rstrip()}")
context.append(f"{marker}{i + 1:4d}: {lines[i].rstrip()}")
return "\n".join(context)
except Exception as e:
@@ -42,7 +41,7 @@ def get_line_context(file_path, line_num, context_lines=3):
def get_character_context(file_path, char_pos, context_chars=100):
"""Get characters around the error position"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
start = max(0, char_pos - context_chars)
@@ -50,19 +49,19 @@ def get_character_context(file_path, char_pos, context_chars=100):
before = content[start:char_pos]
error_char = content[char_pos] if char_pos < len(content) else "EOF"
after = content[char_pos+1:end]
after = content[char_pos + 1 : end]
return {
'before': before,
'error_char': error_char,
'after': after,
'display': f"{before}[{error_char}]{after}"
"before": before,
"error_char": error_char,
"after": after,
"display": f"{before}[{error_char}]{after}",
}
except Exception as e:
except Exception:
return None
def count_keys(data, prefix=''):
def count_keys(data, prefix=""):
"""Recursively count all keys in nested TOML structure"""
count = 0
if isinstance(data, dict):
@@ -77,42 +76,43 @@ def count_keys(data, prefix=''):
def validate_toml_file(file_path):
"""Validate a single TOML file and return detailed error info"""
result = {
'file': str(file_path),
'valid': False,
'error': None,
'line': None,
'context': None,
'entry_count': 0
"file": str(file_path),
"valid": False,
"error": None,
"line": None,
"context": None,
"entry_count": 0,
}
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
data = tomllib.load(f)
result['valid'] = True
result['entry_count'] = count_keys(data)
result["valid"] = True
result["entry_count"] = count_keys(data)
except Exception as e:
error_msg = str(e)
result['error'] = error_msg
result["error"] = error_msg
# Try to extract line number from error message
import re
line_match = re.search(r'line (\d+)', error_msg, re.IGNORECASE)
line_match = re.search(r"line (\d+)", error_msg, re.IGNORECASE)
if line_match:
line_num = int(line_match.group(1))
result['line'] = line_num
result['context'] = get_line_context(file_path, line_num)
result["line"] = line_num
result["context"] = get_line_context(file_path, line_num)
except FileNotFoundError:
result['error'] = "File not found"
result["error"] = "File not found"
return result
def print_validation_result(result, brief=False, quiet=False):
"""Print validation result in human-readable format"""
if result['valid']:
if result["valid"]:
if not quiet:
print(f"{result['file']}")
if not brief:
@@ -121,30 +121,35 @@ def print_validation_result(result, brief=False, quiet=False):
print(f"{result['file']}")
print(f" Error: {result['error']}")
if result['line']:
if result["line"]:
print(f" Line: {result['line']}")
if result['context'] and not brief:
print(f"\n Context:")
if result["context"] and not brief:
print("\n Context:")
print(f" {result['context'].replace(chr(10), chr(10) + ' ')}")
if not brief:
print(f"\n Common fixes:")
print(f" - Check for missing quotes around keys or values")
print(f" - Ensure proper escaping of special characters")
print(f" - Verify table header syntax: [section.subsection]")
print(f" - Check for duplicate keys in the same table")
print("\n Common fixes:")
print(" - Check for missing quotes around keys or values")
print(" - Ensure proper escaping of special characters")
print(" - Verify table header syntax: [section.subsection]")
print(" - Check for duplicate keys in the same table")
def main():
parser = argparse.ArgumentParser(description='Validate TOML translation files')
parser.add_argument('files', nargs='*', help='TOML file(s) or pattern to validate')
parser.add_argument('--all-batches', metavar='LANG',
help='Validate all batch files for a language (e.g., ar_AR)')
parser.add_argument('--brief', action='store_true',
help='Show brief output without context')
parser.add_argument('--quiet', action='store_true',
help='Only show files with errors')
parser = argparse.ArgumentParser(description="Validate TOML translation files")
parser.add_argument("files", nargs="*", help="TOML file(s) or pattern to validate")
parser.add_argument(
"--all-batches",
metavar="LANG",
help="Validate all batch files for a language (e.g., ar_AR)",
)
parser.add_argument(
"--brief", action="store_true", help="Show brief output without context"
)
parser.add_argument(
"--quiet", action="store_true", help="Only show files with errors"
)
args = parser.parse_args()
@@ -181,11 +186,11 @@ def main():
# Summary
total = len(results)
valid = sum(1 for r in results if r['valid'])
valid = sum(1 for r in results if r["valid"])
invalid = total - valid
if not args.quiet:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"Summary: {valid}/{total} files valid")
if invalid > 0:
print(f" {invalid} file(s) with errors")
@@ -194,5 +199,5 @@ def main():
sys.exit(0 if invalid == 0 else 1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -5,16 +5,19 @@ Compares language files against en-GB golden truth file.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
from typing import Dict, List, Set
import argparse
import tomllib
class TranslationAnalyzer:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
def __init__(
self,
locales_dir: str = "frontend/public/locales",
ignore_file: str = "scripts/ignore_translation.toml",
):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
self.golden_truth = self._load_translation_file(self.golden_truth_file)
@@ -24,7 +27,7 @@ class TranslationAnalyzer:
def _load_translation_file(self, file_path: Path) -> Dict:
"""Load TOML translation file with error handling."""
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
@@ -39,17 +42,23 @@ class TranslationAnalyzer:
return {}
try:
with open(self.ignore_file, 'rb') as f:
with open(self.ignore_file, "rb") as f:
ignore_data = tomllib.load(f)
# Convert lists to sets for faster lookup
return {lang: set(patterns) for lang, data in ignore_data.items()
for patterns in [data.get('ignore', [])] if patterns}
return {
lang: set(patterns)
for lang, data in ignore_data.items()
for patterns in [data.get("ignore", [])]
if patterns
}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
return {}
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]:
def _flatten_dict(
self, d: Dict, parent_key: str = "", separator: str = "."
) -> Dict[str, str]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
@@ -80,7 +89,7 @@ class TranslationAnalyzer:
missing = set(golden_flat.keys()) - set(target_flat.keys())
# Filter out ignored keys
lang_code = target_file.parent.name.replace('-', '_')
lang_code = target_file.parent.name.replace("-", "_")
ignore_set = self.ignore_patterns.get(lang_code, set())
return missing - ignore_set
@@ -91,7 +100,7 @@ class TranslationAnalyzer:
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
lang_code = target_file.parent.name.replace('-', '_')
lang_code = target_file.parent.name.replace("-", "_")
ignore_set = self.ignore_patterns.get(lang_code, set())
untranslated = set()
@@ -101,8 +110,14 @@ class TranslationAnalyzer:
golden_value = golden_flat[key]
# Check if marked as [UNTRANSLATED] or identical to en-GB
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
(golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)):
if (
isinstance(target_value, str)
and target_value.startswith("[UNTRANSLATED]")
) or (
golden_value == target_value
and key not in ignore_set
and not self._is_expected_identical(key, golden_value)
):
untranslated.add(key)
return untranslated
@@ -110,14 +125,10 @@ class TranslationAnalyzer:
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
# Keys that should be identical across languages
identical_patterns = [
'language.direction',
'true', 'false',
'unknown'
]
identical_patterns = ["language.direction", "true", "false", "unknown"]
# Values that are often identical (numbers, symbols, etc.)
if value.strip() in ['ltr', 'rtl', 'True', 'False']:
if value.strip() in ["ltr", "rtl", "True", "False"]:
return True
# Check for patterns
@@ -149,7 +160,7 @@ class TranslationAnalyzer:
target_flat = self._flatten_dict(target_data)
# Calculate completion rate excluding ignored keys
lang_code = target_file.parent.name.replace('-', '_')
lang_code = target_file.parent.name.replace("-", "_")
ignore_set = self.ignore_patterns.get(lang_code, set())
relevant_keys = set(golden_flat.keys()) - ignore_set
@@ -161,22 +172,26 @@ class TranslationAnalyzer:
if key in target_flat:
value = target_flat[key]
if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")):
if key not in untranslated: # Not identical to en-GB (unless expected)
if (
key not in untranslated
): # Not identical to en-GB (unless expected)
properly_translated += 1
completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0
completion_rate = (
(properly_translated / total_keys) * 100 if total_keys > 0 else 0
)
return {
'language': lang_code,
'file': target_file,
'missing_count': len(missing),
'missing_keys': sorted(missing),
'untranslated_count': len(untranslated),
'untranslated_keys': sorted(untranslated),
'extra_count': len(extra),
'extra_keys': sorted(extra),
'total_keys': total_keys,
'completion_rate': completion_rate
"language": lang_code,
"file": target_file,
"missing_count": len(missing),
"missing_keys": sorted(missing),
"untranslated_count": len(untranslated),
"untranslated_keys": sorted(untranslated),
"extra_count": len(extra),
"extra_keys": sorted(extra),
"total_keys": total_keys,
"completion_rate": completion_rate,
}
def analyze_all_files(self) -> List[Dict]:
@@ -184,24 +199,38 @@ class TranslationAnalyzer:
results = []
for file_path in self.get_all_language_files():
results.append(self.analyze_file(file_path))
return sorted(results, key=lambda x: x['language'])
return sorted(results, key=lambda x: x["language"])
def main():
parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
help='Path to ignore patterns TOML file')
parser.add_argument('--language', help='Analyze specific language only')
parser.add_argument('--missing-only', action='store_true',
help='Show only missing translations')
parser.add_argument('--untranslated-only', action='store_true',
help='Show only untranslated entries')
parser.add_argument('--summary', action='store_true',
help='Show summary statistics only')
parser.add_argument('--format', choices=['text', 'json'], default='text',
help='Output format')
parser = argparse.ArgumentParser(
description="Analyze translation files against en-GB golden truth"
)
parser.add_argument(
"--locales-dir",
default="frontend/public/locales",
help="Path to locales directory",
)
parser.add_argument(
"--ignore-file",
default="scripts/ignore_translation.toml",
help="Path to ignore patterns TOML file",
)
parser.add_argument("--language", help="Analyze specific language only")
parser.add_argument(
"--missing-only", action="store_true", help="Show only missing translations"
)
parser.add_argument(
"--untranslated-only",
action="store_true",
help="Show only untranslated entries",
)
parser.add_argument(
"--summary", action="store_true", help="Show summary statistics only"
)
parser.add_argument(
"--format", choices=["text", "json"], default="text", help="Output format"
)
args = parser.parse_args()
@@ -220,14 +249,14 @@ def main():
else:
results = analyzer.analyze_all_files()
if args.format == 'json':
if args.format == "json":
print(json.dumps(results, indent=2, default=str))
return
# Text format output
for result in results:
lang = result['language']
print(f"\n{'='*60}")
lang = result["language"]
print(f"\n{'=' * 60}")
print(f"Language: {lang}")
print(f"File: {result['file']}")
print(f"Completion Rate: {result['completion_rate']:.1f}%")
@@ -236,42 +265,48 @@ def main():
if not args.summary:
if not args.untranslated_only:
print(f"\nMissing Translations ({result['missing_count']}):")
for key in result['missing_keys'][:10]: # Show first 10
for key in result["missing_keys"][:10]: # Show first 10
print(f" - {key}")
if len(result['missing_keys']) > 10:
if len(result["missing_keys"]) > 10:
print(f" ... and {len(result['missing_keys']) - 10} more")
if not args.missing_only:
print(f"\nUntranslated Entries ({result['untranslated_count']}):")
for key in result['untranslated_keys'][:10]: # Show first 10
for key in result["untranslated_keys"][:10]: # Show first 10
print(f" - {key}")
if len(result['untranslated_keys']) > 10:
if len(result["untranslated_keys"]) > 10:
print(f" ... and {len(result['untranslated_keys']) - 10} more")
if result['extra_count'] > 0:
if result["extra_count"] > 0:
print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):")
for key in result['extra_keys'][:5]:
for key in result["extra_keys"][:5]:
print(f" - {key}")
if len(result['extra_keys']) > 5:
if len(result["extra_keys"]) > 5:
print(f" ... and {len(result['extra_keys']) - 5} more")
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("SUMMARY")
print(f"{'='*60}")
avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0
print(f"{'=' * 60}")
avg_completion = (
sum(r["completion_rate"] for r in results) / len(results) if results else 0
)
print(f"Average Completion Rate: {avg_completion:.1f}%")
print(f"Languages Analyzed: {len(results)}")
# Top languages by completion
sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True)
print(f"\nTop 5 Most Complete Languages:")
sorted_by_completion = sorted(
results, key=lambda x: x["completion_rate"], reverse=True
)
print("\nTop 5 Most Complete Languages:")
for result in sorted_by_completion[:5]:
print(f" {result['language']}: {result['completion_rate']:.1f}%")
print(f"\nBottom 5 Languages Needing Attention:")
print("\nBottom 5 Languages Needing Attention:")
for result in sorted_by_completion[-5:]:
print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)")
print(
f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)"
)
if __name__ == "__main__":
main()
main()

View File

@@ -7,10 +7,9 @@ TOML format only.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any
from typing import Dict, List, Set, Any
import argparse
import shutil
from datetime import datetime
@@ -20,7 +19,11 @@ import tomli_w
class TranslationMerger:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
def __init__(
self,
locales_dir: str = "frontend/public/locales",
ignore_file: str = "scripts/ignore_translation.toml",
):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
self.golden_truth = self._load_translation_file(self.golden_truth_file)
@@ -30,7 +33,7 @@ class TranslationMerger:
def _load_translation_file(self, file_path: Path) -> Dict:
"""Load TOML translation file."""
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
@@ -39,14 +42,18 @@ class TranslationMerger:
print(f"Error: Invalid file {file_path}: {e}")
sys.exit(1)
def _save_translation_file(self, data: Dict, file_path: Path, backup: bool = False) -> None:
def _save_translation_file(
self, data: Dict, file_path: Path, backup: bool = False
) -> None:
"""Save TOML translation file with backup option."""
if backup and file_path.exists():
backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.toml')
backup_path = file_path.with_suffix(
f".backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}.toml"
)
shutil.copy2(file_path, backup_path)
print(f"Backup created: {backup_path}")
with open(file_path, 'wb') as f:
with open(file_path, "wb") as f:
tomli_w.dump(data, f)
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
@@ -55,18 +62,20 @@ class TranslationMerger:
return {}
try:
with open(self.ignore_file, 'rb') as f:
with open(self.ignore_file, "rb") as f:
ignore_data = tomllib.load(f)
# Convert to sets for faster lookup
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
return {
lang: set(data.get("ignore", [])) for lang, data in ignore_data.items()
}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
return {}
def _get_nested_value(self, data: Dict, key_path: str) -> Any:
"""Get value from nested dict using dot notation."""
keys = key_path.split('.')
keys = key_path.split(".")
current = data
for key in keys:
if isinstance(current, dict) and key in current:
@@ -77,7 +86,7 @@ class TranslationMerger:
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
"""Set value in nested dict using dot notation."""
keys = key_path.split('.')
keys = key_path.split(".")
current = data
for key in keys[:-1]:
if key not in current:
@@ -85,12 +94,16 @@ class TranslationMerger:
elif not isinstance(current[key], dict):
# If the current value is not a dict, we can't nest into it
# This handles cases where a key exists as a string but we need to make it a dict
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
print(
f"Warning: Converting non-dict value at '{key}' to dict to allow nesting"
)
current[key] = {}
current = current[key]
current[keys[-1]] = value
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
def _flatten_dict(
self, d: Dict, parent_key: str = "", separator: str = "."
) -> Dict[str, Any]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
@@ -103,7 +116,7 @@ class TranslationMerger:
def get_missing_keys(self, target_file: Path) -> List[str]:
"""Get list of missing keys in target file."""
lang_code = target_file.parent.name.replace('-', '_')
lang_code = target_file.parent.name.replace("-", "_")
ignore_set = self.ignore_patterns.get(lang_code, set())
if not target_file.exists():
@@ -117,7 +130,9 @@ class TranslationMerger:
missing = set(golden_flat.keys()) - set(target_flat.keys())
return sorted(missing - ignore_set)
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None) -> Dict:
def add_missing_translations(
self, target_file: Path, keys_to_add: List[str] = None
) -> Dict:
"""Add missing translations from en-GB to target file."""
if not target_file.exists():
target_data = {}
@@ -136,12 +151,14 @@ class TranslationMerger:
added_count += 1
return {
'added_count': added_count,
'missing_keys': missing_keys,
'data': target_data
"added_count": added_count,
"missing_keys": missing_keys,
"data": target_data,
}
def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict:
def extract_untranslated_entries(
self, target_file: Path, output_file: Path = None
) -> Dict:
"""Extract entries marked as untranslated or identical to en-GB for AI translation."""
if not target_file.exists():
print(f"Error: Target file does not exist: {target_file}")
@@ -160,20 +177,22 @@ class TranslationMerger:
# Check if marked as untranslated
if isinstance(value, str) and value.startswith("[UNTRANSLATED]"):
untranslated_entries[key] = {
'original': golden_value,
'current': value,
'reason': 'marked_untranslated'
"original": golden_value,
"current": value,
"reason": "marked_untranslated",
}
# Check if identical to golden (and should be translated)
elif value == golden_value and not self._is_expected_identical(key, value):
elif value == golden_value and not self._is_expected_identical(
key, value
):
untranslated_entries[key] = {
'original': golden_value,
'current': value,
'reason': 'identical_to_english'
"original": golden_value,
"current": value,
"reason": "identical_to_english",
}
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(untranslated_entries, f, indent=2, ensure_ascii=False)
return untranslated_entries
@@ -181,10 +200,10 @@ class TranslationMerger:
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
identical_patterns = [
'language.direction',
"language.direction",
]
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]:
return True
for pattern in identical_patterns:
@@ -193,12 +212,13 @@ class TranslationMerger:
return False
def apply_translations(self, target_file: Path, translations: Dict[str, str],
backup: bool = False) -> Dict:
def apply_translations(
self, target_file: Path, translations: Dict[str, str], backup: bool = False
) -> Dict:
"""Apply provided translations to target file."""
if not target_file.exists():
print(f"Error: Target file does not exist: {target_file}")
return {'success': False, 'error': 'File not found'}
return {"success": False, "error": "File not found"}
target_data = self._load_translation_file(target_file)
applied_count = 0
@@ -219,10 +239,10 @@ class TranslationMerger:
self._save_translation_file(target_data, target_file, backup)
return {
'success': True,
'applied_count': applied_count,
'errors': errors,
'data': target_data
"success": True,
"applied_count": applied_count,
"errors": errors,
"data": target_data,
}
def create_translation_template(self, target_file: Path, output_file: Path) -> None:
@@ -230,25 +250,25 @@ class TranslationMerger:
untranslated = self.extract_untranslated_entries(target_file)
template = {
'metadata': {
'source_language': 'en-GB',
'target_language': target_file.parent.name,
'total_entries': len(untranslated),
'created_at': datetime.now().isoformat(),
'instructions': 'Translate the "original" values to the target language. Keep the same keys.'
"metadata": {
"source_language": "en-GB",
"target_language": target_file.parent.name,
"total_entries": len(untranslated),
"created_at": datetime.now().isoformat(),
"instructions": 'Translate the "original" values to the target language. Keep the same keys.',
},
'translations': {}
"translations": {},
}
for key, entry in untranslated.items():
template['translations'][key] = {
'original': entry['original'],
'translated': '', # AI should fill this
'context': self._get_context_for_key(key),
'reason': entry['reason']
template["translations"][key] = {
"original": entry["original"],
"translated": "", # AI should fill this
"context": self._get_context_for_key(key),
"reason": entry["reason"],
}
with open(output_file, 'w', encoding='utf-8') as f:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(template, f, indent=2, ensure_ascii=False)
print(f"Translation template created: {output_file}")
@@ -256,7 +276,7 @@ class TranslationMerger:
def _get_context_for_key(self, key: str) -> str:
"""Get context information for a translation key."""
parts = key.split('.')
parts = key.split(".")
if len(parts) >= 2:
return f"Section: {parts[0]}, Property: {parts[-1]}"
return f"Property: {parts[-1]}"
@@ -264,33 +284,55 @@ class TranslationMerger:
def main():
parser = argparse.ArgumentParser(
description='Merge and manage translation files',
epilog='Works with TOML translation files.'
description="Merge and manage translation files",
epilog="Works with TOML translation files.",
)
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
help='Path to ignore patterns TOML file')
parser.add_argument('language', help='Target language code (e.g., fr-FR)')
parser.add_argument(
"--locales-dir",
default="frontend/public/locales",
help="Path to locales directory",
)
parser.add_argument(
"--ignore-file",
default="scripts/ignore_translation.toml",
help="Path to ignore patterns TOML file",
)
parser.add_argument("language", help="Target language code (e.g., fr-FR)")
subparsers = parser.add_subparsers(dest='command', help='Available commands')
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Add missing command
add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB')
add_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files')
add_parser = subparsers.add_parser(
"add-missing", help="Add missing translations from en-GB"
)
add_parser.add_argument(
"--backup", action="store_true", help="Create backup before modifying files"
)
# Extract untranslated command
extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries')
extract_parser.add_argument('--output', help='Output file path')
extract_parser = subparsers.add_parser(
"extract-untranslated", help="Extract untranslated entries"
)
extract_parser.add_argument("--output", help="Output file path")
# Create template command
template_parser = subparsers.add_parser('create-template', help='Create AI translation template')
template_parser.add_argument('--output', required=True, help='Output template file path')
template_parser = subparsers.add_parser(
"create-template", help="Create AI translation template"
)
template_parser.add_argument(
"--output", required=True, help="Output template file path"
)
# Apply translations command
apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file')
apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations')
apply_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files')
apply_parser = subparsers.add_parser(
"apply-translations", help="Apply translations from JSON file"
)
apply_parser.add_argument(
"--translations-file", required=True, help="JSON file with translations"
)
apply_parser.add_argument(
"--backup", action="store_true", help="Create backup before modifying files"
)
args = parser.parse_args()
@@ -304,44 +346,53 @@ def main():
lang_dir = Path(args.locales_dir) / args.language
target_file = lang_dir / "translation.toml"
if args.command == 'add-missing':
if args.command == "add-missing":
print(f"Adding missing translations to {args.language}...")
result = merger.add_missing_translations(target_file)
merger._save_translation_file(result['data'], target_file, backup=args.backup)
merger._save_translation_file(result["data"], target_file, backup=args.backup)
print(f"Added {result['added_count']} missing translations")
elif args.command == 'extract-untranslated':
output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json')
elif args.command == "extract-untranslated":
output_file = (
Path(args.output)
if args.output
else target_file.with_suffix(".untranslated.json")
)
untranslated = merger.extract_untranslated_entries(target_file, output_file)
print(f"Extracted {len(untranslated)} untranslated entries to {output_file}")
elif args.command == 'create-template':
elif args.command == "create-template":
output_file = Path(args.output)
merger.create_translation_template(target_file, output_file)
elif args.command == 'apply-translations':
with open(args.translations_file, 'r', encoding='utf-8') as f:
elif args.command == "apply-translations":
with open(args.translations_file, "r", encoding="utf-8") as f:
translations_data = json.load(f)
# Extract translations from template format or simple dict
if 'translations' in translations_data:
translations = {k: v['translated'] for k, v in translations_data['translations'].items()
if v.get('translated')}
if "translations" in translations_data:
translations = {
k: v["translated"]
for k, v in translations_data["translations"].items()
if v.get("translated")
}
else:
translations = translations_data
result = merger.apply_translations(target_file, translations, backup=args.backup)
result = merger.apply_translations(
target_file, translations, backup=args.backup
)
if result['success']:
if result["success"]:
print(f"Applied {result['applied_count']} translations")
if result['errors']:
if result["errors"]:
print(f"Errors: {len(result['errors'])}")
for error in result['errors'][:5]:
for error in result["errors"][:5]:
print(f" - {error}")
else:
print(f"Failed: {result.get('error', 'Unknown error')}")
if __name__ == "__main__":
main()
main()

View File

@@ -16,12 +16,12 @@ Usage:
import json
import sys
from pathlib import Path
from typing import Dict, List, Set
from typing import Dict, Set
import argparse
import tomllib # Python 3.11+ (stdlib)
def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
def get_all_keys(d: dict, parent_key: str = "", sep: str = ".") -> Set[str]:
"""Get all keys from nested dict as dot-notation paths."""
keys = set()
for k, v in d.items():
@@ -35,7 +35,7 @@ def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
def validate_translation_file(file_path: Path) -> tuple[bool, str]:
"""Validate that a file contains valid TOML."""
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
tomllib.load(f)
return True, "Valid TOML"
except Exception as e:
@@ -43,95 +43,85 @@ def validate_translation_file(file_path: Path) -> tuple[bool, str]:
def validate_structure(
en_gb_keys: Set[str],
lang_keys: Set[str],
lang_code: str
en_gb_keys: Set[str], lang_keys: Set[str], lang_code: str
) -> Dict:
"""Compare structure between en-GB and target language."""
missing_keys = en_gb_keys - lang_keys
extra_keys = lang_keys - en_gb_keys
return {
'language': lang_code,
'missing_keys': sorted(missing_keys),
'extra_keys': sorted(extra_keys),
'total_keys': len(lang_keys),
'expected_keys': len(en_gb_keys),
'missing_count': len(missing_keys),
'extra_count': len(extra_keys)
"language": lang_code,
"missing_keys": sorted(missing_keys),
"extra_keys": sorted(extra_keys),
"total_keys": len(lang_keys),
"expected_keys": len(en_gb_keys),
"missing_count": len(missing_keys),
"extra_count": len(extra_keys),
}
def print_validation_result(result: Dict, verbose: bool = False):
"""Print validation results in readable format."""
lang = result['language']
lang = result["language"]
print(f"\n{'='*100}")
print(f"\n{'=' * 100}")
print(f"Language: {lang}")
print(f"{'='*100}")
print(f"{'=' * 100}")
print(f" Total keys: {result['total_keys']}")
print(f" Expected keys (en-GB): {result['expected_keys']}")
print(f" Missing keys: {result['missing_count']}")
print(f" Extra keys: {result['extra_count']}")
if result['missing_count'] == 0 and result['extra_count'] == 0:
print(f" ✅ Structure matches en-GB perfectly!")
if result["missing_count"] == 0 and result["extra_count"] == 0:
print(" ✅ Structure matches en-GB perfectly!")
else:
if result['missing_count'] > 0:
if result["missing_count"] > 0:
print(f"\n ⚠️ Missing {result['missing_count']} key(s):")
if verbose or result['missing_count'] <= 20:
for key in result['missing_keys'][:50]:
if verbose or result["missing_count"] <= 20:
for key in result["missing_keys"][:50]:
print(f" - {key}")
if result['missing_count'] > 50:
if result["missing_count"] > 50:
print(f" ... and {result['missing_count'] - 50} more")
else:
print(f" (use --verbose to see all)")
print(" (use --verbose to see all)")
if result['extra_count'] > 0:
if result["extra_count"] > 0:
print(f"\n ⚠️ Extra {result['extra_count']} key(s) not in en-GB:")
if verbose or result['extra_count'] <= 20:
for key in result['extra_keys'][:50]:
if verbose or result["extra_count"] <= 20:
for key in result["extra_keys"][:50]:
print(f" - {key}")
if result['extra_count'] > 50:
if result["extra_count"] > 50:
print(f" ... and {result['extra_count'] - 50} more")
else:
print(f" (use --verbose to see all)")
print(" (use --verbose to see all)")
print("-" * 100)
def load_translation_file(file_path: Path) -> dict:
"""Load TOML translation file."""
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
return tomllib.load(f)
def main():
parser = argparse.ArgumentParser(
description='Validate translation TOML structure'
parser = argparse.ArgumentParser(description="Validate translation TOML structure")
parser.add_argument(
"--language",
help="Specific language code to validate (e.g., es-ES)",
default=None,
)
parser.add_argument(
'--language',
help='Specific language code to validate (e.g., es-ES)',
default=None
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show all missing/extra keys'
)
parser.add_argument(
'--json',
action='store_true',
help='Output results as JSON'
"--verbose", "-v", action="store_true", help="Show all missing/extra keys"
)
parser.add_argument("--json", action="store_true", help="Output results as JSON")
args = parser.parse_args()
# Define paths
locales_dir = Path('frontend/public/locales')
en_gb_path = locales_dir / 'en-GB' / 'translation.toml'
file_ext = '.toml'
locales_dir = Path("frontend/public/locales")
en_gb_path = locales_dir / "en-GB" / "translation.toml"
file_ext = ".toml"
if not en_gb_path.exists():
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
@@ -155,8 +145,8 @@ def main():
# Validate all languages except en-GB
languages = []
for d in locales_dir.iterdir():
if d.is_dir() and d.name != 'en-GB':
if (d / 'translation.toml').exists():
if d.is_dir() and d.name != "en-GB":
if (d / "translation.toml").exists():
languages.append(d.name)
results = []
@@ -164,7 +154,7 @@ def main():
# Validate each language
for lang_code in sorted(languages):
lang_path = locales_dir / lang_code / 'translation.toml'
lang_path = locales_dir / lang_code / "translation.toml"
if not lang_path.exists():
print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping")
@@ -173,11 +163,9 @@ def main():
# First check if file is valid
is_valid, message = validate_translation_file(lang_path)
if not is_valid:
json_errors.append({
'language': lang_code,
'file': str(lang_path),
'error': message
})
json_errors.append(
{"language": lang_code, "file": str(lang_path), "error": message}
)
continue
# Load and compare structure
@@ -189,10 +177,7 @@ def main():
# Output results
if args.json:
output = {
'json_errors': json_errors,
'structure_validation': results
}
output = {"json_errors": json_errors, "structure_validation": results}
print(json.dumps(output, indent=2, ensure_ascii=False))
else:
# Print syntax errors first
@@ -210,11 +195,13 @@ def main():
print("\n📊 Structure Validation Summary:")
print(f" Languages validated: {len(results)}")
perfect = sum(1 for r in results if r['missing_count'] == 0 and r['extra_count'] == 0)
perfect = sum(
1 for r in results if r["missing_count"] == 0 and r["extra_count"] == 0
)
print(f" Perfect matches: {perfect}/{len(results)}")
total_missing = sum(r['missing_count'] for r in results)
total_extra = sum(r['extra_count'] for r in results)
total_missing = sum(r["missing_count"] for r in results)
total_extra = sum(r["extra_count"] for r in results)
print(f" Total missing keys: {total_missing}")
print(f" Total extra keys: {total_extra}")
@@ -226,10 +213,10 @@ def main():
# Exit with error code if issues found
has_issues = len(json_errors) > 0 or any(
r['missing_count'] > 0 or r['extra_count'] > 0 for r in results
r["missing_count"] > 0 or r["extra_count"] > 0 for r in results
)
sys.exit(1 if has_issues else 0)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -13,7 +13,7 @@ import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
from typing import Dict, List, Set
import argparse
import tomllib # Python 3.11+ (stdlib)
@@ -22,10 +22,10 @@ def find_placeholders(text: str) -> Set[str]:
"""Find all placeholders in text like {n}, {{var}}, {0}, etc."""
if not isinstance(text, str):
return set()
return set(re.findall(r'\{\{?[^}]+\}\}?', text))
return set(re.findall(r"\{\{?[^}]+\}\}?", text))
def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str]:
def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> Dict[str, str]:
"""Flatten nested dict to dot-notation keys."""
items = []
for k, v in d.items():
@@ -38,9 +38,7 @@ def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str
def validate_language(
en_gb_flat: Dict[str, str],
lang_flat: Dict[str, str],
lang_code: str
en_gb_flat: Dict[str, str], lang_flat: Dict[str, str], lang_code: str
) -> List[Dict]:
"""Validate placeholders for a language against en-GB."""
issues = []
@@ -57,12 +55,12 @@ def validate_language(
extra = lang_placeholders - en_placeholders
issue = {
'language': lang_code,
'key': key,
'missing': missing,
'extra': extra,
'en_text': en_gb_flat[key],
'lang_text': lang_flat[key]
"language": lang_code,
"key": key,
"missing": missing,
"extra": extra,
"en_text": en_gb_flat[key],
"lang_text": lang_flat[key],
}
issues.append(issue)
@@ -82,9 +80,9 @@ def print_issues(issues: List[Dict], verbose: bool = False):
print(f"\n{i}. Language: {issue['language']}")
print(f" Key: {issue['key']}")
if issue['missing']:
if issue["missing"]:
print(f" ⚠️ MISSING placeholders: {issue['missing']}")
if issue['extra']:
if issue["extra"]:
print(f" ⚠️ EXTRA placeholders: {issue['extra']}")
if verbose:
@@ -96,37 +94,34 @@ def print_issues(issues: List[Dict], verbose: bool = False):
def main():
parser = argparse.ArgumentParser(
description='Validate translation placeholder consistency'
description="Validate translation placeholder consistency"
)
parser.add_argument(
'--language',
help='Specific language code to validate (e.g., es-ES)',
default=None
"--language",
help="Specific language code to validate (e.g., es-ES)",
default=None,
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show full text samples for each issue'
)
parser.add_argument(
'--json',
action='store_true',
help='Output results as JSON'
"--verbose",
"-v",
action="store_true",
help="Show full text samples for each issue",
)
parser.add_argument("--json", action="store_true", help="Output results as JSON")
args = parser.parse_args()
# Define paths
locales_dir = Path('frontend/public/locales')
en_gb_path = locales_dir / 'en-GB' / 'translation.toml'
file_ext = '.toml'
locales_dir = Path("frontend/public/locales")
en_gb_path = locales_dir / "en-GB" / "translation.toml"
file_ext = ".toml"
if not en_gb_path.exists():
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
sys.exit(1)
# Load en-GB (source of truth)
with open(en_gb_path, 'rb') as f:
with open(en_gb_path, "rb") as f:
en_gb = tomllib.load(f)
en_gb_flat = flatten_dict(en_gb)
@@ -138,22 +133,22 @@ def main():
# Validate all languages except en-GB
languages = []
for d in locales_dir.iterdir():
if d.is_dir() and d.name != 'en-GB':
if (d / 'translation.toml').exists():
if d.is_dir() and d.name != "en-GB":
if (d / "translation.toml").exists():
languages.append(d.name)
all_issues = []
# Validate each language
for lang_code in sorted(languages):
lang_path = locales_dir / lang_code / 'translation.toml'
lang_path = locales_dir / lang_code / "translation.toml"
if not lang_path.exists():
print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping")
continue
# Load language file
with open(lang_path, 'rb') as f:
with open(lang_path, "rb") as f:
lang_data = tomllib.load(f)
lang_flat = flatten_dict(lang_data)
@@ -168,19 +163,19 @@ def main():
# Group by language
by_language = {}
for issue in all_issues:
lang = issue['language']
lang = issue["language"]
if lang not in by_language:
by_language[lang] = []
by_language[lang].append(issue)
print(f"📊 Validation Summary:")
print("📊 Validation Summary:")
print(f" Total issues: {len(all_issues)}")
print(f" Languages with issues: {len(by_language)}\n")
for lang in sorted(by_language.keys()):
print(f"\n{'='*100}")
print(f"\n{'=' * 100}")
print(f"Language: {lang} ({len(by_language[lang])} issue(s))")
print(f"{'='*100}")
print(f"{'=' * 100}")
print_issues(by_language[lang], verbose=args.verbose)
else:
print("✅ All translations have correct placeholders!")
@@ -189,5 +184,5 @@ def main():
sys.exit(1 if all_issues else 0)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -55,14 +55,33 @@ class GlyphBuildResult:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Synthesize fonts from Type3 glyph JSON.")
parser.add_argument("--input", required=True, help="Path to glyph JSON emitted by the backend")
parser.add_argument("--otf-output", required=True, help="Destination path for the CFF/OTF font")
parser.add_argument("--ttf-output", help="Optional destination path for a TrueType font")
parser.add_argument("--family-name", default="Type3 Synth", help="Family name for the output")
parser.add_argument("--style-name", default="Regular", help="Style name for the output")
parser.add_argument("--units-per-em", type=int, default=1000, help="Units per EM value")
parser.add_argument("--cu2qu-error", type=float, default=1.0, help="Max error for cubic→quadratic conversion")
parser = argparse.ArgumentParser(
description="Synthesize fonts from Type3 glyph JSON."
)
parser.add_argument(
"--input", required=True, help="Path to glyph JSON emitted by the backend"
)
parser.add_argument(
"--otf-output", required=True, help="Destination path for the CFF/OTF font"
)
parser.add_argument(
"--ttf-output", help="Optional destination path for a TrueType font"
)
parser.add_argument(
"--family-name", default="Type3 Synth", help="Family name for the output"
)
parser.add_argument(
"--style-name", default="Regular", help="Style name for the output"
)
parser.add_argument(
"--units-per-em", type=int, default=1000, help="Units per EM value"
)
parser.add_argument(
"--cu2qu-error",
type=float,
default=1.0,
help="Max error for cubic→quadratic conversion",
)
return parser.parse_args()
@@ -151,18 +170,22 @@ def iterate_glyphs(data: Dict[str, object]) -> List[GlyphSource]:
char_code_value = record.get("code")
if not isinstance(char_code_value, int):
char_code_value = record.get("charCodeRaw")
if not isinstance(char_code_value, int) or not (0 <= char_code_value <= 0x10FFFF):
if not isinstance(char_code_value, int) or not (
0 <= char_code_value <= 0x10FFFF
):
char_code_value = None
outline = record.get("outline")
if not isinstance(outline, list):
outline = []
sources.append(
GlyphSource(
name=name,
width=float(width),
unicode=unicode_value,
char_code=char_code_value,
outline=outline))
GlyphSource(
name=name,
width=float(width),
unicode=unicode_value,
char_code=char_code_value,
outline=outline,
)
)
return sources
@@ -199,7 +222,10 @@ def build_cff_charstring(
start_point = point
open_path = True
elif op == "L" and current is not None:
point = (float(command.get("x", current[0])), float(command.get("y", current[1])))
point = (
float(command.get("x", current[0])),
float(command.get("y", current[1])),
)
pen.lineTo(point)
update_bounds(point)
current = point

View File

@@ -17,14 +17,21 @@ from __future__ import annotations
import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
REPO_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_SIGNATURES = REPO_ROOT / "docs" / "type3" / "signatures"
DEFAULT_INDEX = (
REPO_ROOT / "app" / "core" / "src" / "main" / "resources" / "type3" / "library" / "index.json"
REPO_ROOT
/ "app"
/ "core"
/ "src"
/ "main"
/ "resources"
/ "type3"
/ "library"
/ "index.json"
)
@@ -136,7 +143,12 @@ def update_library(
entry = alias_index[alias]
if entry is None:
unmatched.append((font.get("baseName") or font.get("alias_raw") or "unknown", sig_file))
unmatched.append(
(
font.get("baseName") or font.get("alias_raw") or "unknown",
sig_file,
)
)
continue
entry_modified = False
@@ -186,7 +198,9 @@ def update_library(
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Update Type3 library index using signature dumps.")
parser = argparse.ArgumentParser(
description="Update Type3 library index using signature dumps."
)
parser.add_argument(
"--signatures-dir",
type=Path,
@@ -209,7 +223,11 @@ def parse_args() -> argparse.Namespace:
def main() -> None:
args = parse_args()
signatures_dir = args.signatures_dir if args.signatures_dir.is_absolute() else (REPO_ROOT / args.signatures_dir)
signatures_dir = (
args.signatures_dir
if args.signatures_dir.is_absolute()
else (REPO_ROOT / args.signatures_dir)
)
index_path = args.index if args.index.is_absolute() else (REPO_ROOT / args.index)
if not signatures_dir.exists():

View File

@@ -60,4 +60,4 @@ networks:
volumes:
stirling-data:
stirling-config:
stirling-logs:
stirling-logs:

View File

@@ -56,4 +56,4 @@ networks:
volumes:
stirling-data:
stirling-config:
stirling-logs:
stirling-logs:

View File

@@ -56,4 +56,4 @@ networks:
volumes:
stirling-data:
stirling-config:
stirling-logs:
stirling-logs: