diff --git a/.github/config/dependency-review-config.yml b/.github/config/dependency-review-config.yml index 5df58cdb9..301a5d0b9 100644 --- a/.github/config/dependency-review-config.yml +++ b/.github/config/dependency-review-config.yml @@ -1 +1 @@ -allow-ghsas: GHSA-wrw7-89jp-8q8g \ No newline at end of file +allow-ghsas: GHSA-wrw7-89jp-8q8g diff --git a/.github/scripts/check_language_toml.py b/.github/scripts/check_language_toml.py index 494f90962..6860cc176 100644 --- a/.github/scripts/check_language_toml.py +++ b/.github/scripts/check_language_toml.py @@ -14,12 +14,10 @@ Usage: # Sample for Windows: # python .github/scripts/check_language_toml.py --reference-file frontend/public/locales/en-GB/translation.toml --branch "" --files frontend/public/locales/de-DE/translation.toml frontend/public/locales/fr-FR/translation.toml -import copy import glob import os import argparse import re -import json import tomllib # Python 3.11+ (stdlib) import tomli_w # For writing TOML files @@ -38,7 +36,7 @@ def find_duplicate_keys(file_path, keys=None, prefix=""): duplicates = [] # Load TOML file - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: data = tomllib.load(file) def process_dict(obj, current_prefix=""): @@ -67,7 +65,7 @@ def parse_toml_file(file_path): :param file_path: Path to the TOML file. :return: Dictionary with flattened keys. """ - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: data = tomllib.load(file) def flatten_dict(d, parent_key="", sep="."): @@ -193,13 +191,13 @@ def check_for_differences(reference_file, file_list, branch, actor): basename_current_file = os.path.basename(os.path.join(branch, file_normpath)) locale_dir = os.path.basename(os.path.dirname(file_normpath)) - if ( - basename_current_file == basename_reference_file - and locale_dir == "en-GB" - ): + if basename_current_file == basename_reference_file and locale_dir == "en-GB": continue - if not file_normpath.endswith(".toml") or basename_current_file != "translation.toml": + if ( + not file_normpath.endswith(".toml") + or basename_current_file != "translation.toml" + ): continue only_reference_file = False @@ -288,7 +286,9 @@ def check_for_differences(reference_file, file_list, branch, actor): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Find missing keys in TOML translation files") + parser = argparse.ArgumentParser( + description="Find missing keys in TOML translation files" + ) parser.add_argument( "--actor", required=False, diff --git a/docker/compose/docker-compose.ultra-lite.yml b/docker/compose/docker-compose.ultra-lite.yml index 0639b53ac..0b11bd75e 100644 --- a/docker/compose/docker-compose.ultra-lite.yml +++ b/docker/compose/docker-compose.ultra-lite.yml @@ -54,4 +54,4 @@ services: networks: stirling-network: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/frontend/scripts/generate-icons.js b/frontend/scripts/generate-icons.js index d99414b66..8dc91e6cc 100644 --- a/frontend/scripts/generate-icons.js +++ b/frontend/scripts/generate-icons.js @@ -19,27 +19,27 @@ const debug = (message) => { function scanForUsedIcons() { const usedIcons = new Set(); const srcDir = path.join(__dirname, '..', 'src'); - + info('🔍 Scanning codebase for LocalIcon usage...'); - + if (!fs.existsSync(srcDir)) { console.error('❌ Source directory not found:', srcDir); process.exit(1); } - + // Recursively scan all .tsx and .ts files function scanDirectory(dir) { const files = fs.readdirSync(dir); - + files.forEach(file => { const filePath = path.join(dir, file); const stat = fs.statSync(filePath); - + if (stat.isDirectory()) { scanDirectory(filePath); } else if (file.endsWith('.tsx') || file.endsWith('.ts')) { const content = fs.readFileSync(filePath, 'utf8'); - + // Match LocalIcon usage: const localIconMatches = content.match(/]*icon="([^"]+)"/g); if (localIconMatches) { @@ -51,7 +51,7 @@ function scanForUsedIcons() { } }); } - + // Match old material-symbols-rounded spans: icon-name const spanMatches = content.match(/]*className="[^"]*material-symbols-rounded[^"]*"[^>]*>([^<]+)<\/span>/g); if (spanMatches) { @@ -64,7 +64,7 @@ function scanForUsedIcons() { } }); } - + // Match Icon component usage: const iconMatches = content.match(/]*icon="material-symbols:([^"]+)"/g); if (iconMatches) { @@ -79,12 +79,12 @@ function scanForUsedIcons() { } }); } - + scanDirectory(srcDir); - + const iconArray = Array.from(usedIcons).sort(); info(`📋 Found ${iconArray.length} unique icons across codebase`); - + return iconArray; } @@ -102,7 +102,7 @@ async function main() { const existingSet = JSON.parse(fs.readFileSync(outputPath, 'utf8')); const existingIcons = Object.keys(existingSet.icons || {}).sort(); const currentIcons = [...usedIcons].sort(); - + if (JSON.stringify(existingIcons) === JSON.stringify(currentIcons)) { needsRegeneration = false; info(`✅ Icon set already up-to-date (${usedIcons.length} icons, ${Math.round(fs.statSync(outputPath).size / 1024)}KB)`); @@ -122,7 +122,7 @@ async function main() { // Dynamic import of ES module const { getIcons } = await import('@iconify/utils'); - + // Extract only our used icons from the full set const extractedIcons = getIcons(icons, usedIcons); @@ -183,4 +183,4 @@ export default iconSet; main().catch(error => { console.error('❌ Script failed:', error); process.exit(1); -}); \ No newline at end of file +}); diff --git a/scripts/analyze_pdf_json.py b/scripts/analyze_pdf_json.py index 1a9ba9b21..50656947e 100644 --- a/scripts/analyze_pdf_json.py +++ b/scripts/analyze_pdf_json.py @@ -9,10 +9,10 @@ The script prints size and font statistics so we can confirm whether the lightweight export (no COS dictionaries) is active and how large the font payloads are. """ + from __future__ import annotations import argparse -import base64 import json import math from pathlib import Path @@ -105,7 +105,11 @@ def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown: sample_cos_ids.append((font_id, uid)) metadata_bytes += approx_struct_size( - {k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}} + { + k: v + for k, v in font.items() + if k not in {"program", "webProgram", "pdfProgram"} + } ) program = font.get("program") @@ -259,18 +263,14 @@ def main() -> None: f" Text payload characters (not counting JSON overhead): " f"{page_stats.text_payload_chars:,}" ) - print( - f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}" - ) + print(f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}") print( f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}" ) print( f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}" ) - print( - f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}" - ) + print(f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}") if __name__ == "__main__": diff --git a/scripts/convert_cff_to_ttf.py b/scripts/convert_cff_to_ttf.py index 7a7f99270..1e0ff9aea 100644 --- a/scripts/convert_cff_to_ttf.py +++ b/scripts/convert_cff_to_ttf.py @@ -3,6 +3,7 @@ Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility. Builds proper Unicode cmap from PDF ToUnicode data. """ + import sys import re from pathlib import Path @@ -13,6 +14,7 @@ from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12 from fontTools.ttLib.tables._n_a_m_e import NameRecord from fontTools.ttLib.tables.O_S_2f_2 import Panose + def parse_unicode_mapping(mapping_path): """ Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap). @@ -21,23 +23,27 @@ def parse_unicode_mapping(mapping_path): dict[int, int]: GID → Unicode codepoint """ try: - with open(mapping_path, 'rb') as f: - data = f.read().decode('utf-8', errors='ignore') + with open(mapping_path, "rb") as f: + data = f.read().decode("utf-8", errors="ignore") # Try parsing as JSON first (CID font with complete mapping) - if data.strip().startswith('{'): + if data.strip().startswith("{"): import json + try: mapping_data = json.loads(data) - if mapping_data.get('isCID'): + if mapping_data.get("isCID"): # Build GID → Unicode mapping from entries gid_to_unicode = {} - for entry in mapping_data.get('entries', []): - gid = entry['gid'] - unicode_val = entry['unicode'] + for entry in mapping_data.get("entries", []): + gid = entry["gid"] + unicode_val = entry["unicode"] if unicode_val > 0: gid_to_unicode[gid] = unicode_val - print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr) + print( + f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", + file=sys.stderr, + ) return gid_to_unicode except json.JSONDecodeError: pass @@ -47,7 +53,7 @@ def parse_unicode_mapping(mapping_path): gid_to_unicode = {} # Pattern for bfchar entries - bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>' + bfchar_pattern = r"<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>" for match in re.finditer(bfchar_pattern, data): gid = int(match.group(1), 16) # For non-CID, char code == GID unicode_val = int(match.group(2), 16) @@ -55,7 +61,7 @@ def parse_unicode_mapping(mapping_path): gid_to_unicode[gid] = unicode_val # Pattern for bfrange entries - bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>' + bfrange_pattern = r"<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>" for match in re.finditer(bfrange_pattern, data): start_gid = int(match.group(1), 16) end_gid = int(match.group(2), 16) @@ -72,6 +78,7 @@ def parse_unicode_mapping(mapping_path): print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr) return {} + def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): """ Wrap raw CFF data (from PDF font stream) as OpenType-CFF. @@ -86,7 +93,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): """ try: # Read raw CFF data - with open(input_path, 'rb') as f: + with open(input_path, "rb") as f: cff_data = f.read() # Parse raw CFF data @@ -106,29 +113,35 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): gid_to_unicode = parse_unicode_mapping(tounicode_path) # Create a new OTF font - otf = TTFont(sfntVersion='OTTO') # 'OTTO' = CFF-flavored OpenType + otf = TTFont(sfntVersion="OTTO") # 'OTTO' = CFF-flavored OpenType # Get glyph names - if hasattr(cff_font, 'charset') and cff_font.charset is not None: - glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef'] + if hasattr(cff_font, "charset") and cff_font.charset is not None: + glyph_order = [".notdef"] + [ + name for name in cff_font.charset if name != ".notdef" + ] else: # Fallback to CharStrings keys charstrings = cff_font.CharStrings - glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef'] + glyph_order = [".notdef"] + [ + name for name in charstrings.keys() if name != ".notdef" + ] otf.setGlyphOrder(glyph_order) # === Add CFF table (the actual font outlines) === - cff_table = newTable('CFF ') + cff_table = newTable("CFF ") cff_table.cff = cff_fontset - otf['CFF '] = cff_table + otf["CFF "] = cff_table # === Calculate metrics from CFF === charstrings = cff_font.CharStrings # Get defaults from CFF Private dict - private_dict = getattr(cff_font, 'Private', None) - default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500 + private_dict = getattr(cff_font, "Private", None) + default_width = ( + getattr(private_dict, "defaultWidthX", 500) if private_dict else 500 + ) # Calculate bounding box, widths, and LSBs x_min = 0 @@ -152,7 +165,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): cs = charstrings[glyph_name] # Get width from charstring - if hasattr(cs, 'width'): + if hasattr(cs, "width"): width = int(cs.width) # Calculate bounds for LSB and bbox @@ -181,7 +194,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): except: pass # Some glyphs may not have outlines - except Exception as e: + except Exception: pass # Use defaults widths[glyph_name] = width @@ -196,7 +209,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): units_per_em = 1000 # Standard for Type1/CFF # === Create head table === - head = newTable('head') + head = newTable("head") head.tableVersion = 1.0 head.fontRevision = 1.0 head.checkSumAdjustment = 0 @@ -214,10 +227,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): head.indexToLocFormat = 0 head.glyphDataFormat = 0 head.lowestRecPPEM = 8 - otf['head'] = head + otf["head"] = head # === Create hhea table with correct metrics === - hhea = newTable('hhea') + hhea = newTable("hhea") hhea.tableVersion = 0x00010000 hhea.ascent = max(y_max, 800) hhea.descent = min(y_min, -200) @@ -235,27 +248,30 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): hhea.reserved3 = 0 hhea.metricDataFormat = 0 hhea.numberOfHMetrics = len(glyph_order) - otf['hhea'] = hhea + otf["hhea"] = hhea # === Create hmtx table with correct LSBs === - hmtx = newTable('hmtx') + hmtx = newTable("hmtx") hmtx.metrics = {} for glyph_name in glyph_order: - hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0)) - otf['hmtx'] = hmtx + hmtx.metrics[glyph_name] = ( + widths.get(glyph_name, default_width), + lsbs.get(glyph_name, 0), + ) + otf["hmtx"] = hmtx # === Create maxp table (simpler for CFF) === - maxp = newTable('maxp') + maxp = newTable("maxp") maxp.tableVersion = 0x00005000 # CFF version (0.5) maxp.numGlyphs = len(glyph_order) - otf['maxp'] = maxp + otf["maxp"] = maxp # === Build Unicode cmap from GID→Unicode mapping === unicode_to_glyph = {} if gid_to_unicode: # Debug: Show first few glyph names to understand naming convention - sample_glyphs = glyph_order[:min(10, len(glyph_order))] + sample_glyphs = glyph_order[: min(10, len(glyph_order))] print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr) # Debug: Show which GIDs we have mappings for @@ -264,7 +280,9 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): # For CID fonts: glyph names are "cid00123" (5-digit zero-padded) # For non-CID fonts: glyph names vary but GID == array index - is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6]) # Check first few non-.notdef glyphs + is_cid_font = any( + gn.startswith("cid") for gn in glyph_order[1:6] + ) # Check first few non-.notdef glyphs for gid, unicode_val in gid_to_unicode.items(): if unicode_val > 0: @@ -285,18 +303,21 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): glyph_name = glyph_order[gid] unicode_to_glyph[unicode_val] = glyph_name - print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr) + print( + f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", + file=sys.stderr, + ) # Also try to map from glyph names (uni0041 → U+0041) for glyph_name in glyph_order: - if glyph_name.startswith('uni') and len(glyph_name) == 7: + if glyph_name.startswith("uni") and len(glyph_name) == 7: try: unicode_val = int(glyph_name[3:], 16) if unicode_val not in unicode_to_glyph: unicode_to_glyph[unicode_val] = glyph_name except: pass - elif glyph_name.startswith('u') and len(glyph_name) >= 5: + elif glyph_name.startswith("u") and len(glyph_name) >= 5: try: unicode_val = int(glyph_name[1:], 16) if unicode_val not in unicode_to_glyph: @@ -305,14 +326,14 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): pass # === Create cmap table === - cmap = newTable('cmap') + cmap = newTable("cmap") cmap.tableVersion = 0 cmap_tables = [] # Windows Unicode BMP (format 4) - required cmap4_win = cmap_format_4(4) cmap4_win.platformID = 3 # Windows - cmap4_win.platEncID = 1 # Unicode BMP + cmap4_win.platEncID = 1 # Unicode BMP cmap4_win.language = 0 cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF} cmap_tables.append(cmap4_win) @@ -329,23 +350,27 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): # Mac Unicode (format 4) - for compatibility cmap4_mac = cmap_format_4(4) cmap4_mac.platformID = 1 # Mac - cmap4_mac.platEncID = 0 # Roman + cmap4_mac.platEncID = 0 # Roman cmap4_mac.language = 0 cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF} cmap_tables.append(cmap4_mac) - cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win] # Ensure at least one - otf['cmap'] = cmap + cmap.tables = [t for t in cmap_tables if t.cmap] or [ + cmap4_win + ] # Ensure at least one + otf["cmap"] = cmap - print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr) + print( + f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr + ) # === Create OS/2 table with correct metrics === - os2 = newTable('OS/2') + os2 = newTable("OS/2") os2.version = 4 os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500 os2.usWeightClass = 400 # Normal - os2.usWidthClass = 5 # Medium - os2.fsType = 0 # Installable embedding + os2.usWidthClass = 5 # Medium + os2.fsType = 0 # Installable embedding os2.ySubscriptXSize = 650 os2.ySubscriptYSize = 600 os2.ySubscriptXOffset = 0 @@ -375,7 +400,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): os2.ulUnicodeRange2 = 0 os2.ulUnicodeRange3 = 0 os2.ulUnicodeRange4 = 0 - os2.achVendID = 'SPDF' + os2.achVendID = "SPDF" os2.fsSelection = 0x0040 # REGULAR bit # Set character index range from actual cmap @@ -385,7 +410,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): os2.usLastCharIndex = codepoints[-1] else: os2.usFirstCharIndex = 0x20 # space - os2.usLastCharIndex = 0x7E # tilde + os2.usLastCharIndex = 0x7E # tilde # Typo metrics match hhea os2.sTypoAscender = hhea.ascent @@ -403,10 +428,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): os2.usDefaultChar = 0 os2.usBreakChar = 32 os2.usMaxContext = 0 - otf['OS/2'] = os2 + otf["OS/2"] = os2 # === Create name table with Windows and Mac records === - name = newTable('name') + name = newTable("name") name.names = [] # Get font name from CFF if available @@ -418,7 +443,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): 3: f"Stirling-PDF: {font_name}", # Unique ID 4: font_name, # Full Name 5: "Version 1.0", # Version - 6: font_name.replace(' ', '-'), # PostScript Name + 6: font_name.replace(" ", "-"), # PostScript Name } # Add both Windows and Mac name records @@ -441,10 +466,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): rec_mac.string = value name.names.append(rec_mac) - otf['name'] = name + otf["name"] = name # === Create post table (format 3.0 for smaller web fonts) === - post = newTable('post') + post = newTable("post") post.formatType = 3.0 # No glyph names (smaller, web-optimized) post.italicAngle = 0 post.underlinePosition = -100 @@ -454,7 +479,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): post.maxMemType42 = 0 post.minMemType1 = 0 post.maxMemType1 = 0 - otf['post'] = post + otf["post"] = post # Save the OTF font otf.save(output_path) @@ -465,12 +490,17 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): except Exception as e: print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr) import traceback + traceback.print_exc(file=sys.stderr) return False + def main(): if len(sys.argv) < 3: - print("Usage: convert_cff_to_ttf.py [tounicode.cmap]", file=sys.stderr) + print( + "Usage: convert_cff_to_ttf.py [tounicode.cmap]", + file=sys.stderr, + ) sys.exit(1) input_path = Path(sys.argv[1]) @@ -485,8 +515,13 @@ def main(): print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr) tounicode_path = None - success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None) + success = wrap_cff_as_otf( + str(input_path), + str(output_path), + str(tounicode_path) if tounicode_path else None, + ) sys.exit(0 if success else 1) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/scripts/download_pdf_samples.py b/scripts/download_pdf_samples.py index 6a882c56f..06055f677 100644 --- a/scripts/download_pdf_samples.py +++ b/scripts/download_pdf_samples.py @@ -27,7 +27,7 @@ import os import re import sys from pathlib import Path -from typing import Iterable, List, Optional, Set, Tuple +from typing import List, Optional, Set, Tuple from urllib.parse import unquote, urlparse import requests @@ -121,10 +121,10 @@ def build_filename(url: str, output_dir: Path) -> Path: def download_pdf( - url: str, - output_dir: Path, - timeout: int, - overwrite: bool, + url: str, + output_dir: Path, + timeout: int, + overwrite: bool, ) -> Tuple[str, Optional[Path], Optional[str]]: try: dest = build_filename(url, output_dir) @@ -139,8 +139,12 @@ def download_pdf( # Peek into the first bytes to be safe peek = response.raw.read(5, decode_content=True) if not peek.startswith(b"%PDF"): - return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})" - content = peek + response.content[len(peek):] + return ( + url, + None, + f"Skipping non-PDF content-type ({content_type or 'unknown'})", + ) + content = peek + response.content[len(peek) :] else: content = response.content @@ -157,7 +161,9 @@ def main() -> None: output_dir = Path(args.output_dir).resolve() output_dir.mkdir(parents=True, exist_ok=True) - print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...") + print( + f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers..." + ) successes = 0 skipped = 0 @@ -184,7 +190,9 @@ def main() -> None: print(f"[OK] {url} -> {path}") print() - print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}") + print( + f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}" + ) if failures: print("Failures:") for url, error in failures: diff --git a/scripts/harvest_type3_fonts.py b/scripts/harvest_type3_fonts.py index 5edb1b2a9..445d4f363 100644 --- a/scripts/harvest_type3_fonts.py +++ b/scripts/harvest_type3_fonts.py @@ -28,13 +28,15 @@ import shlex import subprocess import sys from pathlib import Path -from typing import Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Dict, List, Sequence, Tuple REPO_ROOT = Path(__file__).resolve().parents[1] def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Bulk collect Type3 font signatures from PDFs.") + parser = argparse.ArgumentParser( + description="Bulk collect Type3 font signatures from PDFs." + ) parser.add_argument( "--input", nargs="+", @@ -145,7 +147,7 @@ def run_signature_tool( if pretty: args += " --pretty" # Use shell invocation so the quoted --args string is parsed correctly by Gradle. - cmd = f"{gradle_cmd} -q :proprietary:type3SignatureTool --args=\"{args}\"" + cmd = f'{gradle_cmd} -q :proprietary:type3SignatureTool --args="{args}"' completed = subprocess.run( cmd, shell=True, @@ -207,11 +209,15 @@ def main() -> None: try: payload = load_signature_file(signature_path) except Exception as exc: - print(f"[WARN] Failed to parse cached signature {signature_path}: {exc}") + print( + f"[WARN] Failed to parse cached signature {signature_path}: {exc}" + ) payload = None else: try: - run_signature_tool(args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT) + run_signature_tool( + args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT + ) except Exception as exc: print(f"[ERROR] Harvest failed for {pdf}: {exc}", file=sys.stderr) continue diff --git a/scripts/index_type3_catalogue.py b/scripts/index_type3_catalogue.py index 49dce500e..69e7c33e4 100644 --- a/scripts/index_type3_catalogue.py +++ b/scripts/index_type3_catalogue.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """Build a Type3 font catalogue from sample PDFs.""" + import argparse import json import subprocess diff --git a/scripts/summarize_type3_signatures.py b/scripts/summarize_type3_signatures.py index ae8706935..98057c679 100644 --- a/scripts/summarize_type3_signatures.py +++ b/scripts/summarize_type3_signatures.py @@ -18,7 +18,9 @@ from typing import Dict, List def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Summarize Type3 signature JSON dumps.") + parser = argparse.ArgumentParser( + description="Summarize Type3 signature JSON dumps." + ) parser.add_argument( "--input", default="docs/type3/signatures", @@ -53,7 +55,9 @@ def load_signatures(directory: Path) -> Dict[str, List[dict]]: return inventory -def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Path) -> None: +def write_markdown( + inventory: Dict[str, List[dict]], output: Path, input_dir: Path +) -> None: lines: List[str] = [] lines.append("# Type3 Signature Inventory") lines.append("") @@ -72,7 +76,9 @@ def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Pa for entry in entries: signature = entry.get("signature") or "—" sample = Path(entry["source"]).name - glyph_count = entry.get("glyphCount") if entry.get("glyphCount") is not None else "—" + glyph_count = ( + entry.get("glyphCount") if entry.get("glyphCount") is not None else "—" + ) coverage = entry.get("glyphCoverage") or [] preview = ", ".join(str(code) for code in coverage[:10]) lines.append(f"| `{signature}` | `{sample}` | {glyph_count} | {preview} |") diff --git a/scripts/translations/ai_translation_helper.py b/scripts/translations/ai_translation_helper.py index bb6fff5e9..59aed55b0 100644 --- a/scripts/translations/ai_translation_helper.py +++ b/scripts/translations/ai_translation_helper.py @@ -7,10 +7,8 @@ TOML format only. """ import json -import os -import sys from pathlib import Path -from typing import Dict, List, Set, Tuple, Any, Optional +from typing import Dict, List, Any import argparse import re from datetime import datetime @@ -27,7 +25,7 @@ class AITranslationHelper: def _load_translation_file(self, file_path: Path) -> Dict: """Load TOML translation file.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) except (FileNotFoundError, Exception) as e: print(f"Error loading {file_path}: {e}") @@ -35,27 +33,31 @@ class AITranslationHelper: def _save_translation_file(self, data: Dict, file_path: Path) -> None: """Save TOML translation file.""" - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: tomli_w.dump(data, f) - def create_ai_batch_file(self, languages: List[str], output_file: Path, - max_entries_per_language: int = 50) -> None: + def create_ai_batch_file( + self, + languages: List[str], + output_file: Path, + max_entries_per_language: int = 50, + ) -> None: """Create a batch file for AI translation with multiple languages.""" golden_truth = self._load_translation_file(self.golden_truth_file) batch_data = { - 'metadata': { - 'created_at': datetime.now().isoformat(), - 'source_language': 'en-GB', - 'target_languages': languages, - 'max_entries_per_language': max_entries_per_language, - 'instructions': { - 'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}', - 'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.', - 'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.', - 'style': 'Keep translations concise and user-friendly' - } + "metadata": { + "created_at": datetime.now().isoformat(), + "source_language": "en-GB", + "target_languages": languages, + "max_entries_per_language": max_entries_per_language, + "instructions": { + "format": "Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}", + "context": "This is for a PDF manipulation tool. Keep technical terms consistent.", + "placeholders": "Preserve all placeholders: {n}, {total}, {filename}, etc.", + "style": "Keep translations concise and user-friendly", + }, }, - 'translations': {} + "translations": {}, } for lang in languages: @@ -72,41 +74,57 @@ class AITranslationHelper: untranslated = self._find_untranslated_entries(golden_truth, lang_data) # Limit entries if specified - if max_entries_per_language and len(untranslated) > max_entries_per_language: + if ( + max_entries_per_language + and len(untranslated) > max_entries_per_language + ): # Prioritize by key importance - untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language) + untranslated = self._prioritize_translation_keys( + untranslated, max_entries_per_language + ) - batch_data['translations'][lang] = {} + batch_data["translations"][lang] = {} for key, value in untranslated.items(): - batch_data['translations'][lang][key] = { - 'original': value, - 'translated': '', # AI fills this - 'context': self._get_key_context(key) + batch_data["translations"][lang][key] = { + "original": value, + "translated": "", # AI fills this + "context": self._get_key_context(key), } # Always save batch files as JSON for compatibility - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(batch_data, f, indent=2, ensure_ascii=False) - total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values()) + total_entries = sum( + len(lang_data) for lang_data in batch_data["translations"].values() + ) print(f"Created AI batch file: {output_file}") print(f"Total entries to translate: {total_entries}") - def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]: + def _find_untranslated_entries( + self, golden_truth: Dict, lang_data: Dict + ) -> Dict[str, str]: """Find entries that need translation.""" golden_flat = self._flatten_dict(golden_truth) lang_flat = self._flatten_dict(lang_data) untranslated = {} for key, value in golden_flat.items(): - if (key not in lang_flat or - lang_flat[key] == value or - (isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))): + if ( + key not in lang_flat + or lang_flat[key] == value + or ( + isinstance(lang_flat[key], str) + and lang_flat[key].startswith("[UNTRANSLATED]") + ) + ): if not self._is_expected_identical(key, value): untranslated[key] = value return untranslated - def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]: + def _flatten_dict( + self, d: Dict, parent_key: str = "", separator: str = "." + ) -> Dict[str, Any]: """Flatten nested dictionary.""" items = [] for k, v in d.items(): @@ -119,25 +137,27 @@ class AITranslationHelper: def _is_expected_identical(self, key: str, value: str) -> bool: """Check if key should be identical across languages.""" - if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']: + if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]: return True - return 'language.direction' in key.lower() + return "language.direction" in key.lower() - def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]: + def _prioritize_translation_keys( + self, untranslated: Dict[str, str], max_count: int + ) -> Dict[str, str]: """Prioritize which keys to translate first based on importance.""" # Define priority order (higher score = higher priority) priority_patterns = [ - ('title', 10), - ('header', 9), - ('submit', 8), - ('selectText', 7), - ('prompt', 6), - ('desc', 5), - ('error', 8), - ('warning', 7), - ('save', 8), - ('download', 8), - ('upload', 7), + ("title", 10), + ("header", 9), + ("submit", 8), + ("selectText", 7), + ("prompt", 6), + ("desc", 5), + ("error", 8), + ("warning", 7), + ("save", 8), + ("download", 8), + ("upload", 7), ] scored_keys = [] @@ -154,89 +174,99 @@ class AITranslationHelper: def _get_key_context(self, key: str) -> str: """Get contextual information for a translation key.""" - parts = key.split('.') + parts = key.split(".") contexts = { - 'addPageNumbers': 'Feature for adding page numbers to PDFs', - 'compress': 'PDF compression functionality', - 'merge': 'PDF merging functionality', - 'split': 'PDF splitting functionality', - 'rotate': 'PDF rotation functionality', - 'convert': 'File conversion functionality', - 'security': 'PDF security and permissions', - 'metadata': 'PDF metadata editing', - 'watermark': 'Adding watermarks to PDFs', - 'overlay': 'PDF overlay functionality', - 'extract': 'Extracting content from PDFs' + "addPageNumbers": "Feature for adding page numbers to PDFs", + "compress": "PDF compression functionality", + "merge": "PDF merging functionality", + "split": "PDF splitting functionality", + "rotate": "PDF rotation functionality", + "convert": "File conversion functionality", + "security": "PDF security and permissions", + "metadata": "PDF metadata editing", + "watermark": "Adding watermarks to PDFs", + "overlay": "PDF overlay functionality", + "extract": "Extracting content from PDFs", } if len(parts) > 0: main_section = parts[0] - context = contexts.get(main_section, f'Part of {main_section} functionality') + context = contexts.get( + main_section, f"Part of {main_section} functionality" + ) if len(parts) > 1: - context += f', specifically for {parts[-1]}' + context += f", specifically for {parts[-1]}" return context - return 'General application text' + return "General application text" def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]: """Validate AI translations for common issues.""" # Batch files are always JSON - with open(batch_file, 'r', encoding='utf-8') as f: + with open(batch_file, "r", encoding="utf-8") as f: batch_data = json.load(f) - issues = {'errors': [], 'warnings': []} + issues = {"errors": [], "warnings": []} - for lang, translations in batch_data.get('translations', {}).items(): + for lang, translations in batch_data.get("translations", {}).items(): for key, translation_data in translations.items(): - original = translation_data.get('original', '') - translated = translation_data.get('translated', '') + original = translation_data.get("original", "") + translated = translation_data.get("translated", "") if not translated: - issues['errors'].append(f"{lang}.{key}: Missing translation") + issues["errors"].append(f"{lang}.{key}: Missing translation") continue # Check for placeholder preservation - original_placeholders = re.findall(r'\{[^}]+\}', original) - translated_placeholders = re.findall(r'\{[^}]+\}', translated) + original_placeholders = re.findall(r"\{[^}]+\}", original) + translated_placeholders = re.findall(r"\{[^}]+\}", translated) if set(original_placeholders) != set(translated_placeholders): - issues['warnings'].append( + issues["warnings"].append( f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, " f"Translated: {translated_placeholders}" ) # Check if translation is identical to original (might be untranslated) - if translated == original and not self._is_expected_identical(key, original): - issues['warnings'].append(f"{lang}.{key}: Translation identical to original") + if translated == original and not self._is_expected_identical( + key, original + ): + issues["warnings"].append( + f"{lang}.{key}: Translation identical to original" + ) # Check for common AI translation artifacts - artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}'] + artifacts = ["[TRANSLATE]", "[TODO]", "UNTRANSLATED", "{{", "}}"] for artifact in artifacts: if artifact in translated: - issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}") + issues["errors"].append( + f"{lang}.{key}: Contains translation artifact: {artifact}" + ) return issues - def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]: + def apply_ai_batch_translations( + self, batch_file: Path, validate: bool = True + ) -> Dict[str, Any]: """Apply translations from AI batch file to individual language files.""" # Batch files are always JSON - with open(batch_file, 'r', encoding='utf-8') as f: + with open(batch_file, "r", encoding="utf-8") as f: batch_data = json.load(f) - results = {'applied': {}, 'errors': [], 'warnings': []} + results = {"applied": {}, "errors": [], "warnings": []} if validate: validation_issues = self.validate_ai_translations(batch_file) - if validation_issues['errors']: + if validation_issues["errors"]: print("Validation errors found. Fix these before applying:") - for error in validation_issues['errors']: + for error in validation_issues["errors"]: print(f" ERROR: {error}") return results - if validation_issues['warnings']: + if validation_issues["warnings"]: print("Validation warnings (review recommended):") - for warning in validation_issues['warnings'][:10]: + for warning in validation_issues["warnings"][:10]: print(f" WARNING: {warning}") - for lang, translations in batch_data.get('translations', {}).items(): + for lang, translations in batch_data.get("translations", {}).items(): lang_dir = self.locales_dir / lang toml_file = lang_dir / "translation.toml" @@ -249,42 +279,48 @@ class AITranslationHelper: applied_count = 0 for key, translation_data in translations.items(): - translated = translation_data.get('translated', '').strip() - if translated and translated != translation_data.get('original', ''): + translated = translation_data.get("translated", "").strip() + if translated and translated != translation_data.get("original", ""): self._set_nested_value(lang_data, key, translated) applied_count += 1 if applied_count > 0: self._save_translation_file(lang_data, toml_file) - results['applied'][lang] = applied_count + results["applied"][lang] = applied_count print(f"Applied {applied_count} translations to {lang}") return results def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None: """Set value in nested dict using dot notation.""" - keys = key_path.split('.') + keys = key_path.split(".") current = data for key in keys[:-1]: if key not in current: current[key] = {} elif not isinstance(current[key], dict): # If the current value is not a dict, we can't nest into it - print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting") + print( + f"Warning: Converting non-dict value at '{key}' to dict to allow nesting" + ) current[key] = {} current = current[key] current[keys[-1]] = value - def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None: + def export_for_external_translation( + self, languages: List[str], output_format: str = "csv" + ) -> None: """Export translations for external translation services.""" golden_truth = self._load_translation_file(self.golden_truth_file) golden_flat = self._flatten_dict(golden_truth) - if output_format == 'csv': - output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv') + if output_format == "csv": + output_file = Path( + f"translations_export_{datetime.now().strftime('%Y%m%d')}.csv" + ) - with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = ['key', 'context', 'en_GB'] + languages + with open(output_file, "w", newline="", encoding="utf-8") as csvfile: + fieldnames = ["key", "context", "en_GB"] + languages writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() @@ -293,9 +329,9 @@ class AITranslationHelper: continue row = { - 'key': key, - 'context': self._get_key_context(key), - 'en_GB': en_value + "key": key, + "context": self._get_key_context(key), + "en_GB": en_value, } for lang in languages: @@ -305,28 +341,30 @@ class AITranslationHelper: if toml_file.exists(): lang_data = self._load_translation_file(toml_file) lang_flat = self._flatten_dict(lang_data) - value = lang_flat.get(key, '') - if value.startswith('[UNTRANSLATED]'): - value = '' + value = lang_flat.get(key, "") + if value.startswith("[UNTRANSLATED]"): + value = "" row[lang] = value else: - row[lang] = '' + row[lang] = "" writer.writerow(row) print(f"Exported to {output_file}") - elif output_format == 'json': - output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json') - export_data = {'languages': languages, 'translations': {}} + elif output_format == "json": + output_file = Path( + f"translations_export_{datetime.now().strftime('%Y%m%d')}.json" + ) + export_data = {"languages": languages, "translations": {}} for key, en_value in golden_flat.items(): if self._is_expected_identical(key, en_value): continue - export_data['translations'][key] = { - 'en_GB': en_value, - 'context': self._get_key_context(key) + export_data["translations"][key] = { + "en_GB": en_value, + "context": self._get_key_context(key), } for lang in languages: @@ -336,51 +374,64 @@ class AITranslationHelper: if toml_file.exists(): lang_data = self._load_translation_file(toml_file) lang_flat = self._flatten_dict(lang_data) - value = lang_flat.get(key, '') - if value.startswith('[UNTRANSLATED]'): - value = '' - export_data['translations'][key][lang] = value + value = lang_flat.get(key, "") + if value.startswith("[UNTRANSLATED]"): + value = "" + export_data["translations"][key][lang] = value # Export files are always JSON - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(export_data, f, indent=2, ensure_ascii=False) print(f"Exported to {output_file}") def main(): parser = argparse.ArgumentParser( - description='AI Translation Helper', - epilog='Works with TOML translation files.' + description="AI Translation Helper", epilog="Works with TOML translation files." + ) + parser.add_argument( + "--locales-dir", + default="frontend/public/locales", + help="Path to locales directory", ) - parser.add_argument('--locales-dir', default='frontend/public/locales', - help='Path to locales directory') - subparsers = parser.add_subparsers(dest='command', help='Available commands') + subparsers = parser.add_subparsers(dest="command", help="Available commands") # Create batch command - batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file') - batch_parser.add_argument('--languages', nargs='+', required=True, - help='Language codes to include') - batch_parser.add_argument('--output', required=True, help='Output batch file') - batch_parser.add_argument('--max-entries', type=int, default=100, - help='Max entries per language') + batch_parser = subparsers.add_parser( + "create-batch", help="Create AI translation batch file" + ) + batch_parser.add_argument( + "--languages", nargs="+", required=True, help="Language codes to include" + ) + batch_parser.add_argument("--output", required=True, help="Output batch file") + batch_parser.add_argument( + "--max-entries", type=int, default=100, help="Max entries per language" + ) # Validate command - validate_parser = subparsers.add_parser('validate', help='Validate AI translations') - validate_parser.add_argument('batch_file', help='Batch file to validate') + validate_parser = subparsers.add_parser("validate", help="Validate AI translations") + validate_parser.add_argument("batch_file", help="Batch file to validate") # Apply command - apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations') - apply_parser.add_argument('batch_file', help='Batch file with translations') - apply_parser.add_argument('--skip-validation', action='store_true', - help='Skip validation before applying') + apply_parser = subparsers.add_parser( + "apply-batch", help="Apply AI batch translations" + ) + apply_parser.add_argument("batch_file", help="Batch file with translations") + apply_parser.add_argument( + "--skip-validation", action="store_true", help="Skip validation before applying" + ) # Export command - export_parser = subparsers.add_parser('export', help='Export for external translation') - export_parser.add_argument('--languages', nargs='+', required=True, - help='Language codes to export') - export_parser.add_argument('--format', choices=['csv', 'json'], default='csv', - help='Export format') + export_parser = subparsers.add_parser( + "export", help="Export for external translation" + ) + export_parser.add_argument( + "--languages", nargs="+", required=True, help="Language codes to export" + ) + export_parser.add_argument( + "--format", choices=["csv", "json"], default="csv", help="Export format" + ) args = parser.parse_args() @@ -390,40 +441,39 @@ def main(): helper = AITranslationHelper(args.locales_dir) - if args.command == 'create-batch': + if args.command == "create-batch": output_file = Path(args.output) helper.create_ai_batch_file(args.languages, output_file, args.max_entries) - elif args.command == 'validate': + elif args.command == "validate": batch_file = Path(args.batch_file) issues = helper.validate_ai_translations(batch_file) - if issues['errors']: + if issues["errors"]: print("ERRORS:") - for error in issues['errors']: + for error in issues["errors"]: print(f" - {error}") - if issues['warnings']: + if issues["warnings"]: print("WARNINGS:") - for warning in issues['warnings']: + for warning in issues["warnings"]: print(f" - {warning}") - if not issues['errors'] and not issues['warnings']: + if not issues["errors"] and not issues["warnings"]: print("No validation issues found!") - elif args.command == 'apply-batch': + elif args.command == "apply-batch": batch_file = Path(args.batch_file) results = helper.apply_ai_batch_translations( - batch_file, - validate=not args.skip_validation + batch_file, validate=not args.skip_validation ) - total_applied = sum(results['applied'].values()) + total_applied = sum(results["applied"].values()) print(f"Total translations applied: {total_applied}") - elif args.command == 'export': + elif args.command == "export": helper.export_for_external_translation(args.languages, args.format) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/translations/auto_translate.py b/scripts/translations/auto_translate.py index 3e406433f..05328a5fd 100644 --- a/scripts/translations/auto_translate.py +++ b/scripts/translations/auto_translate.py @@ -19,9 +19,9 @@ import tomllib def run_command(cmd, description=""): """Run a shell command and return success status.""" if description: - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Step: {description}") - print(f"{'='*60}") + print(f"{'=' * 60}") result = subprocess.run(cmd, shell=True, capture_output=True, text=True) @@ -40,29 +40,35 @@ def find_translation_file(lang_dir): return toml_file return None + def load_translation_file(file_path): """Load TOML translation file.""" - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) + def extract_untranslated(language_code, batch_size=500, include_existing=False): """Extract untranslated entries and split into batches.""" - mode = "all untranslated (including existing)" if include_existing else "new (missing)" + mode = ( + "all untranslated (including existing)" if include_existing else "new (missing)" + ) print(f"\n🔍 Extracting {mode} entries for {language_code}...") # Load files - golden_path = find_translation_file(Path('frontend/public/locales/en-GB')) - lang_path = find_translation_file(Path(f'frontend/public/locales/{language_code}')) + golden_path = find_translation_file(Path("frontend/public/locales/en-GB")) + lang_path = find_translation_file(Path(f"frontend/public/locales/{language_code}")) if not golden_path: - print(f"Error: Golden truth file not found in frontend/public/locales/en-GB") + print("Error: Golden truth file not found in frontend/public/locales/en-GB") return None if not lang_path: - print(f"Error: Language file not found in frontend/public/locales/{language_code}") + print( + f"Error: Language file not found in frontend/public/locales/{language_code}" + ) return None - def flatten_dict(d, parent_key='', separator='.'): + def flatten_dict(d, parent_key="", separator="."): items = [] for k, v in d.items(): new_key = f"{parent_key}{separator}{k}" if parent_key else k @@ -76,7 +82,7 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False): lang_data = load_translation_file(lang_path) if not golden or not lang_data: - print(f"Error: Failed to load translation files") + print("Error: Failed to load translation files") return None golden_flat = flatten_dict(golden) @@ -87,9 +93,14 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False): for key, value in golden_flat.items(): if include_existing: # Include missing keys, keys with English values, and [UNTRANSLATED] keys - if (key not in lang_flat or - lang_flat.get(key) == value or - (isinstance(lang_flat.get(key), str) and lang_flat.get(key).startswith("[UNTRANSLATED]"))): + if ( + key not in lang_flat + or lang_flat.get(key) == value + or ( + isinstance(lang_flat.get(key), str) + and lang_flat.get(key).startswith("[UNTRANSLATED]") + ) + ): untranslated[key] = value else: # Only include missing keys (not in target file at all) @@ -108,16 +119,16 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False): num_batches = (total + batch_size - 1) // batch_size batch_files = [] - lang_code_safe = language_code.replace('-', '_') + lang_code_safe = language_code.replace("-", "_") for i in range(num_batches): start = i * batch_size end = min((i + 1) * batch_size, total) batch = dict(entries[start:end]) - filename = f'{lang_code_safe}_batch_{i+1}_of_{num_batches}.json' - with open(filename, 'w', encoding='utf-8') as f: - json.dump(batch, f, ensure_ascii=False, separators=(',', ':')) + filename = f"{lang_code_safe}_batch_{i + 1}_of_{num_batches}.json" + with open(filename, "w", encoding="utf-8") as f: + json.dump(batch, f, ensure_ascii=False, separators=(",", ":")) batch_files.append(filename) print(f" Created {filename} with {len(batch)} entries") @@ -131,7 +142,7 @@ def translate_batches(batch_files, language_code, api_key, timeout=600): return [] print(f"\n🤖 Translating {len(batch_files)} batches using GPT-5...") - print(f"Timeout: {timeout}s ({timeout//60} minutes) per batch") + print(f"Timeout: {timeout}s ({timeout // 60} minutes) per batch") translated_files = [] @@ -142,7 +153,9 @@ def translate_batches(batch_files, language_code, api_key, timeout=600): cmd = f'python3 scripts/translations/batch_translator.py "{batch_file}" --language {language_code} --api-key "{api_key}"' # Run with timeout - result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=timeout + ) if result.stdout: print(result.stdout) @@ -153,7 +166,7 @@ def translate_batches(batch_files, language_code, api_key, timeout=600): print(f"✗ Failed to translate {batch_file}") return None - translated_file = batch_file.replace('.json', '_translated.json') + translated_file = batch_file.replace(".json", "_translated.json") translated_files.append(translated_file) # Small delay between batches @@ -177,14 +190,14 @@ def merge_translations(translated_files, language_code): print(f"Error: Translated file not found: {filename}") return None - with open(filename, 'r', encoding='utf-8') as f: + with open(filename, "r", encoding="utf-8") as f: merged.update(json.load(f)) - lang_code_safe = language_code.replace('-', '_') - merged_file = f'{lang_code_safe}_merged.json' + lang_code_safe = language_code.replace("-", "_") + merged_file = f"{lang_code_safe}_merged.json" - with open(merged_file, 'w', encoding='utf-8') as f: - json.dump(merged, f, ensure_ascii=False, separators=(',', ':')) + with open(merged_file, "w", encoding="utf-8") as f: + json.dump(merged, f, ensure_ascii=False, separators=(",", ":")) print(f"✓ Merged {len(merged)} translations into {merged_file}") return merged_file @@ -194,13 +207,13 @@ def apply_translations(merged_file, language_code): """Apply merged translations to the language file.""" print(f"\n📝 Applying translations to {language_code}...") - cmd = f'python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}' + cmd = f"python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}" if not run_command(cmd): - print(f"✗ Failed to apply translations") + print("✗ Failed to apply translations") return False - print(f"✓ Translations applied successfully") + print("✓ Translations applied successfully") return True @@ -208,27 +221,25 @@ def beautify_translations(language_code): """Beautify translation file to match en-GB structure.""" print(f"\n✨ Beautifying {language_code} translation file...") - cmd = f'python3 scripts/translations/toml_beautifier.py --language {language_code}' + cmd = f"python3 scripts/translations/toml_beautifier.py --language {language_code}" if not run_command(cmd): - print(f"✗ Failed to beautify translations") + print("✗ Failed to beautify translations") return False - print(f"✓ Translation file beautified") + print("✓ Translation file beautified") return True def cleanup_temp_files(language_code): """Remove temporary batch files.""" - print(f"\n🧹 Cleaning up temporary files...") + print("\n🧹 Cleaning up temporary files...") - lang_code_safe = language_code.replace('-', '_') - patterns = [ - f'{lang_code_safe}_batch_*.json', - f'{lang_code_safe}_merged.json' - ] + lang_code_safe = language_code.replace("-", "_") + patterns = [f"{lang_code_safe}_batch_*.json", f"{lang_code_safe}_merged.json"] import glob + removed = 0 for pattern in patterns: for file in glob.glob(pattern): @@ -240,15 +251,15 @@ def cleanup_temp_files(language_code): def verify_completion(language_code): """Check final completion percentage.""" - print(f"\n📊 Verifying completion...") + print("\n📊 Verifying completion...") - cmd = f'python3 scripts/translations/translation_analyzer.py --language {language_code} --summary' + cmd = f"python3 scripts/translations/translation_analyzer.py --language {language_code} --summary" run_command(cmd) def main(): parser = argparse.ArgumentParser( - description='Automated translation pipeline for Stirling PDF', + description="Automated translation pipeline for Stirling PDF", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Note: This script works with TOML translation files. @@ -266,36 +277,57 @@ Examples: # Skip cleanup (keep temporary files for inspection) python3 scripts/translations/auto_translate.py fr-FR --no-cleanup - """ + """, ) - parser.add_argument('language', help='Language code (e.g., es-ES, de-DE, zh-CN)') - parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)') - parser.add_argument('--batch-size', type=int, default=500, help='Entries per batch (default: 500)') - parser.add_argument('--no-cleanup', action='store_true', help='Keep temporary batch files') - parser.add_argument('--skip-verification', action='store_true', help='Skip final completion check') - parser.add_argument('--timeout', type=int, default=600, help='Timeout per batch in seconds (default: 600 = 10 minutes)') - parser.add_argument('--include-existing', action='store_true', help='Also retranslate existing keys that match English (default: only translate missing keys)') + parser.add_argument("language", help="Language code (e.g., es-ES, de-DE, zh-CN)") + parser.add_argument( + "--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)" + ) + parser.add_argument( + "--batch-size", type=int, default=500, help="Entries per batch (default: 500)" + ) + parser.add_argument( + "--no-cleanup", action="store_true", help="Keep temporary batch files" + ) + parser.add_argument( + "--skip-verification", action="store_true", help="Skip final completion check" + ) + parser.add_argument( + "--timeout", + type=int, + default=600, + help="Timeout per batch in seconds (default: 600 = 10 minutes)", + ) + parser.add_argument( + "--include-existing", + action="store_true", + help="Also retranslate existing keys that match English (default: only translate missing keys)", + ) args = parser.parse_args() # Verify API key - api_key = args.api_key or os.environ.get('OPENAI_API_KEY') + api_key = args.api_key or os.environ.get("OPENAI_API_KEY") if not api_key: - print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable") + print( + "Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable" + ) sys.exit(1) - print("="*60) - print(f"Automated Translation Pipeline") + print("=" * 60) + print("Automated Translation Pipeline") print(f"Language: {args.language}") print(f"Batch Size: {args.batch_size} entries") - print("="*60) + print("=" * 60) start_time = time.time() try: # Step 1: Extract and split - batch_files = extract_untranslated(args.language, args.batch_size, args.include_existing) + batch_files = extract_untranslated( + args.language, args.batch_size, args.include_existing + ) if batch_files is None: sys.exit(1) @@ -304,7 +336,9 @@ Examples: sys.exit(0) # Step 2: Translate all batches - translated_files = translate_batches(batch_files, args.language, api_key, args.timeout) + translated_files = translate_batches( + batch_files, args.language, api_key, args.timeout + ) if translated_files is None: sys.exit(1) @@ -330,10 +364,10 @@ Examples: verify_completion(args.language) elapsed = time.time() - start_time - print("\n" + "="*60) - print(f"✅ Translation pipeline completed successfully!") + print("\n" + "=" * 60) + print("✅ Translation pipeline completed successfully!") print(f"Time elapsed: {elapsed:.1f} seconds") - print("="*60) + print("=" * 60) except KeyboardInterrupt: print("\n\n⚠ Translation interrupted by user") @@ -341,6 +375,7 @@ Examples: except Exception as e: print(f"\n\n✗ Error: {e}") import traceback + traceback.print_exc() sys.exit(1) diff --git a/scripts/translations/batch_translator.py b/scripts/translations/batch_translator.py index 0085a51de..a77c05328 100644 --- a/scripts/translations/batch_translator.py +++ b/scripts/translations/batch_translator.py @@ -79,10 +79,12 @@ CRITICAL RULES - MUST FOLLOW EXACTLY: Return ONLY the translated JSON. No markdown, no explanations, just the JSON object.""" - def translate_batch(self, batch_data: dict, target_language: str, language_code: str) -> dict: + def translate_batch( + self, batch_data: dict, target_language: str, language_code: str + ) -> dict: """Translate a batch file using OpenAI API.""" # Convert batch to compact JSON for API - input_json = json.dumps(batch_data, ensure_ascii=False, separators=(',', ':')) + input_json = json.dumps(batch_data, ensure_ascii=False, separators=(",", ":")) print(f"Translating {len(batch_data)} entries to {target_language}...") print(f"Input size: {len(input_json)} characters") @@ -94,12 +96,14 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj messages=[ { "role": "system", - "content": self.get_translation_prompt(target_language, language_code) + "content": self.get_translation_prompt( + target_language, language_code + ), }, { "role": "user", - "content": f"Translate this JSON:\n\n{input_json}" - } + "content": f"Translate this JSON:\n\n{input_json}", + }, ], ) @@ -107,13 +111,13 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj # Remove markdown code blocks if present if translated_text.startswith("```"): - lines = translated_text.split('\n') - translated_text = '\n'.join(lines[1:-1]) + lines = translated_text.split("\n") + translated_text = "\n".join(lines[1:-1]) # Parse the translated JSON translated_data = json.loads(translated_text) - print(f"✓ Translation complete") + print("✓ Translation complete") return translated_data except json.JSONDecodeError as e: @@ -139,7 +143,8 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj # Check placeholders in each value import re - placeholder_pattern = r'\{[^}]+\}|\{\{[^}]+\}\}' + + placeholder_pattern = r"\{[^}]+\}|\{\{[^}]+\}\}" for key in original.keys(): if key not in translated: @@ -153,7 +158,9 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj trans_placeholders = set(re.findall(placeholder_pattern, trans_value)) if orig_placeholders != trans_placeholders: - issues.append(f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}") + issues.append( + f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}" + ) if issues: print("\n⚠ Validation warnings:") @@ -170,37 +177,37 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj def get_language_info(language_code: str) -> tuple: """Get full language name from code.""" languages = { - 'zh-CN': ('Simplified Chinese', 'zh-CN'), - 'es-ES': ('Spanish', 'es-ES'), - 'it-IT': ('Italian', 'it-IT'), - 'de-DE': ('German', 'de-DE'), - 'ar-AR': ('Arabic', 'ar-AR'), - 'pt-BR': ('Brazilian Portuguese', 'pt-BR'), - 'ru-RU': ('Russian', 'ru-RU'), - 'fr-FR': ('French', 'fr-FR'), - 'ja-JP': ('Japanese', 'ja-JP'), - 'ko-KR': ('Korean', 'ko-KR'), - 'nl-NL': ('Dutch', 'nl-NL'), - 'pl-PL': ('Polish', 'pl-PL'), - 'sv-SE': ('Swedish', 'sv-SE'), - 'da-DK': ('Danish', 'da-DK'), - 'no-NB': ('Norwegian', 'no-NB'), - 'fi-FI': ('Finnish', 'fi-FI'), - 'tr-TR': ('Turkish', 'tr-TR'), - 'vi-VN': ('Vietnamese', 'vi-VN'), - 'th-TH': ('Thai', 'th-TH'), - 'id-ID': ('Indonesian', 'id-ID'), - 'hi-IN': ('Hindi', 'hi-IN'), - 'cs-CZ': ('Czech', 'cs-CZ'), - 'hu-HU': ('Hungarian', 'hu-HU'), - 'ro-RO': ('Romanian', 'ro-RO'), - 'uk-UA': ('Ukrainian', 'uk-UA'), - 'el-GR': ('Greek', 'el-GR'), - 'bg-BG': ('Bulgarian', 'bg-BG'), - 'hr-HR': ('Croatian', 'hr-HR'), - 'sk-SK': ('Slovak', 'sk-SK'), - 'sl-SI': ('Slovenian', 'sl-SI'), - 'ca-CA': ('Catalan', 'ca-CA'), + "zh-CN": ("Simplified Chinese", "zh-CN"), + "es-ES": ("Spanish", "es-ES"), + "it-IT": ("Italian", "it-IT"), + "de-DE": ("German", "de-DE"), + "ar-AR": ("Arabic", "ar-AR"), + "pt-BR": ("Brazilian Portuguese", "pt-BR"), + "ru-RU": ("Russian", "ru-RU"), + "fr-FR": ("French", "fr-FR"), + "ja-JP": ("Japanese", "ja-JP"), + "ko-KR": ("Korean", "ko-KR"), + "nl-NL": ("Dutch", "nl-NL"), + "pl-PL": ("Polish", "pl-PL"), + "sv-SE": ("Swedish", "sv-SE"), + "da-DK": ("Danish", "da-DK"), + "no-NB": ("Norwegian", "no-NB"), + "fi-FI": ("Finnish", "fi-FI"), + "tr-TR": ("Turkish", "tr-TR"), + "vi-VN": ("Vietnamese", "vi-VN"), + "th-TH": ("Thai", "th-TH"), + "id-ID": ("Indonesian", "id-ID"), + "hi-IN": ("Hindi", "hi-IN"), + "cs-CZ": ("Czech", "cs-CZ"), + "hu-HU": ("Hungarian", "hu-HU"), + "ro-RO": ("Romanian", "ro-RO"), + "uk-UA": ("Ukrainian", "uk-UA"), + "el-GR": ("Greek", "el-GR"), + "bg-BG": ("Bulgarian", "bg-BG"), + "hr-HR": ("Croatian", "hr-HR"), + "sk-SK": ("Slovak", "sk-SK"), + "sl-SI": ("Slovenian", "sl-SI"), + "ca-CA": ("Catalan", "ca-CA"), } return languages.get(language_code, (language_code, language_code)) @@ -208,7 +215,7 @@ def get_language_info(language_code: str) -> tuple: def main(): parser = argparse.ArgumentParser( - description='Translate JSON batch files using OpenAI API (output supports TOML and JSON)', + description="Translate JSON batch files using OpenAI API (output supports TOML and JSON)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Note: This script works with JSON batch files. The translation files it updates can be TOML or JSON. @@ -226,24 +233,51 @@ Examples: # Use different model python batch_translator.py file.json --api-key KEY --language es-ES --model gpt-4-turbo - """ + """, ) - parser.add_argument('input_files', nargs='+', help='Input batch JSON file(s) or pattern') - parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)') - parser.add_argument('--language', '-l', required=True, help='Target language code (e.g., zh-CN, es-ES)') - parser.add_argument('--model', default='gpt-5', help='OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)') - parser.add_argument('--output-suffix', default='_translated', help='Suffix for output files (default: _translated)') - parser.add_argument('--skip-validation', action='store_true', help='Skip validation checks') - parser.add_argument('--delay', type=float, default=1.0, help='Delay between API calls in seconds (default: 1.0)') + parser.add_argument( + "input_files", nargs="+", help="Input batch JSON file(s) or pattern" + ) + parser.add_argument( + "--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)" + ) + parser.add_argument( + "--language", + "-l", + required=True, + help="Target language code (e.g., zh-CN, es-ES)", + ) + parser.add_argument( + "--model", + default="gpt-5", + help="OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)", + ) + parser.add_argument( + "--output-suffix", + default="_translated", + help="Suffix for output files (default: _translated)", + ) + parser.add_argument( + "--skip-validation", action="store_true", help="Skip validation checks" + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Delay between API calls in seconds (default: 1.0)", + ) args = parser.parse_args() # Get API key from args or environment import os - api_key = args.api_key or os.environ.get('OPENAI_API_KEY') + + api_key = args.api_key or os.environ.get("OPENAI_API_KEY") if not api_key: - print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable") + print( + "Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable" + ) sys.exit(1) # Get language info @@ -251,6 +285,7 @@ Examples: # Expand file patterns import glob + input_files = [] for pattern in args.input_files: matched = glob.glob(pattern) @@ -263,7 +298,7 @@ Examples: print("Error: No input files found") sys.exit(1) - print(f"Batch Translator") + print("Batch Translator") print(f"Target Language: {language_name} ({language_code})") print(f"Model: {args.model}") print(f"Files to translate: {len(input_files)}") @@ -281,11 +316,13 @@ Examples: try: # Load input file - with open(input_file, 'r', encoding='utf-8') as f: + with open(input_file, "r", encoding="utf-8") as f: batch_data = json.load(f) # Translate - translated_data = translator.translate_batch(batch_data, language_name, language_code) + translated_data = translator.translate_batch( + batch_data, language_name, language_code + ) # Validate if not args.skip_validation: @@ -295,8 +332,8 @@ Examples: input_path = Path(input_file) output_file = input_path.stem + args.output_suffix + input_path.suffix - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(translated_data, f, ensure_ascii=False, separators=(',', ':')) + with open(output_file, "w", encoding="utf-8") as f: + json.dump(translated_data, f, ensure_ascii=False, separators=(",", ":")) print(f"✓ Saved to: {output_file}") successful += 1 @@ -312,7 +349,7 @@ Examples: # Summary print("\n" + "=" * 60) - print(f"Translation complete!") + print("Translation complete!") print(f"Successful: {successful}/{len(input_files)}") if failed > 0: print(f"Failed: {failed}/{len(input_files)}") @@ -321,5 +358,4 @@ Examples: if __name__ == "__main__": - import os main() diff --git a/scripts/translations/bulk_auto_translate.py b/scripts/translations/bulk_auto_translate.py index ec854a2d7..39dad0067 100644 --- a/scripts/translations/bulk_auto_translate.py +++ b/scripts/translations/bulk_auto_translate.py @@ -54,16 +54,16 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float] return None try: - with open(toml_file, 'rb') as f: + with open(toml_file, "rb") as f: target_data = tomllib.load(f) # Load en-GB reference - en_gb_file = locales_dir / 'en-GB' / 'translation.toml' - with open(en_gb_file, 'rb') as f: + en_gb_file = locales_dir / "en-GB" / "translation.toml" + with open(en_gb_file, "rb") as f: en_gb_data = tomllib.load(f) # Flatten and count - def flatten(d, parent=''): + def flatten(d, parent=""): items = {} for k, v in d.items(): key = f"{parent}.{k}" if parent else k @@ -77,7 +77,11 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float] target_flat = flatten(target_data) # Count translated (not equal to en-GB) - translated = sum(1 for k in en_gb_flat if k in target_flat and target_flat[k] != en_gb_flat[k]) + translated = sum( + 1 + for k in en_gb_flat + if k in target_flat and target_flat[k] != en_gb_flat[k] + ) total = len(en_gb_flat) return (translated / total * 100) if total > 0 else 0.0 @@ -87,7 +91,14 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float] return None -def translate_language(language: str, api_key: str, batch_size: int, timeout: int, skip_verification: bool, include_existing: bool) -> Tuple[str, bool, str]: +def translate_language( + language: str, + api_key: str, + batch_size: int, + timeout: int, + skip_verification: bool, + include_existing: bool, +) -> Tuple[str, bool, str]: """ Translate a single language. Returns: (language_code, success, message) @@ -95,25 +106,29 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in safe_print(f"[{language}] Starting translation...") cmd = [ - 'python3', 'scripts/translations/auto_translate.py', + "python3", + "scripts/translations/auto_translate.py", language, - '--api-key', api_key, - '--batch-size', str(batch_size), - '--timeout', str(timeout) + "--api-key", + api_key, + "--batch-size", + str(batch_size), + "--timeout", + str(timeout), ] if skip_verification: - cmd.append('--skip-verification') + cmd.append("--skip-verification") if include_existing: - cmd.append('--include-existing') + cmd.append("--include-existing") try: result = subprocess.run( cmd, capture_output=True, text=True, - timeout=timeout * 5 # Overall timeout = 5x per-batch timeout + timeout=timeout * 5, # Overall timeout = 5x per-batch timeout ) if result.returncode == 0: @@ -124,7 +139,9 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in safe_print(f"[{language}] ✓ Success") return (language, True, "Success") else: - error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error" + error_msg = ( + result.stderr.strip() or result.stdout.strip() or "Unknown error" + ) safe_print(f"[{language}] ✗ Failed: {error_msg[:100]}") return (language, False, error_msg[:200]) # Truncate long errors @@ -138,7 +155,7 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in def main(): parser = argparse.ArgumentParser( - description='Bulk auto-translate all languages using OpenAI API', + description="Bulk auto-translate all languages using OpenAI API", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -155,35 +172,70 @@ Examples: python3 bulk_auto_translate.py --dry-run Note: Requires OPENAI_API_KEY environment variable or --api-key argument. -""" +""", ) - parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)') - parser.add_argument('--parallel', type=int, default=1, - help='Number of parallel translation threads (default: 1)') - parser.add_argument('--batch-size', type=int, default=500, - help='Entries per batch for translation (default: 500)') - parser.add_argument('--timeout', type=int, default=600, - help='Timeout per batch in seconds (default: 600)') - parser.add_argument('--threshold', type=float, default=0.0, - help='Only translate languages below this completion %% (default: 0 = all)') - parser.add_argument('--languages', nargs='+', - help='Translate only specific languages (e.g., de-DE fr-FR)') - parser.add_argument('--locales-dir', default='frontend/public/locales', - help='Path to locales directory') - parser.add_argument('--skip-verification', action='store_true', - help='Skip final completion verification for each language') - parser.add_argument('--include-existing', action='store_true', - help='Also retranslate existing keys that match English (default: only translate missing keys)') - parser.add_argument('--dry-run', action='store_true', - help='Show what would be translated without actually translating') + parser.add_argument( + "--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)" + ) + parser.add_argument( + "--parallel", + type=int, + default=1, + help="Number of parallel translation threads (default: 1)", + ) + parser.add_argument( + "--batch-size", + type=int, + default=500, + help="Entries per batch for translation (default: 500)", + ) + parser.add_argument( + "--timeout", + type=int, + default=600, + help="Timeout per batch in seconds (default: 600)", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.0, + help="Only translate languages below this completion %% (default: 0 = all)", + ) + parser.add_argument( + "--languages", + nargs="+", + help="Translate only specific languages (e.g., de-DE fr-FR)", + ) + parser.add_argument( + "--locales-dir", + default="frontend/public/locales", + help="Path to locales directory", + ) + parser.add_argument( + "--skip-verification", + action="store_true", + help="Skip final completion verification for each language", + ) + parser.add_argument( + "--include-existing", + action="store_true", + help="Also retranslate existing keys that match English (default: only translate missing keys)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be translated without actually translating", + ) args = parser.parse_args() # Verify API key (unless dry run) - api_key = args.api_key or os.environ.get('OPENAI_API_KEY') + api_key = args.api_key or os.environ.get("OPENAI_API_KEY") if not args.dry_run and not api_key: - print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable") + print( + "Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable" + ) sys.exit(1) locales_dir = Path(args.locales_dir) @@ -221,16 +273,16 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument. print("\nNo languages below threshold!") sys.exit(0) - print(f"\n{'='*60}") - print(f"Bulk Translation Configuration") - print(f"{'='*60}") + print(f"\n{'=' * 60}") + print("Bulk Translation Configuration") + print(f"{'=' * 60}") print(f"Languages to translate: {len(languages)}") print(f"Parallel threads: {args.parallel}") print(f"Batch size: {args.batch_size}") print(f"Timeout per batch: {args.timeout}s") if args.threshold > 0: print(f"Completion threshold: {args.threshold}%") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") if args.dry_run: print("DRY RUN - Languages that would be translated:") @@ -244,11 +296,7 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument. start_time = time.time() # Translate in parallel - results = { - 'success': [], - 'failed': [], - 'already_complete': [] - } + results = {"success": [], "failed": [], "already_complete": []} with ThreadPoolExecutor(max_workers=args.parallel) as executor: futures = { @@ -259,7 +307,7 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument. args.batch_size, args.timeout, args.skip_verification, - args.include_existing + args.include_existing, ): lang for lang in languages } @@ -269,43 +317,43 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument. if success: if message == "Already complete": - results['already_complete'].append(language) + results["already_complete"].append(language) else: - results['success'].append(language) + results["success"].append(language) else: - results['failed'].append((language, message)) + results["failed"].append((language, message)) elapsed = time.time() - start_time # Print summary - print("\n" + "="*60) + print("\n" + "=" * 60) print("Bulk Translation Summary") - print("="*60) + print("=" * 60) print(f"Total languages: {len(languages)}") print(f"Successful: {len(results['success'])}") print(f"Already complete: {len(results['already_complete'])}") print(f"Failed: {len(results['failed'])}") - print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)") - print("="*60) + print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed / 60:.1f} minutes)") + print("=" * 60) - if results['success']: + if results["success"]: print(f"\n✅ Successfully translated ({len(results['success'])}):") - for lang in sorted(results['success']): + for lang in sorted(results["success"]): print(f" - {lang}") - if results['already_complete']: + if results["already_complete"]: print(f"\n✓ Already complete ({len(results['already_complete'])}):") - for lang in sorted(results['already_complete']): + for lang in sorted(results["already_complete"]): print(f" - {lang}") - if results['failed']: + if results["failed"]: print(f"\n❌ Failed ({len(results['failed'])}):") - for lang, msg in sorted(results['failed']): + for lang, msg in sorted(results["failed"]): print(f" - {lang}: {msg}") sys.exit(1) print("\n✅ Bulk translation completed successfully!") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/translations/compact_translator.py b/scripts/translations/compact_translator.py index efe22f9f8..921d5c152 100644 --- a/scripts/translations/compact_translator.py +++ b/scripts/translations/compact_translator.py @@ -13,11 +13,18 @@ import tomllib # Python 3.11+ (stdlib) class CompactTranslationExtractor: - def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): + def __init__( + self, + locales_dir: str = "frontend/public/locales", + ignore_file: str = "scripts/ignore_translation.toml", + ): self.locales_dir = Path(locales_dir) self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml" if not self.golden_truth_file.exists(): - print(f"Error: en-GB translation file not found at {self.golden_truth_file}", file=sys.stderr) + print( + f"Error: en-GB translation file not found at {self.golden_truth_file}", + file=sys.stderr, + ) sys.exit(1) self.golden_truth = self._load_translation_file(self.golden_truth_file) self.ignore_file = Path(ignore_file) @@ -26,7 +33,7 @@ class CompactTranslationExtractor: def _load_translation_file(self, file_path: Path) -> dict: """Load TOML translation file.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) except FileNotFoundError: print(f"Error: File not found: {file_path}", file=sys.stderr) @@ -41,14 +48,21 @@ class CompactTranslationExtractor: return {} try: - with open(self.ignore_file, 'rb') as f: + with open(self.ignore_file, "rb") as f: ignore_data = tomllib.load(f) - return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()} + return { + lang: set(data.get("ignore", [])) for lang, data in ignore_data.items() + } except Exception as e: - print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr) + print( + f"Warning: Could not load ignore file {self.ignore_file}: {e}", + file=sys.stderr, + ) return {} - def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict: + def _flatten_dict( + self, d: dict, parent_key: str = "", separator: str = "." + ) -> dict: """Flatten nested dictionary into dot-notation keys.""" items = [] for k, v in d.items(): @@ -65,14 +79,17 @@ class CompactTranslationExtractor: target_file = lang_dir / "translation.toml" if not target_file.exists(): - print(f"Error: Translation file not found for language: {language}", file=sys.stderr) + print( + f"Error: Translation file not found for language: {language}", + file=sys.stderr, + ) sys.exit(1) target_data = self._load_translation_file(target_file) golden_flat = self._flatten_dict(self.golden_truth) target_flat = self._flatten_dict(target_data) - lang_code = language.replace('-', '_') + lang_code = language.replace("-", "_") ignore_set = self.ignore_patterns.get(lang_code, set()) # Find missing translations @@ -85,8 +102,13 @@ class CompactTranslationExtractor: target_value = target_flat[key] golden_value = golden_flat[key] - if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \ - (golden_value == target_value and not self._is_expected_identical(key, golden_value)): + if ( + isinstance(target_value, str) + and target_value.startswith("[UNTRANSLATED]") + ) or ( + golden_value == target_value + and not self._is_expected_identical(key, golden_value) + ): untranslated_keys.add(key) # Combine and create compact output @@ -101,8 +123,8 @@ class CompactTranslationExtractor: def _is_expected_identical(self, key: str, value: str) -> bool: """Check if a key-value pair is expected to be identical across languages.""" - identical_patterns = ['language.direction'] - identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'} + identical_patterns = ["language.direction"] + identical_values = {"ltr", "rtl", "True", "False", "true", "false", "unknown"} if value.strip() in identical_values: return True @@ -116,13 +138,23 @@ class CompactTranslationExtractor: def main(): parser = argparse.ArgumentParser( - description='Extract untranslated entries in compact format for AI translation (TOML format only)' + description="Extract untranslated entries in compact format for AI translation (TOML format only)" ) - parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)') - parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory') - parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file') - parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output') - parser.add_argument('--output', help='Output file (default: stdout)') + parser.add_argument("language", help="Language code (e.g., de-DE, fr-FR)") + parser.add_argument( + "--locales-dir", + default="frontend/public/locales", + help="Path to locales directory", + ) + parser.add_argument( + "--ignore-file", + default="scripts/ignore_translation.toml", + help="Path to ignore patterns file", + ) + parser.add_argument( + "--max-entries", type=int, help="Maximum number of entries to output" + ) + parser.add_argument("--output", help="Output file (default: stdout)") args = parser.parse_args() @@ -131,19 +163,22 @@ def main(): if args.max_entries: # Take first N entries - keys = list(untranslated.keys())[:args.max_entries] + keys = list(untranslated.keys())[: args.max_entries] untranslated = {k: untranslated[k] for k in keys} # Output compact JSON (no indentation, minimal whitespace) - output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False) + output = json.dumps(untranslated, separators=(",", ":"), ensure_ascii=False) if args.output: - with open(args.output, 'w', encoding='utf-8') as f: + with open(args.output, "w", encoding="utf-8") as f: f.write(output) - print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr) + print( + f"Extracted {len(untranslated)} untranslated entries to {args.output}", + file=sys.stderr, + ) else: print(output) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/translations/toml_beautifier.py b/scripts/translations/toml_beautifier.py index a0e1f95cb..bedf59d15 100644 --- a/scripts/translations/toml_beautifier.py +++ b/scripts/translations/toml_beautifier.py @@ -4,7 +4,6 @@ TOML Beautifier and Structure Fixer for Stirling PDF Frontend Restructures translation TOML files to match en-GB structure and key order exactly. """ -import os import sys from pathlib import Path from typing import Dict, Any, List @@ -24,7 +23,7 @@ class TOMLBeautifier: def _load_toml(self, file_path: Path) -> Dict: """Load TOML file with error handling.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) except FileNotFoundError: print(f"Error: File not found: {file_path}") @@ -36,15 +35,18 @@ class TOMLBeautifier: def _save_toml(self, data: Dict, file_path: Path, backup: bool = False) -> None: """Save TOML file with proper formatting.""" if backup and file_path.exists(): - backup_path = file_path.with_suffix(f'.backup.restructured.toml') + backup_path = file_path.with_suffix(".backup.restructured.toml") import shutil + shutil.copy2(file_path, backup_path) print(f"Backup created: {backup_path}") - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: tomli_w.dump(data, f) - def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]: + def _flatten_dict( + self, d: Dict, parent_key: str = "", separator: str = "." + ) -> Dict[str, Any]: """Flatten nested dictionary into dot-notation keys.""" items = [] for k, v in d.items(): @@ -55,9 +57,12 @@ class TOMLBeautifier: items.append((new_key, v)) return dict(items) - def _rebuild_structure(self, flat_dict: Dict[str, Any], reference_structure: Dict) -> Dict: + def _rebuild_structure( + self, flat_dict: Dict[str, Any], reference_structure: Dict + ) -> Dict: """Rebuild nested structure based on reference structure and available translations.""" - def build_recursive(ref_obj: Any, current_path: str = '') -> Any: + + def build_recursive(ref_obj: Any, current_path: str = "") -> Any: if isinstance(ref_obj, dict): result = OrderedDict() for key, value in ref_obj.items(): @@ -106,7 +111,9 @@ class TOMLBeautifier: return restructured - def beautify_and_restructure(self, target_file: Path, backup: bool = False) -> Dict[str, Any]: + def beautify_and_restructure( + self, target_file: Path, backup: bool = False + ) -> Dict[str, Any]: """Main function to beautify and restructure a translation file.""" lang_code = target_file.parent.name print(f"Restructuring {lang_code} translation file...") @@ -125,10 +132,12 @@ class TOMLBeautifier: preserved_keys = len(flat_restructured) result = { - 'language': lang_code, - 'total_reference_keys': total_keys, - 'preserved_keys': preserved_keys, - 'structure_match': self._compare_structures(self.golden_structure, restructured_data) + "language": lang_code, + "total_reference_keys": total_keys, + "preserved_keys": preserved_keys, + "structure_match": self._compare_structures( + self.golden_structure, restructured_data + ), } print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved") @@ -136,7 +145,8 @@ class TOMLBeautifier: def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]: """Compare structures between reference and target.""" - def compare_recursive(r: Any, t: Any, path: str = '') -> List[str]: + + def compare_recursive(r: Any, t: Any, path: str = "") -> List[str]: issues = [] if isinstance(r, dict) and isinstance(t, dict): @@ -147,7 +157,9 @@ class TOMLBeautifier: missing_sections = ref_keys - target_keys if missing_sections: for section in missing_sections: - issues.append(f"Missing section: {path}.{section}" if path else section) + issues.append( + f"Missing section: {path}.{section}" if path else section + ) # Recurse into common sections for key in ref_keys & target_keys: @@ -159,16 +171,16 @@ class TOMLBeautifier: issues = compare_recursive(ref, target) return { - 'structures_match': len(issues) == 0, - 'issues': issues[:10], # Limit to first 10 issues - 'total_issues': len(issues) + "structures_match": len(issues) == 0, + "issues": issues[:10], # Limit to first 10 issues + "total_issues": len(issues), } def validate_key_order(self, target_file: Path) -> Dict[str, Any]: """Validate that keys appear in the same order as en-GB.""" target_data = self._load_toml(target_file) - def get_key_order(obj: Dict, path: str = '') -> List[str]: + def get_key_order(obj: Dict, path: str = "") -> List[str]: keys = [] for key in obj.keys(): new_path = f"{path}.{key}" if path else key @@ -183,37 +195,51 @@ class TOMLBeautifier: # Find common keys and check their relative order common_keys = set(golden_order) & set(target_order) - golden_indices = {key: idx for idx, key in enumerate(golden_order) if key in common_keys} - target_indices = {key: idx for idx, key in enumerate(target_order) if key in common_keys} + golden_indices = { + key: idx for idx, key in enumerate(golden_order) if key in common_keys + } + target_indices = { + key: idx for idx, key in enumerate(target_order) if key in common_keys + } order_preserved = all( golden_indices[key1] < golden_indices[key2] - for key1 in common_keys for key2 in common_keys - if golden_indices[key1] < golden_indices[key2] and target_indices[key1] < target_indices[key2] + for key1 in common_keys + for key2 in common_keys + if golden_indices[key1] < golden_indices[key2] + and target_indices[key1] < target_indices[key2] ) return { - 'order_preserved': order_preserved, - 'common_keys_count': len(common_keys), - 'golden_keys_count': len(golden_order), - 'target_keys_count': len(target_order) + "order_preserved": order_preserved, + "common_keys_count": len(common_keys), + "golden_keys_count": len(golden_order), + "target_keys_count": len(target_order), } def main(): parser = argparse.ArgumentParser( - description='Beautify and restructure translation TOML files', - epilog='Works with TOML format translation files.' + description="Beautify and restructure translation TOML files", + epilog="Works with TOML format translation files.", + ) + parser.add_argument( + "--locales-dir", + default="frontend/public/locales", + help="Path to locales directory", + ) + parser.add_argument("--language", help="Restructure specific language only") + parser.add_argument( + "--all-languages", action="store_true", help="Restructure all language files" + ) + parser.add_argument( + "--backup", action="store_true", help="Create backup files before modifying" + ) + parser.add_argument( + "--validate-only", + action="store_true", + help="Only validate structure, do not modify files", ) - parser.add_argument('--locales-dir', default='frontend/public/locales', - help='Path to locales directory') - parser.add_argument('--language', help='Restructure specific language only') - parser.add_argument('--all-languages', action='store_true', - help='Restructure all language files') - parser.add_argument('--backup', action='store_true', - help='Create backup files before modifying') - parser.add_argument('--validate-only', action='store_true', - help='Only validate structure, do not modify files') args = parser.parse_args() @@ -229,14 +255,22 @@ def main(): order_result = beautifier.validate_key_order(target_file) print(f"Key order validation for {args.language}:") print(f" Order preserved: {order_result['order_preserved']}") - print(f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}") + print( + f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}" + ) else: - result = beautifier.beautify_and_restructure(target_file, backup=args.backup) + result = beautifier.beautify_and_restructure( + target_file, backup=args.backup + ) print(f"\nResults for {result['language']}:") - print(f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}") - if result['structure_match']['total_issues'] > 0: - print(f" Structure issues: {result['structure_match']['total_issues']}") - for issue in result['structure_match']['issues']: + print( + f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}" + ) + if result["structure_match"]["total_issues"] > 0: + print( + f" Structure issues: {result['structure_match']['total_issues']}" + ) + for issue in result["structure_match"]["issues"]: print(f" - {issue}") elif args.all_languages: @@ -247,18 +281,24 @@ def main(): if translation_file.exists(): if args.validate_only: order_result = beautifier.validate_key_order(translation_file) - print(f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}") + print( + f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}" + ) else: - result = beautifier.beautify_and_restructure(translation_file, backup=args.backup) + result = beautifier.beautify_and_restructure( + translation_file, backup=args.backup + ) results.append(result) if not args.validate_only and results: - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print("RESTRUCTURING SUMMARY") - print(f"{'='*60}") - for result in sorted(results, key=lambda x: x['language']): - print(f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys " - f"({result['preserved_keys']/result['total_reference_keys']*100:.1f}%)") + print(f"{'=' * 60}") + for result in sorted(results, key=lambda x: x["language"]): + print( + f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys " + f"({result['preserved_keys'] / result['total_reference_keys'] * 100:.1f}%)" + ) else: parser.print_help() diff --git a/scripts/translations/toml_validator.py b/scripts/translations/toml_validator.py index e7018050e..94adca8dc 100644 --- a/scripts/translations/toml_validator.py +++ b/scripts/translations/toml_validator.py @@ -15,7 +15,6 @@ Usage: import sys import argparse import glob -from pathlib import Path import tomllib @@ -23,7 +22,7 @@ import tomllib def get_line_context(file_path, line_num, context_lines=3): """Get lines around the error for context""" try: - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: lines = f.readlines() start = max(0, line_num - context_lines - 1) @@ -32,7 +31,7 @@ def get_line_context(file_path, line_num, context_lines=3): context = [] for i in range(start, end): marker = ">>> " if i == line_num - 1 else " " - context.append(f"{marker}{i+1:4d}: {lines[i].rstrip()}") + context.append(f"{marker}{i + 1:4d}: {lines[i].rstrip()}") return "\n".join(context) except Exception as e: @@ -42,7 +41,7 @@ def get_line_context(file_path, line_num, context_lines=3): def get_character_context(file_path, char_pos, context_chars=100): """Get characters around the error position""" try: - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: content = f.read() start = max(0, char_pos - context_chars) @@ -50,19 +49,19 @@ def get_character_context(file_path, char_pos, context_chars=100): before = content[start:char_pos] error_char = content[char_pos] if char_pos < len(content) else "EOF" - after = content[char_pos+1:end] + after = content[char_pos + 1 : end] return { - 'before': before, - 'error_char': error_char, - 'after': after, - 'display': f"{before}[{error_char}]{after}" + "before": before, + "error_char": error_char, + "after": after, + "display": f"{before}[{error_char}]{after}", } - except Exception as e: + except Exception: return None -def count_keys(data, prefix=''): +def count_keys(data, prefix=""): """Recursively count all keys in nested TOML structure""" count = 0 if isinstance(data, dict): @@ -77,42 +76,43 @@ def count_keys(data, prefix=''): def validate_toml_file(file_path): """Validate a single TOML file and return detailed error info""" result = { - 'file': str(file_path), - 'valid': False, - 'error': None, - 'line': None, - 'context': None, - 'entry_count': 0 + "file": str(file_path), + "valid": False, + "error": None, + "line": None, + "context": None, + "entry_count": 0, } try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: data = tomllib.load(f) - result['valid'] = True - result['entry_count'] = count_keys(data) + result["valid"] = True + result["entry_count"] = count_keys(data) except Exception as e: error_msg = str(e) - result['error'] = error_msg + result["error"] = error_msg # Try to extract line number from error message import re - line_match = re.search(r'line (\d+)', error_msg, re.IGNORECASE) + + line_match = re.search(r"line (\d+)", error_msg, re.IGNORECASE) if line_match: line_num = int(line_match.group(1)) - result['line'] = line_num - result['context'] = get_line_context(file_path, line_num) + result["line"] = line_num + result["context"] = get_line_context(file_path, line_num) except FileNotFoundError: - result['error'] = "File not found" + result["error"] = "File not found" return result def print_validation_result(result, brief=False, quiet=False): """Print validation result in human-readable format""" - if result['valid']: + if result["valid"]: if not quiet: print(f"✓ {result['file']}") if not brief: @@ -121,30 +121,35 @@ def print_validation_result(result, brief=False, quiet=False): print(f"✗ {result['file']}") print(f" Error: {result['error']}") - if result['line']: + if result["line"]: print(f" Line: {result['line']}") - if result['context'] and not brief: - print(f"\n Context:") + if result["context"] and not brief: + print("\n Context:") print(f" {result['context'].replace(chr(10), chr(10) + ' ')}") if not brief: - print(f"\n Common fixes:") - print(f" - Check for missing quotes around keys or values") - print(f" - Ensure proper escaping of special characters") - print(f" - Verify table header syntax: [section.subsection]") - print(f" - Check for duplicate keys in the same table") + print("\n Common fixes:") + print(" - Check for missing quotes around keys or values") + print(" - Ensure proper escaping of special characters") + print(" - Verify table header syntax: [section.subsection]") + print(" - Check for duplicate keys in the same table") def main(): - parser = argparse.ArgumentParser(description='Validate TOML translation files') - parser.add_argument('files', nargs='*', help='TOML file(s) or pattern to validate') - parser.add_argument('--all-batches', metavar='LANG', - help='Validate all batch files for a language (e.g., ar_AR)') - parser.add_argument('--brief', action='store_true', - help='Show brief output without context') - parser.add_argument('--quiet', action='store_true', - help='Only show files with errors') + parser = argparse.ArgumentParser(description="Validate TOML translation files") + parser.add_argument("files", nargs="*", help="TOML file(s) or pattern to validate") + parser.add_argument( + "--all-batches", + metavar="LANG", + help="Validate all batch files for a language (e.g., ar_AR)", + ) + parser.add_argument( + "--brief", action="store_true", help="Show brief output without context" + ) + parser.add_argument( + "--quiet", action="store_true", help="Only show files with errors" + ) args = parser.parse_args() @@ -181,11 +186,11 @@ def main(): # Summary total = len(results) - valid = sum(1 for r in results if r['valid']) + valid = sum(1 for r in results if r["valid"]) invalid = total - valid if not args.quiet: - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"Summary: {valid}/{total} files valid") if invalid > 0: print(f" {invalid} file(s) with errors") @@ -194,5 +199,5 @@ def main(): sys.exit(0 if invalid == 0 else 1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/translations/translation_analyzer.py b/scripts/translations/translation_analyzer.py index 35b2b3555..4924cd2bd 100644 --- a/scripts/translations/translation_analyzer.py +++ b/scripts/translations/translation_analyzer.py @@ -5,16 +5,19 @@ Compares language files against en-GB golden truth file. """ import json -import os import sys from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Set import argparse import tomllib class TranslationAnalyzer: - def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): + def __init__( + self, + locales_dir: str = "frontend/public/locales", + ignore_file: str = "scripts/ignore_translation.toml", + ): self.locales_dir = Path(locales_dir) self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml" self.golden_truth = self._load_translation_file(self.golden_truth_file) @@ -24,7 +27,7 @@ class TranslationAnalyzer: def _load_translation_file(self, file_path: Path) -> Dict: """Load TOML translation file with error handling.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) except FileNotFoundError: print(f"Error: File not found: {file_path}") @@ -39,17 +42,23 @@ class TranslationAnalyzer: return {} try: - with open(self.ignore_file, 'rb') as f: + with open(self.ignore_file, "rb") as f: ignore_data = tomllib.load(f) # Convert lists to sets for faster lookup - return {lang: set(patterns) for lang, data in ignore_data.items() - for patterns in [data.get('ignore', [])] if patterns} + return { + lang: set(patterns) + for lang, data in ignore_data.items() + for patterns in [data.get("ignore", [])] + if patterns + } except Exception as e: print(f"Warning: Could not load ignore file {self.ignore_file}: {e}") return {} - def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]: + def _flatten_dict( + self, d: Dict, parent_key: str = "", separator: str = "." + ) -> Dict[str, str]: """Flatten nested dictionary into dot-notation keys.""" items = [] for k, v in d.items(): @@ -80,7 +89,7 @@ class TranslationAnalyzer: missing = set(golden_flat.keys()) - set(target_flat.keys()) # Filter out ignored keys - lang_code = target_file.parent.name.replace('-', '_') + lang_code = target_file.parent.name.replace("-", "_") ignore_set = self.ignore_patterns.get(lang_code, set()) return missing - ignore_set @@ -91,7 +100,7 @@ class TranslationAnalyzer: golden_flat = self._flatten_dict(self.golden_truth) target_flat = self._flatten_dict(target_data) - lang_code = target_file.parent.name.replace('-', '_') + lang_code = target_file.parent.name.replace("-", "_") ignore_set = self.ignore_patterns.get(lang_code, set()) untranslated = set() @@ -101,8 +110,14 @@ class TranslationAnalyzer: golden_value = golden_flat[key] # Check if marked as [UNTRANSLATED] or identical to en-GB - if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \ - (golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)): + if ( + isinstance(target_value, str) + and target_value.startswith("[UNTRANSLATED]") + ) or ( + golden_value == target_value + and key not in ignore_set + and not self._is_expected_identical(key, golden_value) + ): untranslated.add(key) return untranslated @@ -110,14 +125,10 @@ class TranslationAnalyzer: def _is_expected_identical(self, key: str, value: str) -> bool: """Check if a key-value pair is expected to be identical across languages.""" # Keys that should be identical across languages - identical_patterns = [ - 'language.direction', - 'true', 'false', - 'unknown' - ] + identical_patterns = ["language.direction", "true", "false", "unknown"] # Values that are often identical (numbers, symbols, etc.) - if value.strip() in ['ltr', 'rtl', 'True', 'False']: + if value.strip() in ["ltr", "rtl", "True", "False"]: return True # Check for patterns @@ -149,7 +160,7 @@ class TranslationAnalyzer: target_flat = self._flatten_dict(target_data) # Calculate completion rate excluding ignored keys - lang_code = target_file.parent.name.replace('-', '_') + lang_code = target_file.parent.name.replace("-", "_") ignore_set = self.ignore_patterns.get(lang_code, set()) relevant_keys = set(golden_flat.keys()) - ignore_set @@ -161,22 +172,26 @@ class TranslationAnalyzer: if key in target_flat: value = target_flat[key] if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")): - if key not in untranslated: # Not identical to en-GB (unless expected) + if ( + key not in untranslated + ): # Not identical to en-GB (unless expected) properly_translated += 1 - completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0 + completion_rate = ( + (properly_translated / total_keys) * 100 if total_keys > 0 else 0 + ) return { - 'language': lang_code, - 'file': target_file, - 'missing_count': len(missing), - 'missing_keys': sorted(missing), - 'untranslated_count': len(untranslated), - 'untranslated_keys': sorted(untranslated), - 'extra_count': len(extra), - 'extra_keys': sorted(extra), - 'total_keys': total_keys, - 'completion_rate': completion_rate + "language": lang_code, + "file": target_file, + "missing_count": len(missing), + "missing_keys": sorted(missing), + "untranslated_count": len(untranslated), + "untranslated_keys": sorted(untranslated), + "extra_count": len(extra), + "extra_keys": sorted(extra), + "total_keys": total_keys, + "completion_rate": completion_rate, } def analyze_all_files(self) -> List[Dict]: @@ -184,24 +199,38 @@ class TranslationAnalyzer: results = [] for file_path in self.get_all_language_files(): results.append(self.analyze_file(file_path)) - return sorted(results, key=lambda x: x['language']) + return sorted(results, key=lambda x: x["language"]) def main(): - parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth') - parser.add_argument('--locales-dir', default='frontend/public/locales', - help='Path to locales directory') - parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', - help='Path to ignore patterns TOML file') - parser.add_argument('--language', help='Analyze specific language only') - parser.add_argument('--missing-only', action='store_true', - help='Show only missing translations') - parser.add_argument('--untranslated-only', action='store_true', - help='Show only untranslated entries') - parser.add_argument('--summary', action='store_true', - help='Show summary statistics only') - parser.add_argument('--format', choices=['text', 'json'], default='text', - help='Output format') + parser = argparse.ArgumentParser( + description="Analyze translation files against en-GB golden truth" + ) + parser.add_argument( + "--locales-dir", + default="frontend/public/locales", + help="Path to locales directory", + ) + parser.add_argument( + "--ignore-file", + default="scripts/ignore_translation.toml", + help="Path to ignore patterns TOML file", + ) + parser.add_argument("--language", help="Analyze specific language only") + parser.add_argument( + "--missing-only", action="store_true", help="Show only missing translations" + ) + parser.add_argument( + "--untranslated-only", + action="store_true", + help="Show only untranslated entries", + ) + parser.add_argument( + "--summary", action="store_true", help="Show summary statistics only" + ) + parser.add_argument( + "--format", choices=["text", "json"], default="text", help="Output format" + ) args = parser.parse_args() @@ -220,14 +249,14 @@ def main(): else: results = analyzer.analyze_all_files() - if args.format == 'json': + if args.format == "json": print(json.dumps(results, indent=2, default=str)) return # Text format output for result in results: - lang = result['language'] - print(f"\n{'='*60}") + lang = result["language"] + print(f"\n{'=' * 60}") print(f"Language: {lang}") print(f"File: {result['file']}") print(f"Completion Rate: {result['completion_rate']:.1f}%") @@ -236,42 +265,48 @@ def main(): if not args.summary: if not args.untranslated_only: print(f"\nMissing Translations ({result['missing_count']}):") - for key in result['missing_keys'][:10]: # Show first 10 + for key in result["missing_keys"][:10]: # Show first 10 print(f" - {key}") - if len(result['missing_keys']) > 10: + if len(result["missing_keys"]) > 10: print(f" ... and {len(result['missing_keys']) - 10} more") if not args.missing_only: print(f"\nUntranslated Entries ({result['untranslated_count']}):") - for key in result['untranslated_keys'][:10]: # Show first 10 + for key in result["untranslated_keys"][:10]: # Show first 10 print(f" - {key}") - if len(result['untranslated_keys']) > 10: + if len(result["untranslated_keys"]) > 10: print(f" ... and {len(result['untranslated_keys']) - 10} more") - if result['extra_count'] > 0: + if result["extra_count"] > 0: print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):") - for key in result['extra_keys'][:5]: + for key in result["extra_keys"][:5]: print(f" - {key}") - if len(result['extra_keys']) > 5: + if len(result["extra_keys"]) > 5: print(f" ... and {len(result['extra_keys']) - 5} more") - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print("SUMMARY") - print(f"{'='*60}") - avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0 + print(f"{'=' * 60}") + avg_completion = ( + sum(r["completion_rate"] for r in results) / len(results) if results else 0 + ) print(f"Average Completion Rate: {avg_completion:.1f}%") print(f"Languages Analyzed: {len(results)}") # Top languages by completion - sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True) - print(f"\nTop 5 Most Complete Languages:") + sorted_by_completion = sorted( + results, key=lambda x: x["completion_rate"], reverse=True + ) + print("\nTop 5 Most Complete Languages:") for result in sorted_by_completion[:5]: print(f" {result['language']}: {result['completion_rate']:.1f}%") - print(f"\nBottom 5 Languages Needing Attention:") + print("\nBottom 5 Languages Needing Attention:") for result in sorted_by_completion[-5:]: - print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)") + print( + f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)" + ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/translations/translation_merger.py b/scripts/translations/translation_merger.py index 328b11bfd..1dbfea262 100644 --- a/scripts/translations/translation_merger.py +++ b/scripts/translations/translation_merger.py @@ -7,10 +7,9 @@ TOML format only. """ import json -import os import sys from pathlib import Path -from typing import Dict, List, Set, Tuple, Any +from typing import Dict, List, Set, Any import argparse import shutil from datetime import datetime @@ -20,7 +19,11 @@ import tomli_w class TranslationMerger: - def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): + def __init__( + self, + locales_dir: str = "frontend/public/locales", + ignore_file: str = "scripts/ignore_translation.toml", + ): self.locales_dir = Path(locales_dir) self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml" self.golden_truth = self._load_translation_file(self.golden_truth_file) @@ -30,7 +33,7 @@ class TranslationMerger: def _load_translation_file(self, file_path: Path) -> Dict: """Load TOML translation file.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) except FileNotFoundError: print(f"Error: File not found: {file_path}") @@ -39,14 +42,18 @@ class TranslationMerger: print(f"Error: Invalid file {file_path}: {e}") sys.exit(1) - def _save_translation_file(self, data: Dict, file_path: Path, backup: bool = False) -> None: + def _save_translation_file( + self, data: Dict, file_path: Path, backup: bool = False + ) -> None: """Save TOML translation file with backup option.""" if backup and file_path.exists(): - backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.toml') + backup_path = file_path.with_suffix( + f".backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}.toml" + ) shutil.copy2(file_path, backup_path) print(f"Backup created: {backup_path}") - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: tomli_w.dump(data, f) def _load_ignore_patterns(self) -> Dict[str, Set[str]]: @@ -55,18 +62,20 @@ class TranslationMerger: return {} try: - with open(self.ignore_file, 'rb') as f: + with open(self.ignore_file, "rb") as f: ignore_data = tomllib.load(f) # Convert to sets for faster lookup - return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()} + return { + lang: set(data.get("ignore", [])) for lang, data in ignore_data.items() + } except Exception as e: print(f"Warning: Could not load ignore file {self.ignore_file}: {e}") return {} def _get_nested_value(self, data: Dict, key_path: str) -> Any: """Get value from nested dict using dot notation.""" - keys = key_path.split('.') + keys = key_path.split(".") current = data for key in keys: if isinstance(current, dict) and key in current: @@ -77,7 +86,7 @@ class TranslationMerger: def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None: """Set value in nested dict using dot notation.""" - keys = key_path.split('.') + keys = key_path.split(".") current = data for key in keys[:-1]: if key not in current: @@ -85,12 +94,16 @@ class TranslationMerger: elif not isinstance(current[key], dict): # If the current value is not a dict, we can't nest into it # This handles cases where a key exists as a string but we need to make it a dict - print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting") + print( + f"Warning: Converting non-dict value at '{key}' to dict to allow nesting" + ) current[key] = {} current = current[key] current[keys[-1]] = value - def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]: + def _flatten_dict( + self, d: Dict, parent_key: str = "", separator: str = "." + ) -> Dict[str, Any]: """Flatten nested dictionary into dot-notation keys.""" items = [] for k, v in d.items(): @@ -103,7 +116,7 @@ class TranslationMerger: def get_missing_keys(self, target_file: Path) -> List[str]: """Get list of missing keys in target file.""" - lang_code = target_file.parent.name.replace('-', '_') + lang_code = target_file.parent.name.replace("-", "_") ignore_set = self.ignore_patterns.get(lang_code, set()) if not target_file.exists(): @@ -117,7 +130,9 @@ class TranslationMerger: missing = set(golden_flat.keys()) - set(target_flat.keys()) return sorted(missing - ignore_set) - def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None) -> Dict: + def add_missing_translations( + self, target_file: Path, keys_to_add: List[str] = None + ) -> Dict: """Add missing translations from en-GB to target file.""" if not target_file.exists(): target_data = {} @@ -136,12 +151,14 @@ class TranslationMerger: added_count += 1 return { - 'added_count': added_count, - 'missing_keys': missing_keys, - 'data': target_data + "added_count": added_count, + "missing_keys": missing_keys, + "data": target_data, } - def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict: + def extract_untranslated_entries( + self, target_file: Path, output_file: Path = None + ) -> Dict: """Extract entries marked as untranslated or identical to en-GB for AI translation.""" if not target_file.exists(): print(f"Error: Target file does not exist: {target_file}") @@ -160,20 +177,22 @@ class TranslationMerger: # Check if marked as untranslated if isinstance(value, str) and value.startswith("[UNTRANSLATED]"): untranslated_entries[key] = { - 'original': golden_value, - 'current': value, - 'reason': 'marked_untranslated' + "original": golden_value, + "current": value, + "reason": "marked_untranslated", } # Check if identical to golden (and should be translated) - elif value == golden_value and not self._is_expected_identical(key, value): + elif value == golden_value and not self._is_expected_identical( + key, value + ): untranslated_entries[key] = { - 'original': golden_value, - 'current': value, - 'reason': 'identical_to_english' + "original": golden_value, + "current": value, + "reason": "identical_to_english", } if output_file: - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(untranslated_entries, f, indent=2, ensure_ascii=False) return untranslated_entries @@ -181,10 +200,10 @@ class TranslationMerger: def _is_expected_identical(self, key: str, value: str) -> bool: """Check if a key-value pair is expected to be identical across languages.""" identical_patterns = [ - 'language.direction', + "language.direction", ] - if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']: + if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]: return True for pattern in identical_patterns: @@ -193,12 +212,13 @@ class TranslationMerger: return False - def apply_translations(self, target_file: Path, translations: Dict[str, str], - backup: bool = False) -> Dict: + def apply_translations( + self, target_file: Path, translations: Dict[str, str], backup: bool = False + ) -> Dict: """Apply provided translations to target file.""" if not target_file.exists(): print(f"Error: Target file does not exist: {target_file}") - return {'success': False, 'error': 'File not found'} + return {"success": False, "error": "File not found"} target_data = self._load_translation_file(target_file) applied_count = 0 @@ -219,10 +239,10 @@ class TranslationMerger: self._save_translation_file(target_data, target_file, backup) return { - 'success': True, - 'applied_count': applied_count, - 'errors': errors, - 'data': target_data + "success": True, + "applied_count": applied_count, + "errors": errors, + "data": target_data, } def create_translation_template(self, target_file: Path, output_file: Path) -> None: @@ -230,25 +250,25 @@ class TranslationMerger: untranslated = self.extract_untranslated_entries(target_file) template = { - 'metadata': { - 'source_language': 'en-GB', - 'target_language': target_file.parent.name, - 'total_entries': len(untranslated), - 'created_at': datetime.now().isoformat(), - 'instructions': 'Translate the "original" values to the target language. Keep the same keys.' + "metadata": { + "source_language": "en-GB", + "target_language": target_file.parent.name, + "total_entries": len(untranslated), + "created_at": datetime.now().isoformat(), + "instructions": 'Translate the "original" values to the target language. Keep the same keys.', }, - 'translations': {} + "translations": {}, } for key, entry in untranslated.items(): - template['translations'][key] = { - 'original': entry['original'], - 'translated': '', # AI should fill this - 'context': self._get_context_for_key(key), - 'reason': entry['reason'] + template["translations"][key] = { + "original": entry["original"], + "translated": "", # AI should fill this + "context": self._get_context_for_key(key), + "reason": entry["reason"], } - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(template, f, indent=2, ensure_ascii=False) print(f"Translation template created: {output_file}") @@ -256,7 +276,7 @@ class TranslationMerger: def _get_context_for_key(self, key: str) -> str: """Get context information for a translation key.""" - parts = key.split('.') + parts = key.split(".") if len(parts) >= 2: return f"Section: {parts[0]}, Property: {parts[-1]}" return f"Property: {parts[-1]}" @@ -264,33 +284,55 @@ class TranslationMerger: def main(): parser = argparse.ArgumentParser( - description='Merge and manage translation files', - epilog='Works with TOML translation files.' + description="Merge and manage translation files", + epilog="Works with TOML translation files.", ) - parser.add_argument('--locales-dir', default='frontend/public/locales', - help='Path to locales directory') - parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', - help='Path to ignore patterns TOML file') - parser.add_argument('language', help='Target language code (e.g., fr-FR)') + parser.add_argument( + "--locales-dir", + default="frontend/public/locales", + help="Path to locales directory", + ) + parser.add_argument( + "--ignore-file", + default="scripts/ignore_translation.toml", + help="Path to ignore patterns TOML file", + ) + parser.add_argument("language", help="Target language code (e.g., fr-FR)") - subparsers = parser.add_subparsers(dest='command', help='Available commands') + subparsers = parser.add_subparsers(dest="command", help="Available commands") # Add missing command - add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB') - add_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files') + add_parser = subparsers.add_parser( + "add-missing", help="Add missing translations from en-GB" + ) + add_parser.add_argument( + "--backup", action="store_true", help="Create backup before modifying files" + ) # Extract untranslated command - extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries') - extract_parser.add_argument('--output', help='Output file path') + extract_parser = subparsers.add_parser( + "extract-untranslated", help="Extract untranslated entries" + ) + extract_parser.add_argument("--output", help="Output file path") # Create template command - template_parser = subparsers.add_parser('create-template', help='Create AI translation template') - template_parser.add_argument('--output', required=True, help='Output template file path') + template_parser = subparsers.add_parser( + "create-template", help="Create AI translation template" + ) + template_parser.add_argument( + "--output", required=True, help="Output template file path" + ) # Apply translations command - apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file') - apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations') - apply_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files') + apply_parser = subparsers.add_parser( + "apply-translations", help="Apply translations from JSON file" + ) + apply_parser.add_argument( + "--translations-file", required=True, help="JSON file with translations" + ) + apply_parser.add_argument( + "--backup", action="store_true", help="Create backup before modifying files" + ) args = parser.parse_args() @@ -304,44 +346,53 @@ def main(): lang_dir = Path(args.locales_dir) / args.language target_file = lang_dir / "translation.toml" - if args.command == 'add-missing': + if args.command == "add-missing": print(f"Adding missing translations to {args.language}...") result = merger.add_missing_translations(target_file) - merger._save_translation_file(result['data'], target_file, backup=args.backup) + merger._save_translation_file(result["data"], target_file, backup=args.backup) print(f"Added {result['added_count']} missing translations") - elif args.command == 'extract-untranslated': - output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json') + elif args.command == "extract-untranslated": + output_file = ( + Path(args.output) + if args.output + else target_file.with_suffix(".untranslated.json") + ) untranslated = merger.extract_untranslated_entries(target_file, output_file) print(f"Extracted {len(untranslated)} untranslated entries to {output_file}") - elif args.command == 'create-template': + elif args.command == "create-template": output_file = Path(args.output) merger.create_translation_template(target_file, output_file) - elif args.command == 'apply-translations': - with open(args.translations_file, 'r', encoding='utf-8') as f: + elif args.command == "apply-translations": + with open(args.translations_file, "r", encoding="utf-8") as f: translations_data = json.load(f) # Extract translations from template format or simple dict - if 'translations' in translations_data: - translations = {k: v['translated'] for k, v in translations_data['translations'].items() - if v.get('translated')} + if "translations" in translations_data: + translations = { + k: v["translated"] + for k, v in translations_data["translations"].items() + if v.get("translated") + } else: translations = translations_data - result = merger.apply_translations(target_file, translations, backup=args.backup) + result = merger.apply_translations( + target_file, translations, backup=args.backup + ) - if result['success']: + if result["success"]: print(f"Applied {result['applied_count']} translations") - if result['errors']: + if result["errors"]: print(f"Errors: {len(result['errors'])}") - for error in result['errors'][:5]: + for error in result["errors"][:5]: print(f" - {error}") else: print(f"Failed: {result.get('error', 'Unknown error')}") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/translations/validate_json_structure.py b/scripts/translations/validate_json_structure.py index 102bc154c..d204f14b6 100644 --- a/scripts/translations/validate_json_structure.py +++ b/scripts/translations/validate_json_structure.py @@ -16,12 +16,12 @@ Usage: import json import sys from pathlib import Path -from typing import Dict, List, Set +from typing import Dict, Set import argparse import tomllib # Python 3.11+ (stdlib) -def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]: +def get_all_keys(d: dict, parent_key: str = "", sep: str = ".") -> Set[str]: """Get all keys from nested dict as dot-notation paths.""" keys = set() for k, v in d.items(): @@ -35,7 +35,7 @@ def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]: def validate_translation_file(file_path: Path) -> tuple[bool, str]: """Validate that a file contains valid TOML.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: tomllib.load(f) return True, "Valid TOML" except Exception as e: @@ -43,95 +43,85 @@ def validate_translation_file(file_path: Path) -> tuple[bool, str]: def validate_structure( - en_gb_keys: Set[str], - lang_keys: Set[str], - lang_code: str + en_gb_keys: Set[str], lang_keys: Set[str], lang_code: str ) -> Dict: """Compare structure between en-GB and target language.""" missing_keys = en_gb_keys - lang_keys extra_keys = lang_keys - en_gb_keys return { - 'language': lang_code, - 'missing_keys': sorted(missing_keys), - 'extra_keys': sorted(extra_keys), - 'total_keys': len(lang_keys), - 'expected_keys': len(en_gb_keys), - 'missing_count': len(missing_keys), - 'extra_count': len(extra_keys) + "language": lang_code, + "missing_keys": sorted(missing_keys), + "extra_keys": sorted(extra_keys), + "total_keys": len(lang_keys), + "expected_keys": len(en_gb_keys), + "missing_count": len(missing_keys), + "extra_count": len(extra_keys), } def print_validation_result(result: Dict, verbose: bool = False): """Print validation results in readable format.""" - lang = result['language'] + lang = result["language"] - print(f"\n{'='*100}") + print(f"\n{'=' * 100}") print(f"Language: {lang}") - print(f"{'='*100}") + print(f"{'=' * 100}") print(f" Total keys: {result['total_keys']}") print(f" Expected keys (en-GB): {result['expected_keys']}") print(f" Missing keys: {result['missing_count']}") print(f" Extra keys: {result['extra_count']}") - if result['missing_count'] == 0 and result['extra_count'] == 0: - print(f" ✅ Structure matches en-GB perfectly!") + if result["missing_count"] == 0 and result["extra_count"] == 0: + print(" ✅ Structure matches en-GB perfectly!") else: - if result['missing_count'] > 0: + if result["missing_count"] > 0: print(f"\n ⚠️ Missing {result['missing_count']} key(s):") - if verbose or result['missing_count'] <= 20: - for key in result['missing_keys'][:50]: + if verbose or result["missing_count"] <= 20: + for key in result["missing_keys"][:50]: print(f" - {key}") - if result['missing_count'] > 50: + if result["missing_count"] > 50: print(f" ... and {result['missing_count'] - 50} more") else: - print(f" (use --verbose to see all)") + print(" (use --verbose to see all)") - if result['extra_count'] > 0: + if result["extra_count"] > 0: print(f"\n ⚠️ Extra {result['extra_count']} key(s) not in en-GB:") - if verbose or result['extra_count'] <= 20: - for key in result['extra_keys'][:50]: + if verbose or result["extra_count"] <= 20: + for key in result["extra_keys"][:50]: print(f" - {key}") - if result['extra_count'] > 50: + if result["extra_count"] > 50: print(f" ... and {result['extra_count'] - 50} more") else: - print(f" (use --verbose to see all)") + print(" (use --verbose to see all)") print("-" * 100) def load_translation_file(file_path: Path) -> dict: """Load TOML translation file.""" - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return tomllib.load(f) def main(): - parser = argparse.ArgumentParser( - description='Validate translation TOML structure' + parser = argparse.ArgumentParser(description="Validate translation TOML structure") + parser.add_argument( + "--language", + help="Specific language code to validate (e.g., es-ES)", + default=None, ) parser.add_argument( - '--language', - help='Specific language code to validate (e.g., es-ES)', - default=None - ) - parser.add_argument( - '--verbose', '-v', - action='store_true', - help='Show all missing/extra keys' - ) - parser.add_argument( - '--json', - action='store_true', - help='Output results as JSON' + "--verbose", "-v", action="store_true", help="Show all missing/extra keys" ) + parser.add_argument("--json", action="store_true", help="Output results as JSON") args = parser.parse_args() # Define paths - locales_dir = Path('frontend/public/locales') - en_gb_path = locales_dir / 'en-GB' / 'translation.toml' - file_ext = '.toml' + locales_dir = Path("frontend/public/locales") + en_gb_path = locales_dir / "en-GB" / "translation.toml" + file_ext = ".toml" if not en_gb_path.exists(): print(f"❌ Error: en-GB translation file not found at {en_gb_path}") @@ -155,8 +145,8 @@ def main(): # Validate all languages except en-GB languages = [] for d in locales_dir.iterdir(): - if d.is_dir() and d.name != 'en-GB': - if (d / 'translation.toml').exists(): + if d.is_dir() and d.name != "en-GB": + if (d / "translation.toml").exists(): languages.append(d.name) results = [] @@ -164,7 +154,7 @@ def main(): # Validate each language for lang_code in sorted(languages): - lang_path = locales_dir / lang_code / 'translation.toml' + lang_path = locales_dir / lang_code / "translation.toml" if not lang_path.exists(): print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping") @@ -173,11 +163,9 @@ def main(): # First check if file is valid is_valid, message = validate_translation_file(lang_path) if not is_valid: - json_errors.append({ - 'language': lang_code, - 'file': str(lang_path), - 'error': message - }) + json_errors.append( + {"language": lang_code, "file": str(lang_path), "error": message} + ) continue # Load and compare structure @@ -189,10 +177,7 @@ def main(): # Output results if args.json: - output = { - 'json_errors': json_errors, - 'structure_validation': results - } + output = {"json_errors": json_errors, "structure_validation": results} print(json.dumps(output, indent=2, ensure_ascii=False)) else: # Print syntax errors first @@ -210,11 +195,13 @@ def main(): print("\n📊 Structure Validation Summary:") print(f" Languages validated: {len(results)}") - perfect = sum(1 for r in results if r['missing_count'] == 0 and r['extra_count'] == 0) + perfect = sum( + 1 for r in results if r["missing_count"] == 0 and r["extra_count"] == 0 + ) print(f" Perfect matches: {perfect}/{len(results)}") - total_missing = sum(r['missing_count'] for r in results) - total_extra = sum(r['extra_count'] for r in results) + total_missing = sum(r["missing_count"] for r in results) + total_extra = sum(r["extra_count"] for r in results) print(f" Total missing keys: {total_missing}") print(f" Total extra keys: {total_extra}") @@ -226,10 +213,10 @@ def main(): # Exit with error code if issues found has_issues = len(json_errors) > 0 or any( - r['missing_count'] > 0 or r['extra_count'] > 0 for r in results + r["missing_count"] > 0 or r["extra_count"] > 0 for r in results ) sys.exit(1 if has_issues else 0) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/translations/validate_placeholders.py b/scripts/translations/validate_placeholders.py index 5ce18d288..1d59ee2f4 100644 --- a/scripts/translations/validate_placeholders.py +++ b/scripts/translations/validate_placeholders.py @@ -13,7 +13,7 @@ import json import re import sys from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Set import argparse import tomllib # Python 3.11+ (stdlib) @@ -22,10 +22,10 @@ def find_placeholders(text: str) -> Set[str]: """Find all placeholders in text like {n}, {{var}}, {0}, etc.""" if not isinstance(text, str): return set() - return set(re.findall(r'\{\{?[^}]+\}\}?', text)) + return set(re.findall(r"\{\{?[^}]+\}\}?", text)) -def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str]: +def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> Dict[str, str]: """Flatten nested dict to dot-notation keys.""" items = [] for k, v in d.items(): @@ -38,9 +38,7 @@ def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str def validate_language( - en_gb_flat: Dict[str, str], - lang_flat: Dict[str, str], - lang_code: str + en_gb_flat: Dict[str, str], lang_flat: Dict[str, str], lang_code: str ) -> List[Dict]: """Validate placeholders for a language against en-GB.""" issues = [] @@ -57,12 +55,12 @@ def validate_language( extra = lang_placeholders - en_placeholders issue = { - 'language': lang_code, - 'key': key, - 'missing': missing, - 'extra': extra, - 'en_text': en_gb_flat[key], - 'lang_text': lang_flat[key] + "language": lang_code, + "key": key, + "missing": missing, + "extra": extra, + "en_text": en_gb_flat[key], + "lang_text": lang_flat[key], } issues.append(issue) @@ -82,9 +80,9 @@ def print_issues(issues: List[Dict], verbose: bool = False): print(f"\n{i}. Language: {issue['language']}") print(f" Key: {issue['key']}") - if issue['missing']: + if issue["missing"]: print(f" ⚠️ MISSING placeholders: {issue['missing']}") - if issue['extra']: + if issue["extra"]: print(f" ⚠️ EXTRA placeholders: {issue['extra']}") if verbose: @@ -96,37 +94,34 @@ def print_issues(issues: List[Dict], verbose: bool = False): def main(): parser = argparse.ArgumentParser( - description='Validate translation placeholder consistency' + description="Validate translation placeholder consistency" ) parser.add_argument( - '--language', - help='Specific language code to validate (e.g., es-ES)', - default=None + "--language", + help="Specific language code to validate (e.g., es-ES)", + default=None, ) parser.add_argument( - '--verbose', '-v', - action='store_true', - help='Show full text samples for each issue' - ) - parser.add_argument( - '--json', - action='store_true', - help='Output results as JSON' + "--verbose", + "-v", + action="store_true", + help="Show full text samples for each issue", ) + parser.add_argument("--json", action="store_true", help="Output results as JSON") args = parser.parse_args() # Define paths - locales_dir = Path('frontend/public/locales') - en_gb_path = locales_dir / 'en-GB' / 'translation.toml' - file_ext = '.toml' + locales_dir = Path("frontend/public/locales") + en_gb_path = locales_dir / "en-GB" / "translation.toml" + file_ext = ".toml" if not en_gb_path.exists(): print(f"❌ Error: en-GB translation file not found at {en_gb_path}") sys.exit(1) # Load en-GB (source of truth) - with open(en_gb_path, 'rb') as f: + with open(en_gb_path, "rb") as f: en_gb = tomllib.load(f) en_gb_flat = flatten_dict(en_gb) @@ -138,22 +133,22 @@ def main(): # Validate all languages except en-GB languages = [] for d in locales_dir.iterdir(): - if d.is_dir() and d.name != 'en-GB': - if (d / 'translation.toml').exists(): + if d.is_dir() and d.name != "en-GB": + if (d / "translation.toml").exists(): languages.append(d.name) all_issues = [] # Validate each language for lang_code in sorted(languages): - lang_path = locales_dir / lang_code / 'translation.toml' + lang_path = locales_dir / lang_code / "translation.toml" if not lang_path.exists(): print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping") continue # Load language file - with open(lang_path, 'rb') as f: + with open(lang_path, "rb") as f: lang_data = tomllib.load(f) lang_flat = flatten_dict(lang_data) @@ -168,19 +163,19 @@ def main(): # Group by language by_language = {} for issue in all_issues: - lang = issue['language'] + lang = issue["language"] if lang not in by_language: by_language[lang] = [] by_language[lang].append(issue) - print(f"📊 Validation Summary:") + print("📊 Validation Summary:") print(f" Total issues: {len(all_issues)}") print(f" Languages with issues: {len(by_language)}\n") for lang in sorted(by_language.keys()): - print(f"\n{'='*100}") + print(f"\n{'=' * 100}") print(f"Language: {lang} ({len(by_language[lang])} issue(s))") - print(f"{'='*100}") + print(f"{'=' * 100}") print_issues(by_language[lang], verbose=args.verbose) else: print("✅ All translations have correct placeholders!") @@ -189,5 +184,5 @@ def main(): sys.exit(1 if all_issues else 0) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/type3_to_cff.py b/scripts/type3_to_cff.py index 0aaf13218..b939620fb 100644 --- a/scripts/type3_to_cff.py +++ b/scripts/type3_to_cff.py @@ -55,14 +55,33 @@ class GlyphBuildResult: def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Synthesize fonts from Type3 glyph JSON.") - parser.add_argument("--input", required=True, help="Path to glyph JSON emitted by the backend") - parser.add_argument("--otf-output", required=True, help="Destination path for the CFF/OTF font") - parser.add_argument("--ttf-output", help="Optional destination path for a TrueType font") - parser.add_argument("--family-name", default="Type3 Synth", help="Family name for the output") - parser.add_argument("--style-name", default="Regular", help="Style name for the output") - parser.add_argument("--units-per-em", type=int, default=1000, help="Units per EM value") - parser.add_argument("--cu2qu-error", type=float, default=1.0, help="Max error for cubic→quadratic conversion") + parser = argparse.ArgumentParser( + description="Synthesize fonts from Type3 glyph JSON." + ) + parser.add_argument( + "--input", required=True, help="Path to glyph JSON emitted by the backend" + ) + parser.add_argument( + "--otf-output", required=True, help="Destination path for the CFF/OTF font" + ) + parser.add_argument( + "--ttf-output", help="Optional destination path for a TrueType font" + ) + parser.add_argument( + "--family-name", default="Type3 Synth", help="Family name for the output" + ) + parser.add_argument( + "--style-name", default="Regular", help="Style name for the output" + ) + parser.add_argument( + "--units-per-em", type=int, default=1000, help="Units per EM value" + ) + parser.add_argument( + "--cu2qu-error", + type=float, + default=1.0, + help="Max error for cubic→quadratic conversion", + ) return parser.parse_args() @@ -151,18 +170,22 @@ def iterate_glyphs(data: Dict[str, object]) -> List[GlyphSource]: char_code_value = record.get("code") if not isinstance(char_code_value, int): char_code_value = record.get("charCodeRaw") - if not isinstance(char_code_value, int) or not (0 <= char_code_value <= 0x10FFFF): + if not isinstance(char_code_value, int) or not ( + 0 <= char_code_value <= 0x10FFFF + ): char_code_value = None outline = record.get("outline") if not isinstance(outline, list): outline = [] sources.append( - GlyphSource( - name=name, - width=float(width), - unicode=unicode_value, - char_code=char_code_value, - outline=outline)) + GlyphSource( + name=name, + width=float(width), + unicode=unicode_value, + char_code=char_code_value, + outline=outline, + ) + ) return sources @@ -199,7 +222,10 @@ def build_cff_charstring( start_point = point open_path = True elif op == "L" and current is not None: - point = (float(command.get("x", current[0])), float(command.get("y", current[1]))) + point = ( + float(command.get("x", current[0])), + float(command.get("y", current[1])), + ) pen.lineTo(point) update_bounds(point) current = point diff --git a/scripts/update_type3_library.py b/scripts/update_type3_library.py index fe4068e0f..dc1cf0505 100644 --- a/scripts/update_type3_library.py +++ b/scripts/update_type3_library.py @@ -17,14 +17,21 @@ from __future__ import annotations import argparse import json import sys -from collections import defaultdict from pathlib import Path from typing import Dict, Iterable, List, Optional, Tuple REPO_ROOT = Path(__file__).resolve().parents[1] DEFAULT_SIGNATURES = REPO_ROOT / "docs" / "type3" / "signatures" DEFAULT_INDEX = ( - REPO_ROOT / "app" / "core" / "src" / "main" / "resources" / "type3" / "library" / "index.json" + REPO_ROOT + / "app" + / "core" + / "src" + / "main" + / "resources" + / "type3" + / "library" + / "index.json" ) @@ -136,7 +143,12 @@ def update_library( entry = alias_index[alias] if entry is None: - unmatched.append((font.get("baseName") or font.get("alias_raw") or "unknown", sig_file)) + unmatched.append( + ( + font.get("baseName") or font.get("alias_raw") or "unknown", + sig_file, + ) + ) continue entry_modified = False @@ -186,7 +198,9 @@ def update_library( def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Update Type3 library index using signature dumps.") + parser = argparse.ArgumentParser( + description="Update Type3 library index using signature dumps." + ) parser.add_argument( "--signatures-dir", type=Path, @@ -209,7 +223,11 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() - signatures_dir = args.signatures_dir if args.signatures_dir.is_absolute() else (REPO_ROOT / args.signatures_dir) + signatures_dir = ( + args.signatures_dir + if args.signatures_dir.is_absolute() + else (REPO_ROOT / args.signatures_dir) + ) index_path = args.index if args.index.is_absolute() else (REPO_ROOT / args.index) if not signatures_dir.exists(): diff --git a/testing/compose/docker-compose-security-with-login.yml b/testing/compose/docker-compose-security-with-login.yml index feb91b080..af3dc1f68 100644 --- a/testing/compose/docker-compose-security-with-login.yml +++ b/testing/compose/docker-compose-security-with-login.yml @@ -60,4 +60,4 @@ networks: volumes: stirling-data: stirling-config: - stirling-logs: \ No newline at end of file + stirling-logs: diff --git a/testing/compose/docker-compose-security.yml b/testing/compose/docker-compose-security.yml index 14aedb697..9ea37c96c 100644 --- a/testing/compose/docker-compose-security.yml +++ b/testing/compose/docker-compose-security.yml @@ -56,4 +56,4 @@ networks: volumes: stirling-data: stirling-config: - stirling-logs: \ No newline at end of file + stirling-logs: diff --git a/testing/compose/docker-compose-ultra-lite.yml b/testing/compose/docker-compose-ultra-lite.yml index bddea4668..473310166 100644 --- a/testing/compose/docker-compose-ultra-lite.yml +++ b/testing/compose/docker-compose-ultra-lite.yml @@ -56,4 +56,4 @@ networks: volumes: stirling-data: stirling-config: - stirling-logs: \ No newline at end of file + stirling-logs: