mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-04-16 23:08:38 +02:00
🤖 format everything with pre-commit by stirlingbot (#5144)
Auto-generated by [create-pull-request][1] with **stirlingbot** [1]: https://github.com/peter-evans/create-pull-request Signed-off-by: stirlingbot[bot] <stirlingbot[bot]@users.noreply.github.com> Co-authored-by: stirlingbot[bot] <195170888+stirlingbot[bot]@users.noreply.github.com>
This commit is contained in:
2
.github/config/dependency-review-config.yml
vendored
2
.github/config/dependency-review-config.yml
vendored
@@ -1 +1 @@
|
||||
allow-ghsas: GHSA-wrw7-89jp-8q8g
|
||||
allow-ghsas: GHSA-wrw7-89jp-8q8g
|
||||
|
||||
20
.github/scripts/check_language_toml.py
vendored
20
.github/scripts/check_language_toml.py
vendored
@@ -14,12 +14,10 @@ Usage:
|
||||
# Sample for Windows:
|
||||
# python .github/scripts/check_language_toml.py --reference-file frontend/public/locales/en-GB/translation.toml --branch "" --files frontend/public/locales/de-DE/translation.toml frontend/public/locales/fr-FR/translation.toml
|
||||
|
||||
import copy
|
||||
import glob
|
||||
import os
|
||||
import argparse
|
||||
import re
|
||||
import json
|
||||
import tomllib # Python 3.11+ (stdlib)
|
||||
import tomli_w # For writing TOML files
|
||||
|
||||
@@ -38,7 +36,7 @@ def find_duplicate_keys(file_path, keys=None, prefix=""):
|
||||
duplicates = []
|
||||
|
||||
# Load TOML file
|
||||
with open(file_path, 'rb') as file:
|
||||
with open(file_path, "rb") as file:
|
||||
data = tomllib.load(file)
|
||||
|
||||
def process_dict(obj, current_prefix=""):
|
||||
@@ -67,7 +65,7 @@ def parse_toml_file(file_path):
|
||||
:param file_path: Path to the TOML file.
|
||||
:return: Dictionary with flattened keys.
|
||||
"""
|
||||
with open(file_path, 'rb') as file:
|
||||
with open(file_path, "rb") as file:
|
||||
data = tomllib.load(file)
|
||||
|
||||
def flatten_dict(d, parent_key="", sep="."):
|
||||
@@ -193,13 +191,13 @@ def check_for_differences(reference_file, file_list, branch, actor):
|
||||
basename_current_file = os.path.basename(os.path.join(branch, file_normpath))
|
||||
locale_dir = os.path.basename(os.path.dirname(file_normpath))
|
||||
|
||||
if (
|
||||
basename_current_file == basename_reference_file
|
||||
and locale_dir == "en-GB"
|
||||
):
|
||||
if basename_current_file == basename_reference_file and locale_dir == "en-GB":
|
||||
continue
|
||||
|
||||
if not file_normpath.endswith(".toml") or basename_current_file != "translation.toml":
|
||||
if (
|
||||
not file_normpath.endswith(".toml")
|
||||
or basename_current_file != "translation.toml"
|
||||
):
|
||||
continue
|
||||
|
||||
only_reference_file = False
|
||||
@@ -288,7 +286,9 @@ def check_for_differences(reference_file, file_list, branch, actor):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Find missing keys in TOML translation files")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Find missing keys in TOML translation files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--actor",
|
||||
required=False,
|
||||
|
||||
@@ -54,4 +54,4 @@ services:
|
||||
|
||||
networks:
|
||||
stirling-network:
|
||||
driver: bridge
|
||||
driver: bridge
|
||||
|
||||
@@ -19,27 +19,27 @@ const debug = (message) => {
|
||||
function scanForUsedIcons() {
|
||||
const usedIcons = new Set();
|
||||
const srcDir = path.join(__dirname, '..', 'src');
|
||||
|
||||
|
||||
info('🔍 Scanning codebase for LocalIcon usage...');
|
||||
|
||||
|
||||
if (!fs.existsSync(srcDir)) {
|
||||
console.error('❌ Source directory not found:', srcDir);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
// Recursively scan all .tsx and .ts files
|
||||
function scanDirectory(dir) {
|
||||
const files = fs.readdirSync(dir);
|
||||
|
||||
|
||||
files.forEach(file => {
|
||||
const filePath = path.join(dir, file);
|
||||
const stat = fs.statSync(filePath);
|
||||
|
||||
|
||||
if (stat.isDirectory()) {
|
||||
scanDirectory(filePath);
|
||||
} else if (file.endsWith('.tsx') || file.endsWith('.ts')) {
|
||||
const content = fs.readFileSync(filePath, 'utf8');
|
||||
|
||||
|
||||
// Match LocalIcon usage: <LocalIcon icon="icon-name" ...>
|
||||
const localIconMatches = content.match(/<LocalIcon\s+[^>]*icon="([^"]+)"/g);
|
||||
if (localIconMatches) {
|
||||
@@ -51,7 +51,7 @@ function scanForUsedIcons() {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Match old material-symbols-rounded spans: <span className="material-symbols-rounded">icon-name</span>
|
||||
const spanMatches = content.match(/<span[^>]*className="[^"]*material-symbols-rounded[^"]*"[^>]*>([^<]+)<\/span>/g);
|
||||
if (spanMatches) {
|
||||
@@ -64,7 +64,7 @@ function scanForUsedIcons() {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Match Icon component usage: <Icon icon="material-symbols:icon-name" ...>
|
||||
const iconMatches = content.match(/<Icon\s+[^>]*icon="material-symbols:([^"]+)"/g);
|
||||
if (iconMatches) {
|
||||
@@ -79,12 +79,12 @@ function scanForUsedIcons() {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
scanDirectory(srcDir);
|
||||
|
||||
|
||||
const iconArray = Array.from(usedIcons).sort();
|
||||
info(`📋 Found ${iconArray.length} unique icons across codebase`);
|
||||
|
||||
|
||||
return iconArray;
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ async function main() {
|
||||
const existingSet = JSON.parse(fs.readFileSync(outputPath, 'utf8'));
|
||||
const existingIcons = Object.keys(existingSet.icons || {}).sort();
|
||||
const currentIcons = [...usedIcons].sort();
|
||||
|
||||
|
||||
if (JSON.stringify(existingIcons) === JSON.stringify(currentIcons)) {
|
||||
needsRegeneration = false;
|
||||
info(`✅ Icon set already up-to-date (${usedIcons.length} icons, ${Math.round(fs.statSync(outputPath).size / 1024)}KB)`);
|
||||
@@ -122,7 +122,7 @@ async function main() {
|
||||
|
||||
// Dynamic import of ES module
|
||||
const { getIcons } = await import('@iconify/utils');
|
||||
|
||||
|
||||
// Extract only our used icons from the full set
|
||||
const extractedIcons = getIcons(icons, usedIcons);
|
||||
|
||||
@@ -183,4 +183,4 @@ export default iconSet;
|
||||
main().catch(error => {
|
||||
console.error('❌ Script failed:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -9,10 +9,10 @@ The script prints size and font statistics so we can confirm whether the
|
||||
lightweight export (no COS dictionaries) is active and how large the font
|
||||
payloads are.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
@@ -105,7 +105,11 @@ def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
|
||||
sample_cos_ids.append((font_id, uid))
|
||||
|
||||
metadata_bytes += approx_struct_size(
|
||||
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
|
||||
{
|
||||
k: v
|
||||
for k, v in font.items()
|
||||
if k not in {"program", "webProgram", "pdfProgram"}
|
||||
}
|
||||
)
|
||||
|
||||
program = font.get("program")
|
||||
@@ -259,18 +263,14 @@ def main() -> None:
|
||||
f" Text payload characters (not counting JSON overhead): "
|
||||
f"{page_stats.text_payload_chars:,}"
|
||||
)
|
||||
print(
|
||||
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
|
||||
)
|
||||
print(f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}")
|
||||
print(
|
||||
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
|
||||
)
|
||||
print(f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility.
|
||||
Builds proper Unicode cmap from PDF ToUnicode data.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
@@ -13,6 +14,7 @@ from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12
|
||||
from fontTools.ttLib.tables._n_a_m_e import NameRecord
|
||||
from fontTools.ttLib.tables.O_S_2f_2 import Panose
|
||||
|
||||
|
||||
def parse_unicode_mapping(mapping_path):
|
||||
"""
|
||||
Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap).
|
||||
@@ -21,23 +23,27 @@ def parse_unicode_mapping(mapping_path):
|
||||
dict[int, int]: GID → Unicode codepoint
|
||||
"""
|
||||
try:
|
||||
with open(mapping_path, 'rb') as f:
|
||||
data = f.read().decode('utf-8', errors='ignore')
|
||||
with open(mapping_path, "rb") as f:
|
||||
data = f.read().decode("utf-8", errors="ignore")
|
||||
|
||||
# Try parsing as JSON first (CID font with complete mapping)
|
||||
if data.strip().startswith('{'):
|
||||
if data.strip().startswith("{"):
|
||||
import json
|
||||
|
||||
try:
|
||||
mapping_data = json.loads(data)
|
||||
if mapping_data.get('isCID'):
|
||||
if mapping_data.get("isCID"):
|
||||
# Build GID → Unicode mapping from entries
|
||||
gid_to_unicode = {}
|
||||
for entry in mapping_data.get('entries', []):
|
||||
gid = entry['gid']
|
||||
unicode_val = entry['unicode']
|
||||
for entry in mapping_data.get("entries", []):
|
||||
gid = entry["gid"]
|
||||
unicode_val = entry["unicode"]
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr)
|
||||
print(
|
||||
f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return gid_to_unicode
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
@@ -47,7 +53,7 @@ def parse_unicode_mapping(mapping_path):
|
||||
gid_to_unicode = {}
|
||||
|
||||
# Pattern for bfchar entries
|
||||
bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
|
||||
bfchar_pattern = r"<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>"
|
||||
for match in re.finditer(bfchar_pattern, data):
|
||||
gid = int(match.group(1), 16) # For non-CID, char code == GID
|
||||
unicode_val = int(match.group(2), 16)
|
||||
@@ -55,7 +61,7 @@ def parse_unicode_mapping(mapping_path):
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
|
||||
# Pattern for bfrange entries
|
||||
bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
|
||||
bfrange_pattern = r"<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>"
|
||||
for match in re.finditer(bfrange_pattern, data):
|
||||
start_gid = int(match.group(1), 16)
|
||||
end_gid = int(match.group(2), 16)
|
||||
@@ -72,6 +78,7 @@ def parse_unicode_mapping(mapping_path):
|
||||
print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
"""
|
||||
Wrap raw CFF data (from PDF font stream) as OpenType-CFF.
|
||||
@@ -86,7 +93,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
"""
|
||||
try:
|
||||
# Read raw CFF data
|
||||
with open(input_path, 'rb') as f:
|
||||
with open(input_path, "rb") as f:
|
||||
cff_data = f.read()
|
||||
|
||||
# Parse raw CFF data
|
||||
@@ -106,29 +113,35 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
gid_to_unicode = parse_unicode_mapping(tounicode_path)
|
||||
|
||||
# Create a new OTF font
|
||||
otf = TTFont(sfntVersion='OTTO') # 'OTTO' = CFF-flavored OpenType
|
||||
otf = TTFont(sfntVersion="OTTO") # 'OTTO' = CFF-flavored OpenType
|
||||
|
||||
# Get glyph names
|
||||
if hasattr(cff_font, 'charset') and cff_font.charset is not None:
|
||||
glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef']
|
||||
if hasattr(cff_font, "charset") and cff_font.charset is not None:
|
||||
glyph_order = [".notdef"] + [
|
||||
name for name in cff_font.charset if name != ".notdef"
|
||||
]
|
||||
else:
|
||||
# Fallback to CharStrings keys
|
||||
charstrings = cff_font.CharStrings
|
||||
glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef']
|
||||
glyph_order = [".notdef"] + [
|
||||
name for name in charstrings.keys() if name != ".notdef"
|
||||
]
|
||||
|
||||
otf.setGlyphOrder(glyph_order)
|
||||
|
||||
# === Add CFF table (the actual font outlines) ===
|
||||
cff_table = newTable('CFF ')
|
||||
cff_table = newTable("CFF ")
|
||||
cff_table.cff = cff_fontset
|
||||
otf['CFF '] = cff_table
|
||||
otf["CFF "] = cff_table
|
||||
|
||||
# === Calculate metrics from CFF ===
|
||||
charstrings = cff_font.CharStrings
|
||||
|
||||
# Get defaults from CFF Private dict
|
||||
private_dict = getattr(cff_font, 'Private', None)
|
||||
default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500
|
||||
private_dict = getattr(cff_font, "Private", None)
|
||||
default_width = (
|
||||
getattr(private_dict, "defaultWidthX", 500) if private_dict else 500
|
||||
)
|
||||
|
||||
# Calculate bounding box, widths, and LSBs
|
||||
x_min = 0
|
||||
@@ -152,7 +165,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
cs = charstrings[glyph_name]
|
||||
|
||||
# Get width from charstring
|
||||
if hasattr(cs, 'width'):
|
||||
if hasattr(cs, "width"):
|
||||
width = int(cs.width)
|
||||
|
||||
# Calculate bounds for LSB and bbox
|
||||
@@ -181,7 +194,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
except:
|
||||
pass # Some glyphs may not have outlines
|
||||
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass # Use defaults
|
||||
|
||||
widths[glyph_name] = width
|
||||
@@ -196,7 +209,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
units_per_em = 1000 # Standard for Type1/CFF
|
||||
|
||||
# === Create head table ===
|
||||
head = newTable('head')
|
||||
head = newTable("head")
|
||||
head.tableVersion = 1.0
|
||||
head.fontRevision = 1.0
|
||||
head.checkSumAdjustment = 0
|
||||
@@ -214,10 +227,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
head.indexToLocFormat = 0
|
||||
head.glyphDataFormat = 0
|
||||
head.lowestRecPPEM = 8
|
||||
otf['head'] = head
|
||||
otf["head"] = head
|
||||
|
||||
# === Create hhea table with correct metrics ===
|
||||
hhea = newTable('hhea')
|
||||
hhea = newTable("hhea")
|
||||
hhea.tableVersion = 0x00010000
|
||||
hhea.ascent = max(y_max, 800)
|
||||
hhea.descent = min(y_min, -200)
|
||||
@@ -235,27 +248,30 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
hhea.reserved3 = 0
|
||||
hhea.metricDataFormat = 0
|
||||
hhea.numberOfHMetrics = len(glyph_order)
|
||||
otf['hhea'] = hhea
|
||||
otf["hhea"] = hhea
|
||||
|
||||
# === Create hmtx table with correct LSBs ===
|
||||
hmtx = newTable('hmtx')
|
||||
hmtx = newTable("hmtx")
|
||||
hmtx.metrics = {}
|
||||
for glyph_name in glyph_order:
|
||||
hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0))
|
||||
otf['hmtx'] = hmtx
|
||||
hmtx.metrics[glyph_name] = (
|
||||
widths.get(glyph_name, default_width),
|
||||
lsbs.get(glyph_name, 0),
|
||||
)
|
||||
otf["hmtx"] = hmtx
|
||||
|
||||
# === Create maxp table (simpler for CFF) ===
|
||||
maxp = newTable('maxp')
|
||||
maxp = newTable("maxp")
|
||||
maxp.tableVersion = 0x00005000 # CFF version (0.5)
|
||||
maxp.numGlyphs = len(glyph_order)
|
||||
otf['maxp'] = maxp
|
||||
otf["maxp"] = maxp
|
||||
|
||||
# === Build Unicode cmap from GID→Unicode mapping ===
|
||||
unicode_to_glyph = {}
|
||||
|
||||
if gid_to_unicode:
|
||||
# Debug: Show first few glyph names to understand naming convention
|
||||
sample_glyphs = glyph_order[:min(10, len(glyph_order))]
|
||||
sample_glyphs = glyph_order[: min(10, len(glyph_order))]
|
||||
print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr)
|
||||
|
||||
# Debug: Show which GIDs we have mappings for
|
||||
@@ -264,7 +280,9 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
|
||||
# For CID fonts: glyph names are "cid00123" (5-digit zero-padded)
|
||||
# For non-CID fonts: glyph names vary but GID == array index
|
||||
is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6]) # Check first few non-.notdef glyphs
|
||||
is_cid_font = any(
|
||||
gn.startswith("cid") for gn in glyph_order[1:6]
|
||||
) # Check first few non-.notdef glyphs
|
||||
|
||||
for gid, unicode_val in gid_to_unicode.items():
|
||||
if unicode_val > 0:
|
||||
@@ -285,18 +303,21 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
glyph_name = glyph_order[gid]
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
|
||||
print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr)
|
||||
print(
|
||||
f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Also try to map from glyph names (uni0041 → U+0041)
|
||||
for glyph_name in glyph_order:
|
||||
if glyph_name.startswith('uni') and len(glyph_name) == 7:
|
||||
if glyph_name.startswith("uni") and len(glyph_name) == 7:
|
||||
try:
|
||||
unicode_val = int(glyph_name[3:], 16)
|
||||
if unicode_val not in unicode_to_glyph:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
except:
|
||||
pass
|
||||
elif glyph_name.startswith('u') and len(glyph_name) >= 5:
|
||||
elif glyph_name.startswith("u") and len(glyph_name) >= 5:
|
||||
try:
|
||||
unicode_val = int(glyph_name[1:], 16)
|
||||
if unicode_val not in unicode_to_glyph:
|
||||
@@ -305,14 +326,14 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
pass
|
||||
|
||||
# === Create cmap table ===
|
||||
cmap = newTable('cmap')
|
||||
cmap = newTable("cmap")
|
||||
cmap.tableVersion = 0
|
||||
cmap_tables = []
|
||||
|
||||
# Windows Unicode BMP (format 4) - required
|
||||
cmap4_win = cmap_format_4(4)
|
||||
cmap4_win.platformID = 3 # Windows
|
||||
cmap4_win.platEncID = 1 # Unicode BMP
|
||||
cmap4_win.platEncID = 1 # Unicode BMP
|
||||
cmap4_win.language = 0
|
||||
cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
|
||||
cmap_tables.append(cmap4_win)
|
||||
@@ -329,23 +350,27 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
# Mac Unicode (format 4) - for compatibility
|
||||
cmap4_mac = cmap_format_4(4)
|
||||
cmap4_mac.platformID = 1 # Mac
|
||||
cmap4_mac.platEncID = 0 # Roman
|
||||
cmap4_mac.platEncID = 0 # Roman
|
||||
cmap4_mac.language = 0
|
||||
cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
|
||||
cmap_tables.append(cmap4_mac)
|
||||
|
||||
cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win] # Ensure at least one
|
||||
otf['cmap'] = cmap
|
||||
cmap.tables = [t for t in cmap_tables if t.cmap] or [
|
||||
cmap4_win
|
||||
] # Ensure at least one
|
||||
otf["cmap"] = cmap
|
||||
|
||||
print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr)
|
||||
print(
|
||||
f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr
|
||||
)
|
||||
|
||||
# === Create OS/2 table with correct metrics ===
|
||||
os2 = newTable('OS/2')
|
||||
os2 = newTable("OS/2")
|
||||
os2.version = 4
|
||||
os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500
|
||||
os2.usWeightClass = 400 # Normal
|
||||
os2.usWidthClass = 5 # Medium
|
||||
os2.fsType = 0 # Installable embedding
|
||||
os2.usWidthClass = 5 # Medium
|
||||
os2.fsType = 0 # Installable embedding
|
||||
os2.ySubscriptXSize = 650
|
||||
os2.ySubscriptYSize = 600
|
||||
os2.ySubscriptXOffset = 0
|
||||
@@ -375,7 +400,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
os2.ulUnicodeRange2 = 0
|
||||
os2.ulUnicodeRange3 = 0
|
||||
os2.ulUnicodeRange4 = 0
|
||||
os2.achVendID = 'SPDF'
|
||||
os2.achVendID = "SPDF"
|
||||
os2.fsSelection = 0x0040 # REGULAR bit
|
||||
|
||||
# Set character index range from actual cmap
|
||||
@@ -385,7 +410,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
os2.usLastCharIndex = codepoints[-1]
|
||||
else:
|
||||
os2.usFirstCharIndex = 0x20 # space
|
||||
os2.usLastCharIndex = 0x7E # tilde
|
||||
os2.usLastCharIndex = 0x7E # tilde
|
||||
|
||||
# Typo metrics match hhea
|
||||
os2.sTypoAscender = hhea.ascent
|
||||
@@ -403,10 +428,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
os2.usDefaultChar = 0
|
||||
os2.usBreakChar = 32
|
||||
os2.usMaxContext = 0
|
||||
otf['OS/2'] = os2
|
||||
otf["OS/2"] = os2
|
||||
|
||||
# === Create name table with Windows and Mac records ===
|
||||
name = newTable('name')
|
||||
name = newTable("name")
|
||||
name.names = []
|
||||
|
||||
# Get font name from CFF if available
|
||||
@@ -418,7 +443,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
3: f"Stirling-PDF: {font_name}", # Unique ID
|
||||
4: font_name, # Full Name
|
||||
5: "Version 1.0", # Version
|
||||
6: font_name.replace(' ', '-'), # PostScript Name
|
||||
6: font_name.replace(" ", "-"), # PostScript Name
|
||||
}
|
||||
|
||||
# Add both Windows and Mac name records
|
||||
@@ -441,10 +466,10 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
rec_mac.string = value
|
||||
name.names.append(rec_mac)
|
||||
|
||||
otf['name'] = name
|
||||
otf["name"] = name
|
||||
|
||||
# === Create post table (format 3.0 for smaller web fonts) ===
|
||||
post = newTable('post')
|
||||
post = newTable("post")
|
||||
post.formatType = 3.0 # No glyph names (smaller, web-optimized)
|
||||
post.italicAngle = 0
|
||||
post.underlinePosition = -100
|
||||
@@ -454,7 +479,7 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
post.maxMemType42 = 0
|
||||
post.minMemType1 = 0
|
||||
post.maxMemType1 = 0
|
||||
otf['post'] = post
|
||||
otf["post"] = post
|
||||
|
||||
# Save the OTF font
|
||||
otf.save(output_path)
|
||||
@@ -465,12 +490,17 @@ def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
except Exception as e:
|
||||
print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]", file=sys.stderr)
|
||||
print(
|
||||
"Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
input_path = Path(sys.argv[1])
|
||||
@@ -485,8 +515,13 @@ def main():
|
||||
print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr)
|
||||
tounicode_path = None
|
||||
|
||||
success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None)
|
||||
success = wrap_cff_as_otf(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
str(tounicode_path) if tounicode_path else None,
|
||||
)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,7 +27,7 @@ import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Set, Tuple
|
||||
from typing import List, Optional, Set, Tuple
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
import requests
|
||||
@@ -121,10 +121,10 @@ def build_filename(url: str, output_dir: Path) -> Path:
|
||||
|
||||
|
||||
def download_pdf(
|
||||
url: str,
|
||||
output_dir: Path,
|
||||
timeout: int,
|
||||
overwrite: bool,
|
||||
url: str,
|
||||
output_dir: Path,
|
||||
timeout: int,
|
||||
overwrite: bool,
|
||||
) -> Tuple[str, Optional[Path], Optional[str]]:
|
||||
try:
|
||||
dest = build_filename(url, output_dir)
|
||||
@@ -139,8 +139,12 @@ def download_pdf(
|
||||
# Peek into the first bytes to be safe
|
||||
peek = response.raw.read(5, decode_content=True)
|
||||
if not peek.startswith(b"%PDF"):
|
||||
return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
|
||||
content = peek + response.content[len(peek):]
|
||||
return (
|
||||
url,
|
||||
None,
|
||||
f"Skipping non-PDF content-type ({content_type or 'unknown'})",
|
||||
)
|
||||
content = peek + response.content[len(peek) :]
|
||||
else:
|
||||
content = response.content
|
||||
|
||||
@@ -157,7 +161,9 @@ def main() -> None:
|
||||
output_dir = Path(args.output_dir).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
|
||||
print(
|
||||
f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers..."
|
||||
)
|
||||
|
||||
successes = 0
|
||||
skipped = 0
|
||||
@@ -184,7 +190,9 @@ def main() -> None:
|
||||
print(f"[OK] {url} -> {path}")
|
||||
|
||||
print()
|
||||
print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
|
||||
print(
|
||||
f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}"
|
||||
)
|
||||
if failures:
|
||||
print("Failures:")
|
||||
for url, error in failures:
|
||||
|
||||
@@ -28,13 +28,15 @@ import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
from typing import Dict, List, Sequence, Tuple
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Bulk collect Type3 font signatures from PDFs.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Bulk collect Type3 font signatures from PDFs."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
nargs="+",
|
||||
@@ -145,7 +147,7 @@ def run_signature_tool(
|
||||
if pretty:
|
||||
args += " --pretty"
|
||||
# Use shell invocation so the quoted --args string is parsed correctly by Gradle.
|
||||
cmd = f"{gradle_cmd} -q :proprietary:type3SignatureTool --args=\"{args}\""
|
||||
cmd = f'{gradle_cmd} -q :proprietary:type3SignatureTool --args="{args}"'
|
||||
completed = subprocess.run(
|
||||
cmd,
|
||||
shell=True,
|
||||
@@ -207,11 +209,15 @@ def main() -> None:
|
||||
try:
|
||||
payload = load_signature_file(signature_path)
|
||||
except Exception as exc:
|
||||
print(f"[WARN] Failed to parse cached signature {signature_path}: {exc}")
|
||||
print(
|
||||
f"[WARN] Failed to parse cached signature {signature_path}: {exc}"
|
||||
)
|
||||
payload = None
|
||||
else:
|
||||
try:
|
||||
run_signature_tool(args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT)
|
||||
run_signature_tool(
|
||||
args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f"[ERROR] Harvest failed for {pdf}: {exc}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a Type3 font catalogue from sample PDFs."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
@@ -18,7 +18,9 @@ from typing import Dict, List
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Summarize Type3 signature JSON dumps.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Summarize Type3 signature JSON dumps."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
default="docs/type3/signatures",
|
||||
@@ -53,7 +55,9 @@ def load_signatures(directory: Path) -> Dict[str, List[dict]]:
|
||||
return inventory
|
||||
|
||||
|
||||
def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Path) -> None:
|
||||
def write_markdown(
|
||||
inventory: Dict[str, List[dict]], output: Path, input_dir: Path
|
||||
) -> None:
|
||||
lines: List[str] = []
|
||||
lines.append("# Type3 Signature Inventory")
|
||||
lines.append("")
|
||||
@@ -72,7 +76,9 @@ def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Pa
|
||||
for entry in entries:
|
||||
signature = entry.get("signature") or "—"
|
||||
sample = Path(entry["source"]).name
|
||||
glyph_count = entry.get("glyphCount") if entry.get("glyphCount") is not None else "—"
|
||||
glyph_count = (
|
||||
entry.get("glyphCount") if entry.get("glyphCount") is not None else "—"
|
||||
)
|
||||
coverage = entry.get("glyphCoverage") or []
|
||||
preview = ", ".join(str(code) for code in coverage[:10])
|
||||
lines.append(f"| `{signature}` | `{sample}` | {glyph_count} | {preview} |")
|
||||
|
||||
@@ -7,10 +7,8 @@ TOML format only.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple, Any, Optional
|
||||
from typing import Dict, List, Any
|
||||
import argparse
|
||||
import re
|
||||
from datetime import datetime
|
||||
@@ -27,7 +25,7 @@ class AITranslationHelper:
|
||||
def _load_translation_file(self, file_path: Path) -> Dict:
|
||||
"""Load TOML translation file."""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except (FileNotFoundError, Exception) as e:
|
||||
print(f"Error loading {file_path}: {e}")
|
||||
@@ -35,27 +33,31 @@ class AITranslationHelper:
|
||||
|
||||
def _save_translation_file(self, data: Dict, file_path: Path) -> None:
|
||||
"""Save TOML translation file."""
|
||||
with open(file_path, 'wb') as f:
|
||||
with open(file_path, "wb") as f:
|
||||
tomli_w.dump(data, f)
|
||||
|
||||
def create_ai_batch_file(self, languages: List[str], output_file: Path,
|
||||
max_entries_per_language: int = 50) -> None:
|
||||
def create_ai_batch_file(
|
||||
self,
|
||||
languages: List[str],
|
||||
output_file: Path,
|
||||
max_entries_per_language: int = 50,
|
||||
) -> None:
|
||||
"""Create a batch file for AI translation with multiple languages."""
|
||||
golden_truth = self._load_translation_file(self.golden_truth_file)
|
||||
batch_data = {
|
||||
'metadata': {
|
||||
'created_at': datetime.now().isoformat(),
|
||||
'source_language': 'en-GB',
|
||||
'target_languages': languages,
|
||||
'max_entries_per_language': max_entries_per_language,
|
||||
'instructions': {
|
||||
'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}',
|
||||
'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.',
|
||||
'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.',
|
||||
'style': 'Keep translations concise and user-friendly'
|
||||
}
|
||||
"metadata": {
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"source_language": "en-GB",
|
||||
"target_languages": languages,
|
||||
"max_entries_per_language": max_entries_per_language,
|
||||
"instructions": {
|
||||
"format": "Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}",
|
||||
"context": "This is for a PDF manipulation tool. Keep technical terms consistent.",
|
||||
"placeholders": "Preserve all placeholders: {n}, {total}, {filename}, etc.",
|
||||
"style": "Keep translations concise and user-friendly",
|
||||
},
|
||||
},
|
||||
'translations': {}
|
||||
"translations": {},
|
||||
}
|
||||
|
||||
for lang in languages:
|
||||
@@ -72,41 +74,57 @@ class AITranslationHelper:
|
||||
untranslated = self._find_untranslated_entries(golden_truth, lang_data)
|
||||
|
||||
# Limit entries if specified
|
||||
if max_entries_per_language and len(untranslated) > max_entries_per_language:
|
||||
if (
|
||||
max_entries_per_language
|
||||
and len(untranslated) > max_entries_per_language
|
||||
):
|
||||
# Prioritize by key importance
|
||||
untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language)
|
||||
untranslated = self._prioritize_translation_keys(
|
||||
untranslated, max_entries_per_language
|
||||
)
|
||||
|
||||
batch_data['translations'][lang] = {}
|
||||
batch_data["translations"][lang] = {}
|
||||
for key, value in untranslated.items():
|
||||
batch_data['translations'][lang][key] = {
|
||||
'original': value,
|
||||
'translated': '', # AI fills this
|
||||
'context': self._get_key_context(key)
|
||||
batch_data["translations"][lang][key] = {
|
||||
"original": value,
|
||||
"translated": "", # AI fills this
|
||||
"context": self._get_key_context(key),
|
||||
}
|
||||
|
||||
# Always save batch files as JSON for compatibility
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(batch_data, f, indent=2, ensure_ascii=False)
|
||||
total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values())
|
||||
total_entries = sum(
|
||||
len(lang_data) for lang_data in batch_data["translations"].values()
|
||||
)
|
||||
print(f"Created AI batch file: {output_file}")
|
||||
print(f"Total entries to translate: {total_entries}")
|
||||
|
||||
def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]:
|
||||
def _find_untranslated_entries(
|
||||
self, golden_truth: Dict, lang_data: Dict
|
||||
) -> Dict[str, str]:
|
||||
"""Find entries that need translation."""
|
||||
golden_flat = self._flatten_dict(golden_truth)
|
||||
lang_flat = self._flatten_dict(lang_data)
|
||||
|
||||
untranslated = {}
|
||||
for key, value in golden_flat.items():
|
||||
if (key not in lang_flat or
|
||||
lang_flat[key] == value or
|
||||
(isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))):
|
||||
if (
|
||||
key not in lang_flat
|
||||
or lang_flat[key] == value
|
||||
or (
|
||||
isinstance(lang_flat[key], str)
|
||||
and lang_flat[key].startswith("[UNTRANSLATED]")
|
||||
)
|
||||
):
|
||||
if not self._is_expected_identical(key, value):
|
||||
untranslated[key] = value
|
||||
|
||||
return untranslated
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
||||
def _flatten_dict(
|
||||
self, d: Dict, parent_key: str = "", separator: str = "."
|
||||
) -> Dict[str, Any]:
|
||||
"""Flatten nested dictionary."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
@@ -119,25 +137,27 @@ class AITranslationHelper:
|
||||
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if key should be identical across languages."""
|
||||
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
|
||||
if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]:
|
||||
return True
|
||||
return 'language.direction' in key.lower()
|
||||
return "language.direction" in key.lower()
|
||||
|
||||
def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]:
|
||||
def _prioritize_translation_keys(
|
||||
self, untranslated: Dict[str, str], max_count: int
|
||||
) -> Dict[str, str]:
|
||||
"""Prioritize which keys to translate first based on importance."""
|
||||
# Define priority order (higher score = higher priority)
|
||||
priority_patterns = [
|
||||
('title', 10),
|
||||
('header', 9),
|
||||
('submit', 8),
|
||||
('selectText', 7),
|
||||
('prompt', 6),
|
||||
('desc', 5),
|
||||
('error', 8),
|
||||
('warning', 7),
|
||||
('save', 8),
|
||||
('download', 8),
|
||||
('upload', 7),
|
||||
("title", 10),
|
||||
("header", 9),
|
||||
("submit", 8),
|
||||
("selectText", 7),
|
||||
("prompt", 6),
|
||||
("desc", 5),
|
||||
("error", 8),
|
||||
("warning", 7),
|
||||
("save", 8),
|
||||
("download", 8),
|
||||
("upload", 7),
|
||||
]
|
||||
|
||||
scored_keys = []
|
||||
@@ -154,89 +174,99 @@ class AITranslationHelper:
|
||||
|
||||
def _get_key_context(self, key: str) -> str:
|
||||
"""Get contextual information for a translation key."""
|
||||
parts = key.split('.')
|
||||
parts = key.split(".")
|
||||
contexts = {
|
||||
'addPageNumbers': 'Feature for adding page numbers to PDFs',
|
||||
'compress': 'PDF compression functionality',
|
||||
'merge': 'PDF merging functionality',
|
||||
'split': 'PDF splitting functionality',
|
||||
'rotate': 'PDF rotation functionality',
|
||||
'convert': 'File conversion functionality',
|
||||
'security': 'PDF security and permissions',
|
||||
'metadata': 'PDF metadata editing',
|
||||
'watermark': 'Adding watermarks to PDFs',
|
||||
'overlay': 'PDF overlay functionality',
|
||||
'extract': 'Extracting content from PDFs'
|
||||
"addPageNumbers": "Feature for adding page numbers to PDFs",
|
||||
"compress": "PDF compression functionality",
|
||||
"merge": "PDF merging functionality",
|
||||
"split": "PDF splitting functionality",
|
||||
"rotate": "PDF rotation functionality",
|
||||
"convert": "File conversion functionality",
|
||||
"security": "PDF security and permissions",
|
||||
"metadata": "PDF metadata editing",
|
||||
"watermark": "Adding watermarks to PDFs",
|
||||
"overlay": "PDF overlay functionality",
|
||||
"extract": "Extracting content from PDFs",
|
||||
}
|
||||
|
||||
if len(parts) > 0:
|
||||
main_section = parts[0]
|
||||
context = contexts.get(main_section, f'Part of {main_section} functionality')
|
||||
context = contexts.get(
|
||||
main_section, f"Part of {main_section} functionality"
|
||||
)
|
||||
if len(parts) > 1:
|
||||
context += f', specifically for {parts[-1]}'
|
||||
context += f", specifically for {parts[-1]}"
|
||||
return context
|
||||
|
||||
return 'General application text'
|
||||
return "General application text"
|
||||
|
||||
def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
|
||||
"""Validate AI translations for common issues."""
|
||||
# Batch files are always JSON
|
||||
with open(batch_file, 'r', encoding='utf-8') as f:
|
||||
with open(batch_file, "r", encoding="utf-8") as f:
|
||||
batch_data = json.load(f)
|
||||
issues = {'errors': [], 'warnings': []}
|
||||
issues = {"errors": [], "warnings": []}
|
||||
|
||||
for lang, translations in batch_data.get('translations', {}).items():
|
||||
for lang, translations in batch_data.get("translations", {}).items():
|
||||
for key, translation_data in translations.items():
|
||||
original = translation_data.get('original', '')
|
||||
translated = translation_data.get('translated', '')
|
||||
original = translation_data.get("original", "")
|
||||
translated = translation_data.get("translated", "")
|
||||
|
||||
if not translated:
|
||||
issues['errors'].append(f"{lang}.{key}: Missing translation")
|
||||
issues["errors"].append(f"{lang}.{key}: Missing translation")
|
||||
continue
|
||||
|
||||
# Check for placeholder preservation
|
||||
original_placeholders = re.findall(r'\{[^}]+\}', original)
|
||||
translated_placeholders = re.findall(r'\{[^}]+\}', translated)
|
||||
original_placeholders = re.findall(r"\{[^}]+\}", original)
|
||||
translated_placeholders = re.findall(r"\{[^}]+\}", translated)
|
||||
|
||||
if set(original_placeholders) != set(translated_placeholders):
|
||||
issues['warnings'].append(
|
||||
issues["warnings"].append(
|
||||
f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
|
||||
f"Translated: {translated_placeholders}"
|
||||
)
|
||||
|
||||
# Check if translation is identical to original (might be untranslated)
|
||||
if translated == original and not self._is_expected_identical(key, original):
|
||||
issues['warnings'].append(f"{lang}.{key}: Translation identical to original")
|
||||
if translated == original and not self._is_expected_identical(
|
||||
key, original
|
||||
):
|
||||
issues["warnings"].append(
|
||||
f"{lang}.{key}: Translation identical to original"
|
||||
)
|
||||
|
||||
# Check for common AI translation artifacts
|
||||
artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}']
|
||||
artifacts = ["[TRANSLATE]", "[TODO]", "UNTRANSLATED", "{{", "}}"]
|
||||
for artifact in artifacts:
|
||||
if artifact in translated:
|
||||
issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}")
|
||||
issues["errors"].append(
|
||||
f"{lang}.{key}: Contains translation artifact: {artifact}"
|
||||
)
|
||||
|
||||
return issues
|
||||
|
||||
def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]:
|
||||
def apply_ai_batch_translations(
|
||||
self, batch_file: Path, validate: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""Apply translations from AI batch file to individual language files."""
|
||||
# Batch files are always JSON
|
||||
with open(batch_file, 'r', encoding='utf-8') as f:
|
||||
with open(batch_file, "r", encoding="utf-8") as f:
|
||||
batch_data = json.load(f)
|
||||
results = {'applied': {}, 'errors': [], 'warnings': []}
|
||||
results = {"applied": {}, "errors": [], "warnings": []}
|
||||
|
||||
if validate:
|
||||
validation_issues = self.validate_ai_translations(batch_file)
|
||||
if validation_issues['errors']:
|
||||
if validation_issues["errors"]:
|
||||
print("Validation errors found. Fix these before applying:")
|
||||
for error in validation_issues['errors']:
|
||||
for error in validation_issues["errors"]:
|
||||
print(f" ERROR: {error}")
|
||||
return results
|
||||
|
||||
if validation_issues['warnings']:
|
||||
if validation_issues["warnings"]:
|
||||
print("Validation warnings (review recommended):")
|
||||
for warning in validation_issues['warnings'][:10]:
|
||||
for warning in validation_issues["warnings"][:10]:
|
||||
print(f" WARNING: {warning}")
|
||||
|
||||
for lang, translations in batch_data.get('translations', {}).items():
|
||||
for lang, translations in batch_data.get("translations", {}).items():
|
||||
lang_dir = self.locales_dir / lang
|
||||
toml_file = lang_dir / "translation.toml"
|
||||
|
||||
@@ -249,42 +279,48 @@ class AITranslationHelper:
|
||||
|
||||
applied_count = 0
|
||||
for key, translation_data in translations.items():
|
||||
translated = translation_data.get('translated', '').strip()
|
||||
if translated and translated != translation_data.get('original', ''):
|
||||
translated = translation_data.get("translated", "").strip()
|
||||
if translated and translated != translation_data.get("original", ""):
|
||||
self._set_nested_value(lang_data, key, translated)
|
||||
applied_count += 1
|
||||
|
||||
if applied_count > 0:
|
||||
self._save_translation_file(lang_data, toml_file)
|
||||
results['applied'][lang] = applied_count
|
||||
results["applied"][lang] = applied_count
|
||||
print(f"Applied {applied_count} translations to {lang}")
|
||||
|
||||
return results
|
||||
|
||||
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
||||
"""Set value in nested dict using dot notation."""
|
||||
keys = key_path.split('.')
|
||||
keys = key_path.split(".")
|
||||
current = data
|
||||
for key in keys[:-1]:
|
||||
if key not in current:
|
||||
current[key] = {}
|
||||
elif not isinstance(current[key], dict):
|
||||
# If the current value is not a dict, we can't nest into it
|
||||
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
|
||||
print(
|
||||
f"Warning: Converting non-dict value at '{key}' to dict to allow nesting"
|
||||
)
|
||||
current[key] = {}
|
||||
current = current[key]
|
||||
current[keys[-1]] = value
|
||||
|
||||
def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None:
|
||||
def export_for_external_translation(
|
||||
self, languages: List[str], output_format: str = "csv"
|
||||
) -> None:
|
||||
"""Export translations for external translation services."""
|
||||
golden_truth = self._load_translation_file(self.golden_truth_file)
|
||||
golden_flat = self._flatten_dict(golden_truth)
|
||||
|
||||
if output_format == 'csv':
|
||||
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv')
|
||||
if output_format == "csv":
|
||||
output_file = Path(
|
||||
f"translations_export_{datetime.now().strftime('%Y%m%d')}.csv"
|
||||
)
|
||||
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['key', 'context', 'en_GB'] + languages
|
||||
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
|
||||
fieldnames = ["key", "context", "en_GB"] + languages
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
@@ -293,9 +329,9 @@ class AITranslationHelper:
|
||||
continue
|
||||
|
||||
row = {
|
||||
'key': key,
|
||||
'context': self._get_key_context(key),
|
||||
'en_GB': en_value
|
||||
"key": key,
|
||||
"context": self._get_key_context(key),
|
||||
"en_GB": en_value,
|
||||
}
|
||||
|
||||
for lang in languages:
|
||||
@@ -305,28 +341,30 @@ class AITranslationHelper:
|
||||
if toml_file.exists():
|
||||
lang_data = self._load_translation_file(toml_file)
|
||||
lang_flat = self._flatten_dict(lang_data)
|
||||
value = lang_flat.get(key, '')
|
||||
if value.startswith('[UNTRANSLATED]'):
|
||||
value = ''
|
||||
value = lang_flat.get(key, "")
|
||||
if value.startswith("[UNTRANSLATED]"):
|
||||
value = ""
|
||||
row[lang] = value
|
||||
else:
|
||||
row[lang] = ''
|
||||
row[lang] = ""
|
||||
|
||||
writer.writerow(row)
|
||||
|
||||
print(f"Exported to {output_file}")
|
||||
|
||||
elif output_format == 'json':
|
||||
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json')
|
||||
export_data = {'languages': languages, 'translations': {}}
|
||||
elif output_format == "json":
|
||||
output_file = Path(
|
||||
f"translations_export_{datetime.now().strftime('%Y%m%d')}.json"
|
||||
)
|
||||
export_data = {"languages": languages, "translations": {}}
|
||||
|
||||
for key, en_value in golden_flat.items():
|
||||
if self._is_expected_identical(key, en_value):
|
||||
continue
|
||||
|
||||
export_data['translations'][key] = {
|
||||
'en_GB': en_value,
|
||||
'context': self._get_key_context(key)
|
||||
export_data["translations"][key] = {
|
||||
"en_GB": en_value,
|
||||
"context": self._get_key_context(key),
|
||||
}
|
||||
|
||||
for lang in languages:
|
||||
@@ -336,51 +374,64 @@ class AITranslationHelper:
|
||||
if toml_file.exists():
|
||||
lang_data = self._load_translation_file(toml_file)
|
||||
lang_flat = self._flatten_dict(lang_data)
|
||||
value = lang_flat.get(key, '')
|
||||
if value.startswith('[UNTRANSLATED]'):
|
||||
value = ''
|
||||
export_data['translations'][key][lang] = value
|
||||
value = lang_flat.get(key, "")
|
||||
if value.startswith("[UNTRANSLATED]"):
|
||||
value = ""
|
||||
export_data["translations"][key][lang] = value
|
||||
|
||||
# Export files are always JSON
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
||||
print(f"Exported to {output_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='AI Translation Helper',
|
||||
epilog='Works with TOML translation files.'
|
||||
description="AI Translation Helper", epilog="Works with TOML translation files."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--locales-dir",
|
||||
default="frontend/public/locales",
|
||||
help="Path to locales directory",
|
||||
)
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||
|
||||
# Create batch command
|
||||
batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file')
|
||||
batch_parser.add_argument('--languages', nargs='+', required=True,
|
||||
help='Language codes to include')
|
||||
batch_parser.add_argument('--output', required=True, help='Output batch file')
|
||||
batch_parser.add_argument('--max-entries', type=int, default=100,
|
||||
help='Max entries per language')
|
||||
batch_parser = subparsers.add_parser(
|
||||
"create-batch", help="Create AI translation batch file"
|
||||
)
|
||||
batch_parser.add_argument(
|
||||
"--languages", nargs="+", required=True, help="Language codes to include"
|
||||
)
|
||||
batch_parser.add_argument("--output", required=True, help="Output batch file")
|
||||
batch_parser.add_argument(
|
||||
"--max-entries", type=int, default=100, help="Max entries per language"
|
||||
)
|
||||
|
||||
# Validate command
|
||||
validate_parser = subparsers.add_parser('validate', help='Validate AI translations')
|
||||
validate_parser.add_argument('batch_file', help='Batch file to validate')
|
||||
validate_parser = subparsers.add_parser("validate", help="Validate AI translations")
|
||||
validate_parser.add_argument("batch_file", help="Batch file to validate")
|
||||
|
||||
# Apply command
|
||||
apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations')
|
||||
apply_parser.add_argument('batch_file', help='Batch file with translations')
|
||||
apply_parser.add_argument('--skip-validation', action='store_true',
|
||||
help='Skip validation before applying')
|
||||
apply_parser = subparsers.add_parser(
|
||||
"apply-batch", help="Apply AI batch translations"
|
||||
)
|
||||
apply_parser.add_argument("batch_file", help="Batch file with translations")
|
||||
apply_parser.add_argument(
|
||||
"--skip-validation", action="store_true", help="Skip validation before applying"
|
||||
)
|
||||
|
||||
# Export command
|
||||
export_parser = subparsers.add_parser('export', help='Export for external translation')
|
||||
export_parser.add_argument('--languages', nargs='+', required=True,
|
||||
help='Language codes to export')
|
||||
export_parser.add_argument('--format', choices=['csv', 'json'], default='csv',
|
||||
help='Export format')
|
||||
export_parser = subparsers.add_parser(
|
||||
"export", help="Export for external translation"
|
||||
)
|
||||
export_parser.add_argument(
|
||||
"--languages", nargs="+", required=True, help="Language codes to export"
|
||||
)
|
||||
export_parser.add_argument(
|
||||
"--format", choices=["csv", "json"], default="csv", help="Export format"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -390,40 +441,39 @@ def main():
|
||||
|
||||
helper = AITranslationHelper(args.locales_dir)
|
||||
|
||||
if args.command == 'create-batch':
|
||||
if args.command == "create-batch":
|
||||
output_file = Path(args.output)
|
||||
helper.create_ai_batch_file(args.languages, output_file, args.max_entries)
|
||||
|
||||
elif args.command == 'validate':
|
||||
elif args.command == "validate":
|
||||
batch_file = Path(args.batch_file)
|
||||
issues = helper.validate_ai_translations(batch_file)
|
||||
|
||||
if issues['errors']:
|
||||
if issues["errors"]:
|
||||
print("ERRORS:")
|
||||
for error in issues['errors']:
|
||||
for error in issues["errors"]:
|
||||
print(f" - {error}")
|
||||
|
||||
if issues['warnings']:
|
||||
if issues["warnings"]:
|
||||
print("WARNINGS:")
|
||||
for warning in issues['warnings']:
|
||||
for warning in issues["warnings"]:
|
||||
print(f" - {warning}")
|
||||
|
||||
if not issues['errors'] and not issues['warnings']:
|
||||
if not issues["errors"] and not issues["warnings"]:
|
||||
print("No validation issues found!")
|
||||
|
||||
elif args.command == 'apply-batch':
|
||||
elif args.command == "apply-batch":
|
||||
batch_file = Path(args.batch_file)
|
||||
results = helper.apply_ai_batch_translations(
|
||||
batch_file,
|
||||
validate=not args.skip_validation
|
||||
batch_file, validate=not args.skip_validation
|
||||
)
|
||||
|
||||
total_applied = sum(results['applied'].values())
|
||||
total_applied = sum(results["applied"].values())
|
||||
print(f"Total translations applied: {total_applied}")
|
||||
|
||||
elif args.command == 'export':
|
||||
elif args.command == "export":
|
||||
helper.export_for_external_translation(args.languages, args.format)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -19,9 +19,9 @@ import tomllib
|
||||
def run_command(cmd, description=""):
|
||||
"""Run a shell command and return success status."""
|
||||
if description:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Step: {description}")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
||||
|
||||
@@ -40,29 +40,35 @@ def find_translation_file(lang_dir):
|
||||
return toml_file
|
||||
return None
|
||||
|
||||
|
||||
def load_translation_file(file_path):
|
||||
"""Load TOML translation file."""
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
|
||||
|
||||
def extract_untranslated(language_code, batch_size=500, include_existing=False):
|
||||
"""Extract untranslated entries and split into batches."""
|
||||
mode = "all untranslated (including existing)" if include_existing else "new (missing)"
|
||||
mode = (
|
||||
"all untranslated (including existing)" if include_existing else "new (missing)"
|
||||
)
|
||||
print(f"\n🔍 Extracting {mode} entries for {language_code}...")
|
||||
|
||||
# Load files
|
||||
golden_path = find_translation_file(Path('frontend/public/locales/en-GB'))
|
||||
lang_path = find_translation_file(Path(f'frontend/public/locales/{language_code}'))
|
||||
golden_path = find_translation_file(Path("frontend/public/locales/en-GB"))
|
||||
lang_path = find_translation_file(Path(f"frontend/public/locales/{language_code}"))
|
||||
|
||||
if not golden_path:
|
||||
print(f"Error: Golden truth file not found in frontend/public/locales/en-GB")
|
||||
print("Error: Golden truth file not found in frontend/public/locales/en-GB")
|
||||
return None
|
||||
|
||||
if not lang_path:
|
||||
print(f"Error: Language file not found in frontend/public/locales/{language_code}")
|
||||
print(
|
||||
f"Error: Language file not found in frontend/public/locales/{language_code}"
|
||||
)
|
||||
return None
|
||||
|
||||
def flatten_dict(d, parent_key='', separator='.'):
|
||||
def flatten_dict(d, parent_key="", separator="."):
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
||||
@@ -76,7 +82,7 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False):
|
||||
lang_data = load_translation_file(lang_path)
|
||||
|
||||
if not golden or not lang_data:
|
||||
print(f"Error: Failed to load translation files")
|
||||
print("Error: Failed to load translation files")
|
||||
return None
|
||||
|
||||
golden_flat = flatten_dict(golden)
|
||||
@@ -87,9 +93,14 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False):
|
||||
for key, value in golden_flat.items():
|
||||
if include_existing:
|
||||
# Include missing keys, keys with English values, and [UNTRANSLATED] keys
|
||||
if (key not in lang_flat or
|
||||
lang_flat.get(key) == value or
|
||||
(isinstance(lang_flat.get(key), str) and lang_flat.get(key).startswith("[UNTRANSLATED]"))):
|
||||
if (
|
||||
key not in lang_flat
|
||||
or lang_flat.get(key) == value
|
||||
or (
|
||||
isinstance(lang_flat.get(key), str)
|
||||
and lang_flat.get(key).startswith("[UNTRANSLATED]")
|
||||
)
|
||||
):
|
||||
untranslated[key] = value
|
||||
else:
|
||||
# Only include missing keys (not in target file at all)
|
||||
@@ -108,16 +119,16 @@ def extract_untranslated(language_code, batch_size=500, include_existing=False):
|
||||
num_batches = (total + batch_size - 1) // batch_size
|
||||
|
||||
batch_files = []
|
||||
lang_code_safe = language_code.replace('-', '_')
|
||||
lang_code_safe = language_code.replace("-", "_")
|
||||
|
||||
for i in range(num_batches):
|
||||
start = i * batch_size
|
||||
end = min((i + 1) * batch_size, total)
|
||||
batch = dict(entries[start:end])
|
||||
|
||||
filename = f'{lang_code_safe}_batch_{i+1}_of_{num_batches}.json'
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(batch, f, ensure_ascii=False, separators=(',', ':'))
|
||||
filename = f"{lang_code_safe}_batch_{i + 1}_of_{num_batches}.json"
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(batch, f, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
batch_files.append(filename)
|
||||
print(f" Created {filename} with {len(batch)} entries")
|
||||
@@ -131,7 +142,7 @@ def translate_batches(batch_files, language_code, api_key, timeout=600):
|
||||
return []
|
||||
|
||||
print(f"\n🤖 Translating {len(batch_files)} batches using GPT-5...")
|
||||
print(f"Timeout: {timeout}s ({timeout//60} minutes) per batch")
|
||||
print(f"Timeout: {timeout}s ({timeout // 60} minutes) per batch")
|
||||
|
||||
translated_files = []
|
||||
|
||||
@@ -142,7 +153,9 @@ def translate_batches(batch_files, language_code, api_key, timeout=600):
|
||||
cmd = f'python3 scripts/translations/batch_translator.py "{batch_file}" --language {language_code} --api-key "{api_key}"'
|
||||
|
||||
# Run with timeout
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||
result = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True, timeout=timeout
|
||||
)
|
||||
|
||||
if result.stdout:
|
||||
print(result.stdout)
|
||||
@@ -153,7 +166,7 @@ def translate_batches(batch_files, language_code, api_key, timeout=600):
|
||||
print(f"✗ Failed to translate {batch_file}")
|
||||
return None
|
||||
|
||||
translated_file = batch_file.replace('.json', '_translated.json')
|
||||
translated_file = batch_file.replace(".json", "_translated.json")
|
||||
translated_files.append(translated_file)
|
||||
|
||||
# Small delay between batches
|
||||
@@ -177,14 +190,14 @@ def merge_translations(translated_files, language_code):
|
||||
print(f"Error: Translated file not found: {filename}")
|
||||
return None
|
||||
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
merged.update(json.load(f))
|
||||
|
||||
lang_code_safe = language_code.replace('-', '_')
|
||||
merged_file = f'{lang_code_safe}_merged.json'
|
||||
lang_code_safe = language_code.replace("-", "_")
|
||||
merged_file = f"{lang_code_safe}_merged.json"
|
||||
|
||||
with open(merged_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged, f, ensure_ascii=False, separators=(',', ':'))
|
||||
with open(merged_file, "w", encoding="utf-8") as f:
|
||||
json.dump(merged, f, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
print(f"✓ Merged {len(merged)} translations into {merged_file}")
|
||||
return merged_file
|
||||
@@ -194,13 +207,13 @@ def apply_translations(merged_file, language_code):
|
||||
"""Apply merged translations to the language file."""
|
||||
print(f"\n📝 Applying translations to {language_code}...")
|
||||
|
||||
cmd = f'python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}'
|
||||
cmd = f"python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}"
|
||||
|
||||
if not run_command(cmd):
|
||||
print(f"✗ Failed to apply translations")
|
||||
print("✗ Failed to apply translations")
|
||||
return False
|
||||
|
||||
print(f"✓ Translations applied successfully")
|
||||
print("✓ Translations applied successfully")
|
||||
return True
|
||||
|
||||
|
||||
@@ -208,27 +221,25 @@ def beautify_translations(language_code):
|
||||
"""Beautify translation file to match en-GB structure."""
|
||||
print(f"\n✨ Beautifying {language_code} translation file...")
|
||||
|
||||
cmd = f'python3 scripts/translations/toml_beautifier.py --language {language_code}'
|
||||
cmd = f"python3 scripts/translations/toml_beautifier.py --language {language_code}"
|
||||
|
||||
if not run_command(cmd):
|
||||
print(f"✗ Failed to beautify translations")
|
||||
print("✗ Failed to beautify translations")
|
||||
return False
|
||||
|
||||
print(f"✓ Translation file beautified")
|
||||
print("✓ Translation file beautified")
|
||||
return True
|
||||
|
||||
|
||||
def cleanup_temp_files(language_code):
|
||||
"""Remove temporary batch files."""
|
||||
print(f"\n🧹 Cleaning up temporary files...")
|
||||
print("\n🧹 Cleaning up temporary files...")
|
||||
|
||||
lang_code_safe = language_code.replace('-', '_')
|
||||
patterns = [
|
||||
f'{lang_code_safe}_batch_*.json',
|
||||
f'{lang_code_safe}_merged.json'
|
||||
]
|
||||
lang_code_safe = language_code.replace("-", "_")
|
||||
patterns = [f"{lang_code_safe}_batch_*.json", f"{lang_code_safe}_merged.json"]
|
||||
|
||||
import glob
|
||||
|
||||
removed = 0
|
||||
for pattern in patterns:
|
||||
for file in glob.glob(pattern):
|
||||
@@ -240,15 +251,15 @@ def cleanup_temp_files(language_code):
|
||||
|
||||
def verify_completion(language_code):
|
||||
"""Check final completion percentage."""
|
||||
print(f"\n📊 Verifying completion...")
|
||||
print("\n📊 Verifying completion...")
|
||||
|
||||
cmd = f'python3 scripts/translations/translation_analyzer.py --language {language_code} --summary'
|
||||
cmd = f"python3 scripts/translations/translation_analyzer.py --language {language_code} --summary"
|
||||
run_command(cmd)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Automated translation pipeline for Stirling PDF',
|
||||
description="Automated translation pipeline for Stirling PDF",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Note: This script works with TOML translation files.
|
||||
@@ -266,36 +277,57 @@ Examples:
|
||||
|
||||
# Skip cleanup (keep temporary files for inspection)
|
||||
python3 scripts/translations/auto_translate.py fr-FR --no-cleanup
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('language', help='Language code (e.g., es-ES, de-DE, zh-CN)')
|
||||
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
|
||||
parser.add_argument('--batch-size', type=int, default=500, help='Entries per batch (default: 500)')
|
||||
parser.add_argument('--no-cleanup', action='store_true', help='Keep temporary batch files')
|
||||
parser.add_argument('--skip-verification', action='store_true', help='Skip final completion check')
|
||||
parser.add_argument('--timeout', type=int, default=600, help='Timeout per batch in seconds (default: 600 = 10 minutes)')
|
||||
parser.add_argument('--include-existing', action='store_true', help='Also retranslate existing keys that match English (default: only translate missing keys)')
|
||||
parser.add_argument("language", help="Language code (e.g., es-ES, de-DE, zh-CN)")
|
||||
parser.add_argument(
|
||||
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=500, help="Entries per batch (default: 500)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-cleanup", action="store_true", help="Keep temporary batch files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-verification", action="store_true", help="Skip final completion check"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=600,
|
||||
help="Timeout per batch in seconds (default: 600 = 10 minutes)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-existing",
|
||||
action="store_true",
|
||||
help="Also retranslate existing keys that match English (default: only translate missing keys)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Verify API key
|
||||
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
|
||||
print(
|
||||
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
print("="*60)
|
||||
print(f"Automated Translation Pipeline")
|
||||
print("=" * 60)
|
||||
print("Automated Translation Pipeline")
|
||||
print(f"Language: {args.language}")
|
||||
print(f"Batch Size: {args.batch_size} entries")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Step 1: Extract and split
|
||||
batch_files = extract_untranslated(args.language, args.batch_size, args.include_existing)
|
||||
batch_files = extract_untranslated(
|
||||
args.language, args.batch_size, args.include_existing
|
||||
)
|
||||
if batch_files is None:
|
||||
sys.exit(1)
|
||||
|
||||
@@ -304,7 +336,9 @@ Examples:
|
||||
sys.exit(0)
|
||||
|
||||
# Step 2: Translate all batches
|
||||
translated_files = translate_batches(batch_files, args.language, api_key, args.timeout)
|
||||
translated_files = translate_batches(
|
||||
batch_files, args.language, api_key, args.timeout
|
||||
)
|
||||
if translated_files is None:
|
||||
sys.exit(1)
|
||||
|
||||
@@ -330,10 +364,10 @@ Examples:
|
||||
verify_completion(args.language)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print("\n" + "="*60)
|
||||
print(f"✅ Translation pipeline completed successfully!")
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ Translation pipeline completed successfully!")
|
||||
print(f"Time elapsed: {elapsed:.1f} seconds")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n⚠ Translation interrupted by user")
|
||||
@@ -341,6 +375,7 @@ Examples:
|
||||
except Exception as e:
|
||||
print(f"\n\n✗ Error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -79,10 +79,12 @@ CRITICAL RULES - MUST FOLLOW EXACTLY:
|
||||
|
||||
Return ONLY the translated JSON. No markdown, no explanations, just the JSON object."""
|
||||
|
||||
def translate_batch(self, batch_data: dict, target_language: str, language_code: str) -> dict:
|
||||
def translate_batch(
|
||||
self, batch_data: dict, target_language: str, language_code: str
|
||||
) -> dict:
|
||||
"""Translate a batch file using OpenAI API."""
|
||||
# Convert batch to compact JSON for API
|
||||
input_json = json.dumps(batch_data, ensure_ascii=False, separators=(',', ':'))
|
||||
input_json = json.dumps(batch_data, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
print(f"Translating {len(batch_data)} entries to {target_language}...")
|
||||
print(f"Input size: {len(input_json)} characters")
|
||||
@@ -94,12 +96,14 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": self.get_translation_prompt(target_language, language_code)
|
||||
"content": self.get_translation_prompt(
|
||||
target_language, language_code
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Translate this JSON:\n\n{input_json}"
|
||||
}
|
||||
"content": f"Translate this JSON:\n\n{input_json}",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
@@ -107,13 +111,13 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if translated_text.startswith("```"):
|
||||
lines = translated_text.split('\n')
|
||||
translated_text = '\n'.join(lines[1:-1])
|
||||
lines = translated_text.split("\n")
|
||||
translated_text = "\n".join(lines[1:-1])
|
||||
|
||||
# Parse the translated JSON
|
||||
translated_data = json.loads(translated_text)
|
||||
|
||||
print(f"✓ Translation complete")
|
||||
print("✓ Translation complete")
|
||||
return translated_data
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
@@ -139,7 +143,8 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
|
||||
|
||||
# Check placeholders in each value
|
||||
import re
|
||||
placeholder_pattern = r'\{[^}]+\}|\{\{[^}]+\}\}'
|
||||
|
||||
placeholder_pattern = r"\{[^}]+\}|\{\{[^}]+\}\}"
|
||||
|
||||
for key in original.keys():
|
||||
if key not in translated:
|
||||
@@ -153,7 +158,9 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
|
||||
trans_placeholders = set(re.findall(placeholder_pattern, trans_value))
|
||||
|
||||
if orig_placeholders != trans_placeholders:
|
||||
issues.append(f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}")
|
||||
issues.append(
|
||||
f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}"
|
||||
)
|
||||
|
||||
if issues:
|
||||
print("\n⚠ Validation warnings:")
|
||||
@@ -170,37 +177,37 @@ Return ONLY the translated JSON. No markdown, no explanations, just the JSON obj
|
||||
def get_language_info(language_code: str) -> tuple:
|
||||
"""Get full language name from code."""
|
||||
languages = {
|
||||
'zh-CN': ('Simplified Chinese', 'zh-CN'),
|
||||
'es-ES': ('Spanish', 'es-ES'),
|
||||
'it-IT': ('Italian', 'it-IT'),
|
||||
'de-DE': ('German', 'de-DE'),
|
||||
'ar-AR': ('Arabic', 'ar-AR'),
|
||||
'pt-BR': ('Brazilian Portuguese', 'pt-BR'),
|
||||
'ru-RU': ('Russian', 'ru-RU'),
|
||||
'fr-FR': ('French', 'fr-FR'),
|
||||
'ja-JP': ('Japanese', 'ja-JP'),
|
||||
'ko-KR': ('Korean', 'ko-KR'),
|
||||
'nl-NL': ('Dutch', 'nl-NL'),
|
||||
'pl-PL': ('Polish', 'pl-PL'),
|
||||
'sv-SE': ('Swedish', 'sv-SE'),
|
||||
'da-DK': ('Danish', 'da-DK'),
|
||||
'no-NB': ('Norwegian', 'no-NB'),
|
||||
'fi-FI': ('Finnish', 'fi-FI'),
|
||||
'tr-TR': ('Turkish', 'tr-TR'),
|
||||
'vi-VN': ('Vietnamese', 'vi-VN'),
|
||||
'th-TH': ('Thai', 'th-TH'),
|
||||
'id-ID': ('Indonesian', 'id-ID'),
|
||||
'hi-IN': ('Hindi', 'hi-IN'),
|
||||
'cs-CZ': ('Czech', 'cs-CZ'),
|
||||
'hu-HU': ('Hungarian', 'hu-HU'),
|
||||
'ro-RO': ('Romanian', 'ro-RO'),
|
||||
'uk-UA': ('Ukrainian', 'uk-UA'),
|
||||
'el-GR': ('Greek', 'el-GR'),
|
||||
'bg-BG': ('Bulgarian', 'bg-BG'),
|
||||
'hr-HR': ('Croatian', 'hr-HR'),
|
||||
'sk-SK': ('Slovak', 'sk-SK'),
|
||||
'sl-SI': ('Slovenian', 'sl-SI'),
|
||||
'ca-CA': ('Catalan', 'ca-CA'),
|
||||
"zh-CN": ("Simplified Chinese", "zh-CN"),
|
||||
"es-ES": ("Spanish", "es-ES"),
|
||||
"it-IT": ("Italian", "it-IT"),
|
||||
"de-DE": ("German", "de-DE"),
|
||||
"ar-AR": ("Arabic", "ar-AR"),
|
||||
"pt-BR": ("Brazilian Portuguese", "pt-BR"),
|
||||
"ru-RU": ("Russian", "ru-RU"),
|
||||
"fr-FR": ("French", "fr-FR"),
|
||||
"ja-JP": ("Japanese", "ja-JP"),
|
||||
"ko-KR": ("Korean", "ko-KR"),
|
||||
"nl-NL": ("Dutch", "nl-NL"),
|
||||
"pl-PL": ("Polish", "pl-PL"),
|
||||
"sv-SE": ("Swedish", "sv-SE"),
|
||||
"da-DK": ("Danish", "da-DK"),
|
||||
"no-NB": ("Norwegian", "no-NB"),
|
||||
"fi-FI": ("Finnish", "fi-FI"),
|
||||
"tr-TR": ("Turkish", "tr-TR"),
|
||||
"vi-VN": ("Vietnamese", "vi-VN"),
|
||||
"th-TH": ("Thai", "th-TH"),
|
||||
"id-ID": ("Indonesian", "id-ID"),
|
||||
"hi-IN": ("Hindi", "hi-IN"),
|
||||
"cs-CZ": ("Czech", "cs-CZ"),
|
||||
"hu-HU": ("Hungarian", "hu-HU"),
|
||||
"ro-RO": ("Romanian", "ro-RO"),
|
||||
"uk-UA": ("Ukrainian", "uk-UA"),
|
||||
"el-GR": ("Greek", "el-GR"),
|
||||
"bg-BG": ("Bulgarian", "bg-BG"),
|
||||
"hr-HR": ("Croatian", "hr-HR"),
|
||||
"sk-SK": ("Slovak", "sk-SK"),
|
||||
"sl-SI": ("Slovenian", "sl-SI"),
|
||||
"ca-CA": ("Catalan", "ca-CA"),
|
||||
}
|
||||
|
||||
return languages.get(language_code, (language_code, language_code))
|
||||
@@ -208,7 +215,7 @@ def get_language_info(language_code: str) -> tuple:
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Translate JSON batch files using OpenAI API (output supports TOML and JSON)',
|
||||
description="Translate JSON batch files using OpenAI API (output supports TOML and JSON)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Note: This script works with JSON batch files. The translation files it updates can be TOML or JSON.
|
||||
@@ -226,24 +233,51 @@ Examples:
|
||||
|
||||
# Use different model
|
||||
python batch_translator.py file.json --api-key KEY --language es-ES --model gpt-4-turbo
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('input_files', nargs='+', help='Input batch JSON file(s) or pattern')
|
||||
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
|
||||
parser.add_argument('--language', '-l', required=True, help='Target language code (e.g., zh-CN, es-ES)')
|
||||
parser.add_argument('--model', default='gpt-5', help='OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)')
|
||||
parser.add_argument('--output-suffix', default='_translated', help='Suffix for output files (default: _translated)')
|
||||
parser.add_argument('--skip-validation', action='store_true', help='Skip validation checks')
|
||||
parser.add_argument('--delay', type=float, default=1.0, help='Delay between API calls in seconds (default: 1.0)')
|
||||
parser.add_argument(
|
||||
"input_files", nargs="+", help="Input batch JSON file(s) or pattern"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
"-l",
|
||||
required=True,
|
||||
help="Target language code (e.g., zh-CN, es-ES)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="gpt-5",
|
||||
help="OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-suffix",
|
||||
default="_translated",
|
||||
help="Suffix for output files (default: _translated)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-validation", action="store_true", help="Skip validation checks"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Delay between API calls in seconds (default: 1.0)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get API key from args or environment
|
||||
import os
|
||||
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
|
||||
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
|
||||
print(
|
||||
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Get language info
|
||||
@@ -251,6 +285,7 @@ Examples:
|
||||
|
||||
# Expand file patterns
|
||||
import glob
|
||||
|
||||
input_files = []
|
||||
for pattern in args.input_files:
|
||||
matched = glob.glob(pattern)
|
||||
@@ -263,7 +298,7 @@ Examples:
|
||||
print("Error: No input files found")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Batch Translator")
|
||||
print("Batch Translator")
|
||||
print(f"Target Language: {language_name} ({language_code})")
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Files to translate: {len(input_files)}")
|
||||
@@ -281,11 +316,13 @@ Examples:
|
||||
|
||||
try:
|
||||
# Load input file
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
batch_data = json.load(f)
|
||||
|
||||
# Translate
|
||||
translated_data = translator.translate_batch(batch_data, language_name, language_code)
|
||||
translated_data = translator.translate_batch(
|
||||
batch_data, language_name, language_code
|
||||
)
|
||||
|
||||
# Validate
|
||||
if not args.skip_validation:
|
||||
@@ -295,8 +332,8 @@ Examples:
|
||||
input_path = Path(input_file)
|
||||
output_file = input_path.stem + args.output_suffix + input_path.suffix
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(translated_data, f, ensure_ascii=False, separators=(',', ':'))
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(translated_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
print(f"✓ Saved to: {output_file}")
|
||||
successful += 1
|
||||
@@ -312,7 +349,7 @@ Examples:
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Translation complete!")
|
||||
print("Translation complete!")
|
||||
print(f"Successful: {successful}/{len(input_files)}")
|
||||
if failed > 0:
|
||||
print(f"Failed: {failed}/{len(input_files)}")
|
||||
@@ -321,5 +358,4 @@ Examples:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
main()
|
||||
|
||||
@@ -54,16 +54,16 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(toml_file, 'rb') as f:
|
||||
with open(toml_file, "rb") as f:
|
||||
target_data = tomllib.load(f)
|
||||
|
||||
# Load en-GB reference
|
||||
en_gb_file = locales_dir / 'en-GB' / 'translation.toml'
|
||||
with open(en_gb_file, 'rb') as f:
|
||||
en_gb_file = locales_dir / "en-GB" / "translation.toml"
|
||||
with open(en_gb_file, "rb") as f:
|
||||
en_gb_data = tomllib.load(f)
|
||||
|
||||
# Flatten and count
|
||||
def flatten(d, parent=''):
|
||||
def flatten(d, parent=""):
|
||||
items = {}
|
||||
for k, v in d.items():
|
||||
key = f"{parent}.{k}" if parent else k
|
||||
@@ -77,7 +77,11 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
|
||||
target_flat = flatten(target_data)
|
||||
|
||||
# Count translated (not equal to en-GB)
|
||||
translated = sum(1 for k in en_gb_flat if k in target_flat and target_flat[k] != en_gb_flat[k])
|
||||
translated = sum(
|
||||
1
|
||||
for k in en_gb_flat
|
||||
if k in target_flat and target_flat[k] != en_gb_flat[k]
|
||||
)
|
||||
total = len(en_gb_flat)
|
||||
|
||||
return (translated / total * 100) if total > 0 else 0.0
|
||||
@@ -87,7 +91,14 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
|
||||
return None
|
||||
|
||||
|
||||
def translate_language(language: str, api_key: str, batch_size: int, timeout: int, skip_verification: bool, include_existing: bool) -> Tuple[str, bool, str]:
|
||||
def translate_language(
|
||||
language: str,
|
||||
api_key: str,
|
||||
batch_size: int,
|
||||
timeout: int,
|
||||
skip_verification: bool,
|
||||
include_existing: bool,
|
||||
) -> Tuple[str, bool, str]:
|
||||
"""
|
||||
Translate a single language.
|
||||
Returns: (language_code, success, message)
|
||||
@@ -95,25 +106,29 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
|
||||
safe_print(f"[{language}] Starting translation...")
|
||||
|
||||
cmd = [
|
||||
'python3', 'scripts/translations/auto_translate.py',
|
||||
"python3",
|
||||
"scripts/translations/auto_translate.py",
|
||||
language,
|
||||
'--api-key', api_key,
|
||||
'--batch-size', str(batch_size),
|
||||
'--timeout', str(timeout)
|
||||
"--api-key",
|
||||
api_key,
|
||||
"--batch-size",
|
||||
str(batch_size),
|
||||
"--timeout",
|
||||
str(timeout),
|
||||
]
|
||||
|
||||
if skip_verification:
|
||||
cmd.append('--skip-verification')
|
||||
cmd.append("--skip-verification")
|
||||
|
||||
if include_existing:
|
||||
cmd.append('--include-existing')
|
||||
cmd.append("--include-existing")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout * 5 # Overall timeout = 5x per-batch timeout
|
||||
timeout=timeout * 5, # Overall timeout = 5x per-batch timeout
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
@@ -124,7 +139,9 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
|
||||
safe_print(f"[{language}] ✓ Success")
|
||||
return (language, True, "Success")
|
||||
else:
|
||||
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
||||
error_msg = (
|
||||
result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
||||
)
|
||||
safe_print(f"[{language}] ✗ Failed: {error_msg[:100]}")
|
||||
return (language, False, error_msg[:200]) # Truncate long errors
|
||||
|
||||
@@ -138,7 +155,7 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Bulk auto-translate all languages using OpenAI API',
|
||||
description="Bulk auto-translate all languages using OpenAI API",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -155,35 +172,70 @@ Examples:
|
||||
python3 bulk_auto_translate.py --dry-run
|
||||
|
||||
Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
|
||||
parser.add_argument('--parallel', type=int, default=1,
|
||||
help='Number of parallel translation threads (default: 1)')
|
||||
parser.add_argument('--batch-size', type=int, default=500,
|
||||
help='Entries per batch for translation (default: 500)')
|
||||
parser.add_argument('--timeout', type=int, default=600,
|
||||
help='Timeout per batch in seconds (default: 600)')
|
||||
parser.add_argument('--threshold', type=float, default=0.0,
|
||||
help='Only translate languages below this completion %% (default: 0 = all)')
|
||||
parser.add_argument('--languages', nargs='+',
|
||||
help='Translate only specific languages (e.g., de-DE fr-FR)')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--skip-verification', action='store_true',
|
||||
help='Skip final completion verification for each language')
|
||||
parser.add_argument('--include-existing', action='store_true',
|
||||
help='Also retranslate existing keys that match English (default: only translate missing keys)')
|
||||
parser.add_argument('--dry-run', action='store_true',
|
||||
help='Show what would be translated without actually translating')
|
||||
parser.add_argument(
|
||||
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parallel",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of parallel translation threads (default: 1)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Entries per batch for translation (default: 500)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=600,
|
||||
help="Timeout per batch in seconds (default: 600)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Only translate languages below this completion %% (default: 0 = all)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--languages",
|
||||
nargs="+",
|
||||
help="Translate only specific languages (e.g., de-DE fr-FR)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--locales-dir",
|
||||
default="frontend/public/locales",
|
||||
help="Path to locales directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-verification",
|
||||
action="store_true",
|
||||
help="Skip final completion verification for each language",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-existing",
|
||||
action="store_true",
|
||||
help="Also retranslate existing keys that match English (default: only translate missing keys)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be translated without actually translating",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Verify API key (unless dry run)
|
||||
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not args.dry_run and not api_key:
|
||||
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
|
||||
print(
|
||||
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
locales_dir = Path(args.locales_dir)
|
||||
@@ -221,16 +273,16 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
||||
print("\nNo languages below threshold!")
|
||||
sys.exit(0)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Bulk Translation Configuration")
|
||||
print(f"{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("Bulk Translation Configuration")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Languages to translate: {len(languages)}")
|
||||
print(f"Parallel threads: {args.parallel}")
|
||||
print(f"Batch size: {args.batch_size}")
|
||||
print(f"Timeout per batch: {args.timeout}s")
|
||||
if args.threshold > 0:
|
||||
print(f"Completion threshold: {args.threshold}%")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN - Languages that would be translated:")
|
||||
@@ -244,11 +296,7 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
||||
start_time = time.time()
|
||||
|
||||
# Translate in parallel
|
||||
results = {
|
||||
'success': [],
|
||||
'failed': [],
|
||||
'already_complete': []
|
||||
}
|
||||
results = {"success": [], "failed": [], "already_complete": []}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
||||
futures = {
|
||||
@@ -259,7 +307,7 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
||||
args.batch_size,
|
||||
args.timeout,
|
||||
args.skip_verification,
|
||||
args.include_existing
|
||||
args.include_existing,
|
||||
): lang
|
||||
for lang in languages
|
||||
}
|
||||
@@ -269,43 +317,43 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
||||
|
||||
if success:
|
||||
if message == "Already complete":
|
||||
results['already_complete'].append(language)
|
||||
results["already_complete"].append(language)
|
||||
else:
|
||||
results['success'].append(language)
|
||||
results["success"].append(language)
|
||||
else:
|
||||
results['failed'].append((language, message))
|
||||
results["failed"].append((language, message))
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*60)
|
||||
print("\n" + "=" * 60)
|
||||
print("Bulk Translation Summary")
|
||||
print("="*60)
|
||||
print("=" * 60)
|
||||
print(f"Total languages: {len(languages)}")
|
||||
print(f"Successful: {len(results['success'])}")
|
||||
print(f"Already complete: {len(results['already_complete'])}")
|
||||
print(f"Failed: {len(results['failed'])}")
|
||||
print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
|
||||
print("="*60)
|
||||
print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed / 60:.1f} minutes)")
|
||||
print("=" * 60)
|
||||
|
||||
if results['success']:
|
||||
if results["success"]:
|
||||
print(f"\n✅ Successfully translated ({len(results['success'])}):")
|
||||
for lang in sorted(results['success']):
|
||||
for lang in sorted(results["success"]):
|
||||
print(f" - {lang}")
|
||||
|
||||
if results['already_complete']:
|
||||
if results["already_complete"]:
|
||||
print(f"\n✓ Already complete ({len(results['already_complete'])}):")
|
||||
for lang in sorted(results['already_complete']):
|
||||
for lang in sorted(results["already_complete"]):
|
||||
print(f" - {lang}")
|
||||
|
||||
if results['failed']:
|
||||
if results["failed"]:
|
||||
print(f"\n❌ Failed ({len(results['failed'])}):")
|
||||
for lang, msg in sorted(results['failed']):
|
||||
for lang, msg in sorted(results["failed"]):
|
||||
print(f" - {lang}: {msg}")
|
||||
sys.exit(1)
|
||||
|
||||
print("\n✅ Bulk translation completed successfully!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -13,11 +13,18 @@ import tomllib # Python 3.11+ (stdlib)
|
||||
|
||||
|
||||
class CompactTranslationExtractor:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
||||
def __init__(
|
||||
self,
|
||||
locales_dir: str = "frontend/public/locales",
|
||||
ignore_file: str = "scripts/ignore_translation.toml",
|
||||
):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
|
||||
if not self.golden_truth_file.exists():
|
||||
print(f"Error: en-GB translation file not found at {self.golden_truth_file}", file=sys.stderr)
|
||||
print(
|
||||
f"Error: en-GB translation file not found at {self.golden_truth_file}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
self.golden_truth = self._load_translation_file(self.golden_truth_file)
|
||||
self.ignore_file = Path(ignore_file)
|
||||
@@ -26,7 +33,7 @@ class CompactTranslationExtractor:
|
||||
def _load_translation_file(self, file_path: Path) -> dict:
|
||||
"""Load TOML translation file."""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}", file=sys.stderr)
|
||||
@@ -41,14 +48,21 @@ class CompactTranslationExtractor:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(self.ignore_file, 'rb') as f:
|
||||
with open(self.ignore_file, "rb") as f:
|
||||
ignore_data = tomllib.load(f)
|
||||
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
|
||||
return {
|
||||
lang: set(data.get("ignore", [])) for lang, data in ignore_data.items()
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr)
|
||||
print(
|
||||
f"Warning: Could not load ignore file {self.ignore_file}: {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return {}
|
||||
|
||||
def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict:
|
||||
def _flatten_dict(
|
||||
self, d: dict, parent_key: str = "", separator: str = "."
|
||||
) -> dict:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
@@ -65,14 +79,17 @@ class CompactTranslationExtractor:
|
||||
target_file = lang_dir / "translation.toml"
|
||||
|
||||
if not target_file.exists():
|
||||
print(f"Error: Translation file not found for language: {language}", file=sys.stderr)
|
||||
print(
|
||||
f"Error: Translation file not found for language: {language}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
target_data = self._load_translation_file(target_file)
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
lang_code = language.replace('-', '_')
|
||||
lang_code = language.replace("-", "_")
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
# Find missing translations
|
||||
@@ -85,8 +102,13 @@ class CompactTranslationExtractor:
|
||||
target_value = target_flat[key]
|
||||
golden_value = golden_flat[key]
|
||||
|
||||
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
|
||||
(golden_value == target_value and not self._is_expected_identical(key, golden_value)):
|
||||
if (
|
||||
isinstance(target_value, str)
|
||||
and target_value.startswith("[UNTRANSLATED]")
|
||||
) or (
|
||||
golden_value == target_value
|
||||
and not self._is_expected_identical(key, golden_value)
|
||||
):
|
||||
untranslated_keys.add(key)
|
||||
|
||||
# Combine and create compact output
|
||||
@@ -101,8 +123,8 @@ class CompactTranslationExtractor:
|
||||
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if a key-value pair is expected to be identical across languages."""
|
||||
identical_patterns = ['language.direction']
|
||||
identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'}
|
||||
identical_patterns = ["language.direction"]
|
||||
identical_values = {"ltr", "rtl", "True", "False", "true", "false", "unknown"}
|
||||
|
||||
if value.strip() in identical_values:
|
||||
return True
|
||||
@@ -116,13 +138,23 @@ class CompactTranslationExtractor:
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract untranslated entries in compact format for AI translation (TOML format only)'
|
||||
description="Extract untranslated entries in compact format for AI translation (TOML format only)"
|
||||
)
|
||||
parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory')
|
||||
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file')
|
||||
parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output')
|
||||
parser.add_argument('--output', help='Output file (default: stdout)')
|
||||
parser.add_argument("language", help="Language code (e.g., de-DE, fr-FR)")
|
||||
parser.add_argument(
|
||||
"--locales-dir",
|
||||
default="frontend/public/locales",
|
||||
help="Path to locales directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-file",
|
||||
default="scripts/ignore_translation.toml",
|
||||
help="Path to ignore patterns file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-entries", type=int, help="Maximum number of entries to output"
|
||||
)
|
||||
parser.add_argument("--output", help="Output file (default: stdout)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -131,19 +163,22 @@ def main():
|
||||
|
||||
if args.max_entries:
|
||||
# Take first N entries
|
||||
keys = list(untranslated.keys())[:args.max_entries]
|
||||
keys = list(untranslated.keys())[: args.max_entries]
|
||||
untranslated = {k: untranslated[k] for k in keys}
|
||||
|
||||
# Output compact JSON (no indentation, minimal whitespace)
|
||||
output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False)
|
||||
output = json.dumps(untranslated, separators=(",", ":"), ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr)
|
||||
print(
|
||||
f"Extracted {len(untranslated)} untranslated entries to {args.output}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -4,7 +4,6 @@ TOML Beautifier and Structure Fixer for Stirling PDF Frontend
|
||||
Restructures translation TOML files to match en-GB structure and key order exactly.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
@@ -24,7 +23,7 @@ class TOMLBeautifier:
|
||||
def _load_toml(self, file_path: Path) -> Dict:
|
||||
"""Load TOML file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
@@ -36,15 +35,18 @@ class TOMLBeautifier:
|
||||
def _save_toml(self, data: Dict, file_path: Path, backup: bool = False) -> None:
|
||||
"""Save TOML file with proper formatting."""
|
||||
if backup and file_path.exists():
|
||||
backup_path = file_path.with_suffix(f'.backup.restructured.toml')
|
||||
backup_path = file_path.with_suffix(".backup.restructured.toml")
|
||||
import shutil
|
||||
|
||||
shutil.copy2(file_path, backup_path)
|
||||
print(f"Backup created: {backup_path}")
|
||||
|
||||
with open(file_path, 'wb') as f:
|
||||
with open(file_path, "wb") as f:
|
||||
tomli_w.dump(data, f)
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
||||
def _flatten_dict(
|
||||
self, d: Dict, parent_key: str = "", separator: str = "."
|
||||
) -> Dict[str, Any]:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
@@ -55,9 +57,12 @@ class TOMLBeautifier:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
def _rebuild_structure(self, flat_dict: Dict[str, Any], reference_structure: Dict) -> Dict:
|
||||
def _rebuild_structure(
|
||||
self, flat_dict: Dict[str, Any], reference_structure: Dict
|
||||
) -> Dict:
|
||||
"""Rebuild nested structure based on reference structure and available translations."""
|
||||
def build_recursive(ref_obj: Any, current_path: str = '') -> Any:
|
||||
|
||||
def build_recursive(ref_obj: Any, current_path: str = "") -> Any:
|
||||
if isinstance(ref_obj, dict):
|
||||
result = OrderedDict()
|
||||
for key, value in ref_obj.items():
|
||||
@@ -106,7 +111,9 @@ class TOMLBeautifier:
|
||||
|
||||
return restructured
|
||||
|
||||
def beautify_and_restructure(self, target_file: Path, backup: bool = False) -> Dict[str, Any]:
|
||||
def beautify_and_restructure(
|
||||
self, target_file: Path, backup: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""Main function to beautify and restructure a translation file."""
|
||||
lang_code = target_file.parent.name
|
||||
print(f"Restructuring {lang_code} translation file...")
|
||||
@@ -125,10 +132,12 @@ class TOMLBeautifier:
|
||||
preserved_keys = len(flat_restructured)
|
||||
|
||||
result = {
|
||||
'language': lang_code,
|
||||
'total_reference_keys': total_keys,
|
||||
'preserved_keys': preserved_keys,
|
||||
'structure_match': self._compare_structures(self.golden_structure, restructured_data)
|
||||
"language": lang_code,
|
||||
"total_reference_keys": total_keys,
|
||||
"preserved_keys": preserved_keys,
|
||||
"structure_match": self._compare_structures(
|
||||
self.golden_structure, restructured_data
|
||||
),
|
||||
}
|
||||
|
||||
print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved")
|
||||
@@ -136,7 +145,8 @@ class TOMLBeautifier:
|
||||
|
||||
def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]:
|
||||
"""Compare structures between reference and target."""
|
||||
def compare_recursive(r: Any, t: Any, path: str = '') -> List[str]:
|
||||
|
||||
def compare_recursive(r: Any, t: Any, path: str = "") -> List[str]:
|
||||
issues = []
|
||||
|
||||
if isinstance(r, dict) and isinstance(t, dict):
|
||||
@@ -147,7 +157,9 @@ class TOMLBeautifier:
|
||||
missing_sections = ref_keys - target_keys
|
||||
if missing_sections:
|
||||
for section in missing_sections:
|
||||
issues.append(f"Missing section: {path}.{section}" if path else section)
|
||||
issues.append(
|
||||
f"Missing section: {path}.{section}" if path else section
|
||||
)
|
||||
|
||||
# Recurse into common sections
|
||||
for key in ref_keys & target_keys:
|
||||
@@ -159,16 +171,16 @@ class TOMLBeautifier:
|
||||
issues = compare_recursive(ref, target)
|
||||
|
||||
return {
|
||||
'structures_match': len(issues) == 0,
|
||||
'issues': issues[:10], # Limit to first 10 issues
|
||||
'total_issues': len(issues)
|
||||
"structures_match": len(issues) == 0,
|
||||
"issues": issues[:10], # Limit to first 10 issues
|
||||
"total_issues": len(issues),
|
||||
}
|
||||
|
||||
def validate_key_order(self, target_file: Path) -> Dict[str, Any]:
|
||||
"""Validate that keys appear in the same order as en-GB."""
|
||||
target_data = self._load_toml(target_file)
|
||||
|
||||
def get_key_order(obj: Dict, path: str = '') -> List[str]:
|
||||
def get_key_order(obj: Dict, path: str = "") -> List[str]:
|
||||
keys = []
|
||||
for key in obj.keys():
|
||||
new_path = f"{path}.{key}" if path else key
|
||||
@@ -183,37 +195,51 @@ class TOMLBeautifier:
|
||||
# Find common keys and check their relative order
|
||||
common_keys = set(golden_order) & set(target_order)
|
||||
|
||||
golden_indices = {key: idx for idx, key in enumerate(golden_order) if key in common_keys}
|
||||
target_indices = {key: idx for idx, key in enumerate(target_order) if key in common_keys}
|
||||
golden_indices = {
|
||||
key: idx for idx, key in enumerate(golden_order) if key in common_keys
|
||||
}
|
||||
target_indices = {
|
||||
key: idx for idx, key in enumerate(target_order) if key in common_keys
|
||||
}
|
||||
|
||||
order_preserved = all(
|
||||
golden_indices[key1] < golden_indices[key2]
|
||||
for key1 in common_keys for key2 in common_keys
|
||||
if golden_indices[key1] < golden_indices[key2] and target_indices[key1] < target_indices[key2]
|
||||
for key1 in common_keys
|
||||
for key2 in common_keys
|
||||
if golden_indices[key1] < golden_indices[key2]
|
||||
and target_indices[key1] < target_indices[key2]
|
||||
)
|
||||
|
||||
return {
|
||||
'order_preserved': order_preserved,
|
||||
'common_keys_count': len(common_keys),
|
||||
'golden_keys_count': len(golden_order),
|
||||
'target_keys_count': len(target_order)
|
||||
"order_preserved": order_preserved,
|
||||
"common_keys_count": len(common_keys),
|
||||
"golden_keys_count": len(golden_order),
|
||||
"target_keys_count": len(target_order),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Beautify and restructure translation TOML files',
|
||||
epilog='Works with TOML format translation files.'
|
||||
description="Beautify and restructure translation TOML files",
|
||||
epilog="Works with TOML format translation files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--locales-dir",
|
||||
default="frontend/public/locales",
|
||||
help="Path to locales directory",
|
||||
)
|
||||
parser.add_argument("--language", help="Restructure specific language only")
|
||||
parser.add_argument(
|
||||
"--all-languages", action="store_true", help="Restructure all language files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backup", action="store_true", help="Create backup files before modifying"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--validate-only",
|
||||
action="store_true",
|
||||
help="Only validate structure, do not modify files",
|
||||
)
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--language', help='Restructure specific language only')
|
||||
parser.add_argument('--all-languages', action='store_true',
|
||||
help='Restructure all language files')
|
||||
parser.add_argument('--backup', action='store_true',
|
||||
help='Create backup files before modifying')
|
||||
parser.add_argument('--validate-only', action='store_true',
|
||||
help='Only validate structure, do not modify files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -229,14 +255,22 @@ def main():
|
||||
order_result = beautifier.validate_key_order(target_file)
|
||||
print(f"Key order validation for {args.language}:")
|
||||
print(f" Order preserved: {order_result['order_preserved']}")
|
||||
print(f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}")
|
||||
print(
|
||||
f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}"
|
||||
)
|
||||
else:
|
||||
result = beautifier.beautify_and_restructure(target_file, backup=args.backup)
|
||||
result = beautifier.beautify_and_restructure(
|
||||
target_file, backup=args.backup
|
||||
)
|
||||
print(f"\nResults for {result['language']}:")
|
||||
print(f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}")
|
||||
if result['structure_match']['total_issues'] > 0:
|
||||
print(f" Structure issues: {result['structure_match']['total_issues']}")
|
||||
for issue in result['structure_match']['issues']:
|
||||
print(
|
||||
f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}"
|
||||
)
|
||||
if result["structure_match"]["total_issues"] > 0:
|
||||
print(
|
||||
f" Structure issues: {result['structure_match']['total_issues']}"
|
||||
)
|
||||
for issue in result["structure_match"]["issues"]:
|
||||
print(f" - {issue}")
|
||||
|
||||
elif args.all_languages:
|
||||
@@ -247,18 +281,24 @@ def main():
|
||||
if translation_file.exists():
|
||||
if args.validate_only:
|
||||
order_result = beautifier.validate_key_order(translation_file)
|
||||
print(f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}")
|
||||
print(
|
||||
f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}"
|
||||
)
|
||||
else:
|
||||
result = beautifier.beautify_and_restructure(translation_file, backup=args.backup)
|
||||
result = beautifier.beautify_and_restructure(
|
||||
translation_file, backup=args.backup
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
if not args.validate_only and results:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("RESTRUCTURING SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
for result in sorted(results, key=lambda x: x['language']):
|
||||
print(f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
|
||||
f"({result['preserved_keys']/result['total_reference_keys']*100:.1f}%)")
|
||||
print(f"{'=' * 60}")
|
||||
for result in sorted(results, key=lambda x: x["language"]):
|
||||
print(
|
||||
f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
|
||||
f"({result['preserved_keys'] / result['total_reference_keys'] * 100:.1f}%)"
|
||||
)
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
@@ -15,7 +15,6 @@ Usage:
|
||||
import sys
|
||||
import argparse
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
import tomllib
|
||||
|
||||
@@ -23,7 +22,7 @@ import tomllib
|
||||
def get_line_context(file_path, line_num, context_lines=3):
|
||||
"""Get lines around the error for context"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
start = max(0, line_num - context_lines - 1)
|
||||
@@ -32,7 +31,7 @@ def get_line_context(file_path, line_num, context_lines=3):
|
||||
context = []
|
||||
for i in range(start, end):
|
||||
marker = ">>> " if i == line_num - 1 else " "
|
||||
context.append(f"{marker}{i+1:4d}: {lines[i].rstrip()}")
|
||||
context.append(f"{marker}{i + 1:4d}: {lines[i].rstrip()}")
|
||||
|
||||
return "\n".join(context)
|
||||
except Exception as e:
|
||||
@@ -42,7 +41,7 @@ def get_line_context(file_path, line_num, context_lines=3):
|
||||
def get_character_context(file_path, char_pos, context_chars=100):
|
||||
"""Get characters around the error position"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
start = max(0, char_pos - context_chars)
|
||||
@@ -50,19 +49,19 @@ def get_character_context(file_path, char_pos, context_chars=100):
|
||||
|
||||
before = content[start:char_pos]
|
||||
error_char = content[char_pos] if char_pos < len(content) else "EOF"
|
||||
after = content[char_pos+1:end]
|
||||
after = content[char_pos + 1 : end]
|
||||
|
||||
return {
|
||||
'before': before,
|
||||
'error_char': error_char,
|
||||
'after': after,
|
||||
'display': f"{before}[{error_char}]{after}"
|
||||
"before": before,
|
||||
"error_char": error_char,
|
||||
"after": after,
|
||||
"display": f"{before}[{error_char}]{after}",
|
||||
}
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def count_keys(data, prefix=''):
|
||||
def count_keys(data, prefix=""):
|
||||
"""Recursively count all keys in nested TOML structure"""
|
||||
count = 0
|
||||
if isinstance(data, dict):
|
||||
@@ -77,42 +76,43 @@ def count_keys(data, prefix=''):
|
||||
def validate_toml_file(file_path):
|
||||
"""Validate a single TOML file and return detailed error info"""
|
||||
result = {
|
||||
'file': str(file_path),
|
||||
'valid': False,
|
||||
'error': None,
|
||||
'line': None,
|
||||
'context': None,
|
||||
'entry_count': 0
|
||||
"file": str(file_path),
|
||||
"valid": False,
|
||||
"error": None,
|
||||
"line": None,
|
||||
"context": None,
|
||||
"entry_count": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
data = tomllib.load(f)
|
||||
|
||||
result['valid'] = True
|
||||
result['entry_count'] = count_keys(data)
|
||||
result["valid"] = True
|
||||
result["entry_count"] = count_keys(data)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
result['error'] = error_msg
|
||||
result["error"] = error_msg
|
||||
|
||||
# Try to extract line number from error message
|
||||
import re
|
||||
line_match = re.search(r'line (\d+)', error_msg, re.IGNORECASE)
|
||||
|
||||
line_match = re.search(r"line (\d+)", error_msg, re.IGNORECASE)
|
||||
if line_match:
|
||||
line_num = int(line_match.group(1))
|
||||
result['line'] = line_num
|
||||
result['context'] = get_line_context(file_path, line_num)
|
||||
result["line"] = line_num
|
||||
result["context"] = get_line_context(file_path, line_num)
|
||||
|
||||
except FileNotFoundError:
|
||||
result['error'] = "File not found"
|
||||
result["error"] = "File not found"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def print_validation_result(result, brief=False, quiet=False):
|
||||
"""Print validation result in human-readable format"""
|
||||
if result['valid']:
|
||||
if result["valid"]:
|
||||
if not quiet:
|
||||
print(f"✓ {result['file']}")
|
||||
if not brief:
|
||||
@@ -121,30 +121,35 @@ def print_validation_result(result, brief=False, quiet=False):
|
||||
print(f"✗ {result['file']}")
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
if result['line']:
|
||||
if result["line"]:
|
||||
print(f" Line: {result['line']}")
|
||||
|
||||
if result['context'] and not brief:
|
||||
print(f"\n Context:")
|
||||
if result["context"] and not brief:
|
||||
print("\n Context:")
|
||||
print(f" {result['context'].replace(chr(10), chr(10) + ' ')}")
|
||||
|
||||
if not brief:
|
||||
print(f"\n Common fixes:")
|
||||
print(f" - Check for missing quotes around keys or values")
|
||||
print(f" - Ensure proper escaping of special characters")
|
||||
print(f" - Verify table header syntax: [section.subsection]")
|
||||
print(f" - Check for duplicate keys in the same table")
|
||||
print("\n Common fixes:")
|
||||
print(" - Check for missing quotes around keys or values")
|
||||
print(" - Ensure proper escaping of special characters")
|
||||
print(" - Verify table header syntax: [section.subsection]")
|
||||
print(" - Check for duplicate keys in the same table")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Validate TOML translation files')
|
||||
parser.add_argument('files', nargs='*', help='TOML file(s) or pattern to validate')
|
||||
parser.add_argument('--all-batches', metavar='LANG',
|
||||
help='Validate all batch files for a language (e.g., ar_AR)')
|
||||
parser.add_argument('--brief', action='store_true',
|
||||
help='Show brief output without context')
|
||||
parser.add_argument('--quiet', action='store_true',
|
||||
help='Only show files with errors')
|
||||
parser = argparse.ArgumentParser(description="Validate TOML translation files")
|
||||
parser.add_argument("files", nargs="*", help="TOML file(s) or pattern to validate")
|
||||
parser.add_argument(
|
||||
"--all-batches",
|
||||
metavar="LANG",
|
||||
help="Validate all batch files for a language (e.g., ar_AR)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--brief", action="store_true", help="Show brief output without context"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quiet", action="store_true", help="Only show files with errors"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -181,11 +186,11 @@ def main():
|
||||
|
||||
# Summary
|
||||
total = len(results)
|
||||
valid = sum(1 for r in results if r['valid'])
|
||||
valid = sum(1 for r in results if r["valid"])
|
||||
invalid = total - valid
|
||||
|
||||
if not args.quiet:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Summary: {valid}/{total} files valid")
|
||||
if invalid > 0:
|
||||
print(f" {invalid} file(s) with errors")
|
||||
@@ -194,5 +199,5 @@ def main():
|
||||
sys.exit(0 if invalid == 0 else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,16 +5,19 @@ Compares language files against en-GB golden truth file.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple
|
||||
from typing import Dict, List, Set
|
||||
import argparse
|
||||
import tomllib
|
||||
|
||||
|
||||
class TranslationAnalyzer:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
||||
def __init__(
|
||||
self,
|
||||
locales_dir: str = "frontend/public/locales",
|
||||
ignore_file: str = "scripts/ignore_translation.toml",
|
||||
):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
|
||||
self.golden_truth = self._load_translation_file(self.golden_truth_file)
|
||||
@@ -24,7 +27,7 @@ class TranslationAnalyzer:
|
||||
def _load_translation_file(self, file_path: Path) -> Dict:
|
||||
"""Load TOML translation file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
@@ -39,17 +42,23 @@ class TranslationAnalyzer:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(self.ignore_file, 'rb') as f:
|
||||
with open(self.ignore_file, "rb") as f:
|
||||
ignore_data = tomllib.load(f)
|
||||
|
||||
# Convert lists to sets for faster lookup
|
||||
return {lang: set(patterns) for lang, data in ignore_data.items()
|
||||
for patterns in [data.get('ignore', [])] if patterns}
|
||||
return {
|
||||
lang: set(patterns)
|
||||
for lang, data in ignore_data.items()
|
||||
for patterns in [data.get("ignore", [])]
|
||||
if patterns
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
|
||||
return {}
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]:
|
||||
def _flatten_dict(
|
||||
self, d: Dict, parent_key: str = "", separator: str = "."
|
||||
) -> Dict[str, str]:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
@@ -80,7 +89,7 @@ class TranslationAnalyzer:
|
||||
missing = set(golden_flat.keys()) - set(target_flat.keys())
|
||||
|
||||
# Filter out ignored keys
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
lang_code = target_file.parent.name.replace("-", "_")
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
return missing - ignore_set
|
||||
|
||||
@@ -91,7 +100,7 @@ class TranslationAnalyzer:
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
lang_code = target_file.parent.name.replace("-", "_")
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
untranslated = set()
|
||||
@@ -101,8 +110,14 @@ class TranslationAnalyzer:
|
||||
golden_value = golden_flat[key]
|
||||
|
||||
# Check if marked as [UNTRANSLATED] or identical to en-GB
|
||||
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
|
||||
(golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)):
|
||||
if (
|
||||
isinstance(target_value, str)
|
||||
and target_value.startswith("[UNTRANSLATED]")
|
||||
) or (
|
||||
golden_value == target_value
|
||||
and key not in ignore_set
|
||||
and not self._is_expected_identical(key, golden_value)
|
||||
):
|
||||
untranslated.add(key)
|
||||
|
||||
return untranslated
|
||||
@@ -110,14 +125,10 @@ class TranslationAnalyzer:
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if a key-value pair is expected to be identical across languages."""
|
||||
# Keys that should be identical across languages
|
||||
identical_patterns = [
|
||||
'language.direction',
|
||||
'true', 'false',
|
||||
'unknown'
|
||||
]
|
||||
identical_patterns = ["language.direction", "true", "false", "unknown"]
|
||||
|
||||
# Values that are often identical (numbers, symbols, etc.)
|
||||
if value.strip() in ['ltr', 'rtl', 'True', 'False']:
|
||||
if value.strip() in ["ltr", "rtl", "True", "False"]:
|
||||
return True
|
||||
|
||||
# Check for patterns
|
||||
@@ -149,7 +160,7 @@ class TranslationAnalyzer:
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
# Calculate completion rate excluding ignored keys
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
lang_code = target_file.parent.name.replace("-", "_")
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
relevant_keys = set(golden_flat.keys()) - ignore_set
|
||||
@@ -161,22 +172,26 @@ class TranslationAnalyzer:
|
||||
if key in target_flat:
|
||||
value = target_flat[key]
|
||||
if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")):
|
||||
if key not in untranslated: # Not identical to en-GB (unless expected)
|
||||
if (
|
||||
key not in untranslated
|
||||
): # Not identical to en-GB (unless expected)
|
||||
properly_translated += 1
|
||||
|
||||
completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0
|
||||
completion_rate = (
|
||||
(properly_translated / total_keys) * 100 if total_keys > 0 else 0
|
||||
)
|
||||
|
||||
return {
|
||||
'language': lang_code,
|
||||
'file': target_file,
|
||||
'missing_count': len(missing),
|
||||
'missing_keys': sorted(missing),
|
||||
'untranslated_count': len(untranslated),
|
||||
'untranslated_keys': sorted(untranslated),
|
||||
'extra_count': len(extra),
|
||||
'extra_keys': sorted(extra),
|
||||
'total_keys': total_keys,
|
||||
'completion_rate': completion_rate
|
||||
"language": lang_code,
|
||||
"file": target_file,
|
||||
"missing_count": len(missing),
|
||||
"missing_keys": sorted(missing),
|
||||
"untranslated_count": len(untranslated),
|
||||
"untranslated_keys": sorted(untranslated),
|
||||
"extra_count": len(extra),
|
||||
"extra_keys": sorted(extra),
|
||||
"total_keys": total_keys,
|
||||
"completion_rate": completion_rate,
|
||||
}
|
||||
|
||||
def analyze_all_files(self) -> List[Dict]:
|
||||
@@ -184,24 +199,38 @@ class TranslationAnalyzer:
|
||||
results = []
|
||||
for file_path in self.get_all_language_files():
|
||||
results.append(self.analyze_file(file_path))
|
||||
return sorted(results, key=lambda x: x['language'])
|
||||
return sorted(results, key=lambda x: x["language"])
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
|
||||
help='Path to ignore patterns TOML file')
|
||||
parser.add_argument('--language', help='Analyze specific language only')
|
||||
parser.add_argument('--missing-only', action='store_true',
|
||||
help='Show only missing translations')
|
||||
parser.add_argument('--untranslated-only', action='store_true',
|
||||
help='Show only untranslated entries')
|
||||
parser.add_argument('--summary', action='store_true',
|
||||
help='Show summary statistics only')
|
||||
parser.add_argument('--format', choices=['text', 'json'], default='text',
|
||||
help='Output format')
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze translation files against en-GB golden truth"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--locales-dir",
|
||||
default="frontend/public/locales",
|
||||
help="Path to locales directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-file",
|
||||
default="scripts/ignore_translation.toml",
|
||||
help="Path to ignore patterns TOML file",
|
||||
)
|
||||
parser.add_argument("--language", help="Analyze specific language only")
|
||||
parser.add_argument(
|
||||
"--missing-only", action="store_true", help="Show only missing translations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--untranslated-only",
|
||||
action="store_true",
|
||||
help="Show only untranslated entries",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summary", action="store_true", help="Show summary statistics only"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format", choices=["text", "json"], default="text", help="Output format"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -220,14 +249,14 @@ def main():
|
||||
else:
|
||||
results = analyzer.analyze_all_files()
|
||||
|
||||
if args.format == 'json':
|
||||
if args.format == "json":
|
||||
print(json.dumps(results, indent=2, default=str))
|
||||
return
|
||||
|
||||
# Text format output
|
||||
for result in results:
|
||||
lang = result['language']
|
||||
print(f"\n{'='*60}")
|
||||
lang = result["language"]
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Language: {lang}")
|
||||
print(f"File: {result['file']}")
|
||||
print(f"Completion Rate: {result['completion_rate']:.1f}%")
|
||||
@@ -236,42 +265,48 @@ def main():
|
||||
if not args.summary:
|
||||
if not args.untranslated_only:
|
||||
print(f"\nMissing Translations ({result['missing_count']}):")
|
||||
for key in result['missing_keys'][:10]: # Show first 10
|
||||
for key in result["missing_keys"][:10]: # Show first 10
|
||||
print(f" - {key}")
|
||||
if len(result['missing_keys']) > 10:
|
||||
if len(result["missing_keys"]) > 10:
|
||||
print(f" ... and {len(result['missing_keys']) - 10} more")
|
||||
|
||||
if not args.missing_only:
|
||||
print(f"\nUntranslated Entries ({result['untranslated_count']}):")
|
||||
for key in result['untranslated_keys'][:10]: # Show first 10
|
||||
for key in result["untranslated_keys"][:10]: # Show first 10
|
||||
print(f" - {key}")
|
||||
if len(result['untranslated_keys']) > 10:
|
||||
if len(result["untranslated_keys"]) > 10:
|
||||
print(f" ... and {len(result['untranslated_keys']) - 10} more")
|
||||
|
||||
if result['extra_count'] > 0:
|
||||
if result["extra_count"] > 0:
|
||||
print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):")
|
||||
for key in result['extra_keys'][:5]:
|
||||
for key in result["extra_keys"][:5]:
|
||||
print(f" - {key}")
|
||||
if len(result['extra_keys']) > 5:
|
||||
if len(result["extra_keys"]) > 5:
|
||||
print(f" ... and {len(result['extra_keys']) - 5} more")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0
|
||||
print(f"{'=' * 60}")
|
||||
avg_completion = (
|
||||
sum(r["completion_rate"] for r in results) / len(results) if results else 0
|
||||
)
|
||||
print(f"Average Completion Rate: {avg_completion:.1f}%")
|
||||
print(f"Languages Analyzed: {len(results)}")
|
||||
|
||||
# Top languages by completion
|
||||
sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True)
|
||||
print(f"\nTop 5 Most Complete Languages:")
|
||||
sorted_by_completion = sorted(
|
||||
results, key=lambda x: x["completion_rate"], reverse=True
|
||||
)
|
||||
print("\nTop 5 Most Complete Languages:")
|
||||
for result in sorted_by_completion[:5]:
|
||||
print(f" {result['language']}: {result['completion_rate']:.1f}%")
|
||||
|
||||
print(f"\nBottom 5 Languages Needing Attention:")
|
||||
print("\nBottom 5 Languages Needing Attention:")
|
||||
for result in sorted_by_completion[-5:]:
|
||||
print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)")
|
||||
print(
|
||||
f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -7,10 +7,9 @@ TOML format only.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple, Any
|
||||
from typing import Dict, List, Set, Any
|
||||
import argparse
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
@@ -20,7 +19,11 @@ import tomli_w
|
||||
|
||||
|
||||
class TranslationMerger:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
||||
def __init__(
|
||||
self,
|
||||
locales_dir: str = "frontend/public/locales",
|
||||
ignore_file: str = "scripts/ignore_translation.toml",
|
||||
):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
|
||||
self.golden_truth = self._load_translation_file(self.golden_truth_file)
|
||||
@@ -30,7 +33,7 @@ class TranslationMerger:
|
||||
def _load_translation_file(self, file_path: Path) -> Dict:
|
||||
"""Load TOML translation file."""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
@@ -39,14 +42,18 @@ class TranslationMerger:
|
||||
print(f"Error: Invalid file {file_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def _save_translation_file(self, data: Dict, file_path: Path, backup: bool = False) -> None:
|
||||
def _save_translation_file(
|
||||
self, data: Dict, file_path: Path, backup: bool = False
|
||||
) -> None:
|
||||
"""Save TOML translation file with backup option."""
|
||||
if backup and file_path.exists():
|
||||
backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.toml')
|
||||
backup_path = file_path.with_suffix(
|
||||
f".backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}.toml"
|
||||
)
|
||||
shutil.copy2(file_path, backup_path)
|
||||
print(f"Backup created: {backup_path}")
|
||||
|
||||
with open(file_path, 'wb') as f:
|
||||
with open(file_path, "wb") as f:
|
||||
tomli_w.dump(data, f)
|
||||
|
||||
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
|
||||
@@ -55,18 +62,20 @@ class TranslationMerger:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(self.ignore_file, 'rb') as f:
|
||||
with open(self.ignore_file, "rb") as f:
|
||||
ignore_data = tomllib.load(f)
|
||||
|
||||
# Convert to sets for faster lookup
|
||||
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
|
||||
return {
|
||||
lang: set(data.get("ignore", [])) for lang, data in ignore_data.items()
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
|
||||
return {}
|
||||
|
||||
def _get_nested_value(self, data: Dict, key_path: str) -> Any:
|
||||
"""Get value from nested dict using dot notation."""
|
||||
keys = key_path.split('.')
|
||||
keys = key_path.split(".")
|
||||
current = data
|
||||
for key in keys:
|
||||
if isinstance(current, dict) and key in current:
|
||||
@@ -77,7 +86,7 @@ class TranslationMerger:
|
||||
|
||||
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
||||
"""Set value in nested dict using dot notation."""
|
||||
keys = key_path.split('.')
|
||||
keys = key_path.split(".")
|
||||
current = data
|
||||
for key in keys[:-1]:
|
||||
if key not in current:
|
||||
@@ -85,12 +94,16 @@ class TranslationMerger:
|
||||
elif not isinstance(current[key], dict):
|
||||
# If the current value is not a dict, we can't nest into it
|
||||
# This handles cases where a key exists as a string but we need to make it a dict
|
||||
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
|
||||
print(
|
||||
f"Warning: Converting non-dict value at '{key}' to dict to allow nesting"
|
||||
)
|
||||
current[key] = {}
|
||||
current = current[key]
|
||||
current[keys[-1]] = value
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
||||
def _flatten_dict(
|
||||
self, d: Dict, parent_key: str = "", separator: str = "."
|
||||
) -> Dict[str, Any]:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
@@ -103,7 +116,7 @@ class TranslationMerger:
|
||||
|
||||
def get_missing_keys(self, target_file: Path) -> List[str]:
|
||||
"""Get list of missing keys in target file."""
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
lang_code = target_file.parent.name.replace("-", "_")
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
if not target_file.exists():
|
||||
@@ -117,7 +130,9 @@ class TranslationMerger:
|
||||
missing = set(golden_flat.keys()) - set(target_flat.keys())
|
||||
return sorted(missing - ignore_set)
|
||||
|
||||
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None) -> Dict:
|
||||
def add_missing_translations(
|
||||
self, target_file: Path, keys_to_add: List[str] = None
|
||||
) -> Dict:
|
||||
"""Add missing translations from en-GB to target file."""
|
||||
if not target_file.exists():
|
||||
target_data = {}
|
||||
@@ -136,12 +151,14 @@ class TranslationMerger:
|
||||
added_count += 1
|
||||
|
||||
return {
|
||||
'added_count': added_count,
|
||||
'missing_keys': missing_keys,
|
||||
'data': target_data
|
||||
"added_count": added_count,
|
||||
"missing_keys": missing_keys,
|
||||
"data": target_data,
|
||||
}
|
||||
|
||||
def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict:
|
||||
def extract_untranslated_entries(
|
||||
self, target_file: Path, output_file: Path = None
|
||||
) -> Dict:
|
||||
"""Extract entries marked as untranslated or identical to en-GB for AI translation."""
|
||||
if not target_file.exists():
|
||||
print(f"Error: Target file does not exist: {target_file}")
|
||||
@@ -160,20 +177,22 @@ class TranslationMerger:
|
||||
# Check if marked as untranslated
|
||||
if isinstance(value, str) and value.startswith("[UNTRANSLATED]"):
|
||||
untranslated_entries[key] = {
|
||||
'original': golden_value,
|
||||
'current': value,
|
||||
'reason': 'marked_untranslated'
|
||||
"original": golden_value,
|
||||
"current": value,
|
||||
"reason": "marked_untranslated",
|
||||
}
|
||||
# Check if identical to golden (and should be translated)
|
||||
elif value == golden_value and not self._is_expected_identical(key, value):
|
||||
elif value == golden_value and not self._is_expected_identical(
|
||||
key, value
|
||||
):
|
||||
untranslated_entries[key] = {
|
||||
'original': golden_value,
|
||||
'current': value,
|
||||
'reason': 'identical_to_english'
|
||||
"original": golden_value,
|
||||
"current": value,
|
||||
"reason": "identical_to_english",
|
||||
}
|
||||
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(untranslated_entries, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return untranslated_entries
|
||||
@@ -181,10 +200,10 @@ class TranslationMerger:
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if a key-value pair is expected to be identical across languages."""
|
||||
identical_patterns = [
|
||||
'language.direction',
|
||||
"language.direction",
|
||||
]
|
||||
|
||||
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
|
||||
if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]:
|
||||
return True
|
||||
|
||||
for pattern in identical_patterns:
|
||||
@@ -193,12 +212,13 @@ class TranslationMerger:
|
||||
|
||||
return False
|
||||
|
||||
def apply_translations(self, target_file: Path, translations: Dict[str, str],
|
||||
backup: bool = False) -> Dict:
|
||||
def apply_translations(
|
||||
self, target_file: Path, translations: Dict[str, str], backup: bool = False
|
||||
) -> Dict:
|
||||
"""Apply provided translations to target file."""
|
||||
if not target_file.exists():
|
||||
print(f"Error: Target file does not exist: {target_file}")
|
||||
return {'success': False, 'error': 'File not found'}
|
||||
return {"success": False, "error": "File not found"}
|
||||
|
||||
target_data = self._load_translation_file(target_file)
|
||||
applied_count = 0
|
||||
@@ -219,10 +239,10 @@ class TranslationMerger:
|
||||
self._save_translation_file(target_data, target_file, backup)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'applied_count': applied_count,
|
||||
'errors': errors,
|
||||
'data': target_data
|
||||
"success": True,
|
||||
"applied_count": applied_count,
|
||||
"errors": errors,
|
||||
"data": target_data,
|
||||
}
|
||||
|
||||
def create_translation_template(self, target_file: Path, output_file: Path) -> None:
|
||||
@@ -230,25 +250,25 @@ class TranslationMerger:
|
||||
untranslated = self.extract_untranslated_entries(target_file)
|
||||
|
||||
template = {
|
||||
'metadata': {
|
||||
'source_language': 'en-GB',
|
||||
'target_language': target_file.parent.name,
|
||||
'total_entries': len(untranslated),
|
||||
'created_at': datetime.now().isoformat(),
|
||||
'instructions': 'Translate the "original" values to the target language. Keep the same keys.'
|
||||
"metadata": {
|
||||
"source_language": "en-GB",
|
||||
"target_language": target_file.parent.name,
|
||||
"total_entries": len(untranslated),
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"instructions": 'Translate the "original" values to the target language. Keep the same keys.',
|
||||
},
|
||||
'translations': {}
|
||||
"translations": {},
|
||||
}
|
||||
|
||||
for key, entry in untranslated.items():
|
||||
template['translations'][key] = {
|
||||
'original': entry['original'],
|
||||
'translated': '', # AI should fill this
|
||||
'context': self._get_context_for_key(key),
|
||||
'reason': entry['reason']
|
||||
template["translations"][key] = {
|
||||
"original": entry["original"],
|
||||
"translated": "", # AI should fill this
|
||||
"context": self._get_context_for_key(key),
|
||||
"reason": entry["reason"],
|
||||
}
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(template, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Translation template created: {output_file}")
|
||||
@@ -256,7 +276,7 @@ class TranslationMerger:
|
||||
|
||||
def _get_context_for_key(self, key: str) -> str:
|
||||
"""Get context information for a translation key."""
|
||||
parts = key.split('.')
|
||||
parts = key.split(".")
|
||||
if len(parts) >= 2:
|
||||
return f"Section: {parts[0]}, Property: {parts[-1]}"
|
||||
return f"Property: {parts[-1]}"
|
||||
@@ -264,33 +284,55 @@ class TranslationMerger:
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Merge and manage translation files',
|
||||
epilog='Works with TOML translation files.'
|
||||
description="Merge and manage translation files",
|
||||
epilog="Works with TOML translation files.",
|
||||
)
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
|
||||
help='Path to ignore patterns TOML file')
|
||||
parser.add_argument('language', help='Target language code (e.g., fr-FR)')
|
||||
parser.add_argument(
|
||||
"--locales-dir",
|
||||
default="frontend/public/locales",
|
||||
help="Path to locales directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-file",
|
||||
default="scripts/ignore_translation.toml",
|
||||
help="Path to ignore patterns TOML file",
|
||||
)
|
||||
parser.add_argument("language", help="Target language code (e.g., fr-FR)")
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||
|
||||
# Add missing command
|
||||
add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB')
|
||||
add_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files')
|
||||
add_parser = subparsers.add_parser(
|
||||
"add-missing", help="Add missing translations from en-GB"
|
||||
)
|
||||
add_parser.add_argument(
|
||||
"--backup", action="store_true", help="Create backup before modifying files"
|
||||
)
|
||||
|
||||
# Extract untranslated command
|
||||
extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries')
|
||||
extract_parser.add_argument('--output', help='Output file path')
|
||||
extract_parser = subparsers.add_parser(
|
||||
"extract-untranslated", help="Extract untranslated entries"
|
||||
)
|
||||
extract_parser.add_argument("--output", help="Output file path")
|
||||
|
||||
# Create template command
|
||||
template_parser = subparsers.add_parser('create-template', help='Create AI translation template')
|
||||
template_parser.add_argument('--output', required=True, help='Output template file path')
|
||||
template_parser = subparsers.add_parser(
|
||||
"create-template", help="Create AI translation template"
|
||||
)
|
||||
template_parser.add_argument(
|
||||
"--output", required=True, help="Output template file path"
|
||||
)
|
||||
|
||||
# Apply translations command
|
||||
apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file')
|
||||
apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations')
|
||||
apply_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files')
|
||||
apply_parser = subparsers.add_parser(
|
||||
"apply-translations", help="Apply translations from JSON file"
|
||||
)
|
||||
apply_parser.add_argument(
|
||||
"--translations-file", required=True, help="JSON file with translations"
|
||||
)
|
||||
apply_parser.add_argument(
|
||||
"--backup", action="store_true", help="Create backup before modifying files"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -304,44 +346,53 @@ def main():
|
||||
lang_dir = Path(args.locales_dir) / args.language
|
||||
target_file = lang_dir / "translation.toml"
|
||||
|
||||
if args.command == 'add-missing':
|
||||
if args.command == "add-missing":
|
||||
print(f"Adding missing translations to {args.language}...")
|
||||
result = merger.add_missing_translations(target_file)
|
||||
|
||||
merger._save_translation_file(result['data'], target_file, backup=args.backup)
|
||||
merger._save_translation_file(result["data"], target_file, backup=args.backup)
|
||||
print(f"Added {result['added_count']} missing translations")
|
||||
|
||||
elif args.command == 'extract-untranslated':
|
||||
output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json')
|
||||
elif args.command == "extract-untranslated":
|
||||
output_file = (
|
||||
Path(args.output)
|
||||
if args.output
|
||||
else target_file.with_suffix(".untranslated.json")
|
||||
)
|
||||
untranslated = merger.extract_untranslated_entries(target_file, output_file)
|
||||
print(f"Extracted {len(untranslated)} untranslated entries to {output_file}")
|
||||
|
||||
elif args.command == 'create-template':
|
||||
elif args.command == "create-template":
|
||||
output_file = Path(args.output)
|
||||
merger.create_translation_template(target_file, output_file)
|
||||
|
||||
elif args.command == 'apply-translations':
|
||||
with open(args.translations_file, 'r', encoding='utf-8') as f:
|
||||
elif args.command == "apply-translations":
|
||||
with open(args.translations_file, "r", encoding="utf-8") as f:
|
||||
translations_data = json.load(f)
|
||||
|
||||
# Extract translations from template format or simple dict
|
||||
if 'translations' in translations_data:
|
||||
translations = {k: v['translated'] for k, v in translations_data['translations'].items()
|
||||
if v.get('translated')}
|
||||
if "translations" in translations_data:
|
||||
translations = {
|
||||
k: v["translated"]
|
||||
for k, v in translations_data["translations"].items()
|
||||
if v.get("translated")
|
||||
}
|
||||
else:
|
||||
translations = translations_data
|
||||
|
||||
result = merger.apply_translations(target_file, translations, backup=args.backup)
|
||||
result = merger.apply_translations(
|
||||
target_file, translations, backup=args.backup
|
||||
)
|
||||
|
||||
if result['success']:
|
||||
if result["success"]:
|
||||
print(f"Applied {result['applied_count']} translations")
|
||||
if result['errors']:
|
||||
if result["errors"]:
|
||||
print(f"Errors: {len(result['errors'])}")
|
||||
for error in result['errors'][:5]:
|
||||
for error in result["errors"][:5]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print(f"Failed: {result.get('error', 'Unknown error')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -16,12 +16,12 @@ Usage:
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set
|
||||
from typing import Dict, Set
|
||||
import argparse
|
||||
import tomllib # Python 3.11+ (stdlib)
|
||||
|
||||
|
||||
def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
|
||||
def get_all_keys(d: dict, parent_key: str = "", sep: str = ".") -> Set[str]:
|
||||
"""Get all keys from nested dict as dot-notation paths."""
|
||||
keys = set()
|
||||
for k, v in d.items():
|
||||
@@ -35,7 +35,7 @@ def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
|
||||
def validate_translation_file(file_path: Path) -> tuple[bool, str]:
|
||||
"""Validate that a file contains valid TOML."""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
tomllib.load(f)
|
||||
return True, "Valid TOML"
|
||||
except Exception as e:
|
||||
@@ -43,95 +43,85 @@ def validate_translation_file(file_path: Path) -> tuple[bool, str]:
|
||||
|
||||
|
||||
def validate_structure(
|
||||
en_gb_keys: Set[str],
|
||||
lang_keys: Set[str],
|
||||
lang_code: str
|
||||
en_gb_keys: Set[str], lang_keys: Set[str], lang_code: str
|
||||
) -> Dict:
|
||||
"""Compare structure between en-GB and target language."""
|
||||
missing_keys = en_gb_keys - lang_keys
|
||||
extra_keys = lang_keys - en_gb_keys
|
||||
|
||||
return {
|
||||
'language': lang_code,
|
||||
'missing_keys': sorted(missing_keys),
|
||||
'extra_keys': sorted(extra_keys),
|
||||
'total_keys': len(lang_keys),
|
||||
'expected_keys': len(en_gb_keys),
|
||||
'missing_count': len(missing_keys),
|
||||
'extra_count': len(extra_keys)
|
||||
"language": lang_code,
|
||||
"missing_keys": sorted(missing_keys),
|
||||
"extra_keys": sorted(extra_keys),
|
||||
"total_keys": len(lang_keys),
|
||||
"expected_keys": len(en_gb_keys),
|
||||
"missing_count": len(missing_keys),
|
||||
"extra_count": len(extra_keys),
|
||||
}
|
||||
|
||||
|
||||
def print_validation_result(result: Dict, verbose: bool = False):
|
||||
"""Print validation results in readable format."""
|
||||
lang = result['language']
|
||||
lang = result["language"]
|
||||
|
||||
print(f"\n{'='*100}")
|
||||
print(f"\n{'=' * 100}")
|
||||
print(f"Language: {lang}")
|
||||
print(f"{'='*100}")
|
||||
print(f"{'=' * 100}")
|
||||
print(f" Total keys: {result['total_keys']}")
|
||||
print(f" Expected keys (en-GB): {result['expected_keys']}")
|
||||
print(f" Missing keys: {result['missing_count']}")
|
||||
print(f" Extra keys: {result['extra_count']}")
|
||||
|
||||
if result['missing_count'] == 0 and result['extra_count'] == 0:
|
||||
print(f" ✅ Structure matches en-GB perfectly!")
|
||||
if result["missing_count"] == 0 and result["extra_count"] == 0:
|
||||
print(" ✅ Structure matches en-GB perfectly!")
|
||||
else:
|
||||
if result['missing_count'] > 0:
|
||||
if result["missing_count"] > 0:
|
||||
print(f"\n ⚠️ Missing {result['missing_count']} key(s):")
|
||||
if verbose or result['missing_count'] <= 20:
|
||||
for key in result['missing_keys'][:50]:
|
||||
if verbose or result["missing_count"] <= 20:
|
||||
for key in result["missing_keys"][:50]:
|
||||
print(f" - {key}")
|
||||
if result['missing_count'] > 50:
|
||||
if result["missing_count"] > 50:
|
||||
print(f" ... and {result['missing_count'] - 50} more")
|
||||
else:
|
||||
print(f" (use --verbose to see all)")
|
||||
print(" (use --verbose to see all)")
|
||||
|
||||
if result['extra_count'] > 0:
|
||||
if result["extra_count"] > 0:
|
||||
print(f"\n ⚠️ Extra {result['extra_count']} key(s) not in en-GB:")
|
||||
if verbose or result['extra_count'] <= 20:
|
||||
for key in result['extra_keys'][:50]:
|
||||
if verbose or result["extra_count"] <= 20:
|
||||
for key in result["extra_keys"][:50]:
|
||||
print(f" - {key}")
|
||||
if result['extra_count'] > 50:
|
||||
if result["extra_count"] > 50:
|
||||
print(f" ... and {result['extra_count'] - 50} more")
|
||||
else:
|
||||
print(f" (use --verbose to see all)")
|
||||
print(" (use --verbose to see all)")
|
||||
|
||||
print("-" * 100)
|
||||
|
||||
|
||||
def load_translation_file(file_path: Path) -> dict:
|
||||
"""Load TOML translation file."""
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate translation TOML structure'
|
||||
parser = argparse.ArgumentParser(description="Validate translation TOML structure")
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
help="Specific language code to validate (e.g., es-ES)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--language',
|
||||
help='Specific language code to validate (e.g., es-ES)',
|
||||
default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Show all missing/extra keys'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='Output results as JSON'
|
||||
"--verbose", "-v", action="store_true", help="Show all missing/extra keys"
|
||||
)
|
||||
parser.add_argument("--json", action="store_true", help="Output results as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define paths
|
||||
locales_dir = Path('frontend/public/locales')
|
||||
en_gb_path = locales_dir / 'en-GB' / 'translation.toml'
|
||||
file_ext = '.toml'
|
||||
locales_dir = Path("frontend/public/locales")
|
||||
en_gb_path = locales_dir / "en-GB" / "translation.toml"
|
||||
file_ext = ".toml"
|
||||
|
||||
if not en_gb_path.exists():
|
||||
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
|
||||
@@ -155,8 +145,8 @@ def main():
|
||||
# Validate all languages except en-GB
|
||||
languages = []
|
||||
for d in locales_dir.iterdir():
|
||||
if d.is_dir() and d.name != 'en-GB':
|
||||
if (d / 'translation.toml').exists():
|
||||
if d.is_dir() and d.name != "en-GB":
|
||||
if (d / "translation.toml").exists():
|
||||
languages.append(d.name)
|
||||
|
||||
results = []
|
||||
@@ -164,7 +154,7 @@ def main():
|
||||
|
||||
# Validate each language
|
||||
for lang_code in sorted(languages):
|
||||
lang_path = locales_dir / lang_code / 'translation.toml'
|
||||
lang_path = locales_dir / lang_code / "translation.toml"
|
||||
|
||||
if not lang_path.exists():
|
||||
print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping")
|
||||
@@ -173,11 +163,9 @@ def main():
|
||||
# First check if file is valid
|
||||
is_valid, message = validate_translation_file(lang_path)
|
||||
if not is_valid:
|
||||
json_errors.append({
|
||||
'language': lang_code,
|
||||
'file': str(lang_path),
|
||||
'error': message
|
||||
})
|
||||
json_errors.append(
|
||||
{"language": lang_code, "file": str(lang_path), "error": message}
|
||||
)
|
||||
continue
|
||||
|
||||
# Load and compare structure
|
||||
@@ -189,10 +177,7 @@ def main():
|
||||
|
||||
# Output results
|
||||
if args.json:
|
||||
output = {
|
||||
'json_errors': json_errors,
|
||||
'structure_validation': results
|
||||
}
|
||||
output = {"json_errors": json_errors, "structure_validation": results}
|
||||
print(json.dumps(output, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
# Print syntax errors first
|
||||
@@ -210,11 +195,13 @@ def main():
|
||||
print("\n📊 Structure Validation Summary:")
|
||||
print(f" Languages validated: {len(results)}")
|
||||
|
||||
perfect = sum(1 for r in results if r['missing_count'] == 0 and r['extra_count'] == 0)
|
||||
perfect = sum(
|
||||
1 for r in results if r["missing_count"] == 0 and r["extra_count"] == 0
|
||||
)
|
||||
print(f" Perfect matches: {perfect}/{len(results)}")
|
||||
|
||||
total_missing = sum(r['missing_count'] for r in results)
|
||||
total_extra = sum(r['extra_count'] for r in results)
|
||||
total_missing = sum(r["missing_count"] for r in results)
|
||||
total_extra = sum(r["extra_count"] for r in results)
|
||||
print(f" Total missing keys: {total_missing}")
|
||||
print(f" Total extra keys: {total_extra}")
|
||||
|
||||
@@ -226,10 +213,10 @@ def main():
|
||||
|
||||
# Exit with error code if issues found
|
||||
has_issues = len(json_errors) > 0 or any(
|
||||
r['missing_count'] > 0 or r['extra_count'] > 0 for r in results
|
||||
r["missing_count"] > 0 or r["extra_count"] > 0 for r in results
|
||||
)
|
||||
sys.exit(1 if has_issues else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -13,7 +13,7 @@ import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple
|
||||
from typing import Dict, List, Set
|
||||
import argparse
|
||||
import tomllib # Python 3.11+ (stdlib)
|
||||
|
||||
@@ -22,10 +22,10 @@ def find_placeholders(text: str) -> Set[str]:
|
||||
"""Find all placeholders in text like {n}, {{var}}, {0}, etc."""
|
||||
if not isinstance(text, str):
|
||||
return set()
|
||||
return set(re.findall(r'\{\{?[^}]+\}\}?', text))
|
||||
return set(re.findall(r"\{\{?[^}]+\}\}?", text))
|
||||
|
||||
|
||||
def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str]:
|
||||
def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> Dict[str, str]:
|
||||
"""Flatten nested dict to dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
@@ -38,9 +38,7 @@ def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str
|
||||
|
||||
|
||||
def validate_language(
|
||||
en_gb_flat: Dict[str, str],
|
||||
lang_flat: Dict[str, str],
|
||||
lang_code: str
|
||||
en_gb_flat: Dict[str, str], lang_flat: Dict[str, str], lang_code: str
|
||||
) -> List[Dict]:
|
||||
"""Validate placeholders for a language against en-GB."""
|
||||
issues = []
|
||||
@@ -57,12 +55,12 @@ def validate_language(
|
||||
extra = lang_placeholders - en_placeholders
|
||||
|
||||
issue = {
|
||||
'language': lang_code,
|
||||
'key': key,
|
||||
'missing': missing,
|
||||
'extra': extra,
|
||||
'en_text': en_gb_flat[key],
|
||||
'lang_text': lang_flat[key]
|
||||
"language": lang_code,
|
||||
"key": key,
|
||||
"missing": missing,
|
||||
"extra": extra,
|
||||
"en_text": en_gb_flat[key],
|
||||
"lang_text": lang_flat[key],
|
||||
}
|
||||
issues.append(issue)
|
||||
|
||||
@@ -82,9 +80,9 @@ def print_issues(issues: List[Dict], verbose: bool = False):
|
||||
print(f"\n{i}. Language: {issue['language']}")
|
||||
print(f" Key: {issue['key']}")
|
||||
|
||||
if issue['missing']:
|
||||
if issue["missing"]:
|
||||
print(f" ⚠️ MISSING placeholders: {issue['missing']}")
|
||||
if issue['extra']:
|
||||
if issue["extra"]:
|
||||
print(f" ⚠️ EXTRA placeholders: {issue['extra']}")
|
||||
|
||||
if verbose:
|
||||
@@ -96,37 +94,34 @@ def print_issues(issues: List[Dict], verbose: bool = False):
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate translation placeholder consistency'
|
||||
description="Validate translation placeholder consistency"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--language',
|
||||
help='Specific language code to validate (e.g., es-ES)',
|
||||
default=None
|
||||
"--language",
|
||||
help="Specific language code to validate (e.g., es-ES)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Show full text samples for each issue'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='Output results as JSON'
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Show full text samples for each issue",
|
||||
)
|
||||
parser.add_argument("--json", action="store_true", help="Output results as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define paths
|
||||
locales_dir = Path('frontend/public/locales')
|
||||
en_gb_path = locales_dir / 'en-GB' / 'translation.toml'
|
||||
file_ext = '.toml'
|
||||
locales_dir = Path("frontend/public/locales")
|
||||
en_gb_path = locales_dir / "en-GB" / "translation.toml"
|
||||
file_ext = ".toml"
|
||||
|
||||
if not en_gb_path.exists():
|
||||
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load en-GB (source of truth)
|
||||
with open(en_gb_path, 'rb') as f:
|
||||
with open(en_gb_path, "rb") as f:
|
||||
en_gb = tomllib.load(f)
|
||||
|
||||
en_gb_flat = flatten_dict(en_gb)
|
||||
@@ -138,22 +133,22 @@ def main():
|
||||
# Validate all languages except en-GB
|
||||
languages = []
|
||||
for d in locales_dir.iterdir():
|
||||
if d.is_dir() and d.name != 'en-GB':
|
||||
if (d / 'translation.toml').exists():
|
||||
if d.is_dir() and d.name != "en-GB":
|
||||
if (d / "translation.toml").exists():
|
||||
languages.append(d.name)
|
||||
|
||||
all_issues = []
|
||||
|
||||
# Validate each language
|
||||
for lang_code in sorted(languages):
|
||||
lang_path = locales_dir / lang_code / 'translation.toml'
|
||||
lang_path = locales_dir / lang_code / "translation.toml"
|
||||
|
||||
if not lang_path.exists():
|
||||
print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping")
|
||||
continue
|
||||
|
||||
# Load language file
|
||||
with open(lang_path, 'rb') as f:
|
||||
with open(lang_path, "rb") as f:
|
||||
lang_data = tomllib.load(f)
|
||||
|
||||
lang_flat = flatten_dict(lang_data)
|
||||
@@ -168,19 +163,19 @@ def main():
|
||||
# Group by language
|
||||
by_language = {}
|
||||
for issue in all_issues:
|
||||
lang = issue['language']
|
||||
lang = issue["language"]
|
||||
if lang not in by_language:
|
||||
by_language[lang] = []
|
||||
by_language[lang].append(issue)
|
||||
|
||||
print(f"📊 Validation Summary:")
|
||||
print("📊 Validation Summary:")
|
||||
print(f" Total issues: {len(all_issues)}")
|
||||
print(f" Languages with issues: {len(by_language)}\n")
|
||||
|
||||
for lang in sorted(by_language.keys()):
|
||||
print(f"\n{'='*100}")
|
||||
print(f"\n{'=' * 100}")
|
||||
print(f"Language: {lang} ({len(by_language[lang])} issue(s))")
|
||||
print(f"{'='*100}")
|
||||
print(f"{'=' * 100}")
|
||||
print_issues(by_language[lang], verbose=args.verbose)
|
||||
else:
|
||||
print("✅ All translations have correct placeholders!")
|
||||
@@ -189,5 +184,5 @@ def main():
|
||||
sys.exit(1 if all_issues else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -55,14 +55,33 @@ class GlyphBuildResult:
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Synthesize fonts from Type3 glyph JSON.")
|
||||
parser.add_argument("--input", required=True, help="Path to glyph JSON emitted by the backend")
|
||||
parser.add_argument("--otf-output", required=True, help="Destination path for the CFF/OTF font")
|
||||
parser.add_argument("--ttf-output", help="Optional destination path for a TrueType font")
|
||||
parser.add_argument("--family-name", default="Type3 Synth", help="Family name for the output")
|
||||
parser.add_argument("--style-name", default="Regular", help="Style name for the output")
|
||||
parser.add_argument("--units-per-em", type=int, default=1000, help="Units per EM value")
|
||||
parser.add_argument("--cu2qu-error", type=float, default=1.0, help="Max error for cubic→quadratic conversion")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize fonts from Type3 glyph JSON."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input", required=True, help="Path to glyph JSON emitted by the backend"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--otf-output", required=True, help="Destination path for the CFF/OTF font"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ttf-output", help="Optional destination path for a TrueType font"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--family-name", default="Type3 Synth", help="Family name for the output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--style-name", default="Regular", help="Style name for the output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--units-per-em", type=int, default=1000, help="Units per EM value"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cu2qu-error",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Max error for cubic→quadratic conversion",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -151,18 +170,22 @@ def iterate_glyphs(data: Dict[str, object]) -> List[GlyphSource]:
|
||||
char_code_value = record.get("code")
|
||||
if not isinstance(char_code_value, int):
|
||||
char_code_value = record.get("charCodeRaw")
|
||||
if not isinstance(char_code_value, int) or not (0 <= char_code_value <= 0x10FFFF):
|
||||
if not isinstance(char_code_value, int) or not (
|
||||
0 <= char_code_value <= 0x10FFFF
|
||||
):
|
||||
char_code_value = None
|
||||
outline = record.get("outline")
|
||||
if not isinstance(outline, list):
|
||||
outline = []
|
||||
sources.append(
|
||||
GlyphSource(
|
||||
name=name,
|
||||
width=float(width),
|
||||
unicode=unicode_value,
|
||||
char_code=char_code_value,
|
||||
outline=outline))
|
||||
GlyphSource(
|
||||
name=name,
|
||||
width=float(width),
|
||||
unicode=unicode_value,
|
||||
char_code=char_code_value,
|
||||
outline=outline,
|
||||
)
|
||||
)
|
||||
return sources
|
||||
|
||||
|
||||
@@ -199,7 +222,10 @@ def build_cff_charstring(
|
||||
start_point = point
|
||||
open_path = True
|
||||
elif op == "L" and current is not None:
|
||||
point = (float(command.get("x", current[0])), float(command.get("y", current[1])))
|
||||
point = (
|
||||
float(command.get("x", current[0])),
|
||||
float(command.get("y", current[1])),
|
||||
)
|
||||
pen.lineTo(point)
|
||||
update_bounds(point)
|
||||
current = point
|
||||
|
||||
@@ -17,14 +17,21 @@ from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
DEFAULT_SIGNATURES = REPO_ROOT / "docs" / "type3" / "signatures"
|
||||
DEFAULT_INDEX = (
|
||||
REPO_ROOT / "app" / "core" / "src" / "main" / "resources" / "type3" / "library" / "index.json"
|
||||
REPO_ROOT
|
||||
/ "app"
|
||||
/ "core"
|
||||
/ "src"
|
||||
/ "main"
|
||||
/ "resources"
|
||||
/ "type3"
|
||||
/ "library"
|
||||
/ "index.json"
|
||||
)
|
||||
|
||||
|
||||
@@ -136,7 +143,12 @@ def update_library(
|
||||
entry = alias_index[alias]
|
||||
|
||||
if entry is None:
|
||||
unmatched.append((font.get("baseName") or font.get("alias_raw") or "unknown", sig_file))
|
||||
unmatched.append(
|
||||
(
|
||||
font.get("baseName") or font.get("alias_raw") or "unknown",
|
||||
sig_file,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
entry_modified = False
|
||||
@@ -186,7 +198,9 @@ def update_library(
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Update Type3 library index using signature dumps.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Update Type3 library index using signature dumps."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--signatures-dir",
|
||||
type=Path,
|
||||
@@ -209,7 +223,11 @@ def parse_args() -> argparse.Namespace:
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
signatures_dir = args.signatures_dir if args.signatures_dir.is_absolute() else (REPO_ROOT / args.signatures_dir)
|
||||
signatures_dir = (
|
||||
args.signatures_dir
|
||||
if args.signatures_dir.is_absolute()
|
||||
else (REPO_ROOT / args.signatures_dir)
|
||||
)
|
||||
index_path = args.index if args.index.is_absolute() else (REPO_ROOT / args.index)
|
||||
|
||||
if not signatures_dir.exists():
|
||||
|
||||
@@ -60,4 +60,4 @@ networks:
|
||||
volumes:
|
||||
stirling-data:
|
||||
stirling-config:
|
||||
stirling-logs:
|
||||
stirling-logs:
|
||||
|
||||
@@ -56,4 +56,4 @@ networks:
|
||||
volumes:
|
||||
stirling-data:
|
||||
stirling-config:
|
||||
stirling-logs:
|
||||
stirling-logs:
|
||||
|
||||
@@ -56,4 +56,4 @@ networks:
|
||||
volumes:
|
||||
stirling-data:
|
||||
stirling-config:
|
||||
stirling-logs:
|
||||
stirling-logs:
|
||||
|
||||
Reference in New Issue
Block a user