Stirling-PDF/scripts/translations/compact_translator.py
2025-09-25 10:19:45 +01:00

177 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Compact Translation Extractor for Character-Limited AI Translation
Outputs untranslated entries in minimal JSON format with whitespace stripped.
"""
import json
import sys
from pathlib import Path
import argparse
try:
import tomllib # Python 3.11+
except ImportError:
try:
import toml as tomllib_fallback
tomllib = None
except ImportError:
tomllib = None
tomllib_fallback = None
class CompactTranslationExtractor:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
self.golden_truth = self._load_json(self.golden_truth_file)
self.ignore_file = Path(ignore_file)
self.ignore_patterns = self._load_ignore_patterns()
def _load_json(self, file_path: Path) -> dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {file_path}: {e}", file=sys.stderr)
sys.exit(1)
def _load_ignore_patterns(self) -> dict:
"""Load ignore patterns from TOML file."""
if not self.ignore_file.exists():
return {}
try:
if tomllib:
with open(self.ignore_file, 'rb') as f:
ignore_data = tomllib.load(f)
elif tomllib_fallback:
ignore_data = tomllib_fallback.load(self.ignore_file)
else:
ignore_data = self._parse_simple_toml()
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr)
return {}
def _parse_simple_toml(self) -> dict:
"""Simple TOML parser for ignore patterns (fallback)."""
ignore_data = {}
current_section = None
with open(self.ignore_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if line.startswith('[') and line.endswith(']'):
current_section = line[1:-1]
ignore_data[current_section] = {'ignore': []}
elif line.strip().startswith("'") and current_section:
item = line.strip().strip("',")
if item:
ignore_data[current_section]['ignore'].append(item)
return ignore_data
def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, str(v)))
return dict(items)
def get_untranslated_entries(self, language: str) -> dict:
"""Get all untranslated entries for a language in compact format."""
target_file = self.locales_dir / language / "translation.json"
if not target_file.exists():
print(f"Error: Translation file not found for language: {language}", file=sys.stderr)
sys.exit(1)
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
lang_code = language.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
# Find missing translations
missing_keys = set(golden_flat.keys()) - set(target_flat.keys()) - ignore_set
# Find untranslated entries (identical to en-GB or marked [UNTRANSLATED])
untranslated_keys = set()
for key in target_flat:
if key in golden_flat and key not in ignore_set:
target_value = target_flat[key]
golden_value = golden_flat[key]
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
(golden_value == target_value and not self._is_expected_identical(key, golden_value)):
untranslated_keys.add(key)
# Combine and create compact output
all_untranslated = missing_keys | untranslated_keys
compact_entries = {}
for key in sorted(all_untranslated):
if key in golden_flat:
compact_entries[key] = golden_flat[key]
return compact_entries
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
identical_patterns = ['language.direction']
identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'}
if value.strip() in identical_values:
return True
for pattern in identical_patterns:
if pattern in key.lower():
return True
return False
def main():
parser = argparse.ArgumentParser(description='Extract untranslated entries in compact format for AI translation')
parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)')
parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file')
parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output')
parser.add_argument('--output', help='Output file (default: stdout)')
args = parser.parse_args()
extractor = CompactTranslationExtractor(args.locales_dir, args.ignore_file)
untranslated = extractor.get_untranslated_entries(args.language)
if args.max_entries:
# Take first N entries
keys = list(untranslated.keys())[:args.max_entries]
untranslated = {k: untranslated[k] for k in keys}
# Output compact JSON (no indentation, minimal whitespace)
output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False)
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr)
else:
print(output)
if __name__ == "__main__":
main()