mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-12-18 20:04:17 +01:00
185 lines
6.3 KiB
Python
185 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compact Translation Extractor for Character-Limited AI Translation
|
|
Outputs untranslated entries in minimal JSON format with whitespace stripped.
|
|
TOML format only.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
import argparse
|
|
import tomllib # Python 3.11+ (stdlib)
|
|
|
|
|
|
class CompactTranslationExtractor:
|
|
def __init__(
|
|
self,
|
|
locales_dir: str = "frontend/public/locales",
|
|
ignore_file: str = "scripts/ignore_translation.toml",
|
|
):
|
|
self.locales_dir = Path(locales_dir)
|
|
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
|
|
if not self.golden_truth_file.exists():
|
|
print(
|
|
f"Error: en-GB translation file not found at {self.golden_truth_file}",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
self.golden_truth = self._load_translation_file(self.golden_truth_file)
|
|
self.ignore_file = Path(ignore_file)
|
|
self.ignore_patterns = self._load_ignore_patterns()
|
|
|
|
def _load_translation_file(self, file_path: Path) -> dict:
|
|
"""Load TOML translation file."""
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
return tomllib.load(f)
|
|
except FileNotFoundError:
|
|
print(f"Error: File not found: {file_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error: Invalid TOML file {file_path}: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
def _load_ignore_patterns(self) -> dict:
|
|
"""Load ignore patterns from TOML file."""
|
|
if not self.ignore_file.exists():
|
|
return {}
|
|
|
|
try:
|
|
with open(self.ignore_file, "rb") as f:
|
|
ignore_data = tomllib.load(f)
|
|
return {
|
|
lang: set(data.get("ignore", [])) for lang, data in ignore_data.items()
|
|
}
|
|
except Exception as e:
|
|
print(
|
|
f"Warning: Could not load ignore file {self.ignore_file}: {e}",
|
|
file=sys.stderr,
|
|
)
|
|
return {}
|
|
|
|
def _flatten_dict(
|
|
self, d: dict, parent_key: str = "", separator: str = "."
|
|
) -> dict:
|
|
"""Flatten nested dictionary into dot-notation keys."""
|
|
items = []
|
|
for k, v in d.items():
|
|
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
if isinstance(v, dict):
|
|
items.extend(self._flatten_dict(v, new_key, separator).items())
|
|
else:
|
|
items.append((new_key, str(v)))
|
|
return dict(items)
|
|
|
|
def get_untranslated_entries(self, language: str) -> dict:
|
|
"""Get all untranslated entries for a language in compact format."""
|
|
lang_dir = self.locales_dir / language
|
|
target_file = lang_dir / "translation.toml"
|
|
|
|
if not target_file.exists():
|
|
print(
|
|
f"Error: Translation file not found for language: {language}",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
|
|
target_data = self._load_translation_file(target_file)
|
|
golden_flat = self._flatten_dict(self.golden_truth)
|
|
target_flat = self._flatten_dict(target_data)
|
|
|
|
lang_code = language.replace("-", "_")
|
|
ignore_set = self.ignore_patterns.get(lang_code, set())
|
|
|
|
# Find missing translations
|
|
missing_keys = set(golden_flat.keys()) - set(target_flat.keys()) - ignore_set
|
|
|
|
# Find untranslated entries (identical to en-GB or marked [UNTRANSLATED])
|
|
untranslated_keys = set()
|
|
for key in target_flat:
|
|
if key in golden_flat and key not in ignore_set:
|
|
target_value = target_flat[key]
|
|
golden_value = golden_flat[key]
|
|
|
|
if (
|
|
isinstance(target_value, str)
|
|
and target_value.startswith("[UNTRANSLATED]")
|
|
) or (
|
|
golden_value == target_value
|
|
and not self._is_expected_identical(key, golden_value)
|
|
):
|
|
untranslated_keys.add(key)
|
|
|
|
# Combine and create compact output
|
|
all_untranslated = missing_keys | untranslated_keys
|
|
|
|
compact_entries = {}
|
|
for key in sorted(all_untranslated):
|
|
if key in golden_flat:
|
|
compact_entries[key] = golden_flat[key]
|
|
|
|
return compact_entries
|
|
|
|
def _is_expected_identical(self, key: str, value: str) -> bool:
|
|
"""Check if a key-value pair is expected to be identical across languages."""
|
|
identical_patterns = ["language.direction"]
|
|
identical_values = {"ltr", "rtl", "True", "False", "true", "false", "unknown"}
|
|
|
|
if value.strip() in identical_values:
|
|
return True
|
|
|
|
for pattern in identical_patterns:
|
|
if pattern in key.lower():
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract untranslated entries in compact format for AI translation (TOML format only)"
|
|
)
|
|
parser.add_argument("language", help="Language code (e.g., de-DE, fr-FR)")
|
|
parser.add_argument(
|
|
"--locales-dir",
|
|
default="frontend/public/locales",
|
|
help="Path to locales directory",
|
|
)
|
|
parser.add_argument(
|
|
"--ignore-file",
|
|
default="scripts/ignore_translation.toml",
|
|
help="Path to ignore patterns file",
|
|
)
|
|
parser.add_argument(
|
|
"--max-entries", type=int, help="Maximum number of entries to output"
|
|
)
|
|
parser.add_argument("--output", help="Output file (default: stdout)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
extractor = CompactTranslationExtractor(args.locales_dir, args.ignore_file)
|
|
untranslated = extractor.get_untranslated_entries(args.language)
|
|
|
|
if args.max_entries:
|
|
# Take first N entries
|
|
keys = list(untranslated.keys())[: args.max_entries]
|
|
untranslated = {k: untranslated[k] for k in keys}
|
|
|
|
# Output compact JSON (no indentation, minimal whitespace)
|
|
output = json.dumps(untranslated, separators=(",", ":"), ensure_ascii=False)
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
print(
|
|
f"Extracted {len(untranslated)} untranslated entries to {args.output}",
|
|
file=sys.stderr,
|
|
)
|
|
else:
|
|
print(output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|