mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-12-18 20:04:17 +01:00
385 lines
12 KiB
Python
385 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Automated Translation Pipeline
|
|
Extracts, translates, merges, and beautifies translations for a language.
|
|
TOML format only.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
from pathlib import Path
|
|
import time
|
|
|
|
import tomllib
|
|
|
|
|
|
def run_command(cmd, description=""):
|
|
"""Run a shell command and return success status."""
|
|
if description:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Step: {description}")
|
|
print(f"{'=' * 60}")
|
|
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
|
|
if result.stdout:
|
|
print(result.stdout)
|
|
if result.stderr:
|
|
print(result.stderr, file=sys.stderr)
|
|
|
|
return result.returncode == 0
|
|
|
|
|
|
def find_translation_file(lang_dir):
|
|
"""Find translation file in language directory."""
|
|
toml_file = lang_dir / "translation.toml"
|
|
if toml_file.exists():
|
|
return toml_file
|
|
return None
|
|
|
|
|
|
def load_translation_file(file_path):
|
|
"""Load TOML translation file."""
|
|
with open(file_path, "rb") as f:
|
|
return tomllib.load(f)
|
|
|
|
|
|
def extract_untranslated(language_code, batch_size=500, include_existing=False):
|
|
"""Extract untranslated entries and split into batches."""
|
|
mode = (
|
|
"all untranslated (including existing)" if include_existing else "new (missing)"
|
|
)
|
|
print(f"\n🔍 Extracting {mode} entries for {language_code}...")
|
|
|
|
# Load files
|
|
golden_path = find_translation_file(Path("frontend/public/locales/en-GB"))
|
|
lang_path = find_translation_file(Path(f"frontend/public/locales/{language_code}"))
|
|
|
|
if not golden_path:
|
|
print("Error: Golden truth file not found in frontend/public/locales/en-GB")
|
|
return None
|
|
|
|
if not lang_path:
|
|
print(
|
|
f"Error: Language file not found in frontend/public/locales/{language_code}"
|
|
)
|
|
return None
|
|
|
|
def flatten_dict(d, parent_key="", separator="."):
|
|
items = []
|
|
for k, v in d.items():
|
|
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
if isinstance(v, dict):
|
|
items.extend(flatten_dict(v, new_key, separator).items())
|
|
else:
|
|
items.append((new_key, str(v)))
|
|
return dict(items)
|
|
|
|
golden = load_translation_file(golden_path)
|
|
lang_data = load_translation_file(lang_path)
|
|
|
|
if not golden or not lang_data:
|
|
print("Error: Failed to load translation files")
|
|
return None
|
|
|
|
golden_flat = flatten_dict(golden)
|
|
lang_flat = flatten_dict(lang_data)
|
|
|
|
# Find untranslated
|
|
untranslated = {}
|
|
for key, value in golden_flat.items():
|
|
if include_existing:
|
|
# Include missing keys, keys with English values, and [UNTRANSLATED] keys
|
|
if (
|
|
key not in lang_flat
|
|
or lang_flat.get(key) == value
|
|
or (
|
|
isinstance(lang_flat.get(key), str)
|
|
and lang_flat.get(key).startswith("[UNTRANSLATED]")
|
|
)
|
|
):
|
|
untranslated[key] = value
|
|
else:
|
|
# Only include missing keys (not in target file at all)
|
|
if key not in lang_flat:
|
|
untranslated[key] = value
|
|
|
|
total = len(untranslated)
|
|
print(f"Found {total} {mode} entries")
|
|
|
|
if total == 0:
|
|
print("✓ Language is already complete!")
|
|
return []
|
|
|
|
# Split into batches
|
|
entries = list(untranslated.items())
|
|
num_batches = (total + batch_size - 1) // batch_size
|
|
|
|
batch_files = []
|
|
lang_code_safe = language_code.replace("-", "_")
|
|
|
|
for i in range(num_batches):
|
|
start = i * batch_size
|
|
end = min((i + 1) * batch_size, total)
|
|
batch = dict(entries[start:end])
|
|
|
|
filename = f"{lang_code_safe}_batch_{i + 1}_of_{num_batches}.json"
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
json.dump(batch, f, ensure_ascii=False, separators=(",", ":"))
|
|
|
|
batch_files.append(filename)
|
|
print(f" Created {filename} with {len(batch)} entries")
|
|
|
|
return batch_files
|
|
|
|
|
|
def translate_batches(batch_files, language_code, api_key, timeout=600):
|
|
"""Translate all batch files using GPT-5."""
|
|
if not batch_files:
|
|
return []
|
|
|
|
print(f"\n🤖 Translating {len(batch_files)} batches using GPT-5...")
|
|
print(f"Timeout: {timeout}s ({timeout // 60} minutes) per batch")
|
|
|
|
translated_files = []
|
|
|
|
for i, batch_file in enumerate(batch_files, 1):
|
|
print(f"\n[{i}/{len(batch_files)}] Translating {batch_file}...")
|
|
|
|
# Always pass API key since it's required
|
|
cmd = f'python3 scripts/translations/batch_translator.py "{batch_file}" --language {language_code} --api-key "{api_key}"'
|
|
|
|
# Run with timeout
|
|
result = subprocess.run(
|
|
cmd, shell=True, capture_output=True, text=True, timeout=timeout
|
|
)
|
|
|
|
if result.stdout:
|
|
print(result.stdout)
|
|
if result.stderr:
|
|
print(result.stderr, file=sys.stderr)
|
|
|
|
if result.returncode != 0:
|
|
print(f"✗ Failed to translate {batch_file}")
|
|
return None
|
|
|
|
translated_file = batch_file.replace(".json", "_translated.json")
|
|
translated_files.append(translated_file)
|
|
|
|
# Small delay between batches
|
|
if i < len(batch_files):
|
|
time.sleep(1)
|
|
|
|
print(f"\n✓ All {len(batch_files)} batches translated successfully")
|
|
return translated_files
|
|
|
|
|
|
def merge_translations(translated_files, language_code):
|
|
"""Merge all translated batch files."""
|
|
if not translated_files:
|
|
return None
|
|
|
|
print(f"\n🔗 Merging {len(translated_files)} translated batches...")
|
|
|
|
merged = {}
|
|
for filename in translated_files:
|
|
if not Path(filename).exists():
|
|
print(f"Error: Translated file not found: {filename}")
|
|
return None
|
|
|
|
with open(filename, "r", encoding="utf-8") as f:
|
|
merged.update(json.load(f))
|
|
|
|
lang_code_safe = language_code.replace("-", "_")
|
|
merged_file = f"{lang_code_safe}_merged.json"
|
|
|
|
with open(merged_file, "w", encoding="utf-8") as f:
|
|
json.dump(merged, f, ensure_ascii=False, separators=(",", ":"))
|
|
|
|
print(f"✓ Merged {len(merged)} translations into {merged_file}")
|
|
return merged_file
|
|
|
|
|
|
def apply_translations(merged_file, language_code):
|
|
"""Apply merged translations to the language file."""
|
|
print(f"\n📝 Applying translations to {language_code}...")
|
|
|
|
cmd = f"python3 scripts/translations/translation_merger.py {language_code} apply-translations --translations-file {merged_file}"
|
|
|
|
if not run_command(cmd):
|
|
print("✗ Failed to apply translations")
|
|
return False
|
|
|
|
print("✓ Translations applied successfully")
|
|
return True
|
|
|
|
|
|
def beautify_translations(language_code):
|
|
"""Beautify translation file to match en-GB structure."""
|
|
print(f"\n✨ Beautifying {language_code} translation file...")
|
|
|
|
cmd = f"python3 scripts/translations/toml_beautifier.py --language {language_code}"
|
|
|
|
if not run_command(cmd):
|
|
print("✗ Failed to beautify translations")
|
|
return False
|
|
|
|
print("✓ Translation file beautified")
|
|
return True
|
|
|
|
|
|
def cleanup_temp_files(language_code):
|
|
"""Remove temporary batch files."""
|
|
print("\n🧹 Cleaning up temporary files...")
|
|
|
|
lang_code_safe = language_code.replace("-", "_")
|
|
patterns = [f"{lang_code_safe}_batch_*.json", f"{lang_code_safe}_merged.json"]
|
|
|
|
import glob
|
|
|
|
removed = 0
|
|
for pattern in patterns:
|
|
for file in glob.glob(pattern):
|
|
Path(file).unlink()
|
|
removed += 1
|
|
|
|
print(f"✓ Removed {removed} temporary files")
|
|
|
|
|
|
def verify_completion(language_code):
|
|
"""Check final completion percentage."""
|
|
print("\n📊 Verifying completion...")
|
|
|
|
cmd = f"python3 scripts/translations/translation_analyzer.py --language {language_code} --summary"
|
|
run_command(cmd)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Automated translation pipeline for Stirling PDF",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Note: This script works with TOML translation files.
|
|
|
|
Examples:
|
|
# Translate Spanish with API key in environment
|
|
export OPENAI_API_KEY=your_key_here
|
|
python3 scripts/translations/auto_translate.py es-ES
|
|
|
|
# Translate German with inline API key
|
|
python3 scripts/translations/auto_translate.py de-DE --api-key YOUR_KEY
|
|
|
|
# Translate Italian with custom batch size
|
|
python3 scripts/translations/auto_translate.py it-IT --batch-size 600
|
|
|
|
# Skip cleanup (keep temporary files for inspection)
|
|
python3 scripts/translations/auto_translate.py fr-FR --no-cleanup
|
|
""",
|
|
)
|
|
|
|
parser.add_argument("language", help="Language code (e.g., es-ES, de-DE, zh-CN)")
|
|
parser.add_argument(
|
|
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size", type=int, default=500, help="Entries per batch (default: 500)"
|
|
)
|
|
parser.add_argument(
|
|
"--no-cleanup", action="store_true", help="Keep temporary batch files"
|
|
)
|
|
parser.add_argument(
|
|
"--skip-verification", action="store_true", help="Skip final completion check"
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=600,
|
|
help="Timeout per batch in seconds (default: 600 = 10 minutes)",
|
|
)
|
|
parser.add_argument(
|
|
"--include-existing",
|
|
action="store_true",
|
|
help="Also retranslate existing keys that match English (default: only translate missing keys)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Verify API key
|
|
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
|
if not api_key:
|
|
print(
|
|
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
|
|
)
|
|
sys.exit(1)
|
|
|
|
print("=" * 60)
|
|
print("Automated Translation Pipeline")
|
|
print(f"Language: {args.language}")
|
|
print(f"Batch Size: {args.batch_size} entries")
|
|
print("=" * 60)
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Step 1: Extract and split
|
|
batch_files = extract_untranslated(
|
|
args.language, args.batch_size, args.include_existing
|
|
)
|
|
if batch_files is None:
|
|
sys.exit(1)
|
|
|
|
if len(batch_files) == 0:
|
|
print("\n✓ Nothing to translate!")
|
|
sys.exit(0)
|
|
|
|
# Step 2: Translate all batches
|
|
translated_files = translate_batches(
|
|
batch_files, args.language, api_key, args.timeout
|
|
)
|
|
if translated_files is None:
|
|
sys.exit(1)
|
|
|
|
# Step 3: Merge translations
|
|
merged_file = merge_translations(translated_files, args.language)
|
|
if merged_file is None:
|
|
sys.exit(1)
|
|
|
|
# Step 4: Apply translations
|
|
if not apply_translations(merged_file, args.language):
|
|
sys.exit(1)
|
|
|
|
# Step 5: Beautify
|
|
if not beautify_translations(args.language):
|
|
sys.exit(1)
|
|
|
|
# Step 6: Cleanup
|
|
if not args.no_cleanup:
|
|
cleanup_temp_files(args.language)
|
|
|
|
# Step 7: Verify
|
|
if not args.skip_verification:
|
|
verify_completion(args.language)
|
|
|
|
elapsed = time.time() - start_time
|
|
print("\n" + "=" * 60)
|
|
print("✅ Translation pipeline completed successfully!")
|
|
print(f"Time elapsed: {elapsed:.1f} seconds")
|
|
print("=" * 60)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\n⚠ Translation interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n\n✗ Error: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|