mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-12-18 20:04:17 +01:00
360 lines
11 KiB
Python
360 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bulk Auto-Translate All Languages
|
|
Automatically translates all languages in parallel using OpenAI API.
|
|
Supports concurrent translation with configurable thread pool.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import subprocess
|
|
from typing import List, Tuple, Optional
|
|
import threading
|
|
|
|
import tomllib
|
|
|
|
|
|
# Thread-safe print lock
|
|
print_lock = threading.Lock()
|
|
|
|
|
|
def safe_print(*args, **kwargs):
|
|
"""Thread-safe print function."""
|
|
with print_lock:
|
|
print(*args, **kwargs)
|
|
|
|
|
|
def get_all_languages(locales_dir: Path) -> List[str]:
|
|
"""Get all language codes from locales directory."""
|
|
languages = []
|
|
|
|
if not locales_dir.exists():
|
|
print(f"Error: Locales directory not found: {locales_dir}")
|
|
return []
|
|
|
|
for lang_dir in sorted(locales_dir.iterdir()):
|
|
if lang_dir.is_dir() and lang_dir.name != "en-GB":
|
|
toml_file = lang_dir / "translation.toml"
|
|
if toml_file.exists():
|
|
languages.append(lang_dir.name)
|
|
|
|
return languages
|
|
|
|
|
|
def get_language_completion(locales_dir: Path, language: str) -> Optional[float]:
|
|
"""Get completion percentage for a language."""
|
|
lang_dir = locales_dir / language
|
|
toml_file = lang_dir / "translation.toml"
|
|
|
|
if not toml_file.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(toml_file, "rb") as f:
|
|
target_data = tomllib.load(f)
|
|
|
|
# Load en-GB reference
|
|
en_gb_file = locales_dir / "en-GB" / "translation.toml"
|
|
with open(en_gb_file, "rb") as f:
|
|
en_gb_data = tomllib.load(f)
|
|
|
|
# Flatten and count
|
|
def flatten(d, parent=""):
|
|
items = {}
|
|
for k, v in d.items():
|
|
key = f"{parent}.{k}" if parent else k
|
|
if isinstance(v, dict):
|
|
items.update(flatten(v, key))
|
|
else:
|
|
items[key] = v
|
|
return items
|
|
|
|
en_gb_flat = flatten(en_gb_data)
|
|
target_flat = flatten(target_data)
|
|
|
|
# Count translated (not equal to en-GB)
|
|
translated = sum(
|
|
1
|
|
for k in en_gb_flat
|
|
if k in target_flat and target_flat[k] != en_gb_flat[k]
|
|
)
|
|
total = len(en_gb_flat)
|
|
|
|
return (translated / total * 100) if total > 0 else 0.0
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Could not calculate completion for {language}: {e}")
|
|
return None
|
|
|
|
|
|
def translate_language(
|
|
language: str,
|
|
api_key: str,
|
|
batch_size: int,
|
|
timeout: int,
|
|
skip_verification: bool,
|
|
include_existing: bool,
|
|
) -> Tuple[str, bool, str]:
|
|
"""
|
|
Translate a single language.
|
|
Returns: (language_code, success, message)
|
|
"""
|
|
safe_print(f"[{language}] Starting translation...")
|
|
|
|
cmd = [
|
|
"python3",
|
|
"scripts/translations/auto_translate.py",
|
|
language,
|
|
"--api-key",
|
|
api_key,
|
|
"--batch-size",
|
|
str(batch_size),
|
|
"--timeout",
|
|
str(timeout),
|
|
]
|
|
|
|
if skip_verification:
|
|
cmd.append("--skip-verification")
|
|
|
|
if include_existing:
|
|
cmd.append("--include-existing")
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout * 5, # Overall timeout = 5x per-batch timeout
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
# Check if nothing to translate
|
|
if "Nothing to translate!" in result.stdout:
|
|
safe_print(f"[{language}] ✓ Already complete")
|
|
return (language, True, "Already complete")
|
|
safe_print(f"[{language}] ✓ Success")
|
|
return (language, True, "Success")
|
|
else:
|
|
error_msg = (
|
|
result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
)
|
|
safe_print(f"[{language}] ✗ Failed: {error_msg[:100]}")
|
|
return (language, False, error_msg[:200]) # Truncate long errors
|
|
|
|
except subprocess.TimeoutExpired:
|
|
safe_print(f"[{language}] ✗ Timeout exceeded")
|
|
return (language, False, "Timeout exceeded")
|
|
except Exception as e:
|
|
safe_print(f"[{language}] ✗ Error: {str(e)}")
|
|
return (language, False, str(e))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Bulk auto-translate all languages using OpenAI API",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Translate all languages with 10 parallel threads
|
|
python3 bulk_auto_translate.py --parallel 10
|
|
|
|
# Translate only incomplete languages (< 95%)
|
|
python3 bulk_auto_translate.py --parallel 5 --threshold 95
|
|
|
|
# Translate specific languages only
|
|
python3 bulk_auto_translate.py --languages de-DE fr-FR es-ES --parallel 3
|
|
|
|
# Dry run to see what would be translated
|
|
python3 bulk_auto_translate.py --dry-run
|
|
|
|
Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
|
|
)
|
|
parser.add_argument(
|
|
"--parallel",
|
|
type=int,
|
|
default=1,
|
|
help="Number of parallel translation threads (default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
type=int,
|
|
default=500,
|
|
help="Entries per batch for translation (default: 500)",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=600,
|
|
help="Timeout per batch in seconds (default: 600)",
|
|
)
|
|
parser.add_argument(
|
|
"--threshold",
|
|
type=float,
|
|
default=0.0,
|
|
help="Only translate languages below this completion %% (default: 0 = all)",
|
|
)
|
|
parser.add_argument(
|
|
"--languages",
|
|
nargs="+",
|
|
help="Translate only specific languages (e.g., de-DE fr-FR)",
|
|
)
|
|
parser.add_argument(
|
|
"--locales-dir",
|
|
default="frontend/public/locales",
|
|
help="Path to locales directory",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-verification",
|
|
action="store_true",
|
|
help="Skip final completion verification for each language",
|
|
)
|
|
parser.add_argument(
|
|
"--include-existing",
|
|
action="store_true",
|
|
help="Also retranslate existing keys that match English (default: only translate missing keys)",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be translated without actually translating",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Verify API key (unless dry run)
|
|
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
|
if not args.dry_run and not api_key:
|
|
print(
|
|
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
|
|
)
|
|
sys.exit(1)
|
|
|
|
locales_dir = Path(args.locales_dir)
|
|
|
|
# Get languages to translate
|
|
if args.languages:
|
|
languages = args.languages
|
|
print(f"Translating specified languages: {', '.join(languages)}")
|
|
else:
|
|
languages = get_all_languages(locales_dir)
|
|
print(f"Found {len(languages)} languages (excluding en-GB)")
|
|
|
|
if not languages:
|
|
print("No languages to translate!")
|
|
sys.exit(0)
|
|
|
|
# Filter by completion threshold
|
|
if args.threshold > 0:
|
|
print(f"\nFiltering languages below {args.threshold}% completion...")
|
|
filtered = []
|
|
for lang in languages:
|
|
completion = get_language_completion(locales_dir, lang)
|
|
if completion is None:
|
|
filtered.append(lang) # Include if can't determine
|
|
print(f" {lang}: Unknown completion - will translate")
|
|
elif completion < args.threshold:
|
|
filtered.append(lang)
|
|
print(f" {lang}: {completion:.1f}% - will translate")
|
|
else:
|
|
print(f" {lang}: {completion:.1f}% - skipping (above threshold)")
|
|
|
|
languages = filtered
|
|
|
|
if not languages:
|
|
print("\nNo languages below threshold!")
|
|
sys.exit(0)
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print("Bulk Translation Configuration")
|
|
print(f"{'=' * 60}")
|
|
print(f"Languages to translate: {len(languages)}")
|
|
print(f"Parallel threads: {args.parallel}")
|
|
print(f"Batch size: {args.batch_size}")
|
|
print(f"Timeout per batch: {args.timeout}s")
|
|
if args.threshold > 0:
|
|
print(f"Completion threshold: {args.threshold}%")
|
|
print(f"{'=' * 60}\n")
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN - Languages that would be translated:")
|
|
for lang in languages:
|
|
completion = get_language_completion(locales_dir, lang)
|
|
comp_str = f"{completion:.1f}%" if completion is not None else "Unknown"
|
|
print(f" - {lang} ({comp_str})")
|
|
print(f"\nTotal: {len(languages)} languages")
|
|
sys.exit(0)
|
|
|
|
start_time = time.time()
|
|
|
|
# Translate in parallel
|
|
results = {"success": [], "failed": [], "already_complete": []}
|
|
|
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
futures = {
|
|
executor.submit(
|
|
translate_language,
|
|
lang,
|
|
api_key,
|
|
args.batch_size,
|
|
args.timeout,
|
|
args.skip_verification,
|
|
args.include_existing,
|
|
): lang
|
|
for lang in languages
|
|
}
|
|
|
|
for future in as_completed(futures):
|
|
language, success, message = future.result()
|
|
|
|
if success:
|
|
if message == "Already complete":
|
|
results["already_complete"].append(language)
|
|
else:
|
|
results["success"].append(language)
|
|
else:
|
|
results["failed"].append((language, message))
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("Bulk Translation Summary")
|
|
print("=" * 60)
|
|
print(f"Total languages: {len(languages)}")
|
|
print(f"Successful: {len(results['success'])}")
|
|
print(f"Already complete: {len(results['already_complete'])}")
|
|
print(f"Failed: {len(results['failed'])}")
|
|
print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed / 60:.1f} minutes)")
|
|
print("=" * 60)
|
|
|
|
if results["success"]:
|
|
print(f"\n✅ Successfully translated ({len(results['success'])}):")
|
|
for lang in sorted(results["success"]):
|
|
print(f" - {lang}")
|
|
|
|
if results["already_complete"]:
|
|
print(f"\n✓ Already complete ({len(results['already_complete'])}):")
|
|
for lang in sorted(results["already_complete"]):
|
|
print(f" - {lang}")
|
|
|
|
if results["failed"]:
|
|
print(f"\n❌ Failed ({len(results['failed'])}):")
|
|
for lang, msg in sorted(results["failed"]):
|
|
print(f" - {lang}: {msg}")
|
|
sys.exit(1)
|
|
|
|
print("\n✅ Bulk translation completed successfully!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|