mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-12-18 20:04:17 +01:00
# Description of Changes <!-- Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --> --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details.
312 lines
11 KiB
Python
312 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bulk Auto-Translate All Languages
|
|
Automatically translates all languages in parallel using OpenAI API.
|
|
Supports concurrent translation with configurable thread pool.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import subprocess
|
|
from typing import List, Tuple, Optional
|
|
import threading
|
|
|
|
import tomllib
|
|
|
|
|
|
# Thread-safe print lock
|
|
print_lock = threading.Lock()
|
|
|
|
|
|
def safe_print(*args, **kwargs):
|
|
"""Thread-safe print function."""
|
|
with print_lock:
|
|
print(*args, **kwargs)
|
|
|
|
|
|
def get_all_languages(locales_dir: Path) -> List[str]:
|
|
"""Get all language codes from locales directory."""
|
|
languages = []
|
|
|
|
if not locales_dir.exists():
|
|
print(f"Error: Locales directory not found: {locales_dir}")
|
|
return []
|
|
|
|
for lang_dir in sorted(locales_dir.iterdir()):
|
|
if lang_dir.is_dir() and lang_dir.name != "en-GB":
|
|
toml_file = lang_dir / "translation.toml"
|
|
if toml_file.exists():
|
|
languages.append(lang_dir.name)
|
|
|
|
return languages
|
|
|
|
|
|
def get_language_completion(locales_dir: Path, language: str) -> Optional[float]:
|
|
"""Get completion percentage for a language."""
|
|
lang_dir = locales_dir / language
|
|
toml_file = lang_dir / "translation.toml"
|
|
|
|
if not toml_file.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(toml_file, 'rb') as f:
|
|
target_data = tomllib.load(f)
|
|
|
|
# Load en-GB reference
|
|
en_gb_file = locales_dir / 'en-GB' / 'translation.toml'
|
|
with open(en_gb_file, 'rb') as f:
|
|
en_gb_data = tomllib.load(f)
|
|
|
|
# Flatten and count
|
|
def flatten(d, parent=''):
|
|
items = {}
|
|
for k, v in d.items():
|
|
key = f"{parent}.{k}" if parent else k
|
|
if isinstance(v, dict):
|
|
items.update(flatten(v, key))
|
|
else:
|
|
items[key] = v
|
|
return items
|
|
|
|
en_gb_flat = flatten(en_gb_data)
|
|
target_flat = flatten(target_data)
|
|
|
|
# Count translated (not equal to en-GB)
|
|
translated = sum(1 for k in en_gb_flat if k in target_flat and target_flat[k] != en_gb_flat[k])
|
|
total = len(en_gb_flat)
|
|
|
|
return (translated / total * 100) if total > 0 else 0.0
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Could not calculate completion for {language}: {e}")
|
|
return None
|
|
|
|
|
|
def translate_language(language: str, api_key: str, batch_size: int, timeout: int, skip_verification: bool, include_existing: bool) -> Tuple[str, bool, str]:
|
|
"""
|
|
Translate a single language.
|
|
Returns: (language_code, success, message)
|
|
"""
|
|
safe_print(f"[{language}] Starting translation...")
|
|
|
|
cmd = [
|
|
'python3', 'scripts/translations/auto_translate.py',
|
|
language,
|
|
'--api-key', api_key,
|
|
'--batch-size', str(batch_size),
|
|
'--timeout', str(timeout)
|
|
]
|
|
|
|
if skip_verification:
|
|
cmd.append('--skip-verification')
|
|
|
|
if include_existing:
|
|
cmd.append('--include-existing')
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout * 5 # Overall timeout = 5x per-batch timeout
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
# Check if nothing to translate
|
|
if "Nothing to translate!" in result.stdout:
|
|
safe_print(f"[{language}] ✓ Already complete")
|
|
return (language, True, "Already complete")
|
|
safe_print(f"[{language}] ✓ Success")
|
|
return (language, True, "Success")
|
|
else:
|
|
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
safe_print(f"[{language}] ✗ Failed: {error_msg[:100]}")
|
|
return (language, False, error_msg[:200]) # Truncate long errors
|
|
|
|
except subprocess.TimeoutExpired:
|
|
safe_print(f"[{language}] ✗ Timeout exceeded")
|
|
return (language, False, "Timeout exceeded")
|
|
except Exception as e:
|
|
safe_print(f"[{language}] ✗ Error: {str(e)}")
|
|
return (language, False, str(e))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Bulk auto-translate all languages using OpenAI API',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Translate all languages with 10 parallel threads
|
|
python3 bulk_auto_translate.py --parallel 10
|
|
|
|
# Translate only incomplete languages (< 95%)
|
|
python3 bulk_auto_translate.py --parallel 5 --threshold 95
|
|
|
|
# Translate specific languages only
|
|
python3 bulk_auto_translate.py --languages de-DE fr-FR es-ES --parallel 3
|
|
|
|
# Dry run to see what would be translated
|
|
python3 bulk_auto_translate.py --dry-run
|
|
|
|
Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('--api-key', help='OpenAI API key (or set OPENAI_API_KEY env var)')
|
|
parser.add_argument('--parallel', type=int, default=1,
|
|
help='Number of parallel translation threads (default: 1)')
|
|
parser.add_argument('--batch-size', type=int, default=500,
|
|
help='Entries per batch for translation (default: 500)')
|
|
parser.add_argument('--timeout', type=int, default=600,
|
|
help='Timeout per batch in seconds (default: 600)')
|
|
parser.add_argument('--threshold', type=float, default=0.0,
|
|
help='Only translate languages below this completion %% (default: 0 = all)')
|
|
parser.add_argument('--languages', nargs='+',
|
|
help='Translate only specific languages (e.g., de-DE fr-FR)')
|
|
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
|
help='Path to locales directory')
|
|
parser.add_argument('--skip-verification', action='store_true',
|
|
help='Skip final completion verification for each language')
|
|
parser.add_argument('--include-existing', action='store_true',
|
|
help='Also retranslate existing keys that match English (default: only translate missing keys)')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Show what would be translated without actually translating')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Verify API key (unless dry run)
|
|
api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
|
|
if not args.dry_run and not api_key:
|
|
print("Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable")
|
|
sys.exit(1)
|
|
|
|
locales_dir = Path(args.locales_dir)
|
|
|
|
# Get languages to translate
|
|
if args.languages:
|
|
languages = args.languages
|
|
print(f"Translating specified languages: {', '.join(languages)}")
|
|
else:
|
|
languages = get_all_languages(locales_dir)
|
|
print(f"Found {len(languages)} languages (excluding en-GB)")
|
|
|
|
if not languages:
|
|
print("No languages to translate!")
|
|
sys.exit(0)
|
|
|
|
# Filter by completion threshold
|
|
if args.threshold > 0:
|
|
print(f"\nFiltering languages below {args.threshold}% completion...")
|
|
filtered = []
|
|
for lang in languages:
|
|
completion = get_language_completion(locales_dir, lang)
|
|
if completion is None:
|
|
filtered.append(lang) # Include if can't determine
|
|
print(f" {lang}: Unknown completion - will translate")
|
|
elif completion < args.threshold:
|
|
filtered.append(lang)
|
|
print(f" {lang}: {completion:.1f}% - will translate")
|
|
else:
|
|
print(f" {lang}: {completion:.1f}% - skipping (above threshold)")
|
|
|
|
languages = filtered
|
|
|
|
if not languages:
|
|
print("\nNo languages below threshold!")
|
|
sys.exit(0)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Bulk Translation Configuration")
|
|
print(f"{'='*60}")
|
|
print(f"Languages to translate: {len(languages)}")
|
|
print(f"Parallel threads: {args.parallel}")
|
|
print(f"Batch size: {args.batch_size}")
|
|
print(f"Timeout per batch: {args.timeout}s")
|
|
if args.threshold > 0:
|
|
print(f"Completion threshold: {args.threshold}%")
|
|
print(f"{'='*60}\n")
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN - Languages that would be translated:")
|
|
for lang in languages:
|
|
completion = get_language_completion(locales_dir, lang)
|
|
comp_str = f"{completion:.1f}%" if completion is not None else "Unknown"
|
|
print(f" - {lang} ({comp_str})")
|
|
print(f"\nTotal: {len(languages)} languages")
|
|
sys.exit(0)
|
|
|
|
start_time = time.time()
|
|
|
|
# Translate in parallel
|
|
results = {
|
|
'success': [],
|
|
'failed': [],
|
|
'already_complete': []
|
|
}
|
|
|
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
futures = {
|
|
executor.submit(
|
|
translate_language,
|
|
lang,
|
|
api_key,
|
|
args.batch_size,
|
|
args.timeout,
|
|
args.skip_verification,
|
|
args.include_existing
|
|
): lang
|
|
for lang in languages
|
|
}
|
|
|
|
for future in as_completed(futures):
|
|
language, success, message = future.result()
|
|
|
|
if success:
|
|
if message == "Already complete":
|
|
results['already_complete'].append(language)
|
|
else:
|
|
results['success'].append(language)
|
|
else:
|
|
results['failed'].append((language, message))
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Print summary
|
|
print("\n" + "="*60)
|
|
print("Bulk Translation Summary")
|
|
print("="*60)
|
|
print(f"Total languages: {len(languages)}")
|
|
print(f"Successful: {len(results['success'])}")
|
|
print(f"Already complete: {len(results['already_complete'])}")
|
|
print(f"Failed: {len(results['failed'])}")
|
|
print(f"Time elapsed: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
|
|
print("="*60)
|
|
|
|
if results['success']:
|
|
print(f"\n✅ Successfully translated ({len(results['success'])}):")
|
|
for lang in sorted(results['success']):
|
|
print(f" - {lang}")
|
|
|
|
if results['already_complete']:
|
|
print(f"\n✓ Already complete ({len(results['already_complete'])}):")
|
|
for lang in sorted(results['already_complete']):
|
|
print(f" - {lang}")
|
|
|
|
if results['failed']:
|
|
print(f"\n❌ Failed ({len(results['failed'])}):")
|
|
for lang, msg in sorted(results['failed']):
|
|
print(f" - {lang}: {msg}")
|
|
sys.exit(1)
|
|
|
|
print("\n✅ Bulk translation completed successfully!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|