mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-26 17:52:59 +02:00
# Description of Changes <!-- Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --> --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details.
371 lines
15 KiB
Python
371 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Translation Merger for Stirling PDF Frontend
|
|
Merges missing translations from en-GB into target language files.
|
|
Useful for AI-assisted translation workflows.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Tuple, Any
|
|
import argparse
|
|
import shutil
|
|
from datetime import datetime
|
|
|
|
try:
|
|
import tomllib # Python 3.11+
|
|
except ImportError:
|
|
try:
|
|
import toml as tomllib_fallback
|
|
tomllib = None
|
|
except ImportError:
|
|
tomllib = None
|
|
tomllib_fallback = None
|
|
|
|
|
|
class TranslationMerger:
|
|
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
|
self.locales_dir = Path(locales_dir)
|
|
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
|
self.golden_truth = self._load_json(self.golden_truth_file)
|
|
self.ignore_file = Path(ignore_file)
|
|
self.ignore_patterns = self._load_ignore_patterns()
|
|
|
|
def _load_json(self, file_path: Path) -> Dict:
|
|
"""Load JSON file with error handling."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
print(f"Error: File not found: {file_path}")
|
|
sys.exit(1)
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error: Invalid JSON in {file_path}: {e}")
|
|
sys.exit(1)
|
|
|
|
def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None:
|
|
"""Save JSON file with backup option."""
|
|
if backup and file_path.exists():
|
|
backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
|
|
shutil.copy2(file_path, backup_path)
|
|
print(f"Backup created: {backup_path}")
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
|
|
"""Load ignore patterns from TOML file."""
|
|
if not self.ignore_file.exists():
|
|
return {}
|
|
|
|
try:
|
|
# Simple parser for ignore patterns
|
|
ignore_data = {}
|
|
current_section = None
|
|
|
|
with open(self.ignore_file, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
|
|
if line.startswith('[') and line.endswith(']'):
|
|
current_section = line[1:-1]
|
|
ignore_data[current_section] = set()
|
|
elif line.strip().startswith("'") and current_section:
|
|
# Extract quoted items
|
|
item = line.strip().strip("',")
|
|
if item:
|
|
ignore_data[current_section].add(item)
|
|
|
|
return ignore_data
|
|
except Exception as e:
|
|
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
|
|
return {}
|
|
|
|
def _get_nested_value(self, data: Dict, key_path: str) -> Any:
|
|
"""Get value from nested dict using dot notation."""
|
|
keys = key_path.split('.')
|
|
current = data
|
|
for key in keys:
|
|
if isinstance(current, dict) and key in current:
|
|
current = current[key]
|
|
else:
|
|
return None
|
|
return current
|
|
|
|
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
|
"""Set value in nested dict using dot notation."""
|
|
keys = key_path.split('.')
|
|
current = data
|
|
for key in keys[:-1]:
|
|
if key not in current:
|
|
current[key] = {}
|
|
elif not isinstance(current[key], dict):
|
|
# If the current value is not a dict, we can't nest into it
|
|
# This handles cases where a key exists as a string but we need to make it a dict
|
|
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
|
|
current[key] = {}
|
|
current = current[key]
|
|
current[keys[-1]] = value
|
|
|
|
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
|
"""Flatten nested dictionary into dot-notation keys."""
|
|
items = []
|
|
for k, v in d.items():
|
|
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
if isinstance(v, dict):
|
|
items.extend(self._flatten_dict(v, new_key, separator).items())
|
|
else:
|
|
items.append((new_key, v))
|
|
return dict(items)
|
|
|
|
def get_missing_keys(self, target_file: Path) -> List[str]:
|
|
"""Get list of missing keys in target file."""
|
|
lang_code = target_file.parent.name.replace('-', '_')
|
|
ignore_set = self.ignore_patterns.get(lang_code, set())
|
|
|
|
if not target_file.exists():
|
|
golden_keys = set(self._flatten_dict(self.golden_truth).keys())
|
|
return sorted(golden_keys - ignore_set)
|
|
|
|
target_data = self._load_json(target_file)
|
|
golden_flat = self._flatten_dict(self.golden_truth)
|
|
target_flat = self._flatten_dict(target_data)
|
|
|
|
missing = set(golden_flat.keys()) - set(target_flat.keys())
|
|
return sorted(missing - ignore_set)
|
|
|
|
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None,
|
|
mark_untranslated: bool = True) -> Dict:
|
|
"""Add missing translations from en-GB to target file."""
|
|
if not target_file.exists():
|
|
target_data = {}
|
|
else:
|
|
target_data = self._load_json(target_file)
|
|
|
|
golden_flat = self._flatten_dict(self.golden_truth)
|
|
missing_keys = keys_to_add or self.get_missing_keys(target_file)
|
|
|
|
added_count = 0
|
|
for key in missing_keys:
|
|
if key in golden_flat:
|
|
value = golden_flat[key]
|
|
if mark_untranslated and isinstance(value, str):
|
|
# Mark as untranslated for AI to translate later
|
|
value = f"[UNTRANSLATED] {value}"
|
|
|
|
self._set_nested_value(target_data, key, value)
|
|
added_count += 1
|
|
|
|
return {
|
|
'added_count': added_count,
|
|
'missing_keys': missing_keys,
|
|
'data': target_data
|
|
}
|
|
|
|
def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict:
|
|
"""Extract entries marked as untranslated or identical to en-GB for AI translation."""
|
|
if not target_file.exists():
|
|
print(f"Error: Target file does not exist: {target_file}")
|
|
return {}
|
|
|
|
target_data = self._load_json(target_file)
|
|
golden_flat = self._flatten_dict(self.golden_truth)
|
|
target_flat = self._flatten_dict(target_data)
|
|
|
|
untranslated_entries = {}
|
|
|
|
for key, value in target_flat.items():
|
|
if key in golden_flat:
|
|
golden_value = golden_flat[key]
|
|
|
|
# Check if marked as untranslated
|
|
if isinstance(value, str) and value.startswith("[UNTRANSLATED]"):
|
|
untranslated_entries[key] = {
|
|
'original': golden_value,
|
|
'current': value,
|
|
'reason': 'marked_untranslated'
|
|
}
|
|
# Check if identical to golden (and should be translated)
|
|
elif value == golden_value and not self._is_expected_identical(key, value):
|
|
untranslated_entries[key] = {
|
|
'original': golden_value,
|
|
'current': value,
|
|
'reason': 'identical_to_english'
|
|
}
|
|
|
|
if output_file:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(untranslated_entries, f, indent=2, ensure_ascii=False)
|
|
|
|
return untranslated_entries
|
|
|
|
def _is_expected_identical(self, key: str, value: str) -> bool:
|
|
"""Check if a key-value pair is expected to be identical across languages."""
|
|
identical_patterns = [
|
|
'language.direction',
|
|
]
|
|
|
|
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
|
|
return True
|
|
|
|
for pattern in identical_patterns:
|
|
if pattern in key.lower():
|
|
return True
|
|
|
|
return False
|
|
|
|
def apply_translations(self, target_file: Path, translations: Dict[str, str],
|
|
backup: bool = True) -> Dict:
|
|
"""Apply provided translations to target file."""
|
|
if not target_file.exists():
|
|
print(f"Error: Target file does not exist: {target_file}")
|
|
return {'success': False, 'error': 'File not found'}
|
|
|
|
target_data = self._load_json(target_file)
|
|
applied_count = 0
|
|
errors = []
|
|
|
|
for key, translation in translations.items():
|
|
try:
|
|
# Remove [UNTRANSLATED] marker if present
|
|
if translation.startswith("[UNTRANSLATED]"):
|
|
translation = translation.replace("[UNTRANSLATED]", "").strip()
|
|
|
|
self._set_nested_value(target_data, key, translation)
|
|
applied_count += 1
|
|
except Exception as e:
|
|
errors.append(f"Error setting {key}: {e}")
|
|
|
|
if applied_count > 0:
|
|
self._save_json(target_data, target_file, backup)
|
|
|
|
return {
|
|
'success': True,
|
|
'applied_count': applied_count,
|
|
'errors': errors,
|
|
'data': target_data
|
|
}
|
|
|
|
def create_translation_template(self, target_file: Path, output_file: Path) -> None:
|
|
"""Create a template file for AI translation with context."""
|
|
untranslated = self.extract_untranslated_entries(target_file)
|
|
|
|
template = {
|
|
'metadata': {
|
|
'source_language': 'en-GB',
|
|
'target_language': target_file.parent.name,
|
|
'total_entries': len(untranslated),
|
|
'created_at': datetime.now().isoformat(),
|
|
'instructions': 'Translate the "original" values to the target language. Keep the same keys.'
|
|
},
|
|
'translations': {}
|
|
}
|
|
|
|
for key, entry in untranslated.items():
|
|
template['translations'][key] = {
|
|
'original': entry['original'],
|
|
'translated': '', # AI should fill this
|
|
'context': self._get_context_for_key(key),
|
|
'reason': entry['reason']
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(template, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Translation template created: {output_file}")
|
|
print(f"Contains {len(untranslated)} entries to translate")
|
|
|
|
def _get_context_for_key(self, key: str) -> str:
|
|
"""Get context information for a translation key."""
|
|
parts = key.split('.')
|
|
if len(parts) >= 2:
|
|
return f"Section: {parts[0]}, Property: {parts[-1]}"
|
|
return f"Property: {parts[-1]}"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Merge and manage translation files')
|
|
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
|
help='Path to locales directory')
|
|
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
|
|
help='Path to ignore patterns TOML file')
|
|
parser.add_argument('language', help='Target language code (e.g., fr-FR)')
|
|
|
|
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
|
|
# Add missing command
|
|
add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB')
|
|
add_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation')
|
|
add_parser.add_argument('--mark-untranslated', action='store_true', default=True,
|
|
help='Mark added translations as [UNTRANSLATED]')
|
|
|
|
# Extract untranslated command
|
|
extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries')
|
|
extract_parser.add_argument('--output', help='Output file path')
|
|
|
|
# Create template command
|
|
template_parser = subparsers.add_parser('create-template', help='Create AI translation template')
|
|
template_parser.add_argument('--output', required=True, help='Output template file path')
|
|
|
|
# Apply translations command
|
|
apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file')
|
|
apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations')
|
|
apply_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.command:
|
|
parser.print_help()
|
|
return
|
|
|
|
merger = TranslationMerger(args.locales_dir, args.ignore_file)
|
|
target_file = Path(args.locales_dir) / args.language / "translation.json"
|
|
|
|
if args.command == 'add-missing':
|
|
print(f"Adding missing translations to {args.language}...")
|
|
result = merger.add_missing_translations(
|
|
target_file,
|
|
mark_untranslated=args.mark_untranslated
|
|
)
|
|
|
|
merger._save_json(result['data'], target_file, backup=not args.no_backup)
|
|
print(f"Added {result['added_count']} missing translations")
|
|
|
|
elif args.command == 'extract-untranslated':
|
|
output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json')
|
|
untranslated = merger.extract_untranslated_entries(target_file, output_file)
|
|
print(f"Extracted {len(untranslated)} untranslated entries to {output_file}")
|
|
|
|
elif args.command == 'create-template':
|
|
output_file = Path(args.output)
|
|
merger.create_translation_template(target_file, output_file)
|
|
|
|
elif args.command == 'apply-translations':
|
|
with open(args.translations_file, 'r', encoding='utf-8') as f:
|
|
translations_data = json.load(f)
|
|
|
|
# Extract translations from template format or simple dict
|
|
if 'translations' in translations_data:
|
|
translations = {k: v['translated'] for k, v in translations_data['translations'].items()
|
|
if v.get('translated')}
|
|
else:
|
|
translations = translations_data
|
|
|
|
result = merger.apply_translations(target_file, translations, backup=not args.no_backup)
|
|
|
|
if result['success']:
|
|
print(f"Applied {result['applied_count']} translations")
|
|
if result['errors']:
|
|
print(f"Errors: {len(result['errors'])}")
|
|
for error in result['errors'][:5]:
|
|
print(f" - {error}")
|
|
else:
|
|
print(f"Failed: {result.get('error', 'Unknown error')}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |