mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-26 17:52:59 +02:00
408 lines
17 KiB
Python
408 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AI Translation Helper for Stirling PDF Frontend
|
|
Provides utilities for AI-assisted translation workflows including
|
|
batch processing, quality checks, and integration helpers.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Tuple, Any, Optional
|
|
import argparse
|
|
import re
|
|
from datetime import datetime
|
|
import csv
|
|
|
|
|
|
class AITranslationHelper:
|
|
def __init__(self, locales_dir: str = "frontend/public/locales"):
|
|
self.locales_dir = Path(locales_dir)
|
|
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
|
|
|
def _load_json(self, file_path: Path) -> Dict:
|
|
"""Load JSON file with error handling."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
return {}
|
|
|
|
def _save_json(self, data: Dict, file_path: Path) -> None:
|
|
"""Save JSON file."""
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
def create_ai_batch_file(self, languages: List[str], output_file: Path,
|
|
max_entries_per_language: int = 50) -> None:
|
|
"""Create a batch file for AI translation with multiple languages."""
|
|
golden_truth = self._load_json(self.golden_truth_file)
|
|
batch_data = {
|
|
'metadata': {
|
|
'created_at': datetime.now().isoformat(),
|
|
'source_language': 'en-GB',
|
|
'target_languages': languages,
|
|
'max_entries_per_language': max_entries_per_language,
|
|
'instructions': {
|
|
'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}',
|
|
'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.',
|
|
'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.',
|
|
'style': 'Keep translations concise and user-friendly'
|
|
}
|
|
},
|
|
'translations': {}
|
|
}
|
|
|
|
for lang in languages:
|
|
lang_file = self.locales_dir / lang / "translation.json"
|
|
if not lang_file.exists():
|
|
# Create empty translation structure
|
|
lang_data = {}
|
|
else:
|
|
lang_data = self._load_json(lang_file)
|
|
|
|
# Find untranslated entries
|
|
untranslated = self._find_untranslated_entries(golden_truth, lang_data)
|
|
|
|
# Limit entries if specified
|
|
if max_entries_per_language and len(untranslated) > max_entries_per_language:
|
|
# Prioritize by key importance
|
|
untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language)
|
|
|
|
batch_data['translations'][lang] = {}
|
|
for key, value in untranslated.items():
|
|
batch_data['translations'][lang][key] = {
|
|
'original': value,
|
|
'translated': '', # AI fills this
|
|
'context': self._get_key_context(key)
|
|
}
|
|
|
|
self._save_json(batch_data, output_file)
|
|
total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values())
|
|
print(f"Created AI batch file: {output_file}")
|
|
print(f"Total entries to translate: {total_entries}")
|
|
|
|
def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]:
|
|
"""Find entries that need translation."""
|
|
golden_flat = self._flatten_dict(golden_truth)
|
|
lang_flat = self._flatten_dict(lang_data)
|
|
|
|
untranslated = {}
|
|
for key, value in golden_flat.items():
|
|
if (key not in lang_flat or
|
|
lang_flat[key] == value or
|
|
(isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))):
|
|
if not self._is_expected_identical(key, value):
|
|
untranslated[key] = value
|
|
|
|
return untranslated
|
|
|
|
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
|
"""Flatten nested dictionary."""
|
|
items = []
|
|
for k, v in d.items():
|
|
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
if isinstance(v, dict):
|
|
items.extend(self._flatten_dict(v, new_key, separator).items())
|
|
else:
|
|
items.append((new_key, v))
|
|
return dict(items)
|
|
|
|
def _is_expected_identical(self, key: str, value: str) -> bool:
|
|
"""Check if key should be identical across languages."""
|
|
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
|
|
return True
|
|
return 'language.direction' in key.lower()
|
|
|
|
def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]:
|
|
"""Prioritize which keys to translate first based on importance."""
|
|
# Define priority order (higher score = higher priority)
|
|
priority_patterns = [
|
|
('title', 10),
|
|
('header', 9),
|
|
('submit', 8),
|
|
('selectText', 7),
|
|
('prompt', 6),
|
|
('desc', 5),
|
|
('error', 8),
|
|
('warning', 7),
|
|
('save', 8),
|
|
('download', 8),
|
|
('upload', 7),
|
|
]
|
|
|
|
scored_keys = []
|
|
for key, value in untranslated.items():
|
|
score = 1 # base score
|
|
for pattern, pattern_score in priority_patterns:
|
|
if pattern.lower() in key.lower():
|
|
score = max(score, pattern_score)
|
|
scored_keys.append((key, value, score))
|
|
|
|
# Sort by score (descending) and return top entries
|
|
scored_keys.sort(key=lambda x: x[2], reverse=True)
|
|
return {key: value for key, value, _ in scored_keys[:max_count]}
|
|
|
|
def _get_key_context(self, key: str) -> str:
|
|
"""Get contextual information for a translation key."""
|
|
parts = key.split('.')
|
|
contexts = {
|
|
'addPageNumbers': 'Feature for adding page numbers to PDFs',
|
|
'compress': 'PDF compression functionality',
|
|
'merge': 'PDF merging functionality',
|
|
'split': 'PDF splitting functionality',
|
|
'rotate': 'PDF rotation functionality',
|
|
'convert': 'File conversion functionality',
|
|
'security': 'PDF security and permissions',
|
|
'metadata': 'PDF metadata editing',
|
|
'watermark': 'Adding watermarks to PDFs',
|
|
'overlay': 'PDF overlay functionality',
|
|
'extract': 'Extracting content from PDFs'
|
|
}
|
|
|
|
if len(parts) > 0:
|
|
main_section = parts[0]
|
|
context = contexts.get(main_section, f'Part of {main_section} functionality')
|
|
if len(parts) > 1:
|
|
context += f', specifically for {parts[-1]}'
|
|
return context
|
|
|
|
return 'General application text'
|
|
|
|
def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
|
|
"""Validate AI translations for common issues."""
|
|
batch_data = self._load_json(batch_file)
|
|
issues = {'errors': [], 'warnings': []}
|
|
|
|
for lang, translations in batch_data.get('translations', {}).items():
|
|
for key, translation_data in translations.items():
|
|
original = translation_data.get('original', '')
|
|
translated = translation_data.get('translated', '')
|
|
|
|
if not translated:
|
|
issues['errors'].append(f"{lang}.{key}: Missing translation")
|
|
continue
|
|
|
|
# Check for placeholder preservation
|
|
original_placeholders = re.findall(r'\{[^}]+\}', original)
|
|
translated_placeholders = re.findall(r'\{[^}]+\}', translated)
|
|
|
|
if set(original_placeholders) != set(translated_placeholders):
|
|
issues['warnings'].append(
|
|
f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
|
|
f"Translated: {translated_placeholders}"
|
|
)
|
|
|
|
# Check if translation is identical to original (might be untranslated)
|
|
if translated == original and not self._is_expected_identical(key, original):
|
|
issues['warnings'].append(f"{lang}.{key}: Translation identical to original")
|
|
|
|
# Check for common AI translation artifacts
|
|
artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}']
|
|
for artifact in artifacts:
|
|
if artifact in translated:
|
|
issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}")
|
|
|
|
return issues
|
|
|
|
def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]:
|
|
"""Apply translations from AI batch file to individual language files."""
|
|
batch_data = self._load_json(batch_file)
|
|
results = {'applied': {}, 'errors': [], 'warnings': []}
|
|
|
|
if validate:
|
|
validation_issues = self.validate_ai_translations(batch_file)
|
|
if validation_issues['errors']:
|
|
print("Validation errors found. Fix these before applying:")
|
|
for error in validation_issues['errors']:
|
|
print(f" ERROR: {error}")
|
|
return results
|
|
|
|
if validation_issues['warnings']:
|
|
print("Validation warnings (review recommended):")
|
|
for warning in validation_issues['warnings'][:10]:
|
|
print(f" WARNING: {warning}")
|
|
|
|
for lang, translations in batch_data.get('translations', {}).items():
|
|
lang_file = self.locales_dir / lang / "translation.json"
|
|
|
|
# Load existing data or create new
|
|
if lang_file.exists():
|
|
lang_data = self._load_json(lang_file)
|
|
else:
|
|
lang_data = {}
|
|
lang_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
applied_count = 0
|
|
for key, translation_data in translations.items():
|
|
translated = translation_data.get('translated', '').strip()
|
|
if translated and translated != translation_data.get('original', ''):
|
|
self._set_nested_value(lang_data, key, translated)
|
|
applied_count += 1
|
|
|
|
if applied_count > 0:
|
|
self._save_json(lang_data, lang_file)
|
|
results['applied'][lang] = applied_count
|
|
print(f"Applied {applied_count} translations to {lang}")
|
|
|
|
return results
|
|
|
|
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
|
"""Set value in nested dict using dot notation."""
|
|
keys = key_path.split('.')
|
|
current = data
|
|
for key in keys[:-1]:
|
|
if key not in current:
|
|
current[key] = {}
|
|
elif not isinstance(current[key], dict):
|
|
# If the current value is not a dict, we can't nest into it
|
|
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
|
|
current[key] = {}
|
|
current = current[key]
|
|
current[keys[-1]] = value
|
|
|
|
def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None:
|
|
"""Export translations for external translation services."""
|
|
golden_truth = self._load_json(self.golden_truth_file)
|
|
golden_flat = self._flatten_dict(golden_truth)
|
|
|
|
if output_format == 'csv':
|
|
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv')
|
|
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['key', 'context', 'en_GB'] + languages
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
|
|
for key, en_value in golden_flat.items():
|
|
if self._is_expected_identical(key, en_value):
|
|
continue
|
|
|
|
row = {
|
|
'key': key,
|
|
'context': self._get_key_context(key),
|
|
'en_GB': en_value
|
|
}
|
|
|
|
for lang in languages:
|
|
lang_file = self.locales_dir / lang / "translation.json"
|
|
if lang_file.exists():
|
|
lang_data = self._load_json(lang_file)
|
|
lang_flat = self._flatten_dict(lang_data)
|
|
value = lang_flat.get(key, '')
|
|
if value.startswith('[UNTRANSLATED]'):
|
|
value = ''
|
|
row[lang] = value
|
|
else:
|
|
row[lang] = ''
|
|
|
|
writer.writerow(row)
|
|
|
|
print(f"Exported to {output_file}")
|
|
|
|
elif output_format == 'json':
|
|
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json')
|
|
export_data = {'languages': languages, 'translations': {}}
|
|
|
|
for key, en_value in golden_flat.items():
|
|
if self._is_expected_identical(key, en_value):
|
|
continue
|
|
|
|
export_data['translations'][key] = {
|
|
'en_GB': en_value,
|
|
'context': self._get_key_context(key)
|
|
}
|
|
|
|
for lang in languages:
|
|
lang_file = self.locales_dir / lang / "translation.json"
|
|
if lang_file.exists():
|
|
lang_data = self._load_json(lang_file)
|
|
lang_flat = self._flatten_dict(lang_data)
|
|
value = lang_flat.get(key, '')
|
|
if value.startswith('[UNTRANSLATED]'):
|
|
value = ''
|
|
export_data['translations'][key][lang] = value
|
|
|
|
self._save_json(export_data, output_file)
|
|
print(f"Exported to {output_file}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='AI Translation Helper')
|
|
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
|
help='Path to locales directory')
|
|
|
|
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
|
|
# Create batch command
|
|
batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file')
|
|
batch_parser.add_argument('--languages', nargs='+', required=True,
|
|
help='Language codes to include')
|
|
batch_parser.add_argument('--output', required=True, help='Output batch file')
|
|
batch_parser.add_argument('--max-entries', type=int, default=100,
|
|
help='Max entries per language')
|
|
|
|
# Validate command
|
|
validate_parser = subparsers.add_parser('validate', help='Validate AI translations')
|
|
validate_parser.add_argument('batch_file', help='Batch file to validate')
|
|
|
|
# Apply command
|
|
apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations')
|
|
apply_parser.add_argument('batch_file', help='Batch file with translations')
|
|
apply_parser.add_argument('--skip-validation', action='store_true',
|
|
help='Skip validation before applying')
|
|
|
|
# Export command
|
|
export_parser = subparsers.add_parser('export', help='Export for external translation')
|
|
export_parser.add_argument('--languages', nargs='+', required=True,
|
|
help='Language codes to export')
|
|
export_parser.add_argument('--format', choices=['csv', 'json'], default='csv',
|
|
help='Export format')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.command:
|
|
parser.print_help()
|
|
return
|
|
|
|
helper = AITranslationHelper(args.locales_dir)
|
|
|
|
if args.command == 'create-batch':
|
|
output_file = Path(args.output)
|
|
helper.create_ai_batch_file(args.languages, output_file, args.max_entries)
|
|
|
|
elif args.command == 'validate':
|
|
batch_file = Path(args.batch_file)
|
|
issues = helper.validate_ai_translations(batch_file)
|
|
|
|
if issues['errors']:
|
|
print("ERRORS:")
|
|
for error in issues['errors']:
|
|
print(f" - {error}")
|
|
|
|
if issues['warnings']:
|
|
print("WARNINGS:")
|
|
for warning in issues['warnings']:
|
|
print(f" - {warning}")
|
|
|
|
if not issues['errors'] and not issues['warnings']:
|
|
print("No validation issues found!")
|
|
|
|
elif args.command == 'apply-batch':
|
|
batch_file = Path(args.batch_file)
|
|
results = helper.apply_ai_batch_translations(
|
|
batch_file,
|
|
validate=not args.skip_validation
|
|
)
|
|
|
|
total_applied = sum(results['applied'].values())
|
|
print(f"Total translations applied: {total_applied}")
|
|
|
|
elif args.command == 'export':
|
|
helper.export_for_external_translation(args.languages, args.format)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |