mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-01-14 20:11:17 +01:00
Auto-generated by [create-pull-request][1] with **stirlingbot** [1]: https://github.com/peter-evans/create-pull-request Signed-off-by: stirlingbot[bot] <stirlingbot[bot]@users.noreply.github.com> Co-authored-by: stirlingbot[bot] <195170888+stirlingbot[bot]@users.noreply.github.com>
480 lines
18 KiB
Python
480 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AI Translation Helper for Stirling PDF Frontend
|
|
Provides utilities for AI-assisted translation workflows including
|
|
batch processing, quality checks, and integration helpers.
|
|
TOML format only.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
import argparse
|
|
import re
|
|
from datetime import datetime
|
|
import csv
|
|
import tomllib
|
|
import tomli_w
|
|
|
|
|
|
class AITranslationHelper:
|
|
def __init__(self, locales_dir: str = "frontend/public/locales"):
|
|
self.locales_dir = Path(locales_dir)
|
|
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
|
|
|
|
def _load_translation_file(self, file_path: Path) -> Dict:
|
|
"""Load TOML translation file."""
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
return tomllib.load(f)
|
|
except (FileNotFoundError, Exception) as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
return {}
|
|
|
|
def _save_translation_file(self, data: Dict, file_path: Path) -> None:
|
|
"""Save TOML translation file."""
|
|
with open(file_path, "wb") as f:
|
|
tomli_w.dump(data, f)
|
|
|
|
def create_ai_batch_file(
|
|
self,
|
|
languages: List[str],
|
|
output_file: Path,
|
|
max_entries_per_language: int = 50,
|
|
) -> None:
|
|
"""Create a batch file for AI translation with multiple languages."""
|
|
golden_truth = self._load_translation_file(self.golden_truth_file)
|
|
batch_data = {
|
|
"metadata": {
|
|
"created_at": datetime.now().isoformat(),
|
|
"source_language": "en-GB",
|
|
"target_languages": languages,
|
|
"max_entries_per_language": max_entries_per_language,
|
|
"instructions": {
|
|
"format": "Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}",
|
|
"context": "This is for a PDF manipulation tool. Keep technical terms consistent.",
|
|
"placeholders": "Preserve all placeholders: {n}, {total}, {filename}, etc.",
|
|
"style": "Keep translations concise and user-friendly",
|
|
},
|
|
},
|
|
"translations": {},
|
|
}
|
|
|
|
for lang in languages:
|
|
lang_dir = self.locales_dir / lang
|
|
toml_file = lang_dir / "translation.toml"
|
|
|
|
if toml_file.exists():
|
|
lang_data = self._load_translation_file(toml_file)
|
|
else:
|
|
# No translation file found, create empty structure
|
|
lang_data = {}
|
|
|
|
# Find untranslated entries
|
|
untranslated = self._find_untranslated_entries(golden_truth, lang_data)
|
|
|
|
# Limit entries if specified
|
|
if (
|
|
max_entries_per_language
|
|
and len(untranslated) > max_entries_per_language
|
|
):
|
|
# Prioritize by key importance
|
|
untranslated = self._prioritize_translation_keys(
|
|
untranslated, max_entries_per_language
|
|
)
|
|
|
|
batch_data["translations"][lang] = {}
|
|
for key, value in untranslated.items():
|
|
batch_data["translations"][lang][key] = {
|
|
"original": value,
|
|
"translated": "", # AI fills this
|
|
"context": self._get_key_context(key),
|
|
}
|
|
|
|
# Always save batch files as JSON for compatibility
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(batch_data, f, indent=2, ensure_ascii=False)
|
|
total_entries = sum(
|
|
len(lang_data) for lang_data in batch_data["translations"].values()
|
|
)
|
|
print(f"Created AI batch file: {output_file}")
|
|
print(f"Total entries to translate: {total_entries}")
|
|
|
|
def _find_untranslated_entries(
|
|
self, golden_truth: Dict, lang_data: Dict
|
|
) -> Dict[str, str]:
|
|
"""Find entries that need translation."""
|
|
golden_flat = self._flatten_dict(golden_truth)
|
|
lang_flat = self._flatten_dict(lang_data)
|
|
|
|
untranslated = {}
|
|
for key, value in golden_flat.items():
|
|
if (
|
|
key not in lang_flat
|
|
or lang_flat[key] == value
|
|
or (
|
|
isinstance(lang_flat[key], str)
|
|
and lang_flat[key].startswith("[UNTRANSLATED]")
|
|
)
|
|
):
|
|
if not self._is_expected_identical(key, value):
|
|
untranslated[key] = value
|
|
|
|
return untranslated
|
|
|
|
def _flatten_dict(
|
|
self, d: Dict, parent_key: str = "", separator: str = "."
|
|
) -> Dict[str, Any]:
|
|
"""Flatten nested dictionary."""
|
|
items = []
|
|
for k, v in d.items():
|
|
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
if isinstance(v, dict):
|
|
items.extend(self._flatten_dict(v, new_key, separator).items())
|
|
else:
|
|
items.append((new_key, v))
|
|
return dict(items)
|
|
|
|
def _is_expected_identical(self, key: str, value: str) -> bool:
|
|
"""Check if key should be identical across languages."""
|
|
if str(value).strip() in ["ltr", "rtl", "True", "False", "true", "false"]:
|
|
return True
|
|
return "language.direction" in key.lower()
|
|
|
|
def _prioritize_translation_keys(
|
|
self, untranslated: Dict[str, str], max_count: int
|
|
) -> Dict[str, str]:
|
|
"""Prioritize which keys to translate first based on importance."""
|
|
# Define priority order (higher score = higher priority)
|
|
priority_patterns = [
|
|
("title", 10),
|
|
("header", 9),
|
|
("submit", 8),
|
|
("selectText", 7),
|
|
("prompt", 6),
|
|
("desc", 5),
|
|
("error", 8),
|
|
("warning", 7),
|
|
("save", 8),
|
|
("download", 8),
|
|
("upload", 7),
|
|
]
|
|
|
|
scored_keys = []
|
|
for key, value in untranslated.items():
|
|
score = 1 # base score
|
|
for pattern, pattern_score in priority_patterns:
|
|
if pattern.lower() in key.lower():
|
|
score = max(score, pattern_score)
|
|
scored_keys.append((key, value, score))
|
|
|
|
# Sort by score (descending) and return top entries
|
|
scored_keys.sort(key=lambda x: x[2], reverse=True)
|
|
return {key: value for key, value, _ in scored_keys[:max_count]}
|
|
|
|
def _get_key_context(self, key: str) -> str:
|
|
"""Get contextual information for a translation key."""
|
|
parts = key.split(".")
|
|
contexts = {
|
|
"addPageNumbers": "Feature for adding page numbers to PDFs",
|
|
"compress": "PDF compression functionality",
|
|
"merge": "PDF merging functionality",
|
|
"split": "PDF splitting functionality",
|
|
"rotate": "PDF rotation functionality",
|
|
"convert": "File conversion functionality",
|
|
"security": "PDF security and permissions",
|
|
"metadata": "PDF metadata editing",
|
|
"watermark": "Adding watermarks to PDFs",
|
|
"overlay": "PDF overlay functionality",
|
|
"extract": "Extracting content from PDFs",
|
|
}
|
|
|
|
if len(parts) > 0:
|
|
main_section = parts[0]
|
|
context = contexts.get(
|
|
main_section, f"Part of {main_section} functionality"
|
|
)
|
|
if len(parts) > 1:
|
|
context += f", specifically for {parts[-1]}"
|
|
return context
|
|
|
|
return "General application text"
|
|
|
|
def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
|
|
"""Validate AI translations for common issues."""
|
|
# Batch files are always JSON
|
|
with open(batch_file, "r", encoding="utf-8") as f:
|
|
batch_data = json.load(f)
|
|
issues = {"errors": [], "warnings": []}
|
|
|
|
for lang, translations in batch_data.get("translations", {}).items():
|
|
for key, translation_data in translations.items():
|
|
original = translation_data.get("original", "")
|
|
translated = translation_data.get("translated", "")
|
|
|
|
if not translated:
|
|
issues["errors"].append(f"{lang}.{key}: Missing translation")
|
|
continue
|
|
|
|
# Check for placeholder preservation
|
|
original_placeholders = re.findall(r"\{[^}]+\}", original)
|
|
translated_placeholders = re.findall(r"\{[^}]+\}", translated)
|
|
|
|
if set(original_placeholders) != set(translated_placeholders):
|
|
issues["warnings"].append(
|
|
f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
|
|
f"Translated: {translated_placeholders}"
|
|
)
|
|
|
|
# Check if translation is identical to original (might be untranslated)
|
|
if translated == original and not self._is_expected_identical(
|
|
key, original
|
|
):
|
|
issues["warnings"].append(
|
|
f"{lang}.{key}: Translation identical to original"
|
|
)
|
|
|
|
# Check for common AI translation artifacts
|
|
artifacts = ["[TRANSLATE]", "[TODO]", "UNTRANSLATED", "{{", "}}"]
|
|
for artifact in artifacts:
|
|
if artifact in translated:
|
|
issues["errors"].append(
|
|
f"{lang}.{key}: Contains translation artifact: {artifact}"
|
|
)
|
|
|
|
return issues
|
|
|
|
def apply_ai_batch_translations(
|
|
self, batch_file: Path, validate: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""Apply translations from AI batch file to individual language files."""
|
|
# Batch files are always JSON
|
|
with open(batch_file, "r", encoding="utf-8") as f:
|
|
batch_data = json.load(f)
|
|
results = {"applied": {}, "errors": [], "warnings": []}
|
|
|
|
if validate:
|
|
validation_issues = self.validate_ai_translations(batch_file)
|
|
if validation_issues["errors"]:
|
|
print("Validation errors found. Fix these before applying:")
|
|
for error in validation_issues["errors"]:
|
|
print(f" ERROR: {error}")
|
|
return results
|
|
|
|
if validation_issues["warnings"]:
|
|
print("Validation warnings (review recommended):")
|
|
for warning in validation_issues["warnings"][:10]:
|
|
print(f" WARNING: {warning}")
|
|
|
|
for lang, translations in batch_data.get("translations", {}).items():
|
|
lang_dir = self.locales_dir / lang
|
|
toml_file = lang_dir / "translation.toml"
|
|
|
|
if toml_file.exists():
|
|
lang_data = self._load_translation_file(toml_file)
|
|
else:
|
|
# No translation file found, create new TOML file
|
|
lang_data = {}
|
|
lang_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
applied_count = 0
|
|
for key, translation_data in translations.items():
|
|
translated = translation_data.get("translated", "").strip()
|
|
if translated and translated != translation_data.get("original", ""):
|
|
self._set_nested_value(lang_data, key, translated)
|
|
applied_count += 1
|
|
|
|
if applied_count > 0:
|
|
self._save_translation_file(lang_data, toml_file)
|
|
results["applied"][lang] = applied_count
|
|
print(f"Applied {applied_count} translations to {lang}")
|
|
|
|
return results
|
|
|
|
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
|
"""Set value in nested dict using dot notation."""
|
|
keys = key_path.split(".")
|
|
current = data
|
|
for key in keys[:-1]:
|
|
if key not in current:
|
|
current[key] = {}
|
|
elif not isinstance(current[key], dict):
|
|
# If the current value is not a dict, we can't nest into it
|
|
print(
|
|
f"Warning: Converting non-dict value at '{key}' to dict to allow nesting"
|
|
)
|
|
current[key] = {}
|
|
current = current[key]
|
|
current[keys[-1]] = value
|
|
|
|
def export_for_external_translation(
|
|
self, languages: List[str], output_format: str = "csv"
|
|
) -> None:
|
|
"""Export translations for external translation services."""
|
|
golden_truth = self._load_translation_file(self.golden_truth_file)
|
|
golden_flat = self._flatten_dict(golden_truth)
|
|
|
|
if output_format == "csv":
|
|
output_file = Path(
|
|
f"translations_export_{datetime.now().strftime('%Y%m%d')}.csv"
|
|
)
|
|
|
|
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
|
|
fieldnames = ["key", "context", "en_GB"] + languages
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
|
|
for key, en_value in golden_flat.items():
|
|
if self._is_expected_identical(key, en_value):
|
|
continue
|
|
|
|
row = {
|
|
"key": key,
|
|
"context": self._get_key_context(key),
|
|
"en_GB": en_value,
|
|
}
|
|
|
|
for lang in languages:
|
|
lang_dir = self.locales_dir / lang
|
|
toml_file = lang_dir / "translation.toml"
|
|
|
|
if toml_file.exists():
|
|
lang_data = self._load_translation_file(toml_file)
|
|
lang_flat = self._flatten_dict(lang_data)
|
|
value = lang_flat.get(key, "")
|
|
if value.startswith("[UNTRANSLATED]"):
|
|
value = ""
|
|
row[lang] = value
|
|
else:
|
|
row[lang] = ""
|
|
|
|
writer.writerow(row)
|
|
|
|
print(f"Exported to {output_file}")
|
|
|
|
elif output_format == "json":
|
|
output_file = Path(
|
|
f"translations_export_{datetime.now().strftime('%Y%m%d')}.json"
|
|
)
|
|
export_data = {"languages": languages, "translations": {}}
|
|
|
|
for key, en_value in golden_flat.items():
|
|
if self._is_expected_identical(key, en_value):
|
|
continue
|
|
|
|
export_data["translations"][key] = {
|
|
"en_GB": en_value,
|
|
"context": self._get_key_context(key),
|
|
}
|
|
|
|
for lang in languages:
|
|
lang_dir = self.locales_dir / lang
|
|
toml_file = lang_dir / "translation.toml"
|
|
|
|
if toml_file.exists():
|
|
lang_data = self._load_translation_file(toml_file)
|
|
lang_flat = self._flatten_dict(lang_data)
|
|
value = lang_flat.get(key, "")
|
|
if value.startswith("[UNTRANSLATED]"):
|
|
value = ""
|
|
export_data["translations"][key][lang] = value
|
|
|
|
# Export files are always JSON
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
|
print(f"Exported to {output_file}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="AI Translation Helper", epilog="Works with TOML translation files."
|
|
)
|
|
parser.add_argument(
|
|
"--locales-dir",
|
|
default="frontend/public/locales",
|
|
help="Path to locales directory",
|
|
)
|
|
|
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
|
|
# Create batch command
|
|
batch_parser = subparsers.add_parser(
|
|
"create-batch", help="Create AI translation batch file"
|
|
)
|
|
batch_parser.add_argument(
|
|
"--languages", nargs="+", required=True, help="Language codes to include"
|
|
)
|
|
batch_parser.add_argument("--output", required=True, help="Output batch file")
|
|
batch_parser.add_argument(
|
|
"--max-entries", type=int, default=100, help="Max entries per language"
|
|
)
|
|
|
|
# Validate command
|
|
validate_parser = subparsers.add_parser("validate", help="Validate AI translations")
|
|
validate_parser.add_argument("batch_file", help="Batch file to validate")
|
|
|
|
# Apply command
|
|
apply_parser = subparsers.add_parser(
|
|
"apply-batch", help="Apply AI batch translations"
|
|
)
|
|
apply_parser.add_argument("batch_file", help="Batch file with translations")
|
|
apply_parser.add_argument(
|
|
"--skip-validation", action="store_true", help="Skip validation before applying"
|
|
)
|
|
|
|
# Export command
|
|
export_parser = subparsers.add_parser(
|
|
"export", help="Export for external translation"
|
|
)
|
|
export_parser.add_argument(
|
|
"--languages", nargs="+", required=True, help="Language codes to export"
|
|
)
|
|
export_parser.add_argument(
|
|
"--format", choices=["csv", "json"], default="csv", help="Export format"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.command:
|
|
parser.print_help()
|
|
return
|
|
|
|
helper = AITranslationHelper(args.locales_dir)
|
|
|
|
if args.command == "create-batch":
|
|
output_file = Path(args.output)
|
|
helper.create_ai_batch_file(args.languages, output_file, args.max_entries)
|
|
|
|
elif args.command == "validate":
|
|
batch_file = Path(args.batch_file)
|
|
issues = helper.validate_ai_translations(batch_file)
|
|
|
|
if issues["errors"]:
|
|
print("ERRORS:")
|
|
for error in issues["errors"]:
|
|
print(f" - {error}")
|
|
|
|
if issues["warnings"]:
|
|
print("WARNINGS:")
|
|
for warning in issues["warnings"]:
|
|
print(f" - {warning}")
|
|
|
|
if not issues["errors"] and not issues["warnings"]:
|
|
print("No validation issues found!")
|
|
|
|
elif args.command == "apply-batch":
|
|
batch_file = Path(args.batch_file)
|
|
results = helper.apply_ai_batch_translations(
|
|
batch_file, validate=not args.skip_validation
|
|
)
|
|
|
|
total_applied = sum(results["applied"].values())
|
|
print(f"Total translations applied: {total_applied}")
|
|
|
|
elif args.command == "export":
|
|
helper.export_for_external_translation(args.languages, args.format)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|