mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-12-18 20:04:17 +01:00
362 lines
12 KiB
Python
362 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch Translation Script using OpenAI API
|
|
Automatically translates JSON batch files to target language while preserving:
|
|
- Placeholders: {n}, {total}, {filename}, {{variable}}
|
|
- HTML tags: <strong>, </strong>, etc.
|
|
- Technical terms: PDF, API, OAuth2, SAML2, JWT, etc.
|
|
|
|
Note: Works with JSON batch files. Translation files can be TOML or JSON format.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
import time
|
|
|
|
try:
|
|
from openai import OpenAI
|
|
except ImportError:
|
|
print("Error: openai package not installed. Install with: pip install openai")
|
|
sys.exit(1)
|
|
|
|
|
|
class BatchTranslator:
|
|
def __init__(self, api_key: str, model: str = "gpt-5"):
|
|
"""Initialize translator with OpenAI API key."""
|
|
self.client = OpenAI(api_key=api_key)
|
|
self.model = model
|
|
|
|
def get_translation_prompt(self, language_name: str, language_code: str) -> str:
|
|
"""Generate the system prompt for translation."""
|
|
return f"""You are a professional translator for Stirling PDF, an open-source PDF manipulation tool.
|
|
|
|
Translate the following JSON from English to {language_name} ({language_code}) for the Stirling PDF user interface.
|
|
|
|
CRITICAL RULES - MUST FOLLOW EXACTLY:
|
|
|
|
1. PRESERVE ALL PLACEHOLDERS EXACTLY AS-IS:
|
|
- Single braces: {{{{n}}}}, {{{{total}}}}, {{{{filename}}}}, {{{{count}}}}, {{{{date}}}}, {{{{planName}}}}, {{{{toolName}}}}, {{{{variable}}}}
|
|
- Double braces: {{{{{{{{variable}}}}}}}}
|
|
- Never translate, modify, or remove these - they are template variables
|
|
|
|
2. KEEP ALL HTML TAGS INTACT:
|
|
- <strong>, </strong>, <br>, <code>, </code>, etc.
|
|
- Do not translate tag names, only text between tags
|
|
|
|
3. DO NOT TRANSLATE TECHNICAL TERMS:
|
|
- File formats: PDF, JSON, CSV, XML, HTML, ZIP, DOCX, XLSX, PNG, JPG
|
|
- Protocols: API, OAuth2, SAML2, JWT, SMTP, HTTP, HTTPS, SSL, TLS
|
|
- Technologies: Git, GitHub, Google, PostHog, Scarf, LibreOffice, Ghostscript, Tesseract, OCR
|
|
- Technical keywords: URL, URI, DPI, RGB, CMYK, QR
|
|
- "Stirling PDF" - always keep as-is
|
|
|
|
4. MAINTAIN CONSISTENT TERMINOLOGY:
|
|
- Use the SAME translation for repeated terms throughout
|
|
- Do not introduce new terminology or synonyms
|
|
- Keep UI action words consistent (e.g., "upload", "download", "compress")
|
|
|
|
5. PRESERVE SPECIAL KEYWORDS IN CONTEXT:
|
|
- Mathematical expressions: "2n", "2n-1", "3n" (in page selection)
|
|
- Special keywords: "all", "odd", "even" (in page contexts)
|
|
- Code examples and technical patterns
|
|
|
|
6. JSON STRUCTURE:
|
|
- Translate ONLY the values (text after :), NEVER the keys
|
|
- Return ONLY valid JSON with exact same structure
|
|
- Maintain all quotes, commas, and braces
|
|
|
|
7. TONE & STYLE:
|
|
- Use appropriate formal/informal tone for {language_name} UI
|
|
- Keep translations concise and user-friendly
|
|
- Maintain the professional but accessible tone of the original
|
|
|
|
8. DO NOT ADD OR REMOVE TEXT:
|
|
- Do not add explanations, comments, or extra text
|
|
- Do not remove any part of the original meaning
|
|
- Keep the same level of detail
|
|
|
|
Return ONLY the translated JSON. No markdown, no explanations, just the JSON object."""
|
|
|
|
def translate_batch(
|
|
self, batch_data: dict, target_language: str, language_code: str
|
|
) -> dict:
|
|
"""Translate a batch file using OpenAI API."""
|
|
# Convert batch to compact JSON for API
|
|
input_json = json.dumps(batch_data, ensure_ascii=False, separators=(",", ":"))
|
|
|
|
print(f"Translating {len(batch_data)} entries to {target_language}...")
|
|
print(f"Input size: {len(input_json)} characters")
|
|
|
|
try:
|
|
# GPT-5 only supports temperature=1, so we don't include it
|
|
response = self.client.chat.completions.create(
|
|
model=self.model,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": self.get_translation_prompt(
|
|
target_language, language_code
|
|
),
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Translate this JSON:\n\n{input_json}",
|
|
},
|
|
],
|
|
)
|
|
|
|
translated_text = response.choices[0].message.content.strip()
|
|
|
|
# Remove markdown code blocks if present
|
|
if translated_text.startswith("```"):
|
|
lines = translated_text.split("\n")
|
|
translated_text = "\n".join(lines[1:-1])
|
|
|
|
# Parse the translated JSON
|
|
translated_data = json.loads(translated_text)
|
|
|
|
print("✓ Translation complete")
|
|
return translated_data
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error: AI returned invalid JSON: {e}")
|
|
print(f"Response: {translated_text[:500]}...")
|
|
raise
|
|
except Exception as e:
|
|
print(f"Error during translation: {e}")
|
|
raise
|
|
|
|
def validate_translation(self, original: dict, translated: dict) -> bool:
|
|
"""Validate that translation preserved all placeholders and structure."""
|
|
issues = []
|
|
|
|
# Check that all keys are present
|
|
if set(original.keys()) != set(translated.keys()):
|
|
missing = set(original.keys()) - set(translated.keys())
|
|
extra = set(translated.keys()) - set(original.keys())
|
|
if missing:
|
|
issues.append(f"Missing keys: {missing}")
|
|
if extra:
|
|
issues.append(f"Extra keys: {extra}")
|
|
|
|
# Check placeholders in each value
|
|
import re
|
|
|
|
placeholder_pattern = r"\{[^}]+\}|\{\{[^}]+\}\}"
|
|
|
|
for key in original.keys():
|
|
if key not in translated:
|
|
continue
|
|
|
|
orig_value = str(original[key])
|
|
trans_value = str(translated[key])
|
|
|
|
# Find all placeholders in original
|
|
orig_placeholders = set(re.findall(placeholder_pattern, orig_value))
|
|
trans_placeholders = set(re.findall(placeholder_pattern, trans_value))
|
|
|
|
if orig_placeholders != trans_placeholders:
|
|
issues.append(
|
|
f"Placeholder mismatch in '{key}': {orig_placeholders} vs {trans_placeholders}"
|
|
)
|
|
|
|
if issues:
|
|
print("\n⚠ Validation warnings:")
|
|
for issue in issues[:10]: # Show first 10 issues
|
|
print(f" - {issue}")
|
|
if len(issues) > 10:
|
|
print(f" ... and {len(issues) - 10} more issues")
|
|
return False
|
|
|
|
print("✓ Validation passed")
|
|
return True
|
|
|
|
|
|
def get_language_info(language_code: str) -> tuple:
|
|
"""Get full language name from code."""
|
|
languages = {
|
|
"zh-CN": ("Simplified Chinese", "zh-CN"),
|
|
"es-ES": ("Spanish", "es-ES"),
|
|
"it-IT": ("Italian", "it-IT"),
|
|
"de-DE": ("German", "de-DE"),
|
|
"ar-AR": ("Arabic", "ar-AR"),
|
|
"pt-BR": ("Brazilian Portuguese", "pt-BR"),
|
|
"ru-RU": ("Russian", "ru-RU"),
|
|
"fr-FR": ("French", "fr-FR"),
|
|
"ja-JP": ("Japanese", "ja-JP"),
|
|
"ko-KR": ("Korean", "ko-KR"),
|
|
"nl-NL": ("Dutch", "nl-NL"),
|
|
"pl-PL": ("Polish", "pl-PL"),
|
|
"sv-SE": ("Swedish", "sv-SE"),
|
|
"da-DK": ("Danish", "da-DK"),
|
|
"no-NB": ("Norwegian", "no-NB"),
|
|
"fi-FI": ("Finnish", "fi-FI"),
|
|
"tr-TR": ("Turkish", "tr-TR"),
|
|
"vi-VN": ("Vietnamese", "vi-VN"),
|
|
"th-TH": ("Thai", "th-TH"),
|
|
"id-ID": ("Indonesian", "id-ID"),
|
|
"hi-IN": ("Hindi", "hi-IN"),
|
|
"cs-CZ": ("Czech", "cs-CZ"),
|
|
"hu-HU": ("Hungarian", "hu-HU"),
|
|
"ro-RO": ("Romanian", "ro-RO"),
|
|
"uk-UA": ("Ukrainian", "uk-UA"),
|
|
"el-GR": ("Greek", "el-GR"),
|
|
"bg-BG": ("Bulgarian", "bg-BG"),
|
|
"hr-HR": ("Croatian", "hr-HR"),
|
|
"sk-SK": ("Slovak", "sk-SK"),
|
|
"sl-SI": ("Slovenian", "sl-SI"),
|
|
"ca-CA": ("Catalan", "ca-CA"),
|
|
}
|
|
|
|
return languages.get(language_code, (language_code, language_code))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Translate JSON batch files using OpenAI API (output supports TOML and JSON)",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Note: This script works with JSON batch files. The translation files it updates can be TOML or JSON.
|
|
|
|
Examples:
|
|
# Translate single batch file
|
|
python batch_translator.py zh_CN_batch_1_of_4.json --api-key YOUR_KEY --language zh-CN
|
|
|
|
# Translate all batches for a language (with pattern)
|
|
python batch_translator.py "zh_CN_batch_*_of_*.json" --api-key YOUR_KEY --language zh-CN
|
|
|
|
# Use environment variable for API key
|
|
export OPENAI_API_KEY=your_key_here
|
|
python batch_translator.py zh_CN_batch_1_of_4.json --language zh-CN
|
|
|
|
# Use different model
|
|
python batch_translator.py file.json --api-key KEY --language es-ES --model gpt-4-turbo
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"input_files", nargs="+", help="Input batch JSON file(s) or pattern"
|
|
)
|
|
parser.add_argument(
|
|
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY env var)"
|
|
)
|
|
parser.add_argument(
|
|
"--language",
|
|
"-l",
|
|
required=True,
|
|
help="Target language code (e.g., zh-CN, es-ES)",
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
default="gpt-5",
|
|
help="OpenAI model to use (default: gpt-5, options: gpt-5-mini, gpt-5-nano)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-suffix",
|
|
default="_translated",
|
|
help="Suffix for output files (default: _translated)",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-validation", action="store_true", help="Skip validation checks"
|
|
)
|
|
parser.add_argument(
|
|
"--delay",
|
|
type=float,
|
|
default=1.0,
|
|
help="Delay between API calls in seconds (default: 1.0)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get API key from args or environment
|
|
import os
|
|
|
|
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
|
if not api_key:
|
|
print(
|
|
"Error: OpenAI API key required. Provide via --api-key or OPENAI_API_KEY environment variable"
|
|
)
|
|
sys.exit(1)
|
|
|
|
# Get language info
|
|
language_name, language_code = get_language_info(args.language)
|
|
|
|
# Expand file patterns
|
|
import glob
|
|
|
|
input_files = []
|
|
for pattern in args.input_files:
|
|
matched = glob.glob(pattern)
|
|
if matched:
|
|
input_files.extend(matched)
|
|
else:
|
|
input_files.append(pattern) # Use as literal filename
|
|
|
|
if not input_files:
|
|
print("Error: No input files found")
|
|
sys.exit(1)
|
|
|
|
print("Batch Translator")
|
|
print(f"Target Language: {language_name} ({language_code})")
|
|
print(f"Model: {args.model}")
|
|
print(f"Files to translate: {len(input_files)}")
|
|
print("=" * 60)
|
|
|
|
# Initialize translator
|
|
translator = BatchTranslator(api_key, args.model)
|
|
|
|
# Process each file
|
|
successful = 0
|
|
failed = 0
|
|
|
|
for i, input_file in enumerate(input_files, 1):
|
|
print(f"\n[{i}/{len(input_files)}] Processing: {input_file}")
|
|
|
|
try:
|
|
# Load input file
|
|
with open(input_file, "r", encoding="utf-8") as f:
|
|
batch_data = json.load(f)
|
|
|
|
# Translate
|
|
translated_data = translator.translate_batch(
|
|
batch_data, language_name, language_code
|
|
)
|
|
|
|
# Validate
|
|
if not args.skip_validation:
|
|
translator.validate_translation(batch_data, translated_data)
|
|
|
|
# Save output
|
|
input_path = Path(input_file)
|
|
output_file = input_path.stem + args.output_suffix + input_path.suffix
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(translated_data, f, ensure_ascii=False, separators=(",", ":"))
|
|
|
|
print(f"✓ Saved to: {output_file}")
|
|
successful += 1
|
|
|
|
# Delay between API calls to avoid rate limits
|
|
if i < len(input_files):
|
|
time.sleep(args.delay)
|
|
|
|
except Exception as e:
|
|
print(f"✗ Failed: {e}")
|
|
failed += 1
|
|
continue
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("Translation complete!")
|
|
print(f"Successful: {successful}/{len(input_files)}")
|
|
if failed > 0:
|
|
print(f"Failed: {failed}/{len(input_files)}")
|
|
|
|
sys.exit(0 if failed == 0 else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|