diff --git a/.gitignore b/.gitignore index 37df23f58..b339d7ff6 100644 --- a/.gitignore +++ b/.gitignore @@ -203,3 +203,10 @@ id_ed25519.pub # node_modules node_modules/ + +# Translation temp files +*_compact.json +*compact*.json +test_batch.json +*.backup.*.json +frontend/public/locales/*/translation.backup*.json diff --git a/frontend/index.html b/frontend/index.html index 31f1b3008..b563bdcd8 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -2,6 +2,7 @@
+Opening Swagger UI in a new tab...
If it didn't open automatically,{" "} - + click here
diff --git a/frontend/src/utils/urlRouting.ts b/frontend/src/utils/urlRouting.ts index 3ca35e9a7..d14ca923a 100644 --- a/frontend/src/utils/urlRouting.ts +++ b/frontend/src/utils/urlRouting.ts @@ -8,12 +8,17 @@ import { getDefaultWorkbench } from '../types/workbench'; import { ToolRegistry, getToolWorkbench, getToolUrlPath } from '../data/toolsTaxonomy'; import { firePixel } from './scarfTracking'; import { URL_TO_TOOL_MAP } from './urlMapping'; +import { BASE_PATH, withBasePath } from '../constants/app'; /** * Parse the current URL to extract tool routing information */ export function parseToolRoute(registry: ToolRegistry): ToolRoute { - const path = window.location.pathname; + const fullPath = window.location.pathname; + // Remove base path to get app-relative path + const path = BASE_PATH && fullPath.startsWith(BASE_PATH) + ? fullPath.slice(BASE_PATH.length) || '/' + : fullPath; const searchParams = new URLSearchParams(window.location.search); // First, check URL mapping for multiple URL aliases @@ -83,7 +88,8 @@ export function updateToolRoute(toolId: ToolId, registry: ToolRegistry, replace: return; } - const newPath = getToolUrlPath(toolId, tool); + const toolPath = getToolUrlPath(toolId, tool); + const newPath = withBasePath(toolPath); const searchParams = new URLSearchParams(window.location.search); // Remove tool query parameter since we're using path-based routing @@ -99,7 +105,7 @@ export function clearToolRoute(replace: boolean = false): void { const searchParams = new URLSearchParams(window.location.search); searchParams.delete('tool'); - updateUrl('/', searchParams, replace); + updateUrl(withBasePath('/'), searchParams, replace); } /** @@ -117,11 +123,12 @@ export function generateShareableUrl(toolId: ToolId | null, registry: ToolRegist const baseUrl = window.location.origin; if (!toolId || !registry[toolId]) { - return baseUrl; + return `${baseUrl}${BASE_PATH || ''}`; } const tool = registry[toolId]; - const path = getToolUrlPath(toolId, tool); - return `${baseUrl}${path}`; + const toolPath = getToolUrlPath(toolId, tool); + const fullPath = withBasePath(toolPath); + return `${baseUrl}${fullPath}`; } diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index 1db9de625..59ebfd663 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -12,5 +12,5 @@ export default defineConfig({ }, }, }, - base: "./", + base: process.env.RUN_SUBPATH ? `/${process.env.RUN_SUBPATH}` : './', }); diff --git a/scripts/translations/README.md b/scripts/translations/README.md new file mode 100644 index 000000000..2688e8537 --- /dev/null +++ b/scripts/translations/README.md @@ -0,0 +1,403 @@ +# Translation Management Scripts + +This directory contains Python scripts for managing frontend translations in Stirling PDF. These tools help analyze, merge, and manage translations against the en-GB golden truth file. + +## Scripts Overview + +### 1. `translation_analyzer.py` +Analyzes translation files to find missing translations, untranslated entries, and provides completion statistics. + +**Usage:** +```bash +# Analyze all languages +python scripts/translations/translation_analyzer.py + +# Analyze specific language +python scripts/translations/translation_analyzer.py --language fr-FR + +# Show only missing translations +python scripts/translations/translation_analyzer.py --missing-only + +# Show only untranslated entries +python scripts/translations/translation_analyzer.py --untranslated-only + +# Show summary only +python scripts/translations/translation_analyzer.py --summary + +# JSON output format +python scripts/translations/translation_analyzer.py --format json +``` + +**Features:** +- Finds missing translation keys +- Identifies untranslated entries (identical to en-GB and [UNTRANSLATED] markers) +- Shows accurate completion percentages using ignore patterns +- Identifies extra keys not in en-GB +- Supports JSON and text output formats +- Uses `scripts/ignore_translation.toml` for language-specific exclusions + +### 2. `translation_merger.py` +Merges missing translations from en-GB into target language files and manages translation workflows. + +**Usage:** +```bash +# Add missing translations from en-GB to French +python scripts/translations/translation_merger.py fr-FR add-missing + +# Add without marking as [UNTRANSLATED] +python scripts/translations/translation_merger.py fr-FR add-missing --no-mark-untranslated + +# Extract untranslated entries to a file +python scripts/translations/translation_merger.py fr-FR extract-untranslated --output fr_untranslated.json + +# Create a template for AI translation +python scripts/translations/translation_merger.py fr-FR create-template --output fr_template.json + +# Apply translations from a file +python scripts/translations/translation_merger.py fr-FR apply-translations --translations-file fr_translated.json +``` + +**Features:** +- Adds missing keys from en-GB with optional [UNTRANSLATED] markers +- Extracts untranslated entries for external translation +- Creates structured templates for AI translation +- Applies translated content back to language files +- Automatic backup creation + +### 3. `ai_translation_helper.py` +Specialized tool for AI-assisted translation workflows with batch processing and validation. + +**Usage:** +```bash +# Create batch file for AI translation (multiple languages) +python scripts/translations/ai_translation_helper.py create-batch --languages fr-FR de-DE es-ES --output batch.json --max-entries 50 + +# Validate AI translations +python scripts/translations/ai_translation_helper.py validate batch.json + +# Apply validated AI translations +python scripts/translations/ai_translation_helper.py apply-batch batch.json + +# Export for external translation services +python scripts/translations/ai_translation_helper.py export --languages fr-FR de-DE --format csv +``` + +**Features:** +- Creates batch files for AI translation of multiple languages +- Prioritizes important translation keys +- Validates translations for placeholders and artifacts +- Applies batch translations with validation +- Exports to CSV/JSON for external translation services + +### 4. `compact_translator.py` +Extracts untranslated entries in minimal JSON format for character-limited AI services. + +**Usage:** +```bash +# Extract all untranslated entries +python scripts/translations/compact_translator.py it-IT --output to_translate.json +``` + +**Features:** +- Produces minimal JSON output with no extra whitespace +- Automatic ignore patterns for cleaner output +- Batch size control for manageable chunks +- 50-80% fewer characters than other extraction methods + +### 5. `json_beautifier.py` +Restructures and beautifies translation JSON files to match en-GB structure exactly. + +**Usage:** +```bash +# Restructure single language to match en-GB structure +python scripts/translations/json_beautifier.py --language de-DE + +# Restructure all languages +python scripts/translations/json_beautifier.py --all-languages + +# Validate structure without modifying files +python scripts/translations/json_beautifier.py --language de-DE --validate-only + +# Skip backup creation +python scripts/translations/json_beautifier.py --language de-DE --no-backup +``` + +**Features:** +- Restructures JSON to match en-GB nested structure exactly +- Preserves key ordering for line-by-line comparison +- Creates automatic backups before modification +- Validates structure and key ordering +- Handles flattened dot-notation keys (e.g., "key.subkey") properly + +## Translation Workflows + +### Method 1: Compact Translation Workflow (RECOMMENDED for AI) + +**Best for character-limited AI services like Claude or ChatGPT** + +#### Step 1: Check Current Status +```bash +python scripts/translations/translation_analyzer.py --language it-IT --summary +``` + +#### Step 2: Extract Untranslated Entries +```bash +python scripts/translations/compact_translator.py it-IT --output to_translate.json +``` + +**Output format**: Compact JSON with minimal whitespace +```json +{"key1":"English text","key2":"Another text","key3":"More text"} +``` + +#### Step 3: AI Translation +1. Copy the compact JSON output +2. Give it to your AI with instructions: + ``` + Translate this JSON to Italian. Keep the same structure, translate only the values. + Preserve placeholders like {n}, {total}, {filename}, {{variable}}. + ``` +3. Save the AI's response as `translated.json` + +#### Step 4: Apply Translations +```bash +python scripts/translations/translation_merger.py it-IT apply-translations --translations-file translated.json +``` + +#### Step 5: Verify Results +```bash +python scripts/translations/translation_analyzer.py --language it-IT --summary +``` + +### Method 2: Batch Translation Workflow + +**For complete language translation from scratch or major updates** + +#### Step 1: Analyze Current State +```bash +python scripts/translations/translation_analyzer.py --language de-DE --summary +``` + +#### Step 2: Create Translation Batches +```bash +# Create batches of 100 entries each for systematic translation +python scripts/translations/ai_translation_helper.py create-batch --languages de-DE --output de_batch_1.json --max-entries 100 +``` + +#### Step 3: Translate Batch with AI +Edit the batch file and fill in ALL `translated` fields: +- Preserve all placeholders like `{n}`, `{total}`, `{filename}`, `{{toolName}}` +- Keep technical terms consistent +- Maintain JSON structure exactly +- Consider context provided for each entry + +#### Step 4: Apply Translations +```bash +# Skip validation if using legitimate placeholders ({{variable}}) +python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json --skip-validation +``` + +#### Step 5: Check Progress and Continue +```bash +python scripts/translations/translation_analyzer.py --language de-DE --summary +``` +Repeat steps 2-5 until 100% complete. + +### Method 3: Quick Translation Workflow (Legacy) + +**For small updates or existing translations** + +#### Step 1: Add Missing Translations +```bash +python scripts/translations/translation_merger.py fr-FR add-missing --mark-untranslated +``` + +#### Step 2: Create AI Template +```bash +python scripts/translations/translation_merger.py fr-FR create-template --output fr_template.json +``` + +#### Step 3: Apply Translations +```bash +python scripts/translations/translation_merger.py fr-FR apply-translations --translations-file fr_translated.json +``` + +## Translation File Structure + +Translation files are located in `frontend/public/locales/{language}/translation.json` with nested JSON structure: + +```json +{ + "addPageNumbers": { + "title": "Add Page Numbers", + "selectText": { + "1": "Select PDF file:", + "2": "Margin Size" + } + } +} +``` + +Keys use dot notation internally (e.g., `addPageNumbers.selectText.1`). + +## Key Features + +### Placeholder Preservation +All scripts preserve placeholders like `{n}`, `{total}`, `{filename}` in translations: +``` +"customNumberDesc": "Defaults to {n}, also accepts 'Page {n} of {total}'" +``` + +### Automatic Backups +Scripts create timestamped backups before modifying files: +``` +translation.backup.20241201_143022.json +``` + +### Context-Aware Translation +Scripts provide context information to help with accurate translations: +```json +{ + "addPageNumbers.title": { + "original": "Add Page Numbers", + "context": "Feature for adding page numbers to PDFs" + } +} +``` + +### Priority-Based Translation +Important keys (title, submit, error messages) are prioritized when limiting translation batch sizes. + +### Ignore Patterns System +The `scripts/ignore_translation.toml` file defines keys that should be ignored for each language, improving completion accuracy. + +**Common ignore patterns:** +- `language.direction`: Text direction (ltr/rtl) - universal +- `lang.*`: Language code entries not relevant to specific locales +- `pipeline.title`, `home.devApi.title`: Technical terms kept in English +- Specific technical IDs, version numbers, and system identifiers + +**Format:** +```toml +[de_DE] +ignore = [ + 'language.direction', + 'pipeline.title', + 'lang.afr', + 'lang.ceb', + # ... more patterns +] +``` + +## Best Practices & Lessons Learned + +### Critical Rules for Translation + +1. **NEVER skip entries**: Translate ALL entries in each batch to avoid [UNTRANSLATED] pollution +2. **Use appropriate batch sizes**: 100 entries for systematic translation, unlimited for compact method +3. **Skip validation for placeholders**: Use `--skip-validation` when batch contains `{{variable}}` patterns +4. **Check progress between batches**: Use `--summary` flag to track completion percentage +5. **Preserve all placeholders**: Keep `{n}`, `{total}`, `{filename}`, `{{toolName}}` exactly as-is + +### Workflow Comparison + +| Method | Best For | Character Usage | Complexity | Speed | +|--------|----------|----------------|------------|-------| +| Compact | AI services | Minimal (50-80% less) | Simple | Fastest | +| Batch | Systematic translation | Moderate | Medium | Medium | +| Quick | Small updates | High | Low | Slow | + +### Common Issues and Solutions + +#### [UNTRANSLATED] Pollution +**Problem**: Hundreds of [UNTRANSLATED] markers from incomplete translation attempts +**Solution**: +- Only translate complete batches of manageable size +- Use analyzer that counts [UNTRANSLATED] as missing translations +- Restore from backup if pollution occurs + +#### Validation False Positives +**Problem**: Validator flags legitimate `{{variable}}` placeholders as artifacts +**Solution**: Use `--skip-validation` flag when applying batches with template variables + +#### JSON Structure Mismatches +**Problem**: Flattened dot-notation keys instead of proper nested objects +**Solution**: Use `json_beautifier.py` to restructure files to match en-GB exactly + +## Real-World Examples + +### Complete Italian Translation (Compact Method) +```bash +# Check status +python scripts/translations/translation_analyzer.py --language it-IT --summary +# Result: 46.8% complete, 1147 missing + +# Extract all entries for translation +python scripts/translations/compact_translator.py it-IT --output batch1.json + +# [Translate batch1.json with AI, save as batch1_translated.json] + +# Apply translations +python scripts/translations/translation_merger.py it-IT apply-translations --translations-file batch1_translated.json +# Result: Applied 1147 translations + +# Check progress +python scripts/translations/translation_analyzer.py --language it-IT --summary +# Result: 100% complete, 0 missing +``` + +### German Translation (Batch Method) +Starting from 46.3% completion, reaching 60.3% with batch method: + +```bash +# Initial analysis +python scripts/translations/translation_analyzer.py --language de-DE --summary +# Result: 46.3% complete, 1142 missing entries + +# Batch 1 (100 entries) +python scripts/translations/ai_translation_helper.py create-batch --languages de-DE --output de_batch_1.json --max-entries 100 +# [Translate all 100 entries in batch file] +python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json --skip-validation +# Progress: 46.6% → 51.2% + +# Continue with more batches until 100% complete +``` + +## Error Handling + +- **Missing Files**: Scripts create new files when language directories don't exist +- **Invalid JSON**: Clear error messages with line numbers +- **Placeholder Mismatches**: Validation warnings for missing or extra placeholders +- **[UNTRANSLATED] Entries**: Counted as missing translations to prevent pollution +- **Backup Failures**: Graceful handling with user notification + +## Integration with Development + +These scripts integrate with the existing translation system: +- Works with the current `frontend/public/locales/` structure +- Compatible with the i18n system used in the React frontend +- Respects the JSON format expected by the translation loader +- Maintains the nested structure required by the UI components + +## Language-Specific Notes + +### German Translation Notes +- Technical terms: Use German equivalents (PDF → PDF, API → API) +- UI actions: "hochladen" (upload), "herunterladen" (download), "speichern" (save) +- Error messages: Consistent pattern "Ein Fehler ist beim [action] aufgetreten" +- Formal address: Use "Sie" form for user-facing text + +### Italian Translation Notes +- Keep technical terms in English when commonly used (PDF, API, URL) +- Use formal address ("Lei" form) for user-facing text +- Error messages: "Si è verificato un errore durante [action]" +- UI actions: "carica" (upload), "scarica" (download), "salva" (save) + +## Common Use Cases + +1. **Complete Language Translation**: Use Compact Workflow for fastest AI-assisted translation +2. **New Language Addition**: Start with compact workflow for comprehensive coverage +3. **Updating Existing Language**: Use analyzer to find gaps, then compact or batch method +4. **Quality Assurance**: Use analyzer with `--summary` for completion metrics and issue detection +5. **External Translation Services**: Use export functionality to generate CSV files for translators +6. **Structure Maintenance**: Use json_beautifier to keep files aligned with en-GB structure \ No newline at end of file diff --git a/scripts/translations/ai_translation_helper.py b/scripts/translations/ai_translation_helper.py new file mode 100644 index 000000000..c879c229b --- /dev/null +++ b/scripts/translations/ai_translation_helper.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +""" +AI Translation Helper for Stirling PDF Frontend +Provides utilities for AI-assisted translation workflows including +batch processing, quality checks, and integration helpers. +""" + +import json +import os +import sys +from pathlib import Path +from typing import Dict, List, Set, Tuple, Any, Optional +import argparse +import re +from datetime import datetime +import csv + + +class AITranslationHelper: + def __init__(self, locales_dir: str = "frontend/public/locales"): + self.locales_dir = Path(locales_dir) + self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json" + + def _load_json(self, file_path: Path) -> Dict: + """Load JSON file with error handling.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Error loading {file_path}: {e}") + return {} + + def _save_json(self, data: Dict, file_path: Path) -> None: + """Save JSON file.""" + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + def create_ai_batch_file(self, languages: List[str], output_file: Path, + max_entries_per_language: int = 50) -> None: + """Create a batch file for AI translation with multiple languages.""" + golden_truth = self._load_json(self.golden_truth_file) + batch_data = { + 'metadata': { + 'created_at': datetime.now().isoformat(), + 'source_language': 'en-GB', + 'target_languages': languages, + 'max_entries_per_language': max_entries_per_language, + 'instructions': { + 'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}', + 'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.', + 'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.', + 'style': 'Keep translations concise and user-friendly' + } + }, + 'translations': {} + } + + for lang in languages: + lang_file = self.locales_dir / lang / "translation.json" + if not lang_file.exists(): + # Create empty translation structure + lang_data = {} + else: + lang_data = self._load_json(lang_file) + + # Find untranslated entries + untranslated = self._find_untranslated_entries(golden_truth, lang_data) + + # Limit entries if specified + if max_entries_per_language and len(untranslated) > max_entries_per_language: + # Prioritize by key importance + untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language) + + batch_data['translations'][lang] = {} + for key, value in untranslated.items(): + batch_data['translations'][lang][key] = { + 'original': value, + 'translated': '', # AI fills this + 'context': self._get_key_context(key) + } + + self._save_json(batch_data, output_file) + total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values()) + print(f"Created AI batch file: {output_file}") + print(f"Total entries to translate: {total_entries}") + + def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]: + """Find entries that need translation.""" + golden_flat = self._flatten_dict(golden_truth) + lang_flat = self._flatten_dict(lang_data) + + untranslated = {} + for key, value in golden_flat.items(): + if (key not in lang_flat or + lang_flat[key] == value or + (isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))): + if not self._is_expected_identical(key, value): + untranslated[key] = value + + return untranslated + + def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]: + """Flatten nested dictionary.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{separator}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self._flatten_dict(v, new_key, separator).items()) + else: + items.append((new_key, v)) + return dict(items) + + def _is_expected_identical(self, key: str, value: str) -> bool: + """Check if key should be identical across languages.""" + if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']: + return True + return 'language.direction' in key.lower() + + def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]: + """Prioritize which keys to translate first based on importance.""" + # Define priority order (higher score = higher priority) + priority_patterns = [ + ('title', 10), + ('header', 9), + ('submit', 8), + ('selectText', 7), + ('prompt', 6), + ('desc', 5), + ('error', 8), + ('warning', 7), + ('save', 8), + ('download', 8), + ('upload', 7), + ] + + scored_keys = [] + for key, value in untranslated.items(): + score = 1 # base score + for pattern, pattern_score in priority_patterns: + if pattern.lower() in key.lower(): + score = max(score, pattern_score) + scored_keys.append((key, value, score)) + + # Sort by score (descending) and return top entries + scored_keys.sort(key=lambda x: x[2], reverse=True) + return {key: value for key, value, _ in scored_keys[:max_count]} + + def _get_key_context(self, key: str) -> str: + """Get contextual information for a translation key.""" + parts = key.split('.') + contexts = { + 'addPageNumbers': 'Feature for adding page numbers to PDFs', + 'compress': 'PDF compression functionality', + 'merge': 'PDF merging functionality', + 'split': 'PDF splitting functionality', + 'rotate': 'PDF rotation functionality', + 'convert': 'File conversion functionality', + 'security': 'PDF security and permissions', + 'metadata': 'PDF metadata editing', + 'watermark': 'Adding watermarks to PDFs', + 'overlay': 'PDF overlay functionality', + 'extract': 'Extracting content from PDFs' + } + + if len(parts) > 0: + main_section = parts[0] + context = contexts.get(main_section, f'Part of {main_section} functionality') + if len(parts) > 1: + context += f', specifically for {parts[-1]}' + return context + + return 'General application text' + + def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]: + """Validate AI translations for common issues.""" + batch_data = self._load_json(batch_file) + issues = {'errors': [], 'warnings': []} + + for lang, translations in batch_data.get('translations', {}).items(): + for key, translation_data in translations.items(): + original = translation_data.get('original', '') + translated = translation_data.get('translated', '') + + if not translated: + issues['errors'].append(f"{lang}.{key}: Missing translation") + continue + + # Check for placeholder preservation + original_placeholders = re.findall(r'\{[^}]+\}', original) + translated_placeholders = re.findall(r'\{[^}]+\}', translated) + + if set(original_placeholders) != set(translated_placeholders): + issues['warnings'].append( + f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, " + f"Translated: {translated_placeholders}" + ) + + # Check if translation is identical to original (might be untranslated) + if translated == original and not self._is_expected_identical(key, original): + issues['warnings'].append(f"{lang}.{key}: Translation identical to original") + + # Check for common AI translation artifacts + artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}'] + for artifact in artifacts: + if artifact in translated: + issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}") + + return issues + + def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]: + """Apply translations from AI batch file to individual language files.""" + batch_data = self._load_json(batch_file) + results = {'applied': {}, 'errors': [], 'warnings': []} + + if validate: + validation_issues = self.validate_ai_translations(batch_file) + if validation_issues['errors']: + print("Validation errors found. Fix these before applying:") + for error in validation_issues['errors']: + print(f" ERROR: {error}") + return results + + if validation_issues['warnings']: + print("Validation warnings (review recommended):") + for warning in validation_issues['warnings'][:10]: + print(f" WARNING: {warning}") + + for lang, translations in batch_data.get('translations', {}).items(): + lang_file = self.locales_dir / lang / "translation.json" + + # Load existing data or create new + if lang_file.exists(): + lang_data = self._load_json(lang_file) + else: + lang_data = {} + lang_file.parent.mkdir(parents=True, exist_ok=True) + + applied_count = 0 + for key, translation_data in translations.items(): + translated = translation_data.get('translated', '').strip() + if translated and translated != translation_data.get('original', ''): + self._set_nested_value(lang_data, key, translated) + applied_count += 1 + + if applied_count > 0: + self._save_json(lang_data, lang_file) + results['applied'][lang] = applied_count + print(f"Applied {applied_count} translations to {lang}") + + return results + + def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None: + """Set value in nested dict using dot notation.""" + keys = key_path.split('.') + current = data + for key in keys[:-1]: + if key not in current: + current[key] = {} + elif not isinstance(current[key], dict): + # If the current value is not a dict, we can't nest into it + print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting") + current[key] = {} + current = current[key] + current[keys[-1]] = value + + def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None: + """Export translations for external translation services.""" + golden_truth = self._load_json(self.golden_truth_file) + golden_flat = self._flatten_dict(golden_truth) + + if output_format == 'csv': + output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv') + + with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['key', 'context', 'en_GB'] + languages + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + for key, en_value in golden_flat.items(): + if self._is_expected_identical(key, en_value): + continue + + row = { + 'key': key, + 'context': self._get_key_context(key), + 'en_GB': en_value + } + + for lang in languages: + lang_file = self.locales_dir / lang / "translation.json" + if lang_file.exists(): + lang_data = self._load_json(lang_file) + lang_flat = self._flatten_dict(lang_data) + value = lang_flat.get(key, '') + if value.startswith('[UNTRANSLATED]'): + value = '' + row[lang] = value + else: + row[lang] = '' + + writer.writerow(row) + + print(f"Exported to {output_file}") + + elif output_format == 'json': + output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json') + export_data = {'languages': languages, 'translations': {}} + + for key, en_value in golden_flat.items(): + if self._is_expected_identical(key, en_value): + continue + + export_data['translations'][key] = { + 'en_GB': en_value, + 'context': self._get_key_context(key) + } + + for lang in languages: + lang_file = self.locales_dir / lang / "translation.json" + if lang_file.exists(): + lang_data = self._load_json(lang_file) + lang_flat = self._flatten_dict(lang_data) + value = lang_flat.get(key, '') + if value.startswith('[UNTRANSLATED]'): + value = '' + export_data['translations'][key][lang] = value + + self._save_json(export_data, output_file) + print(f"Exported to {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description='AI Translation Helper') + parser.add_argument('--locales-dir', default='frontend/public/locales', + help='Path to locales directory') + + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + # Create batch command + batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file') + batch_parser.add_argument('--languages', nargs='+', required=True, + help='Language codes to include') + batch_parser.add_argument('--output', required=True, help='Output batch file') + batch_parser.add_argument('--max-entries', type=int, default=100, + help='Max entries per language') + + # Validate command + validate_parser = subparsers.add_parser('validate', help='Validate AI translations') + validate_parser.add_argument('batch_file', help='Batch file to validate') + + # Apply command + apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations') + apply_parser.add_argument('batch_file', help='Batch file with translations') + apply_parser.add_argument('--skip-validation', action='store_true', + help='Skip validation before applying') + + # Export command + export_parser = subparsers.add_parser('export', help='Export for external translation') + export_parser.add_argument('--languages', nargs='+', required=True, + help='Language codes to export') + export_parser.add_argument('--format', choices=['csv', 'json'], default='csv', + help='Export format') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + helper = AITranslationHelper(args.locales_dir) + + if args.command == 'create-batch': + output_file = Path(args.output) + helper.create_ai_batch_file(args.languages, output_file, args.max_entries) + + elif args.command == 'validate': + batch_file = Path(args.batch_file) + issues = helper.validate_ai_translations(batch_file) + + if issues['errors']: + print("ERRORS:") + for error in issues['errors']: + print(f" - {error}") + + if issues['warnings']: + print("WARNINGS:") + for warning in issues['warnings']: + print(f" - {warning}") + + if not issues['errors'] and not issues['warnings']: + print("No validation issues found!") + + elif args.command == 'apply-batch': + batch_file = Path(args.batch_file) + results = helper.apply_ai_batch_translations( + batch_file, + validate=not args.skip_validation + ) + + total_applied = sum(results['applied'].values()) + print(f"Total translations applied: {total_applied}") + + elif args.command == 'export': + helper.export_for_external_translation(args.languages, args.format) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/translations/compact_translator.py b/scripts/translations/compact_translator.py new file mode 100644 index 000000000..59efcbe9c --- /dev/null +++ b/scripts/translations/compact_translator.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Compact Translation Extractor for Character-Limited AI Translation +Outputs untranslated entries in minimal JSON format with whitespace stripped. +""" + +import json +import sys +from pathlib import Path +import argparse +try: + import tomllib # Python 3.11+ +except ImportError: + try: + import toml as tomllib_fallback + tomllib = None + except ImportError: + tomllib = None + tomllib_fallback = None + + +class CompactTranslationExtractor: + def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): + self.locales_dir = Path(locales_dir) + self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json" + self.golden_truth = self._load_json(self.golden_truth_file) + self.ignore_file = Path(ignore_file) + self.ignore_patterns = self._load_ignore_patterns() + + def _load_json(self, file_path: Path) -> dict: + """Load JSON file with error handling.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + print(f"Error: File not found: {file_path}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in {file_path}: {e}", file=sys.stderr) + sys.exit(1) + + def _load_ignore_patterns(self) -> dict: + """Load ignore patterns from TOML file.""" + if not self.ignore_file.exists(): + return {} + + try: + if tomllib: + with open(self.ignore_file, 'rb') as f: + ignore_data = tomllib.load(f) + elif tomllib_fallback: + ignore_data = tomllib_fallback.load(self.ignore_file) + else: + ignore_data = self._parse_simple_toml() + + return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()} + except Exception as e: + print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr) + return {} + + def _parse_simple_toml(self) -> dict: + """Simple TOML parser for ignore patterns (fallback).""" + ignore_data = {} + current_section = None + + with open(self.ignore_file, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + + if line.startswith('[') and line.endswith(']'): + current_section = line[1:-1] + ignore_data[current_section] = {'ignore': []} + elif line.strip().startswith("'") and current_section: + item = line.strip().strip("',") + if item: + ignore_data[current_section]['ignore'].append(item) + + return ignore_data + + def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict: + """Flatten nested dictionary into dot-notation keys.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{separator}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self._flatten_dict(v, new_key, separator).items()) + else: + items.append((new_key, str(v))) + return dict(items) + + def get_untranslated_entries(self, language: str) -> dict: + """Get all untranslated entries for a language in compact format.""" + target_file = self.locales_dir / language / "translation.json" + + if not target_file.exists(): + print(f"Error: Translation file not found for language: {language}", file=sys.stderr) + sys.exit(1) + + target_data = self._load_json(target_file) + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + lang_code = language.replace('-', '_') + ignore_set = self.ignore_patterns.get(lang_code, set()) + + # Find missing translations + missing_keys = set(golden_flat.keys()) - set(target_flat.keys()) - ignore_set + + # Find untranslated entries (identical to en-GB or marked [UNTRANSLATED]) + untranslated_keys = set() + for key in target_flat: + if key in golden_flat and key not in ignore_set: + target_value = target_flat[key] + golden_value = golden_flat[key] + + if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \ + (golden_value == target_value and not self._is_expected_identical(key, golden_value)): + untranslated_keys.add(key) + + # Combine and create compact output + all_untranslated = missing_keys | untranslated_keys + + compact_entries = {} + for key in sorted(all_untranslated): + if key in golden_flat: + compact_entries[key] = golden_flat[key] + + return compact_entries + + def _is_expected_identical(self, key: str, value: str) -> bool: + """Check if a key-value pair is expected to be identical across languages.""" + identical_patterns = ['language.direction'] + identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'} + + if value.strip() in identical_values: + return True + + for pattern in identical_patterns: + if pattern in key.lower(): + return True + + return False + + +def main(): + parser = argparse.ArgumentParser(description='Extract untranslated entries in compact format for AI translation') + parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)') + parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory') + parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file') + parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output') + parser.add_argument('--output', help='Output file (default: stdout)') + + args = parser.parse_args() + + extractor = CompactTranslationExtractor(args.locales_dir, args.ignore_file) + untranslated = extractor.get_untranslated_entries(args.language) + + if args.max_entries: + # Take first N entries + keys = list(untranslated.keys())[:args.max_entries] + untranslated = {k: untranslated[k] for k in keys} + + # Output compact JSON (no indentation, minimal whitespace) + output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False) + + if args.output: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(output) + print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr) + else: + print(output) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/translations/json_beautifier.py b/scripts/translations/json_beautifier.py new file mode 100644 index 000000000..41c65afda --- /dev/null +++ b/scripts/translations/json_beautifier.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +""" +JSON Beautifier and Structure Fixer for Stirling PDF Frontend +Restructures translation JSON files to match en-GB structure and key order exactly. +""" + +import json +import os +import sys +from pathlib import Path +from typing import Dict, Any, List +import argparse +from collections import OrderedDict + + +class JSONBeautifier: + def __init__(self, locales_dir: str = "frontend/public/locales"): + self.locales_dir = Path(locales_dir) + self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json" + self.golden_structure = self._load_json(self.golden_truth_file) + + def _load_json(self, file_path: Path) -> Dict: + """Load JSON file with error handling.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f, object_pairs_hook=OrderedDict) + except FileNotFoundError: + print(f"Error: File not found: {file_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in {file_path}: {e}") + sys.exit(1) + + def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None: + """Save JSON file with proper formatting.""" + if backup and file_path.exists(): + backup_path = file_path.with_suffix(f'.backup.restructured.json') + file_path.rename(backup_path) + print(f"Backup created: {backup_path}") + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False, separators=(',', ': ')) + + def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]: + """Flatten nested dictionary into dot-notation keys.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{separator}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self._flatten_dict(v, new_key, separator).items()) + else: + items.append((new_key, v)) + return dict(items) + + def _rebuild_structure(self, flat_dict: Dict[str, Any], reference_structure: Dict) -> Dict: + """Rebuild nested structure based on reference structure and available translations.""" + def build_recursive(ref_obj: Any, current_path: str = '') -> Any: + if isinstance(ref_obj, dict): + result = OrderedDict() + for key, value in ref_obj.items(): + new_path = f"{current_path}.{key}" if current_path else key + + if new_path in flat_dict: + # Direct translation exists + if isinstance(value, dict): + # If reference is dict but we have a string, use the string + if isinstance(flat_dict[new_path], str): + result[key] = flat_dict[new_path] + else: + # Recurse into nested structure + result[key] = build_recursive(value, new_path) + else: + result[key] = flat_dict[new_path] + else: + # No direct translation, recurse to check for nested keys + if isinstance(value, dict): + nested_result = build_recursive(value, new_path) + if nested_result: # Only add if we found some translations + result[key] = nested_result + # If no translation found and it's a leaf, skip it + + return result if result else None + else: + # Leaf node - return the translation if it exists + return flat_dict.get(current_path, None) + + return build_recursive(reference_structure) or OrderedDict() + + def restructure_translation_file(self, target_file: Path) -> Dict[str, Any]: + """Restructure a translation file to match en-GB structure exactly.""" + if not target_file.exists(): + print(f"Error: Target file does not exist: {target_file}") + return {} + + # Load the target file + target_data = self._load_json(target_file) + + # Flatten the target translations + flat_target = self._flatten_dict(target_data) + + # Rebuild structure based on golden truth + restructured = self._rebuild_structure(flat_target, self.golden_structure) + + return restructured + + def beautify_and_restructure(self, target_file: Path, backup: bool = True) -> Dict[str, Any]: + """Main function to beautify and restructure a translation file.""" + lang_code = target_file.parent.name + print(f"Restructuring {lang_code} translation file...") + + # Get the restructured data + restructured_data = self.restructure_translation_file(target_file) + + # Save the restructured file + self._save_json(restructured_data, target_file, backup) + + # Analyze the results + flat_golden = self._flatten_dict(self.golden_structure) + flat_restructured = self._flatten_dict(restructured_data) + + total_keys = len(flat_golden) + preserved_keys = len(flat_restructured) + + result = { + 'language': lang_code, + 'total_reference_keys': total_keys, + 'preserved_keys': preserved_keys, + 'structure_match': self._compare_structures(self.golden_structure, restructured_data) + } + + print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved") + return result + + def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]: + """Compare structures between reference and target.""" + def compare_recursive(r: Any, t: Any, path: str = '') -> List[str]: + issues = [] + + if isinstance(r, dict) and isinstance(t, dict): + # Check for missing top-level sections + ref_keys = set(r.keys()) + target_keys = set(t.keys()) + + missing_sections = ref_keys - target_keys + if missing_sections: + for section in missing_sections: + issues.append(f"Missing section: {path}.{section}" if path else section) + + # Recurse into common sections + for key in ref_keys & target_keys: + new_path = f"{path}.{key}" if path else key + issues.extend(compare_recursive(r[key], t[key], new_path)) + + return issues + + issues = compare_recursive(ref, target) + + return { + 'structures_match': len(issues) == 0, + 'issues': issues[:10], # Limit to first 10 issues + 'total_issues': len(issues) + } + + def validate_key_order(self, target_file: Path) -> Dict[str, Any]: + """Validate that keys appear in the same order as en-GB.""" + target_data = self._load_json(target_file) + + def get_key_order(obj: Dict, path: str = '') -> List[str]: + keys = [] + for key in obj.keys(): + new_path = f"{path}.{key}" if path else key + keys.append(new_path) + if isinstance(obj[key], dict): + keys.extend(get_key_order(obj[key], new_path)) + return keys + + golden_order = get_key_order(self.golden_structure) + target_order = get_key_order(target_data) + + # Find common keys and check their relative order + common_keys = set(golden_order) & set(target_order) + + golden_indices = {key: idx for idx, key in enumerate(golden_order) if key in common_keys} + target_indices = {key: idx for idx, key in enumerate(target_order) if key in common_keys} + + order_preserved = all( + golden_indices[key1] < golden_indices[key2] + for key1 in common_keys for key2 in common_keys + if golden_indices[key1] < golden_indices[key2] and target_indices[key1] < target_indices[key2] + ) + + return { + 'order_preserved': order_preserved, + 'common_keys_count': len(common_keys), + 'golden_keys_count': len(golden_order), + 'target_keys_count': len(target_order) + } + + +def main(): + parser = argparse.ArgumentParser(description='Beautify and restructure translation JSON files') + parser.add_argument('--locales-dir', default='frontend/public/locales', + help='Path to locales directory') + parser.add_argument('--language', help='Restructure specific language only') + parser.add_argument('--all-languages', action='store_true', + help='Restructure all language files') + parser.add_argument('--no-backup', action='store_true', + help='Skip backup creation') + parser.add_argument('--validate-only', action='store_true', + help='Only validate structure, do not modify files') + + args = parser.parse_args() + + beautifier = JSONBeautifier(args.locales_dir) + + if args.language: + target_file = Path(args.locales_dir) / args.language / "translation.json" + if not target_file.exists(): + print(f"Error: Translation file not found for language: {args.language}") + sys.exit(1) + + if args.validate_only: + order_result = beautifier.validate_key_order(target_file) + print(f"Key order validation for {args.language}:") + print(f" Order preserved: {order_result['order_preserved']}") + print(f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}") + else: + result = beautifier.beautify_and_restructure(target_file, backup=not args.no_backup) + print(f"\nResults for {result['language']}:") + print(f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}") + if result['structure_match']['total_issues'] > 0: + print(f" Structure issues: {result['structure_match']['total_issues']}") + for issue in result['structure_match']['issues']: + print(f" - {issue}") + + elif args.all_languages: + results = [] + for lang_dir in Path(args.locales_dir).iterdir(): + if lang_dir.is_dir() and lang_dir.name != "en-GB": + translation_file = lang_dir / "translation.json" + if translation_file.exists(): + if args.validate_only: + order_result = beautifier.validate_key_order(translation_file) + print(f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}") + else: + result = beautifier.beautify_and_restructure(translation_file, backup=not args.no_backup) + results.append(result) + + if not args.validate_only and results: + print(f"\n{'='*60}") + print("RESTRUCTURING SUMMARY") + print(f"{'='*60}") + for result in sorted(results, key=lambda x: x['language']): + print(f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys " + f"({result['preserved_keys']/result['total_reference_keys']*100:.1f}%)") + + else: + parser.print_help() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/translations/translation_analyzer.py b/scripts/translations/translation_analyzer.py new file mode 100644 index 000000000..9c8315b53 --- /dev/null +++ b/scripts/translations/translation_analyzer.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 +""" +Translation Analyzer for Stirling PDF Frontend +Compares language files against en-GB golden truth file. +""" + +import json +import os +import sys +from pathlib import Path +from typing import Dict, List, Set, Tuple +import argparse +try: + import tomllib # Python 3.11+ +except ImportError: + try: + import toml as tomllib_fallback + tomllib = None + except ImportError: + tomllib = None + tomllib_fallback = None + + +class TranslationAnalyzer: + def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): + self.locales_dir = Path(locales_dir) + self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json" + self.golden_truth = self._load_json(self.golden_truth_file) + self.ignore_file = Path(ignore_file) + self.ignore_patterns = self._load_ignore_patterns() + + def _load_json(self, file_path: Path) -> Dict: + """Load JSON file with error handling.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + print(f"Error: File not found: {file_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in {file_path}: {e}") + sys.exit(1) + + def _load_ignore_patterns(self) -> Dict[str, Set[str]]: + """Load ignore patterns from TOML file.""" + if not self.ignore_file.exists(): + return {} + + try: + if tomllib: + # Use Python 3.11+ built-in + with open(self.ignore_file, 'rb') as f: + ignore_data = tomllib.load(f) + elif tomllib_fallback: + # Use toml library fallback + ignore_data = tomllib_fallback.load(self.ignore_file) + else: + # Simple parser as fallback + ignore_data = self._parse_simple_toml() + + # Convert lists to sets for faster lookup + return {lang: set(patterns) for lang, data in ignore_data.items() + for patterns in [data.get('ignore', [])] if patterns} + except Exception as e: + print(f"Warning: Could not load ignore file {self.ignore_file}: {e}") + return {} + + def _parse_simple_toml(self) -> Dict: + """Simple TOML parser for ignore patterns (fallback).""" + ignore_data = {} + current_section = None + + with open(self.ignore_file, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + + if line.startswith('[') and line.endswith(']'): + current_section = line[1:-1] + ignore_data[current_section] = {'ignore': []} + elif line.startswith('ignore = [') and current_section: + # Handle ignore array + continue + elif line.strip().startswith("'") and current_section: + # Extract quoted items + item = line.strip().strip("',") + if item: + ignore_data[current_section]['ignore'].append(item) + + return ignore_data + + def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]: + """Flatten nested dictionary into dot-notation keys.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{separator}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self._flatten_dict(v, new_key, separator).items()) + else: + items.append((new_key, str(v))) + return dict(items) + + def get_all_language_files(self) -> List[Path]: + """Get all translation.json files except en-GB.""" + files = [] + for lang_dir in self.locales_dir.iterdir(): + if lang_dir.is_dir() and lang_dir.name != "en-GB": + translation_file = lang_dir / "translation.json" + if translation_file.exists(): + files.append(translation_file) + return sorted(files) + + def find_missing_translations(self, target_file: Path) -> Set[str]: + """Find keys that exist in en-GB but missing in target file.""" + target_data = self._load_json(target_file) + + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + missing = set(golden_flat.keys()) - set(target_flat.keys()) + + # Filter out ignored keys + lang_code = target_file.parent.name.replace('-', '_') + ignore_set = self.ignore_patterns.get(lang_code, set()) + return missing - ignore_set + + def find_untranslated_entries(self, target_file: Path) -> Set[str]: + """Find entries that appear to be untranslated (identical to en-GB).""" + target_data = self._load_json(target_file) + + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + lang_code = target_file.parent.name.replace('-', '_') + ignore_set = self.ignore_patterns.get(lang_code, set()) + + untranslated = set() + for key in target_flat: + if key in golden_flat: + target_value = target_flat[key] + golden_value = golden_flat[key] + + # Check if marked as [UNTRANSLATED] or identical to en-GB + if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \ + (golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)): + untranslated.add(key) + + return untranslated + + def _is_expected_identical(self, key: str, value: str) -> bool: + """Check if a key-value pair is expected to be identical across languages.""" + # Keys that should be identical across languages + identical_patterns = [ + 'language.direction', + 'true', 'false', + 'unknown' + ] + + # Values that are often identical (numbers, symbols, etc.) + if value.strip() in ['ltr', 'rtl', 'True', 'False']: + return True + + # Check for patterns + for pattern in identical_patterns: + if pattern in key.lower(): + return True + + return False + + def find_extra_translations(self, target_file: Path) -> Set[str]: + """Find keys that exist in target file but not in en-GB.""" + target_data = self._load_json(target_file) + + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + return set(target_flat.keys()) - set(golden_flat.keys()) + + def analyze_file(self, target_file: Path) -> Dict: + """Complete analysis of a single translation file.""" + lang_code = target_file.parent.name + + missing = self.find_missing_translations(target_file) + untranslated = self.find_untranslated_entries(target_file) + extra = self.find_extra_translations(target_file) + + target_data = self._load_json(target_file) + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + # Calculate completion rate excluding ignored keys + lang_code = target_file.parent.name.replace('-', '_') + ignore_set = self.ignore_patterns.get(lang_code, set()) + + relevant_keys = set(golden_flat.keys()) - ignore_set + total_keys = len(relevant_keys) + + # Count keys that exist and are properly translated (not [UNTRANSLATED]) + properly_translated = 0 + for key in relevant_keys: + if key in target_flat: + value = target_flat[key] + if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")): + if key not in untranslated: # Not identical to en-GB (unless expected) + properly_translated += 1 + + completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0 + + return { + 'language': lang_code, + 'file': target_file, + 'missing_count': len(missing), + 'missing_keys': sorted(missing), + 'untranslated_count': len(untranslated), + 'untranslated_keys': sorted(untranslated), + 'extra_count': len(extra), + 'extra_keys': sorted(extra), + 'total_keys': total_keys, + 'completion_rate': completion_rate + } + + def analyze_all_files(self) -> List[Dict]: + """Analyze all translation files.""" + results = [] + for file_path in self.get_all_language_files(): + results.append(self.analyze_file(file_path)) + return sorted(results, key=lambda x: x['language']) + + +def main(): + parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth') + parser.add_argument('--locales-dir', default='frontend/public/locales', + help='Path to locales directory') + parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', + help='Path to ignore patterns TOML file') + parser.add_argument('--language', help='Analyze specific language only') + parser.add_argument('--missing-only', action='store_true', + help='Show only missing translations') + parser.add_argument('--untranslated-only', action='store_true', + help='Show only untranslated entries') + parser.add_argument('--summary', action='store_true', + help='Show summary statistics only') + parser.add_argument('--format', choices=['text', 'json'], default='text', + help='Output format') + + args = parser.parse_args() + + analyzer = TranslationAnalyzer(args.locales_dir, args.ignore_file) + + if args.language: + target_file = Path(args.locales_dir) / args.language / "translation.json" + if not target_file.exists(): + print(f"Error: Translation file not found for language: {args.language}") + sys.exit(1) + results = [analyzer.analyze_file(target_file)] + else: + results = analyzer.analyze_all_files() + + if args.format == 'json': + print(json.dumps(results, indent=2, default=str)) + return + + # Text format output + for result in results: + lang = result['language'] + print(f"\n{'='*60}") + print(f"Language: {lang}") + print(f"File: {result['file']}") + print(f"Completion Rate: {result['completion_rate']:.1f}%") + print(f"Total Keys in en-GB: {result['total_keys']}") + + if not args.summary: + if not args.untranslated_only: + print(f"\nMissing Translations ({result['missing_count']}):") + for key in result['missing_keys'][:10]: # Show first 10 + print(f" - {key}") + if len(result['missing_keys']) > 10: + print(f" ... and {len(result['missing_keys']) - 10} more") + + if not args.missing_only: + print(f"\nUntranslated Entries ({result['untranslated_count']}):") + for key in result['untranslated_keys'][:10]: # Show first 10 + print(f" - {key}") + if len(result['untranslated_keys']) > 10: + print(f" ... and {len(result['untranslated_keys']) - 10} more") + + if result['extra_count'] > 0: + print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):") + for key in result['extra_keys'][:5]: + print(f" - {key}") + if len(result['extra_keys']) > 5: + print(f" ... and {len(result['extra_keys']) - 5} more") + + print(f"\n{'='*60}") + print("SUMMARY") + print(f"{'='*60}") + avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0 + print(f"Average Completion Rate: {avg_completion:.1f}%") + print(f"Languages Analyzed: {len(results)}") + + # Top languages by completion + sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True) + print(f"\nTop 5 Most Complete Languages:") + for result in sorted_by_completion[:5]: + print(f" {result['language']}: {result['completion_rate']:.1f}%") + + print(f"\nBottom 5 Languages Needing Attention:") + for result in sorted_by_completion[-5:]: + print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/translations/translation_merger.py b/scripts/translations/translation_merger.py new file mode 100644 index 000000000..84884d946 --- /dev/null +++ b/scripts/translations/translation_merger.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +""" +Translation Merger for Stirling PDF Frontend +Merges missing translations from en-GB into target language files. +Useful for AI-assisted translation workflows. +""" + +import json +import os +import sys +from pathlib import Path +from typing import Dict, List, Set, Tuple, Any +import argparse +import shutil +from datetime import datetime + +try: + import tomllib # Python 3.11+ +except ImportError: + try: + import toml as tomllib_fallback + tomllib = None + except ImportError: + tomllib = None + tomllib_fallback = None + + +class TranslationMerger: + def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): + self.locales_dir = Path(locales_dir) + self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json" + self.golden_truth = self._load_json(self.golden_truth_file) + self.ignore_file = Path(ignore_file) + self.ignore_patterns = self._load_ignore_patterns() + + def _load_json(self, file_path: Path) -> Dict: + """Load JSON file with error handling.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + print(f"Error: File not found: {file_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in {file_path}: {e}") + sys.exit(1) + + def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None: + """Save JSON file with backup option.""" + if backup and file_path.exists(): + backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.json') + shutil.copy2(file_path, backup_path) + print(f"Backup created: {backup_path}") + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + def _load_ignore_patterns(self) -> Dict[str, Set[str]]: + """Load ignore patterns from TOML file.""" + if not self.ignore_file.exists(): + return {} + + try: + # Simple parser for ignore patterns + ignore_data = {} + current_section = None + + with open(self.ignore_file, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + + if line.startswith('[') and line.endswith(']'): + current_section = line[1:-1] + ignore_data[current_section] = set() + elif line.strip().startswith("'") and current_section: + # Extract quoted items + item = line.strip().strip("',") + if item: + ignore_data[current_section].add(item) + + return ignore_data + except Exception as e: + print(f"Warning: Could not load ignore file {self.ignore_file}: {e}") + return {} + + def _get_nested_value(self, data: Dict, key_path: str) -> Any: + """Get value from nested dict using dot notation.""" + keys = key_path.split('.') + current = data + for key in keys: + if isinstance(current, dict) and key in current: + current = current[key] + else: + return None + return current + + def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None: + """Set value in nested dict using dot notation.""" + keys = key_path.split('.') + current = data + for key in keys[:-1]: + if key not in current: + current[key] = {} + elif not isinstance(current[key], dict): + # If the current value is not a dict, we can't nest into it + # This handles cases where a key exists as a string but we need to make it a dict + print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting") + current[key] = {} + current = current[key] + current[keys[-1]] = value + + def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]: + """Flatten nested dictionary into dot-notation keys.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{separator}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self._flatten_dict(v, new_key, separator).items()) + else: + items.append((new_key, v)) + return dict(items) + + def get_missing_keys(self, target_file: Path) -> List[str]: + """Get list of missing keys in target file.""" + lang_code = target_file.parent.name.replace('-', '_') + ignore_set = self.ignore_patterns.get(lang_code, set()) + + if not target_file.exists(): + golden_keys = set(self._flatten_dict(self.golden_truth).keys()) + return sorted(golden_keys - ignore_set) + + target_data = self._load_json(target_file) + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + missing = set(golden_flat.keys()) - set(target_flat.keys()) + return sorted(missing - ignore_set) + + def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None, + mark_untranslated: bool = True) -> Dict: + """Add missing translations from en-GB to target file.""" + if not target_file.exists(): + target_data = {} + else: + target_data = self._load_json(target_file) + + golden_flat = self._flatten_dict(self.golden_truth) + missing_keys = keys_to_add or self.get_missing_keys(target_file) + + added_count = 0 + for key in missing_keys: + if key in golden_flat: + value = golden_flat[key] + if mark_untranslated and isinstance(value, str): + # Mark as untranslated for AI to translate later + value = f"[UNTRANSLATED] {value}" + + self._set_nested_value(target_data, key, value) + added_count += 1 + + return { + 'added_count': added_count, + 'missing_keys': missing_keys, + 'data': target_data + } + + def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict: + """Extract entries marked as untranslated or identical to en-GB for AI translation.""" + if not target_file.exists(): + print(f"Error: Target file does not exist: {target_file}") + return {} + + target_data = self._load_json(target_file) + golden_flat = self._flatten_dict(self.golden_truth) + target_flat = self._flatten_dict(target_data) + + untranslated_entries = {} + + for key, value in target_flat.items(): + if key in golden_flat: + golden_value = golden_flat[key] + + # Check if marked as untranslated + if isinstance(value, str) and value.startswith("[UNTRANSLATED]"): + untranslated_entries[key] = { + 'original': golden_value, + 'current': value, + 'reason': 'marked_untranslated' + } + # Check if identical to golden (and should be translated) + elif value == golden_value and not self._is_expected_identical(key, value): + untranslated_entries[key] = { + 'original': golden_value, + 'current': value, + 'reason': 'identical_to_english' + } + + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(untranslated_entries, f, indent=2, ensure_ascii=False) + + return untranslated_entries + + def _is_expected_identical(self, key: str, value: str) -> bool: + """Check if a key-value pair is expected to be identical across languages.""" + identical_patterns = [ + 'language.direction', + ] + + if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']: + return True + + for pattern in identical_patterns: + if pattern in key.lower(): + return True + + return False + + def apply_translations(self, target_file: Path, translations: Dict[str, str], + backup: bool = True) -> Dict: + """Apply provided translations to target file.""" + if not target_file.exists(): + print(f"Error: Target file does not exist: {target_file}") + return {'success': False, 'error': 'File not found'} + + target_data = self._load_json(target_file) + applied_count = 0 + errors = [] + + for key, translation in translations.items(): + try: + # Remove [UNTRANSLATED] marker if present + if translation.startswith("[UNTRANSLATED]"): + translation = translation.replace("[UNTRANSLATED]", "").strip() + + self._set_nested_value(target_data, key, translation) + applied_count += 1 + except Exception as e: + errors.append(f"Error setting {key}: {e}") + + if applied_count > 0: + self._save_json(target_data, target_file, backup) + + return { + 'success': True, + 'applied_count': applied_count, + 'errors': errors, + 'data': target_data + } + + def create_translation_template(self, target_file: Path, output_file: Path) -> None: + """Create a template file for AI translation with context.""" + untranslated = self.extract_untranslated_entries(target_file) + + template = { + 'metadata': { + 'source_language': 'en-GB', + 'target_language': target_file.parent.name, + 'total_entries': len(untranslated), + 'created_at': datetime.now().isoformat(), + 'instructions': 'Translate the "original" values to the target language. Keep the same keys.' + }, + 'translations': {} + } + + for key, entry in untranslated.items(): + template['translations'][key] = { + 'original': entry['original'], + 'translated': '', # AI should fill this + 'context': self._get_context_for_key(key), + 'reason': entry['reason'] + } + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(template, f, indent=2, ensure_ascii=False) + + print(f"Translation template created: {output_file}") + print(f"Contains {len(untranslated)} entries to translate") + + def _get_context_for_key(self, key: str) -> str: + """Get context information for a translation key.""" + parts = key.split('.') + if len(parts) >= 2: + return f"Section: {parts[0]}, Property: {parts[-1]}" + return f"Property: {parts[-1]}" + + +def main(): + parser = argparse.ArgumentParser(description='Merge and manage translation files') + parser.add_argument('--locales-dir', default='frontend/public/locales', + help='Path to locales directory') + parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', + help='Path to ignore patterns TOML file') + parser.add_argument('language', help='Target language code (e.g., fr-FR)') + + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + # Add missing command + add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB') + add_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation') + add_parser.add_argument('--mark-untranslated', action='store_true', default=True, + help='Mark added translations as [UNTRANSLATED]') + + # Extract untranslated command + extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries') + extract_parser.add_argument('--output', help='Output file path') + + # Create template command + template_parser = subparsers.add_parser('create-template', help='Create AI translation template') + template_parser.add_argument('--output', required=True, help='Output template file path') + + # Apply translations command + apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file') + apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations') + apply_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + merger = TranslationMerger(args.locales_dir, args.ignore_file) + target_file = Path(args.locales_dir) / args.language / "translation.json" + + if args.command == 'add-missing': + print(f"Adding missing translations to {args.language}...") + result = merger.add_missing_translations( + target_file, + mark_untranslated=args.mark_untranslated + ) + + merger._save_json(result['data'], target_file, backup=not args.no_backup) + print(f"Added {result['added_count']} missing translations") + + elif args.command == 'extract-untranslated': + output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json') + untranslated = merger.extract_untranslated_entries(target_file, output_file) + print(f"Extracted {len(untranslated)} untranslated entries to {output_file}") + + elif args.command == 'create-template': + output_file = Path(args.output) + merger.create_translation_template(target_file, output_file) + + elif args.command == 'apply-translations': + with open(args.translations_file, 'r', encoding='utf-8') as f: + translations_data = json.load(f) + + # Extract translations from template format or simple dict + if 'translations' in translations_data: + translations = {k: v['translated'] for k, v in translations_data['translations'].items() + if v.get('translated')} + else: + translations = translations_data + + result = merger.apply_translations(target_file, translations, backup=not args.no_backup) + + if result['success']: + print(f"Applied {result['applied_count']} translations") + if result['errors']: + print(f"Errors: {len(result['errors'])}") + for error in result['errors'][:5]: + print(f" - {error}") + else: + print(f"Failed: {result.get('error', 'Unknown error')}") + + +if __name__ == "__main__": + main() \ No newline at end of file