Stirling-PDF/scripts/translations/ai_translation_helper.py

#!/usr/bin/env python3
"""
AI Translation Helper for Stirling PDF Frontend
Provides utilities for AI-assisted translation workflows including
batch processing, quality checks, and integration helpers.
"""

import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any, Optional
import argparse
import re
from datetime import datetime
import csv


class AITranslationHelper:
    def __init__(self, locales_dir: str = "frontend/public/locales"):
        self.locales_dir = Path(locales_dir)
        self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"

    def _load_json(self, file_path: Path) -> Dict:
        """Load JSON file with error handling."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Error loading {file_path}: {e}")
            return {}

    def _save_json(self, data: Dict, file_path: Path) -> None:
        """Save JSON file."""
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

    def create_ai_batch_file(self, languages: List[str], output_file: Path,
                            max_entries_per_language: int = 50) -> None:
        """Create a batch file for AI translation with multiple languages."""
        golden_truth = self._load_json(self.golden_truth_file)
        batch_data = {
            'metadata': {
                'created_at': datetime.now().isoformat(),
                'source_language': 'en-GB',
                'target_languages': languages,
                'max_entries_per_language': max_entries_per_language,
                'instructions': {
                    'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}',
                    'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.',
                    'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.',
                    'style': 'Keep translations concise and user-friendly'
                }
            },
            'translations': {}
        }

        for lang in languages:
            lang_file = self.locales_dir / lang / "translation.json"
            if not lang_file.exists():
                # Create empty translation structure
                lang_data = {}
            else:
                lang_data = self._load_json(lang_file)

            # Find untranslated entries
            untranslated = self._find_untranslated_entries(golden_truth, lang_data)

            # Limit entries if specified
            if max_entries_per_language and len(untranslated) > max_entries_per_language:
                # Prioritize by key importance
                untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language)

            batch_data['translations'][lang] = {}
            for key, value in untranslated.items():
                batch_data['translations'][lang][key] = {
                    'original': value,
                    'translated': '',  # AI fills this
                    'context': self._get_key_context(key)
                }

        self._save_json(batch_data, output_file)
        total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values())
        print(f"Created AI batch file: {output_file}")
        print(f"Total entries to translate: {total_entries}")

    def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]:
        """Find entries that need translation."""
        golden_flat = self._flatten_dict(golden_truth)
        lang_flat = self._flatten_dict(lang_data)

        untranslated = {}
        for key, value in golden_flat.items():
            if (key not in lang_flat or
                lang_flat[key] == value or
                (isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))):
                if not self._is_expected_identical(key, value):
                    untranslated[key] = value

        return untranslated

    def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
        """Flatten nested dictionary."""
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}{separator}{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(self._flatten_dict(v, new_key, separator).items())
            else:
                items.append((new_key, v))
        return dict(items)

    def _is_expected_identical(self, key: str, value: str) -> bool:
        """Check if key should be identical across languages."""
        if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
            return True
        return 'language.direction' in key.lower()

    def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]:
        """Prioritize which keys to translate first based on importance."""
        # Define priority order (higher score = higher priority)
        priority_patterns = [
            ('title', 10),
            ('header', 9),
            ('submit', 8),
            ('selectText', 7),
            ('prompt', 6),
            ('desc', 5),
            ('error', 8),
            ('warning', 7),
            ('save', 8),
            ('download', 8),
            ('upload', 7),
        ]

        scored_keys = []
        for key, value in untranslated.items():
            score = 1  # base score
            for pattern, pattern_score in priority_patterns:
                if pattern.lower() in key.lower():
                    score = max(score, pattern_score)
            scored_keys.append((key, value, score))

        # Sort by score (descending) and return top entries
        scored_keys.sort(key=lambda x: x[2], reverse=True)
        return {key: value for key, value, _ in scored_keys[:max_count]}

    def _get_key_context(self, key: str) -> str:
        """Get contextual information for a translation key."""
        parts = key.split('.')
        contexts = {
            'addPageNumbers': 'Feature for adding page numbers to PDFs',
            'compress': 'PDF compression functionality',
            'merge': 'PDF merging functionality',
            'split': 'PDF splitting functionality',
            'rotate': 'PDF rotation functionality',
            'convert': 'File conversion functionality',
            'security': 'PDF security and permissions',
            'metadata': 'PDF metadata editing',
            'watermark': 'Adding watermarks to PDFs',
            'overlay': 'PDF overlay functionality',
            'extract': 'Extracting content from PDFs'
        }

        if len(parts) > 0:
            main_section = parts[0]
            context = contexts.get(main_section, f'Part of {main_section} functionality')
            if len(parts) > 1:
                context += f', specifically for {parts[-1]}'
            return context

        return 'General application text'

    def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
        """Validate AI translations for common issues."""
        batch_data = self._load_json(batch_file)
        issues = {'errors': [], 'warnings': []}

        for lang, translations in batch_data.get('translations', {}).items():
            for key, translation_data in translations.items():
                original = translation_data.get('original', '')
                translated = translation_data.get('translated', '')

                if not translated:
                    issues['errors'].append(f"{lang}.{key}: Missing translation")
                    continue

                # Check for placeholder preservation
                original_placeholders = re.findall(r'\{[^}]+\}', original)
                translated_placeholders = re.findall(r'\{[^}]+\}', translated)

                if set(original_placeholders) != set(translated_placeholders):
                    issues['warnings'].append(
                        f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
                        f"Translated: {translated_placeholders}"
                    )

                # Check if translation is identical to original (might be untranslated)
                if translated == original and not self._is_expected_identical(key, original):
                    issues['warnings'].append(f"{lang}.{key}: Translation identical to original")

                # Check for common AI translation artifacts
                artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}']
                for artifact in artifacts:
                    if artifact in translated:
                        issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}")

        return issues

    def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]:
        """Apply translations from AI batch file to individual language files."""
        batch_data = self._load_json(batch_file)
        results = {'applied': {}, 'errors': [], 'warnings': []}

        if validate:
            validation_issues = self.validate_ai_translations(batch_file)
            if validation_issues['errors']:
                print("Validation errors found. Fix these before applying:")
                for error in validation_issues['errors']:
                    print(f"  ERROR: {error}")
                return results

            if validation_issues['warnings']:
                print("Validation warnings (review recommended):")
                for warning in validation_issues['warnings'][:10]:
                    print(f"  WARNING: {warning}")

        for lang, translations in batch_data.get('translations', {}).items():
            lang_file = self.locales_dir / lang / "translation.json"

            # Load existing data or create new
            if lang_file.exists():
                lang_data = self._load_json(lang_file)
            else:
                lang_data = {}
                lang_file.parent.mkdir(parents=True, exist_ok=True)

            applied_count = 0
            for key, translation_data in translations.items():
                translated = translation_data.get('translated', '').strip()
                if translated and translated != translation_data.get('original', ''):
                    self._set_nested_value(lang_data, key, translated)
                    applied_count += 1

            if applied_count > 0:
                self._save_json(lang_data, lang_file)
                results['applied'][lang] = applied_count
                print(f"Applied {applied_count} translations to {lang}")

        return results

    def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
        """Set value in nested dict using dot notation."""
        keys = key_path.split('.')
        current = data
        for key in keys[:-1]:
            if key not in current:
                current[key] = {}
            elif not isinstance(current[key], dict):
                # If the current value is not a dict, we can't nest into it
                print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
                current[key] = {}
            current = current[key]
        current[keys[-1]] = value

    def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None:
        """Export translations for external translation services."""
        golden_truth = self._load_json(self.golden_truth_file)
        golden_flat = self._flatten_dict(golden_truth)

        if output_format == 'csv':
            output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv')

            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['key', 'context', 'en_GB'] + languages
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

                for key, en_value in golden_flat.items():
                    if self._is_expected_identical(key, en_value):
                        continue

                    row = {
                        'key': key,
                        'context': self._get_key_context(key),
                        'en_GB': en_value
                    }

                    for lang in languages:
                        lang_file = self.locales_dir / lang / "translation.json"
                        if lang_file.exists():
                            lang_data = self._load_json(lang_file)
                            lang_flat = self._flatten_dict(lang_data)
                            value = lang_flat.get(key, '')
                            if value.startswith('[UNTRANSLATED]'):
                                value = ''
                            row[lang] = value
                        else:
                            row[lang] = ''

                    writer.writerow(row)

            print(f"Exported to {output_file}")

        elif output_format == 'json':
            output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json')
            export_data = {'languages': languages, 'translations': {}}

            for key, en_value in golden_flat.items():
                if self._is_expected_identical(key, en_value):
                    continue

                export_data['translations'][key] = {
                    'en_GB': en_value,
                    'context': self._get_key_context(key)
                }

                for lang in languages:
                    lang_file = self.locales_dir / lang / "translation.json"
                    if lang_file.exists():
                        lang_data = self._load_json(lang_file)
                        lang_flat = self._flatten_dict(lang_data)
                        value = lang_flat.get(key, '')
                        if value.startswith('[UNTRANSLATED]'):
                            value = ''
                        export_data['translations'][key][lang] = value

            self._save_json(export_data, output_file)
            print(f"Exported to {output_file}")


def main():
    parser = argparse.ArgumentParser(description='AI Translation Helper')
    parser.add_argument('--locales-dir', default='frontend/public/locales',
                        help='Path to locales directory')

    subparsers = parser.add_subparsers(dest='command', help='Available commands')

    # Create batch command
    batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file')
    batch_parser.add_argument('--languages', nargs='+', required=True,
                             help='Language codes to include')
    batch_parser.add_argument('--output', required=True, help='Output batch file')
    batch_parser.add_argument('--max-entries', type=int, default=100,
                             help='Max entries per language')

    # Validate command
    validate_parser = subparsers.add_parser('validate', help='Validate AI translations')
    validate_parser.add_argument('batch_file', help='Batch file to validate')

    # Apply command
    apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations')
    apply_parser.add_argument('batch_file', help='Batch file with translations')
    apply_parser.add_argument('--skip-validation', action='store_true',
                             help='Skip validation before applying')

    # Export command
    export_parser = subparsers.add_parser('export', help='Export for external translation')
    export_parser.add_argument('--languages', nargs='+', required=True,
                              help='Language codes to export')
    export_parser.add_argument('--format', choices=['csv', 'json'], default='csv',
                              help='Export format')

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return

    helper = AITranslationHelper(args.locales_dir)

    if args.command == 'create-batch':
        output_file = Path(args.output)
        helper.create_ai_batch_file(args.languages, output_file, args.max_entries)

    elif args.command == 'validate':
        batch_file = Path(args.batch_file)
        issues = helper.validate_ai_translations(batch_file)

        if issues['errors']:
            print("ERRORS:")
            for error in issues['errors']:
                print(f"  - {error}")

        if issues['warnings']:
            print("WARNINGS:")
            for warning in issues['warnings']:
                print(f"  - {warning}")

        if not issues['errors'] and not issues['warnings']:
            print("No validation issues found!")

    elif args.command == 'apply-batch':
        batch_file = Path(args.batch_file)
        results = helper.apply_ai_batch_translations(
            batch_file,
            validate=not args.skip_validation
        )

        total_applied = sum(results['applied'].values())
        print(f"Total translations applied: {total_applied}")

    elif args.command == 'export':
        helper.export_for_external_translation(args.languages, args.format)


if __name__ == "__main__":
    main()