Stirling-PDF/scripts/translations/validate_json_structure.py

#!/usr/bin/env python3
"""
Validate JSON structure and formatting of translation files.

Checks for:
- Valid JSON syntax
- Consistent key structure with en-GB
- Missing keys
- Extra keys not in en-GB
- Malformed entries

Usage:
    python scripts/translations/validate_json_structure.py [--language LANG]
"""

import json
import sys
from pathlib import Path
from typing import Dict, List, Set
import argparse


def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
    """Get all keys from nested dict as dot-notation paths."""
    keys = set()
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        keys.add(new_key)
        if isinstance(v, dict):
            keys.update(get_all_keys(v, new_key, sep=sep))
    return keys


def validate_json_file(file_path: Path) -> tuple[bool, str]:
    """Validate that a file contains valid JSON."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            json.load(f)
        return True, "Valid JSON"
    except json.JSONDecodeError as e:
        return False, f"Invalid JSON at line {e.lineno}, column {e.colno}: {e.msg}"
    except Exception as e:
        return False, f"Error reading file: {str(e)}"


def validate_structure(
    en_gb_keys: Set[str],
    lang_keys: Set[str],
    lang_code: str
) -> Dict:
    """Compare structure between en-GB and target language."""
    missing_keys = en_gb_keys - lang_keys
    extra_keys = lang_keys - en_gb_keys

    return {
        'language': lang_code,
        'missing_keys': sorted(missing_keys),
        'extra_keys': sorted(extra_keys),
        'total_keys': len(lang_keys),
        'expected_keys': len(en_gb_keys),
        'missing_count': len(missing_keys),
        'extra_count': len(extra_keys)
    }


def print_validation_result(result: Dict, verbose: bool = False):
    """Print validation results in readable format."""
    lang = result['language']

    print(f"\n{'='*100}")
    print(f"Language: {lang}")
    print(f"{'='*100}")
    print(f"  Total keys: {result['total_keys']}")
    print(f"  Expected keys (en-GB): {result['expected_keys']}")
    print(f"  Missing keys: {result['missing_count']}")
    print(f"  Extra keys: {result['extra_count']}")

    if result['missing_count'] == 0 and result['extra_count'] == 0:
        print(f"  ✅ Structure matches en-GB perfectly!")
    else:
        if result['missing_count'] > 0:
            print(f"\n  ⚠️  Missing {result['missing_count']} key(s):")
            if verbose or result['missing_count'] <= 20:
                for key in result['missing_keys'][:50]:
                    print(f"     - {key}")
                if result['missing_count'] > 50:
                    print(f"     ... and {result['missing_count'] - 50} more")
            else:
                print(f"     (use --verbose to see all)")

        if result['extra_count'] > 0:
            print(f"\n  ⚠️  Extra {result['extra_count']} key(s) not in en-GB:")
            if verbose or result['extra_count'] <= 20:
                for key in result['extra_keys'][:50]:
                    print(f"     - {key}")
                if result['extra_count'] > 50:
                    print(f"     ... and {result['extra_count'] - 50} more")
            else:
                print(f"     (use --verbose to see all)")

    print("-" * 100)


def main():
    parser = argparse.ArgumentParser(
        description='Validate translation JSON structure'
    )
    parser.add_argument(
        '--language',
        help='Specific language code to validate (e.g., es-ES)',
        default=None
    )
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Show all missing/extra keys'
    )
    parser.add_argument(
        '--json',
        action='store_true',
        help='Output results as JSON'
    )

    args = parser.parse_args()

    # Define paths
    locales_dir = Path('frontend/public/locales')
    en_gb_path = locales_dir / 'en-GB' / 'translation.json'

    if not en_gb_path.exists():
        print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
        sys.exit(1)

    # Validate en-GB itself
    is_valid, message = validate_json_file(en_gb_path)
    if not is_valid:
        print(f"❌ Error in en-GB file: {message}")
        sys.exit(1)

    # Load en-GB structure
    with open(en_gb_path, 'r', encoding='utf-8') as f:
        en_gb = json.load(f)

    en_gb_keys = get_all_keys(en_gb)

    # Get list of languages to validate
    if args.language:
        languages = [args.language]
    else:
        languages = [
            d.name for d in locales_dir.iterdir()
            if d.is_dir() and d.name != 'en-GB' and (d / 'translation.json').exists()
        ]

    results = []
    json_errors = []

    # Validate each language
    for lang_code in sorted(languages):
        lang_path = locales_dir / lang_code / 'translation.json'

        if not lang_path.exists():
            print(f"⚠️  Warning: {lang_code}/translation.json not found, skipping")
            continue

        # First check if JSON is valid
        is_valid, message = validate_json_file(lang_path)
        if not is_valid:
            json_errors.append({
                'language': lang_code,
                'file': str(lang_path),
                'error': message
            })
            continue

        # Load and compare structure
        with open(lang_path, 'r', encoding='utf-8') as f:
            lang_data = json.load(f)

        lang_keys = get_all_keys(lang_data)
        result = validate_structure(en_gb_keys, lang_keys, lang_code)
        results.append(result)

    # Output results
    if args.json:
        output = {
            'json_errors': json_errors,
            'structure_validation': results
        }
        print(json.dumps(output, indent=2, ensure_ascii=False))
    else:
        # Print JSON errors first
        if json_errors:
            print("\n❌ JSON Syntax Errors:")
            print("=" * 100)
            for error in json_errors:
                print(f"\nLanguage: {error['language']}")
                print(f"File: {error['file']}")
                print(f"Error: {error['error']}")
            print("\n")

        # Print structure validation results
        if results:
            print("\n📊 Structure Validation Summary:")
            print(f"   Languages validated: {len(results)}")

            perfect = sum(1 for r in results if r['missing_count'] == 0 and r['extra_count'] == 0)
            print(f"   Perfect matches: {perfect}/{len(results)}")

            total_missing = sum(r['missing_count'] for r in results)
            total_extra = sum(r['extra_count'] for r in results)
            print(f"   Total missing keys: {total_missing}")
            print(f"   Total extra keys: {total_extra}")

            for result in results:
                print_validation_result(result, verbose=args.verbose)

        if not json_errors and perfect == len(results):
            print("\n✅ All translations have perfect structure!")

    # Exit with error code if issues found
    has_issues = len(json_errors) > 0 or any(
        r['missing_count'] > 0 or r['extra_count'] > 0 for r in results
    )
    sys.exit(1 if has_issues else 0)


if __name__ == '__main__':
    main()