Stirling-PDF/scripts/translations/validate_json_structure.py

#!/usr/bin/env python3
"""
Validate TOML structure and formatting of translation files.

Checks for:
- Valid TOML syntax
- Consistent key structure with en-GB
- Missing keys
- Extra keys not in en-GB
- Malformed entries

Usage:
    python scripts/translations/validate_json_structure.py [--language LANG]
"""

import json
import sys
from pathlib import Path
from typing import Dict, Set
import argparse
import tomllib  # Python 3.11+ (stdlib)


def get_all_keys(d: dict, parent_key: str = "", sep: str = ".") -> Set[str]:
    """Get all keys from nested dict as dot-notation paths."""
    keys = set()
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        keys.add(new_key)
        if isinstance(v, dict):
            keys.update(get_all_keys(v, new_key, sep=sep))
    return keys


def validate_translation_file(file_path: Path) -> tuple[bool, str]:
    """Validate that a file contains valid TOML."""
    try:
        with open(file_path, "rb") as f:
            tomllib.load(f)
        return True, "Valid TOML"
    except Exception as e:
        return False, f"Error reading file: {str(e)}"


def validate_structure(
    en_gb_keys: Set[str], lang_keys: Set[str], lang_code: str
) -> Dict:
    """Compare structure between en-GB and target language."""
    missing_keys = en_gb_keys - lang_keys
    extra_keys = lang_keys - en_gb_keys

    return {
        "language": lang_code,
        "missing_keys": sorted(missing_keys),
        "extra_keys": sorted(extra_keys),
        "total_keys": len(lang_keys),
        "expected_keys": len(en_gb_keys),
        "missing_count": len(missing_keys),
        "extra_count": len(extra_keys),
    }


def print_validation_result(result: Dict, verbose: bool = False):
    """Print validation results in readable format."""
    lang = result["language"]

    print(f"\n{'=' * 100}")
    print(f"Language: {lang}")
    print(f"{'=' * 100}")
    print(f"  Total keys: {result['total_keys']}")
    print(f"  Expected keys (en-GB): {result['expected_keys']}")
    print(f"  Missing keys: {result['missing_count']}")
    print(f"  Extra keys: {result['extra_count']}")

    if result["missing_count"] == 0 and result["extra_count"] == 0:
        print("  ✅ Structure matches en-GB perfectly!")
    else:
        if result["missing_count"] > 0:
            print(f"\n  ⚠️  Missing {result['missing_count']} key(s):")
            if verbose or result["missing_count"] <= 20:
                for key in result["missing_keys"][:50]:
                    print(f"     - {key}")
                if result["missing_count"] > 50:
                    print(f"     ... and {result['missing_count'] - 50} more")
            else:
                print("     (use --verbose to see all)")

        if result["extra_count"] > 0:
            print(f"\n  ⚠️  Extra {result['extra_count']} key(s) not in en-GB:")
            if verbose or result["extra_count"] <= 20:
                for key in result["extra_keys"][:50]:
                    print(f"     - {key}")
                if result["extra_count"] > 50:
                    print(f"     ... and {result['extra_count'] - 50} more")
            else:
                print("     (use --verbose to see all)")

    print("-" * 100)


def load_translation_file(file_path: Path) -> dict:
    """Load TOML translation file."""
    with open(file_path, "rb") as f:
        return tomllib.load(f)


def main():
    parser = argparse.ArgumentParser(description="Validate translation TOML structure")
    parser.add_argument(
        "--language",
        help="Specific language code to validate (e.g., es-ES)",
        default=None,
    )
    parser.add_argument(
        "--verbose", "-v", action="store_true", help="Show all missing/extra keys"
    )
    parser.add_argument("--json", action="store_true", help="Output results as JSON")

    args = parser.parse_args()

    # Define paths
    locales_dir = Path("frontend/public/locales")
    en_gb_path = locales_dir / "en-GB" / "translation.toml"

    if not en_gb_path.exists():
        print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
        sys.exit(1)

    # Validate en-GB itself
    is_valid, message = validate_translation_file(en_gb_path)
    if not is_valid:
        print(f"❌ Error in en-GB file: {message}")
        sys.exit(1)

    # Load en-GB structure
    en_gb = load_translation_file(en_gb_path)

    en_gb_keys = get_all_keys(en_gb)

    # Get list of languages to validate
    if args.language:
        languages = [args.language]
    else:
        # Validate all languages except en-GB
        languages = []
        for d in locales_dir.iterdir():
            if d.is_dir() and d.name != "en-GB":
                if (d / "translation.toml").exists():
                    languages.append(d.name)

    results = []
    json_errors = []

    # Validate each language
    for lang_code in sorted(languages):
        lang_path = locales_dir / lang_code / "translation.toml"

        if not lang_path.exists():
            print(f"⚠️  Warning: {lang_code}/translation.toml not found, skipping")
            continue

        # First check if file is valid
        is_valid, message = validate_translation_file(lang_path)
        if not is_valid:
            json_errors.append(
                {"language": lang_code, "file": str(lang_path), "error": message}
            )
            continue

        # Load and compare structure
        lang_data = load_translation_file(lang_path)

        lang_keys = get_all_keys(lang_data)
        result = validate_structure(en_gb_keys, lang_keys, lang_code)
        results.append(result)

    # Output results
    if args.json:
        output = {"json_errors": json_errors, "structure_validation": results}
        print(json.dumps(output, indent=2, ensure_ascii=False))
    else:
        # Print syntax errors first
        if json_errors:
            print("\n❌ Syntax Errors:")
            print("=" * 100)
            for error in json_errors:
                print(f"\nLanguage: {error['language']}")
                print(f"File: {error['file']}")
                print(f"Error: {error['error']}")
            print("\n")

        # Print structure validation results
        if results:
            print("\n📊 Structure Validation Summary:")
            print(f"   Languages validated: {len(results)}")

            perfect = sum(
                1 for r in results if r["missing_count"] == 0 and r["extra_count"] == 0
            )
            print(f"   Perfect matches: {perfect}/{len(results)}")

            total_missing = sum(r["missing_count"] for r in results)
            total_extra = sum(r["extra_count"] for r in results)
            print(f"   Total missing keys: {total_missing}")
            print(f"   Total extra keys: {total_extra}")

            for result in results:
                print_validation_result(result, verbose=args.verbose)

        if not json_errors and perfect == len(results):
            print("\n✅ All translations have perfect structure!")

    # Exit with error code if issues found
    has_issues = len(json_errors) > 0 or any(
        r["missing_count"] > 0 or r["extra_count"] > 0 for r in results
    )
    sys.exit(1 if has_issues else 0)


if __name__ == "__main__":
    main()