Stirling-PDF/scripts/translations/toml_beautifier.py

#!/usr/bin/env python3
"""
TOML Beautifier and Structure Fixer for Stirling PDF Frontend
Restructures translation TOML files to match en-GB structure and key order exactly.
"""

import sys
from pathlib import Path
from typing import Dict, Any, List
import argparse
from collections import OrderedDict

import tomllib
import tomli_w


class TOMLBeautifier:
    def __init__(self, locales_dir: str = "frontend/public/locales"):
        self.locales_dir = Path(locales_dir)
        self.golden_truth_file = self.locales_dir / "en-GB" / "translation.toml"
        self.golden_structure = self._load_toml(self.golden_truth_file)

    def _load_toml(self, file_path: Path) -> Dict:
        """Load TOML file with error handling."""
        try:
            with open(file_path, "rb") as f:
                return tomllib.load(f)
        except FileNotFoundError:
            print(f"Error: File not found: {file_path}")
            sys.exit(1)
        except Exception as e:
            print(f"Error: Invalid TOML in {file_path}: {e}")
            sys.exit(1)

    def _save_toml(self, data: Dict, file_path: Path, backup: bool = False) -> None:
        """Save TOML file with proper formatting."""
        if backup and file_path.exists():
            backup_path = file_path.with_suffix(".backup.restructured.toml")
            import shutil

            shutil.copy2(file_path, backup_path)
            print(f"Backup created: {backup_path}")

        with open(file_path, "wb") as f:
            tomli_w.dump(data, f)

    def _flatten_dict(
        self, d: Dict, parent_key: str = "", separator: str = "."
    ) -> Dict[str, Any]:
        """Flatten nested dictionary into dot-notation keys."""
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}{separator}{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(self._flatten_dict(v, new_key, separator).items())
            else:
                items.append((new_key, v))
        return dict(items)

    def _rebuild_structure(
        self, flat_dict: Dict[str, Any], reference_structure: Dict
    ) -> Dict:
        """Rebuild nested structure based on reference structure and available translations."""

        def build_recursive(ref_obj: Any, current_path: str = "") -> Any:
            if isinstance(ref_obj, dict):
                result = OrderedDict()
                for key, value in ref_obj.items():
                    new_path = f"{current_path}.{key}" if current_path else key

                    if new_path in flat_dict:
                        # Direct translation exists
                        if isinstance(value, dict):
                            # If reference is dict but we have a string, use the string
                            if isinstance(flat_dict[new_path], str):
                                result[key] = flat_dict[new_path]
                            else:
                                # Recurse into nested structure
                                result[key] = build_recursive(value, new_path)
                        else:
                            result[key] = flat_dict[new_path]
                    else:
                        # No direct translation, recurse to check for nested keys
                        if isinstance(value, dict):
                            nested_result = build_recursive(value, new_path)
                            if nested_result:  # Only add if we found some translations
                                result[key] = nested_result
                        # If no translation found and it's a leaf, skip it

                return result if result else None
            else:
                # Leaf node - return the translation if it exists
                return flat_dict.get(current_path, None)

        return build_recursive(reference_structure) or OrderedDict()

    def restructure_translation_file(self, target_file: Path) -> Dict[str, Any]:
        """Restructure a translation file to match en-GB structure exactly."""
        if not target_file.exists():
            print(f"Error: Target file does not exist: {target_file}")
            return {}

        # Load the target file
        target_data = self._load_toml(target_file)

        # Flatten the target translations
        flat_target = self._flatten_dict(target_data)

        # Rebuild structure based on golden truth
        restructured = self._rebuild_structure(flat_target, self.golden_structure)

        return restructured

    def beautify_and_restructure(
        self, target_file: Path, backup: bool = False
    ) -> Dict[str, Any]:
        """Main function to beautify and restructure a translation file."""
        lang_code = target_file.parent.name
        print(f"Restructuring {lang_code} translation file...")

        # Get the restructured data
        restructured_data = self.restructure_translation_file(target_file)

        # Save the restructured file
        self._save_toml(restructured_data, target_file, backup)

        # Analyze the results
        flat_golden = self._flatten_dict(self.golden_structure)
        flat_restructured = self._flatten_dict(restructured_data)

        total_keys = len(flat_golden)
        preserved_keys = len(flat_restructured)

        result = {
            "language": lang_code,
            "total_reference_keys": total_keys,
            "preserved_keys": preserved_keys,
            "structure_match": self._compare_structures(
                self.golden_structure, restructured_data
            ),
        }

        print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved")
        return result

    def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]:
        """Compare structures between reference and target."""

        def compare_recursive(r: Any, t: Any, path: str = "") -> List[str]:
            issues = []

            if isinstance(r, dict) and isinstance(t, dict):
                # Check for missing top-level sections
                ref_keys = set(r.keys())
                target_keys = set(t.keys())

                missing_sections = ref_keys - target_keys
                if missing_sections:
                    for section in missing_sections:
                        issues.append(
                            f"Missing section: {path}.{section}" if path else section
                        )

                # Recurse into common sections
                for key in ref_keys & target_keys:
                    new_path = f"{path}.{key}" if path else key
                    issues.extend(compare_recursive(r[key], t[key], new_path))

            return issues

        issues = compare_recursive(ref, target)

        return {
            "structures_match": len(issues) == 0,
            "issues": issues[:10],  # Limit to first 10 issues
            "total_issues": len(issues),
        }

    def validate_key_order(self, target_file: Path) -> Dict[str, Any]:
        """Validate that keys appear in the same order as en-GB."""
        target_data = self._load_toml(target_file)

        def get_key_order(obj: Dict, path: str = "") -> List[str]:
            keys = []
            for key in obj.keys():
                new_path = f"{path}.{key}" if path else key
                keys.append(new_path)
                if isinstance(obj[key], dict):
                    keys.extend(get_key_order(obj[key], new_path))
            return keys

        golden_order = get_key_order(self.golden_structure)
        target_order = get_key_order(target_data)

        # Find common keys and check their relative order
        common_keys = set(golden_order) & set(target_order)

        golden_indices = {
            key: idx for idx, key in enumerate(golden_order) if key in common_keys
        }
        target_indices = {
            key: idx for idx, key in enumerate(target_order) if key in common_keys
        }

        order_preserved = all(
            golden_indices[key1] < golden_indices[key2]
            for key1 in common_keys
            for key2 in common_keys
            if golden_indices[key1] < golden_indices[key2]
            and target_indices[key1] < target_indices[key2]
        )

        return {
            "order_preserved": order_preserved,
            "common_keys_count": len(common_keys),
            "golden_keys_count": len(golden_order),
            "target_keys_count": len(target_order),
        }


def main():
    parser = argparse.ArgumentParser(
        description="Beautify and restructure translation TOML files",
        epilog="Works with TOML format translation files.",
    )
    parser.add_argument(
        "--locales-dir",
        default="frontend/public/locales",
        help="Path to locales directory",
    )
    parser.add_argument("--language", help="Restructure specific language only")
    parser.add_argument(
        "--all-languages", action="store_true", help="Restructure all language files"
    )
    parser.add_argument(
        "--backup", action="store_true", help="Create backup files before modifying"
    )
    parser.add_argument(
        "--validate-only",
        action="store_true",
        help="Only validate structure, do not modify files",
    )

    args = parser.parse_args()

    beautifier = TOMLBeautifier(args.locales_dir)

    if args.language:
        target_file = Path(args.locales_dir) / args.language / "translation.toml"
        if not target_file.exists():
            print(f"Error: Translation file not found for language: {args.language}")
            sys.exit(1)

        if args.validate_only:
            order_result = beautifier.validate_key_order(target_file)
            print(f"Key order validation for {args.language}:")
            print(f"  Order preserved: {order_result['order_preserved']}")
            print(
                f"  Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}"
            )
        else:
            result = beautifier.beautify_and_restructure(
                target_file, backup=args.backup
            )
            print(f"\nResults for {result['language']}:")
            print(
                f"  Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}"
            )
            if result["structure_match"]["total_issues"] > 0:
                print(
                    f"  Structure issues: {result['structure_match']['total_issues']}"
                )
                for issue in result["structure_match"]["issues"]:
                    print(f"    - {issue}")

    elif args.all_languages:
        results = []
        for lang_dir in Path(args.locales_dir).iterdir():
            if lang_dir.is_dir() and lang_dir.name != "en-GB":
                translation_file = lang_dir / "translation.toml"
                if translation_file.exists():
                    if args.validate_only:
                        order_result = beautifier.validate_key_order(translation_file)
                        print(
                            f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}"
                        )
                    else:
                        result = beautifier.beautify_and_restructure(
                            translation_file, backup=args.backup
                        )
                        results.append(result)

        if not args.validate_only and results:
            print(f"\n{'=' * 60}")
            print("RESTRUCTURING SUMMARY")
            print(f"{'=' * 60}")
            for result in sorted(results, key=lambda x: x["language"]):
                print(
                    f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
                    f"({result['preserved_keys'] / result['total_reference_keys'] * 100:.1f}%)"
                )

    else:
        parser.print_help()


if __name__ == "__main__":
    main()