#!/usr/bin/env python3 """ Translation Analyzer for Stirling PDF Frontend Compares language files against en-GB golden truth file. """ import json import os import sys from pathlib import Path from typing import Dict, List, Set, Tuple import argparse try: import tomllib # Python 3.11+ except ImportError: try: import toml as tomllib_fallback tomllib = None except ImportError: tomllib = None tomllib_fallback = None class TranslationAnalyzer: def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"): self.locales_dir = Path(locales_dir) self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json" self.golden_truth = self._load_json(self.golden_truth_file) self.ignore_file = Path(ignore_file) self.ignore_patterns = self._load_ignore_patterns() def _load_json(self, file_path: Path) -> Dict: """Load JSON file with error handling.""" try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: print(f"Error: File not found: {file_path}") sys.exit(1) except json.JSONDecodeError as e: print(f"Error: Invalid JSON in {file_path}: {e}") sys.exit(1) def _load_ignore_patterns(self) -> Dict[str, Set[str]]: """Load ignore patterns from TOML file.""" if not self.ignore_file.exists(): return {} try: if tomllib: # Use Python 3.11+ built-in with open(self.ignore_file, 'rb') as f: ignore_data = tomllib.load(f) elif tomllib_fallback: # Use toml library fallback ignore_data = tomllib_fallback.load(self.ignore_file) else: # Simple parser as fallback ignore_data = self._parse_simple_toml() # Convert lists to sets for faster lookup return {lang: set(patterns) for lang, data in ignore_data.items() for patterns in [data.get('ignore', [])] if patterns} except Exception as e: print(f"Warning: Could not load ignore file {self.ignore_file}: {e}") return {} def _parse_simple_toml(self) -> Dict: """Simple TOML parser for ignore patterns (fallback).""" ignore_data = {} current_section = None with open(self.ignore_file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line or line.startswith('#'): continue if line.startswith('[') and line.endswith(']'): current_section = line[1:-1] ignore_data[current_section] = {'ignore': []} elif line.startswith('ignore = [') and current_section: # Handle ignore array continue elif line.strip().startswith("'") and current_section: # Extract quoted items item = line.strip().strip("',") if item: ignore_data[current_section]['ignore'].append(item) return ignore_data def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]: """Flatten nested dictionary into dot-notation keys.""" items = [] for k, v in d.items(): new_key = f"{parent_key}{separator}{k}" if parent_key else k if isinstance(v, dict): items.extend(self._flatten_dict(v, new_key, separator).items()) else: items.append((new_key, str(v))) return dict(items) def get_all_language_files(self) -> List[Path]: """Get all translation.json files except en-GB.""" files = [] for lang_dir in self.locales_dir.iterdir(): if lang_dir.is_dir() and lang_dir.name != "en-GB": translation_file = lang_dir / "translation.json" if translation_file.exists(): files.append(translation_file) return sorted(files) def find_missing_translations(self, target_file: Path) -> Set[str]: """Find keys that exist in en-GB but missing in target file.""" target_data = self._load_json(target_file) golden_flat = self._flatten_dict(self.golden_truth) target_flat = self._flatten_dict(target_data) missing = set(golden_flat.keys()) - set(target_flat.keys()) # Filter out ignored keys lang_code = target_file.parent.name.replace('-', '_') ignore_set = self.ignore_patterns.get(lang_code, set()) return missing - ignore_set def find_untranslated_entries(self, target_file: Path) -> Set[str]: """Find entries that appear to be untranslated (identical to en-GB).""" target_data = self._load_json(target_file) golden_flat = self._flatten_dict(self.golden_truth) target_flat = self._flatten_dict(target_data) lang_code = target_file.parent.name.replace('-', '_') ignore_set = self.ignore_patterns.get(lang_code, set()) untranslated = set() for key in target_flat: if key in golden_flat: target_value = target_flat[key] golden_value = golden_flat[key] # Check if marked as [UNTRANSLATED] or identical to en-GB if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \ (golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)): untranslated.add(key) return untranslated def _is_expected_identical(self, key: str, value: str) -> bool: """Check if a key-value pair is expected to be identical across languages.""" # Keys that should be identical across languages identical_patterns = [ 'language.direction', 'true', 'false', 'unknown' ] # Values that are often identical (numbers, symbols, etc.) if value.strip() in ['ltr', 'rtl', 'True', 'False']: return True # Check for patterns for pattern in identical_patterns: if pattern in key.lower(): return True return False def find_extra_translations(self, target_file: Path) -> Set[str]: """Find keys that exist in target file but not in en-GB.""" target_data = self._load_json(target_file) golden_flat = self._flatten_dict(self.golden_truth) target_flat = self._flatten_dict(target_data) return set(target_flat.keys()) - set(golden_flat.keys()) def analyze_file(self, target_file: Path) -> Dict: """Complete analysis of a single translation file.""" lang_code = target_file.parent.name missing = self.find_missing_translations(target_file) untranslated = self.find_untranslated_entries(target_file) extra = self.find_extra_translations(target_file) target_data = self._load_json(target_file) golden_flat = self._flatten_dict(self.golden_truth) target_flat = self._flatten_dict(target_data) # Calculate completion rate excluding ignored keys lang_code = target_file.parent.name.replace('-', '_') ignore_set = self.ignore_patterns.get(lang_code, set()) relevant_keys = set(golden_flat.keys()) - ignore_set total_keys = len(relevant_keys) # Count keys that exist and are properly translated (not [UNTRANSLATED]) properly_translated = 0 for key in relevant_keys: if key in target_flat: value = target_flat[key] if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")): if key not in untranslated: # Not identical to en-GB (unless expected) properly_translated += 1 completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0 return { 'language': lang_code, 'file': target_file, 'missing_count': len(missing), 'missing_keys': sorted(missing), 'untranslated_count': len(untranslated), 'untranslated_keys': sorted(untranslated), 'extra_count': len(extra), 'extra_keys': sorted(extra), 'total_keys': total_keys, 'completion_rate': completion_rate } def analyze_all_files(self) -> List[Dict]: """Analyze all translation files.""" results = [] for file_path in self.get_all_language_files(): results.append(self.analyze_file(file_path)) return sorted(results, key=lambda x: x['language']) def main(): parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth') parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory') parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns TOML file') parser.add_argument('--language', help='Analyze specific language only') parser.add_argument('--missing-only', action='store_true', help='Show only missing translations') parser.add_argument('--untranslated-only', action='store_true', help='Show only untranslated entries') parser.add_argument('--summary', action='store_true', help='Show summary statistics only') parser.add_argument('--format', choices=['text', 'json'], default='text', help='Output format') args = parser.parse_args() analyzer = TranslationAnalyzer(args.locales_dir, args.ignore_file) if args.language: target_file = Path(args.locales_dir) / args.language / "translation.json" if not target_file.exists(): print(f"Error: Translation file not found for language: {args.language}") sys.exit(1) results = [analyzer.analyze_file(target_file)] else: results = analyzer.analyze_all_files() if args.format == 'json': print(json.dumps(results, indent=2, default=str)) return # Text format output for result in results: lang = result['language'] print(f"\n{'='*60}") print(f"Language: {lang}") print(f"File: {result['file']}") print(f"Completion Rate: {result['completion_rate']:.1f}%") print(f"Total Keys in en-GB: {result['total_keys']}") if not args.summary: if not args.untranslated_only: print(f"\nMissing Translations ({result['missing_count']}):") for key in result['missing_keys'][:10]: # Show first 10 print(f" - {key}") if len(result['missing_keys']) > 10: print(f" ... and {len(result['missing_keys']) - 10} more") if not args.missing_only: print(f"\nUntranslated Entries ({result['untranslated_count']}):") for key in result['untranslated_keys'][:10]: # Show first 10 print(f" - {key}") if len(result['untranslated_keys']) > 10: print(f" ... and {len(result['untranslated_keys']) - 10} more") if result['extra_count'] > 0: print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):") for key in result['extra_keys'][:5]: print(f" - {key}") if len(result['extra_keys']) > 5: print(f" ... and {len(result['extra_keys']) - 5} more") print(f"\n{'='*60}") print("SUMMARY") print(f"{'='*60}") avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0 print(f"Average Completion Rate: {avg_completion:.1f}%") print(f"Languages Analyzed: {len(results)}") # Top languages by completion sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True) print(f"\nTop 5 Most Complete Languages:") for result in sorted_by_completion[:5]: print(f" {result['language']}: {result['completion_rate']:.1f}%") print(f"\nBottom 5 Languages Needing Attention:") for result in sorted_by_completion[-5:]: print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)") if __name__ == "__main__": main()