Stirling-PDF/scripts/translations/translation_analyzer.py
2025-09-25 10:19:45 +01:00

314 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Translation Analyzer for Stirling PDF Frontend
Compares language files against en-GB golden truth file.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
import argparse
try:
import tomllib # Python 3.11+
except ImportError:
try:
import toml as tomllib_fallback
tomllib = None
except ImportError:
tomllib = None
tomllib_fallback = None
class TranslationAnalyzer:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
self.golden_truth = self._load_json(self.golden_truth_file)
self.ignore_file = Path(ignore_file)
self.ignore_patterns = self._load_ignore_patterns()
def _load_json(self, file_path: Path) -> Dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {file_path}: {e}")
sys.exit(1)
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
"""Load ignore patterns from TOML file."""
if not self.ignore_file.exists():
return {}
try:
if tomllib:
# Use Python 3.11+ built-in
with open(self.ignore_file, 'rb') as f:
ignore_data = tomllib.load(f)
elif tomllib_fallback:
# Use toml library fallback
ignore_data = tomllib_fallback.load(self.ignore_file)
else:
# Simple parser as fallback
ignore_data = self._parse_simple_toml()
# Convert lists to sets for faster lookup
return {lang: set(patterns) for lang, data in ignore_data.items()
for patterns in [data.get('ignore', [])] if patterns}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
return {}
def _parse_simple_toml(self) -> Dict:
"""Simple TOML parser for ignore patterns (fallback)."""
ignore_data = {}
current_section = None
with open(self.ignore_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if line.startswith('[') and line.endswith(']'):
current_section = line[1:-1]
ignore_data[current_section] = {'ignore': []}
elif line.startswith('ignore = [') and current_section:
# Handle ignore array
continue
elif line.strip().startswith("'") and current_section:
# Extract quoted items
item = line.strip().strip("',")
if item:
ignore_data[current_section]['ignore'].append(item)
return ignore_data
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, str(v)))
return dict(items)
def get_all_language_files(self) -> List[Path]:
"""Get all translation.json files except en-GB."""
files = []
for lang_dir in self.locales_dir.iterdir():
if lang_dir.is_dir() and lang_dir.name != "en-GB":
translation_file = lang_dir / "translation.json"
if translation_file.exists():
files.append(translation_file)
return sorted(files)
def find_missing_translations(self, target_file: Path) -> Set[str]:
"""Find keys that exist in en-GB but missing in target file."""
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
missing = set(golden_flat.keys()) - set(target_flat.keys())
# Filter out ignored keys
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
return missing - ignore_set
def find_untranslated_entries(self, target_file: Path) -> Set[str]:
"""Find entries that appear to be untranslated (identical to en-GB)."""
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
untranslated = set()
for key in target_flat:
if key in golden_flat:
target_value = target_flat[key]
golden_value = golden_flat[key]
# Check if marked as [UNTRANSLATED] or identical to en-GB
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
(golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)):
untranslated.add(key)
return untranslated
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
# Keys that should be identical across languages
identical_patterns = [
'language.direction',
'true', 'false',
'unknown'
]
# Values that are often identical (numbers, symbols, etc.)
if value.strip() in ['ltr', 'rtl', 'True', 'False']:
return True
# Check for patterns
for pattern in identical_patterns:
if pattern in key.lower():
return True
return False
def find_extra_translations(self, target_file: Path) -> Set[str]:
"""Find keys that exist in target file but not in en-GB."""
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
return set(target_flat.keys()) - set(golden_flat.keys())
def analyze_file(self, target_file: Path) -> Dict:
"""Complete analysis of a single translation file."""
lang_code = target_file.parent.name
missing = self.find_missing_translations(target_file)
untranslated = self.find_untranslated_entries(target_file)
extra = self.find_extra_translations(target_file)
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
# Calculate completion rate excluding ignored keys
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
relevant_keys = set(golden_flat.keys()) - ignore_set
total_keys = len(relevant_keys)
# Count keys that exist and are properly translated (not [UNTRANSLATED])
properly_translated = 0
for key in relevant_keys:
if key in target_flat:
value = target_flat[key]
if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")):
if key not in untranslated: # Not identical to en-GB (unless expected)
properly_translated += 1
completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0
return {
'language': lang_code,
'file': target_file,
'missing_count': len(missing),
'missing_keys': sorted(missing),
'untranslated_count': len(untranslated),
'untranslated_keys': sorted(untranslated),
'extra_count': len(extra),
'extra_keys': sorted(extra),
'total_keys': total_keys,
'completion_rate': completion_rate
}
def analyze_all_files(self) -> List[Dict]:
"""Analyze all translation files."""
results = []
for file_path in self.get_all_language_files():
results.append(self.analyze_file(file_path))
return sorted(results, key=lambda x: x['language'])
def main():
parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
help='Path to ignore patterns TOML file')
parser.add_argument('--language', help='Analyze specific language only')
parser.add_argument('--missing-only', action='store_true',
help='Show only missing translations')
parser.add_argument('--untranslated-only', action='store_true',
help='Show only untranslated entries')
parser.add_argument('--summary', action='store_true',
help='Show summary statistics only')
parser.add_argument('--format', choices=['text', 'json'], default='text',
help='Output format')
args = parser.parse_args()
analyzer = TranslationAnalyzer(args.locales_dir, args.ignore_file)
if args.language:
target_file = Path(args.locales_dir) / args.language / "translation.json"
if not target_file.exists():
print(f"Error: Translation file not found for language: {args.language}")
sys.exit(1)
results = [analyzer.analyze_file(target_file)]
else:
results = analyzer.analyze_all_files()
if args.format == 'json':
print(json.dumps(results, indent=2, default=str))
return
# Text format output
for result in results:
lang = result['language']
print(f"\n{'='*60}")
print(f"Language: {lang}")
print(f"File: {result['file']}")
print(f"Completion Rate: {result['completion_rate']:.1f}%")
print(f"Total Keys in en-GB: {result['total_keys']}")
if not args.summary:
if not args.untranslated_only:
print(f"\nMissing Translations ({result['missing_count']}):")
for key in result['missing_keys'][:10]: # Show first 10
print(f" - {key}")
if len(result['missing_keys']) > 10:
print(f" ... and {len(result['missing_keys']) - 10} more")
if not args.missing_only:
print(f"\nUntranslated Entries ({result['untranslated_count']}):")
for key in result['untranslated_keys'][:10]: # Show first 10
print(f" - {key}")
if len(result['untranslated_keys']) > 10:
print(f" ... and {len(result['untranslated_keys']) - 10} more")
if result['extra_count'] > 0:
print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):")
for key in result['extra_keys'][:5]:
print(f" - {key}")
if len(result['extra_keys']) > 5:
print(f" ... and {len(result['extra_keys']) - 5} more")
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0
print(f"Average Completion Rate: {avg_completion:.1f}%")
print(f"Languages Analyzed: {len(results)}")
# Top languages by completion
sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True)
print(f"\nTop 5 Most Complete Languages:")
for result in sorted_by_completion[:5]:
print(f" {result['language']}: {result['completion_rate']:.1f}%")
print(f"\nBottom 5 Languages Needing Attention:")
for result in sorted_by_completion[-5:]:
print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)")
if __name__ == "__main__":
main()