Stirling-PDF/scripts/translations/validate_placeholders.py
Ludy 8555fe3fb5
chore(ci): refine pre-commit workflows, add TOML sorting (#5648)
# Description of Changes

This pull request introduces several improvements to pre-commit
configuration and automation, enhances error handling in scripts, and
updates dependencies and exclusions for code quality tools. The main
changes are grouped below:

**Pre-commit and CI workflow improvements:**

* The pre-commit workflow in `.github/workflows/pre_commit.yml` now runs
specific hooks (`ruff`, `ruff-format`, `codespell`, `gitleaks`,
`end-of-file-fixer`, `trailing-whitespace`) individually instead of
running all hooks at once, providing more granular feedback.
* The sync files workflow in `.github/workflows/sync_files_v2.yml` now
installs pre-commit dependencies and runs the `toml-sort-fix` hook to
ensure TOML files are consistently sorted.
* Added the `toml-sort-fix` hook from the `toml-sort` repository to
`.pre-commit-config.yaml` for sorting TOML files in the locales
directory.

**Pre-commit configuration and dependency updates:**

* Updated the `ruff-pre-commit` repository version from `v0.14.8` to
`v0.14.14` in `.pre-commit-config.yaml`.
* Updated the `codespell` hook to expand the ignore words list and to
exclude the `frontend/public/vendor` directory.

**Script improvements and error handling:**

* Replaced bare `except:` clauses with `except Exception:` in
`scripts/convert_cff_to_ttf.py` for safer error handling.
[[1]](diffhunk://#diff-8c68a22370903bb52267848deaf7298604704c59292650d9dfc1d1975fa8bc53L194-R194)
[[2]](diffhunk://#diff-8c68a22370903bb52267848deaf7298604704c59292650d9dfc1d1975fa8bc53L318-R325)
* Minor code cleanup in translation validation scripts by removing
unused variables.
[[1]](diffhunk://#diff-2399f964d817f2e61b818c3f6543ebce9e230778b35ab62bc8578cb7cc9da99eL124)
[[2]](diffhunk://#diff-3b83f838d72dce860ff1f7b24a033f02134aaac3d7abdf061d72c1c21943f896L117)
* Removed unused `progress` variable assignment in
`scripts/counter_translation_v3.py` for clarity.

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### Translations (if applicable)

- [ ] I ran
[`scripts/counter_translation.py`](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/docs/counter_translation.md)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
2026-02-06 11:09:04 +00:00

188 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Validate that translation files have the same placeholders as en-GB (source of truth).
Usage:
python scripts/translations/validate_placeholders.py [--language LANG] [--fix]
--language: Validate specific language (e.g., es-ES, de-DE)
--fix: Automatically remove extra placeholders (use with caution)
"""
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Set
import argparse
import tomllib # Python 3.11+ (stdlib)
def find_placeholders(text: str) -> Set[str]:
"""Find all placeholders in text like {n}, {{var}}, {0}, etc."""
if not isinstance(text, str):
return set()
return set(re.findall(r"\{\{?[^}]+\}\}?", text))
def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> Dict[str, str]:
"""Flatten nested dict to dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def validate_language(
en_gb_flat: Dict[str, str], lang_flat: Dict[str, str], lang_code: str
) -> List[Dict]:
"""Validate placeholders for a language against en-GB."""
issues = []
for key in en_gb_flat:
if key not in lang_flat:
continue
en_placeholders = find_placeholders(en_gb_flat[key])
lang_placeholders = find_placeholders(lang_flat[key])
if en_placeholders != lang_placeholders:
missing = en_placeholders - lang_placeholders
extra = lang_placeholders - en_placeholders
issue = {
"language": lang_code,
"key": key,
"missing": missing,
"extra": extra,
"en_text": en_gb_flat[key],
"lang_text": lang_flat[key],
}
issues.append(issue)
return issues
def print_issues(issues: List[Dict], verbose: bool = False):
"""Print validation issues in a readable format."""
if not issues:
print("✅ No placeholder validation issues found!")
return
print(f"❌ Found {len(issues)} placeholder validation issue(s):\n")
print("=" * 100)
for i, issue in enumerate(issues, 1):
print(f"\n{i}. Language: {issue['language']}")
print(f" Key: {issue['key']}")
if issue["missing"]:
print(f" ⚠️ MISSING placeholders: {issue['missing']}")
if issue["extra"]:
print(f" ⚠️ EXTRA placeholders: {issue['extra']}")
if verbose:
print(f" EN-GB: {issue['en_text'][:150]}")
print(f" {issue['language']}: {issue['lang_text'][:150]}")
print("-" * 100)
def main():
parser = argparse.ArgumentParser(
description="Validate translation placeholder consistency"
)
parser.add_argument(
"--language",
help="Specific language code to validate (e.g., es-ES)",
default=None,
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Show full text samples for each issue",
)
parser.add_argument("--json", action="store_true", help="Output results as JSON")
args = parser.parse_args()
# Define paths
locales_dir = Path("frontend/public/locales")
en_gb_path = locales_dir / "en-GB" / "translation.toml"
if not en_gb_path.exists():
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
sys.exit(1)
# Load en-GB (source of truth)
with open(en_gb_path, "rb") as f:
en_gb = tomllib.load(f)
en_gb_flat = flatten_dict(en_gb)
# Get list of languages to validate
if args.language:
languages = [args.language]
else:
# Validate all languages except en-GB
languages = []
for d in locales_dir.iterdir():
if d.is_dir() and d.name != "en-GB":
if (d / "translation.toml").exists():
languages.append(d.name)
all_issues = []
# Validate each language
for lang_code in sorted(languages):
lang_path = locales_dir / lang_code / "translation.toml"
if not lang_path.exists():
print(f"⚠️ Warning: {lang_code}/translation.toml not found, skipping")
continue
# Load language file
with open(lang_path, "rb") as f:
lang_data = tomllib.load(f)
lang_flat = flatten_dict(lang_data)
issues = validate_language(en_gb_flat, lang_flat, lang_code)
all_issues.extend(issues)
# Output results
if args.json:
print(json.dumps(all_issues, indent=2, ensure_ascii=False))
else:
if all_issues:
# Group by language
by_language = {}
for issue in all_issues:
lang = issue["language"]
if lang not in by_language:
by_language[lang] = []
by_language[lang].append(issue)
print("📊 Validation Summary:")
print(f" Total issues: {len(all_issues)}")
print(f" Languages with issues: {len(by_language)}\n")
for lang in sorted(by_language.keys()):
print(f"\n{'=' * 100}")
print(f"Language: {lang} ({len(by_language[lang])} issue(s))")
print(f"{'=' * 100}")
print_issues(by_language[lang], verbose=args.verbose)
else:
print("✅ All translations have correct placeholders!")
# Exit with error code if issues found
sys.exit(1 if all_issues else 0)
if __name__ == "__main__":
main()