Replace check_language_json.py with sync_translations.py

Removed .github/scripts/check_language_json.py and updated the workflow to use .github/scripts/sync_translations.py for translation checks and syncing. Updated the usage documentation in sync_translations.py. Refactored scripts/counter_translation_v2.py to use sync_translations.py for translation progress calculation, simplifying logic and removing TOML-based ignore handling.
This commit is contained in:
Ludy87 2025-10-27 08:50:51 +01:00
parent 8fc3f3e8cb
commit 427c52e0cc
No known key found for this signature in database
GPG Key ID: 92696155E0220F94
4 changed files with 90 additions and 520 deletions

View File

@ -1,345 +0,0 @@
"""
Author: Ludy87
Description: This script processes JSON translation files for localization checks. It compares translation files in a branch with
a reference file to ensure consistency. The script performs two main checks:
1. Verifies that the number of translation keys in the translation files matches the reference file.
2. Ensures that all keys in the translation files are present in the reference file and vice versa.
The script also provides functionality to update the translation files to match the reference file by adding missing keys and
adjusting the format.
Usage:
python check_language_json.py --reference-file <path_to_reference_file> --branch <branch_name> [--actor <actor_name>] [--files <list_of_changed_files>]
"""
# Sample for Windows:
# python .github/scripts/check_language_json.py --reference-file frontend/public/locales/en-GB/translation.json --branch "" --files frontend/public/locales/de-DE/translation.json frontend/public/locales/fr-FR/translation.json
import copy
import glob
import os
import argparse
import re
import json
def find_duplicate_keys(file_path, keys=None, prefix=""):
"""
Identifies duplicate keys in a JSON file (including nested keys).
:param file_path: Path to the JSON file.
:param keys: Dictionary to track keys (used for recursion).
:param prefix: Prefix for nested keys.
:return: List of tuples (key, first_occurrence_path, duplicate_path).
"""
if keys is None:
keys = {}
duplicates = []
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
def process_dict(obj, current_prefix=""):
for key, value in obj.items():
full_key = f"{current_prefix}.{key}" if current_prefix else key
if isinstance(value, dict):
process_dict(value, full_key)
else:
if full_key in keys:
duplicates.append((full_key, keys[full_key], full_key))
else:
keys[full_key] = full_key
process_dict(data, prefix)
return duplicates
# Maximum size for JSON files (e.g., 500 KB)
MAX_FILE_SIZE = 500 * 1024
def parse_json_file(file_path):
"""
Parses a JSON translation file and returns a flat dictionary of all keys.
:param file_path: Path to the JSON file.
:return: Dictionary with flattened keys.
"""
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
def flatten_dict(d, parent_key="", sep="."):
items = {}
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.update(flatten_dict(v, new_key, sep=sep))
else:
items[new_key] = v
return items
return flatten_dict(data)
def unflatten_dict(d, sep="."):
"""
Converts a flat dictionary with dot notation keys back to nested dict.
:param d: Flattened dictionary.
:param sep: Separator used in keys.
:return: Nested dictionary.
"""
result = {}
for key, value in d.items():
parts = key.split(sep)
current = result
for part in parts[:-1]:
if part not in current:
current[part] = {}
current = current[part]
current[parts[-1]] = value
return result
def write_json_file(file_path, updated_properties):
"""
Writes updated properties back to the JSON file.
:param file_path: Path to the JSON file.
:param updated_properties: Dictionary of updated properties to write.
"""
nested_data = unflatten_dict(updated_properties)
with open(file_path, "w", encoding="utf-8", newline="\n") as file:
json.dump(nested_data, file, ensure_ascii=False, indent=2)
file.write("\n") # Add trailing newline
def update_missing_keys(reference_file, file_list, branch=""):
"""
Updates missing keys in the translation files based on the reference file.
:param reference_file: Path to the reference JSON file.
:param file_list: List of translation files to update.
:param branch: Branch where the files are located.
"""
reference_properties = parse_json_file(reference_file)
for file_path in file_list:
basename_current_file = os.path.basename(os.path.join(branch, file_path))
if (
basename_current_file == os.path.basename(reference_file)
or not file_path.endswith(".json")
or not os.path.dirname(file_path).endswith("locales")
):
continue
current_properties = parse_json_file(os.path.join(branch, file_path))
updated_properties = {}
for ref_key, ref_value in reference_properties.items():
if ref_key in current_properties:
# Keep the current translation
updated_properties[ref_key] = current_properties[ref_key]
else:
# Add missing key with reference value
updated_properties[ref_key] = ref_value
write_json_file(os.path.join(branch, file_path), updated_properties)
def check_for_missing_keys(reference_file, file_list, branch):
update_missing_keys(reference_file, file_list, branch)
def read_json_keys(file_path):
if os.path.isfile(file_path) and os.path.exists(file_path):
return parse_json_file(file_path)
return {}
def check_for_differences(reference_file, file_list, branch, actor):
reference_branch = branch
basename_reference_file = os.path.basename(reference_file)
report = []
report.append(f"#### 🔄 Reference Branch: `{reference_branch}`")
reference_keys = read_json_keys(reference_file)
has_differences = False
only_reference_file = True
file_arr = file_list
if len(file_list) == 1:
file_arr = file_list[0].split()
base_dir = os.path.abspath(
os.path.join(os.getcwd(), "frontend", "public", "locales")
)
for file_path in file_arr:
file_normpath = os.path.normpath(file_path)
absolute_path = os.path.abspath(file_normpath)
# Verify that file is within the expected directory
if not absolute_path.startswith(base_dir):
raise ValueError(f"Unsafe file found: {file_normpath}")
# Verify file size before processing
if os.path.getsize(os.path.join(branch, file_normpath)) > MAX_FILE_SIZE:
raise ValueError(
f"The file {file_normpath} is too large and could pose a security risk."
)
basename_current_file = os.path.basename(os.path.join(branch, file_normpath))
locale_dir = os.path.basename(os.path.dirname(file_normpath))
if (
basename_current_file == basename_reference_file
and locale_dir == "en-GB"
):
continue
if not file_normpath.endswith(".json") or basename_current_file != "translation.json":
continue
only_reference_file = False
report.append(f"#### 📃 **File Check:** `{locale_dir}/{basename_current_file}`")
current_keys = read_json_keys(os.path.join(branch, file_path))
reference_key_count = len(reference_keys)
current_key_count = len(current_keys)
if reference_key_count != current_key_count:
report.append("")
report.append("1. **Test Status:** ❌ **_Failed_**")
report.append(" - **Issue:**")
has_differences = True
if reference_key_count > current_key_count:
report.append(
f" - **_Mismatched key count_**: {reference_key_count} (reference) vs {current_key_count} (current). Translation keys are missing."
)
elif reference_key_count < current_key_count:
report.append(
f" - **_Too many keys_**: {reference_key_count} (reference) vs {current_key_count} (current). Please verify if there are additional keys that need to be removed."
)
else:
report.append("1. **Test Status:** ✅ **_Passed_**")
# Check for missing or extra keys
current_keys_set = set(current_keys.keys())
reference_keys_set = set(reference_keys.keys())
missing_keys = current_keys_set.difference(reference_keys_set)
extra_keys = reference_keys_set.difference(current_keys_set)
missing_keys_list = list(missing_keys)
extra_keys_list = list(extra_keys)
if missing_keys_list or extra_keys_list:
has_differences = True
missing_keys_str = "`, `".join(missing_keys_list)
extra_keys_str = "`, `".join(extra_keys_list)
report.append("2. **Test Status:** ❌ **_Failed_**")
report.append(" - **Issue:**")
if missing_keys_list:
report.append(
f" - **_Extra keys in `{locale_dir}/{basename_current_file}`_**: `{missing_keys_str}` that are not present in **_`{basename_reference_file}`_**."
)
if extra_keys_list:
report.append(
f" - **_Missing keys in `{locale_dir}/{basename_current_file}`_**: `{extra_keys_str}` that are not present in **_`{basename_reference_file}`_**."
)
else:
report.append("2. **Test Status:** ✅ **_Passed_**")
if find_duplicate_keys(os.path.join(branch, file_normpath)):
has_differences = True
output = "\n".join(
[
f" - `{key}`: first at {first}, duplicate at `{duplicate}`"
for key, first, duplicate in find_duplicate_keys(
os.path.join(branch, file_normpath)
)
]
)
report.append("3. **Test Status:** ❌ **_Failed_**")
report.append(" - **Issue:**")
report.append(" - duplicate entries were found:")
report.append(output)
else:
report.append("3. **Test Status:** ✅ **_Passed_**")
report.append("")
report.append("---")
report.append("")
if has_differences:
report.append("## ❌ Overall Check Status: **_Failed_**")
report.append("")
report.append(
f"@{actor} please check your translation if it conforms to the standard. Follow the format of [en-GB/translation.json](https://github.com/Stirling-Tools/Stirling-PDF/blob/V2/frontend/public/locales/en-GB/translation.json)"
)
else:
report.append("## ✅ Overall Check Status: **_Success_**")
report.append("")
report.append(
f"Thanks @{actor} for your help in keeping the translations up to date."
)
if not only_reference_file:
print("\n".join(report))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Find missing keys")
parser.add_argument(
"--actor",
required=False,
help="Actor from PR.",
)
parser.add_argument(
"--reference-file",
required=True,
help="Path to the reference file.",
)
parser.add_argument(
"--branch",
type=str,
required=True,
help="Branch name.",
)
parser.add_argument(
"--check-file",
type=str,
required=False,
help="List of changed files, separated by spaces.",
)
parser.add_argument(
"--files",
nargs="+",
required=False,
help="List of changed files, separated by spaces.",
)
args = parser.parse_args()
# Sanitize --actor input to avoid injection attacks
if args.actor:
args.actor = re.sub(r"[^a-zA-Z0-9_\\-]", "", args.actor)
# Sanitize --branch input to avoid injection attacks
if args.branch:
args.branch = re.sub(r"[^a-zA-Z0-9\\-]", "", args.branch)
file_list = args.files
if file_list is None:
if args.check_file:
file_list = [args.check_file]
else:
file_list = glob.glob(
os.path.join(
os.getcwd(),
"frontend",
"public",
"locales",
"*",
"translation.json",
)
)
update_missing_keys(args.reference_file, file_list)
else:
check_for_differences(args.reference_file, file_list, args.branch, args.actor)

View File

@ -12,7 +12,7 @@ It does two things:
Also prints a CI-friendly report (intended for PR comments).
Usage:
python sync_translations.py --reference-file <path_to_reference_json> [--branch <branch_root>] [--actor <actor_name>] [--files <list_of_target_jsons>] [--check] [--prune] [--dry-run]
python sync_translations.py --reference-file <path_to_reference_json> [--branch <branch_root>] [--actor <actor_name>] [--files <list_of_target_jsons>] [--check] [--prune] [--procent-translations] [--dry-run]
"""
from __future__ import annotations

View File

@ -54,7 +54,7 @@ jobs:
- name: Sync translation JSON files
run: |
python .github/scripts/check_language_json.py --reference-file "frontend/public/locales/en-GB/translation.json" --branch V2
python .github/scripts/sync_translations.py --reference-file "frontend/public/locales/en-GB/translation.json" --branch V2
- name: Commit translation files
run: |

View File

@ -1,204 +1,119 @@
"""A script to update language progress status in README.md based on
JSON translation file comparison.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
This script compares the default translation JSON file with others in the locales directory to
determine language progress.
It then updates README.md based on provided progress list.
"""
A tiny helper that updates README.md translation progress by asking
.sync_translations.py for the per-locale percentage (via --procent-translations).
Author: Ludy87
"""
Example:
To use this script, simply run it from command line:
$ python counter_translation_v2.py
""" # noqa: D205
from __future__ import annotations
import glob
import os
import re
import json
import tomlkit
import tomlkit.toml_file
import subprocess
from pathlib import Path
from typing import List, Tuple
def convert_to_multiline(data: tomlkit.TOMLDocument) -> tomlkit.TOMLDocument:
"""Converts 'ignore' and 'missing' arrays to multiline arrays and sorts the first-level keys of the TOML document.
Enhances readability and consistency in the TOML file by ensuring arrays contain unique and sorted entries.
Parameters:
data (tomlkit.TOMLDocument): The original TOML document containing the data.
Returns:
tomlkit.TOMLDocument: A new TOML document with sorted keys and properly formatted arrays.
""" # noqa: D205
sorted_data = tomlkit.document()
for key in sorted(data.keys()):
value = data[key]
if isinstance(value, dict):
new_table = tomlkit.table()
for subkey in ("ignore", "missing"):
if subkey in value:
# Convert the list to a set to remove duplicates, sort it, and convert to multiline for readability
unique_sorted_array = sorted(set(value[subkey]))
array = tomlkit.array()
array.multiline(True)
for item in unique_sorted_array:
array.append(item)
new_table[subkey] = array
sorted_data[key] = new_table
else:
# Add other types of data unchanged
sorted_data[key] = value
return sorted_data
REPO_ROOT = Path(os.getcwd())
LOCALES_DIR = REPO_ROOT / "frontend" / "public" / "locales"
REF_FILE = LOCALES_DIR / "en-GB" / "translation.json"
SYNC_SCRIPT = REPO_ROOT / ".github" / "scripts" / "sync_translations.py"
README = REPO_ROOT / "README.md"
def write_readme(progress_list: list[tuple[str, int]]) -> None:
"""Updates the progress status in the README.md file based
on the provided progress list.
Parameters:
progress_list (list[tuple[str, int]]): A list of tuples containing
language and progress percentage.
Returns:
None
""" # noqa: D205
with open("README.md", encoding="utf-8") as file:
content = file.readlines()
for i, line in enumerate(content[2:], start=2):
for progress in progress_list:
language, value = progress
if language in line:
if match := re.search(r"\!\[(\d+(\.\d+)?)%\]\(.*\)", line):
content[i] = line.replace(
match.group(0),
f"![{value}%](https://geps.dev/progress/{value})",
)
with open("README.md", "w", encoding="utf-8", newline="\n") as file:
file.writelines(content)
def find_locale_files() -> List[Path]:
return sorted(
Path(p) for p in glob.glob(str(LOCALES_DIR / "*" / "translation.json"))
)
def parse_json_file(file_path):
def percent_done_for_file(file_path: Path) -> int:
"""
Parses a JSON translation file and returns a flat dictionary of all keys.
:param file_path: Path to the JSON file.
:return: Dictionary with flattened keys and values.
Calls sync_translations.py --procent-translations for a single locale file.
Returns an int 0..100.
"""
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
# en-GB / en-US are always 100% by definition
norm = str(file_path).replace("\\", "/")
if norm.endswith("en-GB/translation.json") or norm.endswith(
"en-US/translation.json"
):
return 100
def flatten_dict(d, parent_key="", sep="."):
items = {}
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.update(flatten_dict(v, new_key, sep=sep))
else:
items[new_key] = v
return items
return flatten_dict(data)
cmd = [
"python",
str(SYNC_SCRIPT),
"--reference-file",
str(REF_FILE),
"--files",
str(file_path),
"--check",
"--procent-translations",
]
res = subprocess.run(cmd, capture_output=True, text=True, check=True)
out = res.stdout.strip()
return int(float(out))
def compare_files(
default_file_path, file_paths, ignore_translation_file
) -> list[tuple[str, int]]:
"""Compares the default JSON translation file with other
translation files in the locales directory.
def update_readme(progress_list: List[Tuple[str, int]]) -> None:
"""
Update README badges. Expects lines like:
... [xx%](https://geps.dev/progress/xx)
and replaces xx with the new percent.
"""
if not README.exists():
print("README.md not found — skipping write.")
return
Parameters:
default_file_path (str): The path to the default translation JSON file.
file_paths (list): List of paths to translation JSON files.
ignore_translation_file (str): Path to the TOML file with ignore rules.
content = README.read_text(encoding="utf-8").splitlines(keepends=True)
Returns:
list[tuple[str, int]]: A list of tuples containing
language and progress percentage.
""" # noqa: D205
default_keys = parse_json_file(default_file_path)
num_keys = len(default_keys)
# we start at line 2 like your original (skip title, etc.)
for i in range(2, len(content)):
line = content[i]
for lang, value in progress_list:
if lang in line:
content[i] = re.sub(
r"!\[(\d+(?:\.\d+)?)%\]\(https://geps\.dev/progress/\d+\)",
f"![{value}%](https://geps.dev/progress/{value})",
line,
)
break
result_list = []
sort_ignore_translation: tomlkit.TOMLDocument
README.write_text("".join(content), encoding="utf-8", newline="\n")
# read toml
with open(ignore_translation_file, encoding="utf-8") as f:
sort_ignore_translation = tomlkit.parse(f.read())
for file_path in file_paths:
# Extract language code from directory name
locale_dir = os.path.basename(os.path.dirname(file_path))
def main() -> None:
files = find_locale_files()
if not files:
print("No translation.json files found.")
return
# Convert locale format from hyphen to underscore for TOML compatibility
# e.g., en-GB -> en_GB, sr-LATN-RS -> sr_LATN_RS
language = locale_dir.replace("-", "_")
results: List[Tuple[str, int]] = []
for f in files:
# language label from folder, e.g. de-DE, sr-LATN-RS
lang = f.parent.name.replace(
"-", "_"
) # keep hyphenated form to match README lines
pct = percent_done_for_file(f)
results.append((lang, pct))
fails = 0
if language in ["en_GB", "en_US"]:
result_list.append(("en_GB", 100))
result_list.append(("en_US", 100))
continue
# ensure en-GB/en-US are included & set to 100
have = {lang for lang, _ in results}
for hard in ("en-GB", "en-US"):
if hard not in have:
results.append((hard, 100))
if language not in sort_ignore_translation:
sort_ignore_translation[language] = tomlkit.table()
# optional: sort by percent desc (nice to have)
results.sort(key=lambda x: x[1], reverse=True)
if (
"ignore" not in sort_ignore_translation[language]
or len(sort_ignore_translation[language].get("ignore", [])) < 1
):
sort_ignore_translation[language]["ignore"] = tomlkit.array(
["language.direction"]
)
update_readme(results)
current_keys = parse_json_file(file_path)
# Compare keys
for default_key, default_value in default_keys.items():
if default_key not in current_keys:
# Key is missing entirely
if default_key not in sort_ignore_translation[language]["ignore"]:
print(f"{language}: Key '{default_key}' is missing.")
fails += 1
elif (
default_value == current_keys[default_key]
and default_key not in sort_ignore_translation[language]["ignore"]
):
# Key exists but value is untranslated (same as reference)
print(f"{language}: Key '{default_key}' is missing the translation.")
fails += 1
elif default_value != current_keys[default_key]:
# Key is translated, remove from ignore list if present
if default_key in sort_ignore_translation[language]["ignore"]:
sort_ignore_translation[language]["ignore"].remove(default_key)
print(f"{language}: {fails} out of {num_keys} keys are not translated.")
result_list.append(
(
language,
int((num_keys - fails) * 100 / num_keys),
)
)
ignore_translation = convert_to_multiline(sort_ignore_translation)
with open(ignore_translation_file, "w", encoding="utf-8", newline="\n") as file:
file.write(tomlkit.dumps(ignore_translation))
unique_data = list(set(result_list))
unique_data.sort(key=lambda x: x[1], reverse=True)
return unique_data
# also print a compact summary to stdout (useful in CI logs)
# for lang, pct in results:
# print(f"{lang}: {pct}%")
if __name__ == "__main__":
directory = os.path.join(os.getcwd(), "frontend", "public", "locales")
translation_file_paths = glob.glob(os.path.join(directory, "*", "translation.json"))
reference_file = os.path.join(directory, "en-GB", "translation.json")
scripts_directory = os.path.join(os.getcwd(), "scripts")
translation_state_file = os.path.join(scripts_directory, "ignore_translation.toml")
write_readme(
compare_files(reference_file, translation_file_paths, translation_state_file)
)
main()