Replace check_language_json.py with sync_translations.py

Removed .github/scripts/check_language_json.py and updated the workflow to use .github/scripts/sync_translations.py for translation checks and syncing. Updated the usage documentation in sync_translations.py. Refactored scripts/counter_translation_v2.py to use sync_translations.py for translation progress calculation, simplifying logic and removing TOML-based ignore handling.
2026-02-17 13:52:14 +01:00 · 2025-10-27 08:50:51 +01:00
parent 8fc3f3e8cb
commit 427c52e0cc
4 changed files with 90 additions and 520 deletions
--- a/.github/scripts/check_language_json.py
+++ b/.github/scripts/check_language_json.py
@@ -1,345 +0,0 @@
-"""
-Author: Ludy87
-Description: This script processes JSON translation files for localization checks. It compares translation files in a branch with
-a reference file to ensure consistency. The script performs two main checks:
-1. Verifies that the number of translation keys in the translation files matches the reference file.
-2. Ensures that all keys in the translation files are present in the reference file and vice versa.
-
-The script also provides functionality to update the translation files to match the reference file by adding missing keys and
-adjusting the format.
-
-Usage:
-    python check_language_json.py --reference-file <path_to_reference_file> --branch <branch_name> [--actor <actor_name>] [--files <list_of_changed_files>]
-"""
-# Sample for Windows:
-# python .github/scripts/check_language_json.py --reference-file frontend/public/locales/en-GB/translation.json --branch "" --files frontend/public/locales/de-DE/translation.json frontend/public/locales/fr-FR/translation.json
-
-import copy
-import glob
-import os
-import argparse
-import re
-import json
-
-
-def find_duplicate_keys(file_path, keys=None, prefix=""):
-    """
-    Identifies duplicate keys in a JSON file (including nested keys).
-    :param file_path: Path to the JSON file.
-    :param keys: Dictionary to track keys (used for recursion).
-    :param prefix: Prefix for nested keys.
-    :return: List of tuples (key, first_occurrence_path, duplicate_path).
-    """
-    if keys is None:
-        keys = {}
-
-    duplicates = []
-
-    with open(file_path, "r", encoding="utf-8") as file:
-        data = json.load(file)
-
-    def process_dict(obj, current_prefix=""):
-        for key, value in obj.items():
-            full_key = f"{current_prefix}.{key}" if current_prefix else key
-
-            if isinstance(value, dict):
-                process_dict(value, full_key)
-            else:
-                if full_key in keys:
-                    duplicates.append((full_key, keys[full_key], full_key))
-                else:
-                    keys[full_key] = full_key
-
-    process_dict(data, prefix)
-    return duplicates
-
-
-# Maximum size for JSON files (e.g., 500 KB)
-MAX_FILE_SIZE = 500 * 1024
-
-
-def parse_json_file(file_path):
-    """
-    Parses a JSON translation file and returns a flat dictionary of all keys.
-    :param file_path: Path to the JSON file.
-    :return: Dictionary with flattened keys.
-    """
-    with open(file_path, "r", encoding="utf-8") as file:
-        data = json.load(file)
-
-    def flatten_dict(d, parent_key="", sep="."):
-        items = {}
-        for k, v in d.items():
-            new_key = f"{parent_key}{sep}{k}" if parent_key else k
-            if isinstance(v, dict):
-                items.update(flatten_dict(v, new_key, sep=sep))
-            else:
-                items[new_key] = v
-        return items
-
-    return flatten_dict(data)
-
-
-def unflatten_dict(d, sep="."):
-    """
-    Converts a flat dictionary with dot notation keys back to nested dict.
-    :param d: Flattened dictionary.
-    :param sep: Separator used in keys.
-    :return: Nested dictionary.
-    """
-    result = {}
-    for key, value in d.items():
-        parts = key.split(sep)
-        current = result
-        for part in parts[:-1]:
-            if part not in current:
-                current[part] = {}
-            current = current[part]
-        current[parts[-1]] = value
-    return result
-
-
-def write_json_file(file_path, updated_properties):
-    """
-    Writes updated properties back to the JSON file.
-    :param file_path: Path to the JSON file.
-    :param updated_properties: Dictionary of updated properties to write.
-    """
-    nested_data = unflatten_dict(updated_properties)
-
-    with open(file_path, "w", encoding="utf-8", newline="\n") as file:
-        json.dump(nested_data, file, ensure_ascii=False, indent=2)
-        file.write("\n")  # Add trailing newline
-
-
-def update_missing_keys(reference_file, file_list, branch=""):
-    """
-    Updates missing keys in the translation files based on the reference file.
-    :param reference_file: Path to the reference JSON file.
-    :param file_list: List of translation files to update.
-    :param branch: Branch where the files are located.
-    """
-    reference_properties = parse_json_file(reference_file)
-
-    for file_path in file_list:
-        basename_current_file = os.path.basename(os.path.join(branch, file_path))
-        if (
-            basename_current_file == os.path.basename(reference_file)
-            or not file_path.endswith(".json")
-            or not os.path.dirname(file_path).endswith("locales")
-        ):
-            continue
-
-        current_properties = parse_json_file(os.path.join(branch, file_path))
-        updated_properties = {}
-
-        for ref_key, ref_value in reference_properties.items():
-            if ref_key in current_properties:
-                # Keep the current translation
-                updated_properties[ref_key] = current_properties[ref_key]
-            else:
-                # Add missing key with reference value
-                updated_properties[ref_key] = ref_value
-
-        write_json_file(os.path.join(branch, file_path), updated_properties)
-
-
-def check_for_missing_keys(reference_file, file_list, branch):
-    update_missing_keys(reference_file, file_list, branch)
-
-
-def read_json_keys(file_path):
-    if os.path.isfile(file_path) and os.path.exists(file_path):
-        return parse_json_file(file_path)
-    return {}
-
-
-def check_for_differences(reference_file, file_list, branch, actor):
-    reference_branch = branch
-    basename_reference_file = os.path.basename(reference_file)
-
-    report = []
-    report.append(f"#### 🔄 Reference Branch: `{reference_branch}`")
-    reference_keys = read_json_keys(reference_file)
-    has_differences = False
-
-    only_reference_file = True
-
-    file_arr = file_list
-
-    if len(file_list) == 1:
-        file_arr = file_list[0].split()
-
-    base_dir = os.path.abspath(
-        os.path.join(os.getcwd(), "frontend", "public", "locales")
-    )
-
-    for file_path in file_arr:
-        file_normpath = os.path.normpath(file_path)
-        absolute_path = os.path.abspath(file_normpath)
-
-        # Verify that file is within the expected directory
-        if not absolute_path.startswith(base_dir):
-            raise ValueError(f"Unsafe file found: {file_normpath}")
-
-        # Verify file size before processing
-        if os.path.getsize(os.path.join(branch, file_normpath)) > MAX_FILE_SIZE:
-            raise ValueError(
-                f"The file {file_normpath} is too large and could pose a security risk."
-            )
-
-        basename_current_file = os.path.basename(os.path.join(branch, file_normpath))
-        locale_dir = os.path.basename(os.path.dirname(file_normpath))
-
-        if (
-            basename_current_file == basename_reference_file
-            and locale_dir == "en-GB"
-        ):
-            continue
-
-        if not file_normpath.endswith(".json") or basename_current_file != "translation.json":
-            continue
-
-        only_reference_file = False
-        report.append(f"#### 📃 **File Check:** `{locale_dir}/{basename_current_file}`")
-        current_keys = read_json_keys(os.path.join(branch, file_path))
-        reference_key_count = len(reference_keys)
-        current_key_count = len(current_keys)
-
-        if reference_key_count != current_key_count:
-            report.append("")
-            report.append("1. **Test Status:** ❌ **_Failed_**")
-            report.append("  - **Issue:**")
-            has_differences = True
-            if reference_key_count > current_key_count:
-                report.append(
-                    f"    - **_Mismatched key count_**: {reference_key_count} (reference) vs {current_key_count} (current). Translation keys are missing."
-                )
-            elif reference_key_count < current_key_count:
-                report.append(
-                    f"    - **_Too many keys_**: {reference_key_count} (reference) vs {current_key_count} (current). Please verify if there are additional keys that need to be removed."
-                )
-        else:
-            report.append("1. **Test Status:** ✅ **_Passed_**")
-
-        # Check for missing or extra keys
-        current_keys_set = set(current_keys.keys())
-        reference_keys_set = set(reference_keys.keys())
-        missing_keys = current_keys_set.difference(reference_keys_set)
-        extra_keys = reference_keys_set.difference(current_keys_set)
-        missing_keys_list = list(missing_keys)
-        extra_keys_list = list(extra_keys)
-
-        if missing_keys_list or extra_keys_list:
-            has_differences = True
-            missing_keys_str = "`, `".join(missing_keys_list)
-            extra_keys_str = "`, `".join(extra_keys_list)
-            report.append("2. **Test Status:** ❌ **_Failed_**")
-            report.append("  - **Issue:**")
-            if missing_keys_list:
-                report.append(
-                    f"    - **_Extra keys in `{locale_dir}/{basename_current_file}`_**: `{missing_keys_str}` that are not present in **_`{basename_reference_file}`_**."
-                )
-            if extra_keys_list:
-                report.append(
-                    f"    - **_Missing keys in `{locale_dir}/{basename_current_file}`_**: `{extra_keys_str}` that are not present in **_`{basename_reference_file}`_**."
-                )
-        else:
-            report.append("2. **Test Status:** ✅ **_Passed_**")
-
-        if find_duplicate_keys(os.path.join(branch, file_normpath)):
-            has_differences = True
-            output = "\n".join(
-                [
-                    f"      - `{key}`: first at {first}, duplicate at `{duplicate}`"
-                    for key, first, duplicate in find_duplicate_keys(
-                        os.path.join(branch, file_normpath)
-                    )
-                ]
-            )
-            report.append("3. **Test Status:** ❌ **_Failed_**")
-            report.append("  - **Issue:**")
-            report.append("    - duplicate entries were found:")
-            report.append(output)
-        else:
-            report.append("3. **Test Status:** ✅ **_Passed_**")
-
-        report.append("")
-        report.append("---")
-        report.append("")
-
-    if has_differences:
-        report.append("## ❌ Overall Check Status: **_Failed_**")
-        report.append("")
-        report.append(
-            f"@{actor} please check your translation if it conforms to the standard. Follow the format of [en-GB/translation.json](https://github.com/Stirling-Tools/Stirling-PDF/blob/V2/frontend/public/locales/en-GB/translation.json)"
-        )
-    else:
-        report.append("## ✅ Overall Check Status: **_Success_**")
-        report.append("")
-        report.append(
-            f"Thanks @{actor} for your help in keeping the translations up to date."
-        )
-
-    if not only_reference_file:
-        print("\n".join(report))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Find missing keys")
-    parser.add_argument(
-        "--actor",
-        required=False,
-        help="Actor from PR.",
-    )
-    parser.add_argument(
-        "--reference-file",
-        required=True,
-        help="Path to the reference file.",
-    )
-    parser.add_argument(
-        "--branch",
-        type=str,
-        required=True,
-        help="Branch name.",
-    )
-    parser.add_argument(
-        "--check-file",
-        type=str,
-        required=False,
-        help="List of changed files, separated by spaces.",
-    )
-    parser.add_argument(
-        "--files",
-        nargs="+",
-        required=False,
-        help="List of changed files, separated by spaces.",
-    )
-    args = parser.parse_args()
-
-    # Sanitize --actor input to avoid injection attacks
-    if args.actor:
-        args.actor = re.sub(r"[^a-zA-Z0-9_\\-]", "", args.actor)
-
-    # Sanitize --branch input to avoid injection attacks
-    if args.branch:
-        args.branch = re.sub(r"[^a-zA-Z0-9\\-]", "", args.branch)
-
-    file_list = args.files
-    if file_list is None:
-        if args.check_file:
-            file_list = [args.check_file]
-        else:
-            file_list = glob.glob(
-                os.path.join(
-                    os.getcwd(),
-                    "frontend",
-                    "public",
-                    "locales",
-                    "*",
-                    "translation.json",
-                )
-            )
-        update_missing_keys(args.reference_file, file_list)
-    else:
-        check_for_differences(args.reference_file, file_list, args.branch, args.actor)
--- a/.github/scripts/sync_translations.py
+++ b/.github/scripts/sync_translations.py
@@ -12,7 +12,7 @@ It does two things:
 Also prints a CI-friendly report (intended for PR comments).

 Usage:
-    python sync_translations.py --reference-file <path_to_reference_json> [--branch <branch_root>] [--actor <actor_name>] [--files <list_of_target_jsons>] [--check] [--prune] [--dry-run]
+    python sync_translations.py --reference-file <path_to_reference_json> [--branch <branch_root>] [--actor <actor_name>] [--files <list_of_target_jsons>] [--check] [--prune] [--procent-translations] [--dry-run]
 """

 from __future__ import annotations
--- a/.github/workflows/sync_files_v2.yml
+++ b/.github/workflows/sync_files_v2.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Sync translation JSON files
        run: |
-          python .github/scripts/check_language_json.py --reference-file "frontend/public/locales/en-GB/translation.json" --branch V2
+          python .github/scripts/sync_translations.py --reference-file "frontend/public/locales/en-GB/translation.json" --branch V2

      - name: Commit translation files
        run: |
--- a/scripts/counter_translation_v2.py
+++ b/scripts/counter_translation_v2.py
@@ -1,204 +1,119 @@
-"""A script to update language progress status in README.md based on
-JSON translation file comparison.
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-

-This script compares the default translation JSON file with others in the locales directory to
-determine language progress.
-It then updates README.md based on provided progress list.
+"""
+A tiny helper that updates README.md translation progress by asking
+.sync_translations.py for the per-locale percentage (via --procent-translations).

 Author: Ludy87
+"""

-Example:
-    To use this script, simply run it from command line:
-        $ python counter_translation_v2.py
-"""  # noqa: D205
-
+from __future__ import annotations
 import glob
 import os
 import re
-import json
-
-import tomlkit
-import tomlkit.toml_file
+import subprocess
+from pathlib import Path
+from typing import List, Tuple


-def convert_to_multiline(data: tomlkit.TOMLDocument) -> tomlkit.TOMLDocument:
-    """Converts 'ignore' and 'missing' arrays to multiline arrays and sorts the first-level keys of the TOML document.
-    Enhances readability and consistency in the TOML file by ensuring arrays contain unique and sorted entries.
-
-    Parameters:
-        data (tomlkit.TOMLDocument): The original TOML document containing the data.
-
-    Returns:
-        tomlkit.TOMLDocument: A new TOML document with sorted keys and properly formatted arrays.
-    """  # noqa: D205
-    sorted_data = tomlkit.document()
-    for key in sorted(data.keys()):
-        value = data[key]
-        if isinstance(value, dict):
-            new_table = tomlkit.table()
-            for subkey in ("ignore", "missing"):
-                if subkey in value:
-                    # Convert the list to a set to remove duplicates, sort it, and convert to multiline for readability
-                    unique_sorted_array = sorted(set(value[subkey]))
-                    array = tomlkit.array()
-                    array.multiline(True)
-                    for item in unique_sorted_array:
-                        array.append(item)
-                    new_table[subkey] = array
-            sorted_data[key] = new_table
-        else:
-            # Add other types of data unchanged
-            sorted_data[key] = value
-    return sorted_data
+REPO_ROOT = Path(os.getcwd())
+LOCALES_DIR = REPO_ROOT / "frontend" / "public" / "locales"
+REF_FILE = LOCALES_DIR / "en-GB" / "translation.json"
+SYNC_SCRIPT = REPO_ROOT / ".github" / "scripts" / "sync_translations.py"
+README = REPO_ROOT / "README.md"


-def write_readme(progress_list: list[tuple[str, int]]) -> None:
-    """Updates the progress status in the README.md file based
-    on the provided progress list.
-
-    Parameters:
-        progress_list (list[tuple[str, int]]): A list of tuples containing
-        language and progress percentage.
-
-    Returns:
-        None
-    """  # noqa: D205
-    with open("README.md", encoding="utf-8") as file:
-        content = file.readlines()
-
-    for i, line in enumerate(content[2:], start=2):
-        for progress in progress_list:
-            language, value = progress
-            if language in line:
-                if match := re.search(r"\!\[(\d+(\.\d+)?)%\]\(.*\)", line):
-                    content[i] = line.replace(
-                        match.group(0),
-                        f"![{value}%](https://geps.dev/progress/{value})",
-                    )
-
-    with open("README.md", "w", encoding="utf-8", newline="\n") as file:
-        file.writelines(content)
+def find_locale_files() -> List[Path]:
+    return sorted(
+        Path(p) for p in glob.glob(str(LOCALES_DIR / "*" / "translation.json"))
+    )


-def parse_json_file(file_path):
+def percent_done_for_file(file_path: Path) -> int:
    """
-    Parses a JSON translation file and returns a flat dictionary of all keys.
-    :param file_path: Path to the JSON file.
-    :return: Dictionary with flattened keys and values.
+    Calls sync_translations.py --procent-translations for a single locale file.
+    Returns an int 0..100.
    """
-    with open(file_path, "r", encoding="utf-8") as file:
-        data = json.load(file)
+    # en-GB / en-US are always 100% by definition
+    norm = str(file_path).replace("\\", "/")
+    if norm.endswith("en-GB/translation.json") or norm.endswith(
+        "en-US/translation.json"
+    ):
+        return 100

-    def flatten_dict(d, parent_key="", sep="."):
-        items = {}
-        for k, v in d.items():
-            new_key = f"{parent_key}{sep}{k}" if parent_key else k
-            if isinstance(v, dict):
-                items.update(flatten_dict(v, new_key, sep=sep))
-            else:
-                items[new_key] = v
-        return items
-
-    return flatten_dict(data)
+    cmd = [
+        "python",
+        str(SYNC_SCRIPT),
+        "--reference-file",
+        str(REF_FILE),
+        "--files",
+        str(file_path),
+        "--check",
+        "--procent-translations",
+    ]
+    res = subprocess.run(cmd, capture_output=True, text=True, check=True)
+    out = res.stdout.strip()
+    return int(float(out))


-def compare_files(
-    default_file_path, file_paths, ignore_translation_file
-) -> list[tuple[str, int]]:
-    """Compares the default JSON translation file with other
-    translation files in the locales directory.
+def update_readme(progress_list: List[Tuple[str, int]]) -> None:
+    """
+    Update README badges. Expects lines like:
+      ... [xx%](https://geps.dev/progress/xx)
+    and replaces xx with the new percent.
+    """
+    if not README.exists():
+        print("README.md not found — skipping write.")
+        return

-    Parameters:
-        default_file_path (str): The path to the default translation JSON file.
-        file_paths (list): List of paths to translation JSON files.
-        ignore_translation_file (str): Path to the TOML file with ignore rules.
+    content = README.read_text(encoding="utf-8").splitlines(keepends=True)

-    Returns:
-        list[tuple[str, int]]: A list of tuples containing
-        language and progress percentage.
-    """  # noqa: D205
-    default_keys = parse_json_file(default_file_path)
-    num_keys = len(default_keys)
+    # we start at line 2 like your original (skip title, etc.)
+    for i in range(2, len(content)):
+        line = content[i]
+        for lang, value in progress_list:
+            if lang in line:
+                content[i] = re.sub(
+                    r"!\[(\d+(?:\.\d+)?)%\]\(https://geps\.dev/progress/\d+\)",
+                    f"![{value}%](https://geps.dev/progress/{value})",
+                    line,
+                )
+                break

-    result_list = []
-    sort_ignore_translation: tomlkit.TOMLDocument
+    README.write_text("".join(content), encoding="utf-8", newline="\n")

-    # read toml
-    with open(ignore_translation_file, encoding="utf-8") as f:
-        sort_ignore_translation = tomlkit.parse(f.read())

-    for file_path in file_paths:
-        # Extract language code from directory name
-        locale_dir = os.path.basename(os.path.dirname(file_path))
+def main() -> None:
+    files = find_locale_files()
+    if not files:
+        print("No translation.json files found.")
+        return

-        # Convert locale format from hyphen to underscore for TOML compatibility
-        # e.g., en-GB -> en_GB, sr-LATN-RS -> sr_LATN_RS
-        language = locale_dir.replace("-", "_")
+    results: List[Tuple[str, int]] = []
+    for f in files:
+        # language label from folder, e.g. de-DE, sr-LATN-RS
+        lang = f.parent.name.replace(
+            "-", "_"
+        )  # keep hyphenated form to match README lines
+        pct = percent_done_for_file(f)
+        results.append((lang, pct))

-        fails = 0
-        if language in ["en_GB", "en_US"]:
-            result_list.append(("en_GB", 100))
-            result_list.append(("en_US", 100))
-            continue
+    # ensure en-GB/en-US are included & set to 100
+    have = {lang for lang, _ in results}
+    for hard in ("en-GB", "en-US"):
+        if hard not in have:
+            results.append((hard, 100))

-        if language not in sort_ignore_translation:
-            sort_ignore_translation[language] = tomlkit.table()
+    # optional: sort by percent desc (nice to have)
+    results.sort(key=lambda x: x[1], reverse=True)

-        if (
-            "ignore" not in sort_ignore_translation[language]
-            or len(sort_ignore_translation[language].get("ignore", [])) < 1
-        ):
-            sort_ignore_translation[language]["ignore"] = tomlkit.array(
-                ["language.direction"]
-            )
+    update_readme(results)

-        current_keys = parse_json_file(file_path)
-
-        # Compare keys
-        for default_key, default_value in default_keys.items():
-            if default_key not in current_keys:
-                # Key is missing entirely
-                if default_key not in sort_ignore_translation[language]["ignore"]:
-                    print(f"{language}: Key '{default_key}' is missing.")
-                    fails += 1
-            elif (
-                default_value == current_keys[default_key]
-                and default_key not in sort_ignore_translation[language]["ignore"]
-            ):
-                # Key exists but value is untranslated (same as reference)
-                print(f"{language}: Key '{default_key}' is missing the translation.")
-                fails += 1
-            elif default_value != current_keys[default_key]:
-                # Key is translated, remove from ignore list if present
-                if default_key in sort_ignore_translation[language]["ignore"]:
-                    sort_ignore_translation[language]["ignore"].remove(default_key)
-
-        print(f"{language}: {fails} out of {num_keys} keys are not translated.")
-        result_list.append(
-            (
-                language,
-                int((num_keys - fails) * 100 / num_keys),
-            )
-        )
-
-    ignore_translation = convert_to_multiline(sort_ignore_translation)
-    with open(ignore_translation_file, "w", encoding="utf-8", newline="\n") as file:
-        file.write(tomlkit.dumps(ignore_translation))
-
-    unique_data = list(set(result_list))
-    unique_data.sort(key=lambda x: x[1], reverse=True)
-
-    return unique_data
+    # also print a compact summary to stdout (useful in CI logs)
+    # for lang, pct in results:
+    #     print(f"{lang}: {pct}%")


 if __name__ == "__main__":
-    directory = os.path.join(os.getcwd(), "frontend", "public", "locales")
-    translation_file_paths = glob.glob(os.path.join(directory, "*", "translation.json"))
-    reference_file = os.path.join(directory, "en-GB", "translation.json")
-
-    scripts_directory = os.path.join(os.getcwd(), "scripts")
-    translation_state_file = os.path.join(scripts_directory, "ignore_translation.toml")
-
-    write_readme(
-        compare_files(reference_file, translation_file_paths, translation_state_file)
-    )
+    main()