mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-03-04 02:20:19 +01:00
Merge remote-tracking branch 'origin/V2' into mainToV2
This commit is contained in:
113
scripts/convert_properties_to_json.py
Normal file
113
scripts/convert_properties_to_json.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert Java .properties files to JSON for react-i18next
|
||||
Preserves hierarchical structure and handles special cases
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
def properties_to_dict(file_path):
|
||||
"""Convert .properties file to nested dictionary"""
|
||||
result = {}
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
# Handle key=value pairs
|
||||
if '=' in line:
|
||||
key, value = line.split('=', 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Handle multiline values (ending with \)
|
||||
while value.endswith('\\'):
|
||||
next_line = next(f, '').strip()
|
||||
value = value[:-1] + next_line
|
||||
|
||||
# Create nested structure from dot notation
|
||||
set_nested_value(result, key, value)
|
||||
|
||||
return result
|
||||
|
||||
def set_nested_value(dictionary, key_path, value):
|
||||
"""Set value in nested dictionary using dot notation"""
|
||||
keys = key_path.split('.')
|
||||
current = dictionary
|
||||
|
||||
for key in keys[:-1]:
|
||||
if key not in current:
|
||||
current[key] = {}
|
||||
elif not isinstance(current[key], dict):
|
||||
# Convert existing string value to nested object
|
||||
old_value = current[key]
|
||||
current[key] = {"_value": old_value}
|
||||
current = current[key]
|
||||
|
||||
final_key = keys[-1]
|
||||
if final_key in current and isinstance(current[final_key], dict):
|
||||
# If the final key already exists as an object, store the value under "_value"
|
||||
current[final_key]["_value"] = value
|
||||
else:
|
||||
current[final_key] = value
|
||||
|
||||
def convert_all_properties():
|
||||
"""Convert all messages_*.properties files to JSON"""
|
||||
|
||||
# Get project root
|
||||
script_dir = Path(__file__).parent
|
||||
project_root = script_dir.parent
|
||||
resources_dir = project_root / 'src' / 'main' / 'resources'
|
||||
output_dir = project_root / 'frontend' / 'public' / 'locales'
|
||||
|
||||
# Create output directory
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find all .properties files
|
||||
properties_files = list(resources_dir.glob('messages*.properties'))
|
||||
|
||||
converted_count = 0
|
||||
|
||||
for props_file in properties_files:
|
||||
# Extract locale from filename
|
||||
filename = props_file.name
|
||||
if filename == 'messages.properties':
|
||||
locale = 'en' # Default locale
|
||||
else:
|
||||
# Extract locale from messages_en_US.properties format
|
||||
locale_match = re.match(r'messages_(.+)\.properties', filename)
|
||||
if locale_match:
|
||||
locale = locale_match.group(1)
|
||||
# Convert Java locale format to standard (en_US -> en-US)
|
||||
locale = locale.replace('_', '-')
|
||||
else:
|
||||
continue
|
||||
|
||||
print(f"Converting {filename} -> {locale}.json")
|
||||
|
||||
# Convert to dictionary
|
||||
data = properties_to_dict(props_file)
|
||||
|
||||
# Create locale directory
|
||||
locale_dir = output_dir / locale
|
||||
locale_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Write translation.json (react-i18next default namespace)
|
||||
output_file = locale_dir / 'translation.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
converted_count += 1
|
||||
|
||||
print(f"\nConverted {converted_count} language files to {output_dir}")
|
||||
print("Languages available:", [d.name for d in output_dir.iterdir() if d.is_dir()])
|
||||
|
||||
if __name__ == '__main__':
|
||||
convert_all_properties()
|
||||
204
scripts/counter_translation_v2.py
Normal file
204
scripts/counter_translation_v2.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""A script to update language progress status in README.md based on
|
||||
JSON translation file comparison.
|
||||
|
||||
This script compares the default translation JSON file with others in the locales directory to
|
||||
determine language progress.
|
||||
It then updates README.md based on provided progress list.
|
||||
|
||||
Author: Ludy87
|
||||
|
||||
Example:
|
||||
To use this script, simply run it from command line:
|
||||
$ python counter_translation_v2.py
|
||||
""" # noqa: D205
|
||||
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
import tomlkit
|
||||
import tomlkit.toml_file
|
||||
|
||||
|
||||
def convert_to_multiline(data: tomlkit.TOMLDocument) -> tomlkit.TOMLDocument:
|
||||
"""Converts 'ignore' and 'missing' arrays to multiline arrays and sorts the first-level keys of the TOML document.
|
||||
Enhances readability and consistency in the TOML file by ensuring arrays contain unique and sorted entries.
|
||||
|
||||
Parameters:
|
||||
data (tomlkit.TOMLDocument): The original TOML document containing the data.
|
||||
|
||||
Returns:
|
||||
tomlkit.TOMLDocument: A new TOML document with sorted keys and properly formatted arrays.
|
||||
""" # noqa: D205
|
||||
sorted_data = tomlkit.document()
|
||||
for key in sorted(data.keys()):
|
||||
value = data[key]
|
||||
if isinstance(value, dict):
|
||||
new_table = tomlkit.table()
|
||||
for subkey in ("ignore", "missing"):
|
||||
if subkey in value:
|
||||
# Convert the list to a set to remove duplicates, sort it, and convert to multiline for readability
|
||||
unique_sorted_array = sorted(set(value[subkey]))
|
||||
array = tomlkit.array()
|
||||
array.multiline(True)
|
||||
for item in unique_sorted_array:
|
||||
array.append(item)
|
||||
new_table[subkey] = array
|
||||
sorted_data[key] = new_table
|
||||
else:
|
||||
# Add other types of data unchanged
|
||||
sorted_data[key] = value
|
||||
return sorted_data
|
||||
|
||||
|
||||
def write_readme(progress_list: list[tuple[str, int]]) -> None:
|
||||
"""Updates the progress status in the README.md file based
|
||||
on the provided progress list.
|
||||
|
||||
Parameters:
|
||||
progress_list (list[tuple[str, int]]): A list of tuples containing
|
||||
language and progress percentage.
|
||||
|
||||
Returns:
|
||||
None
|
||||
""" # noqa: D205
|
||||
with open("README.md", encoding="utf-8") as file:
|
||||
content = file.readlines()
|
||||
|
||||
for i, line in enumerate(content[2:], start=2):
|
||||
for progress in progress_list:
|
||||
language, value = progress
|
||||
if language in line:
|
||||
if match := re.search(r"\!\[(\d+(\.\d+)?)%\]\(.*\)", line):
|
||||
content[i] = line.replace(
|
||||
match.group(0),
|
||||
f"",
|
||||
)
|
||||
|
||||
with open("README.md", "w", encoding="utf-8", newline="\n") as file:
|
||||
file.writelines(content)
|
||||
|
||||
|
||||
def parse_json_file(file_path):
|
||||
"""
|
||||
Parses a JSON translation file and returns a flat dictionary of all keys.
|
||||
:param file_path: Path to the JSON file.
|
||||
:return: Dictionary with flattened keys and values.
|
||||
"""
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
|
||||
def flatten_dict(d, parent_key="", sep="."):
|
||||
items = {}
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.update(flatten_dict(v, new_key, sep=sep))
|
||||
else:
|
||||
items[new_key] = v
|
||||
return items
|
||||
|
||||
return flatten_dict(data)
|
||||
|
||||
|
||||
def compare_files(
|
||||
default_file_path, file_paths, ignore_translation_file
|
||||
) -> list[tuple[str, int]]:
|
||||
"""Compares the default JSON translation file with other
|
||||
translation files in the locales directory.
|
||||
|
||||
Parameters:
|
||||
default_file_path (str): The path to the default translation JSON file.
|
||||
file_paths (list): List of paths to translation JSON files.
|
||||
ignore_translation_file (str): Path to the TOML file with ignore rules.
|
||||
|
||||
Returns:
|
||||
list[tuple[str, int]]: A list of tuples containing
|
||||
language and progress percentage.
|
||||
""" # noqa: D205
|
||||
default_keys = parse_json_file(default_file_path)
|
||||
num_keys = len(default_keys)
|
||||
|
||||
result_list = []
|
||||
sort_ignore_translation: tomlkit.TOMLDocument
|
||||
|
||||
# read toml
|
||||
with open(ignore_translation_file, encoding="utf-8") as f:
|
||||
sort_ignore_translation = tomlkit.parse(f.read())
|
||||
|
||||
for file_path in file_paths:
|
||||
# Extract language code from directory name
|
||||
locale_dir = os.path.basename(os.path.dirname(file_path))
|
||||
|
||||
# Convert locale format from hyphen to underscore for TOML compatibility
|
||||
# e.g., en-GB -> en_GB, sr-LATN-RS -> sr_LATN_RS
|
||||
language = locale_dir.replace("-", "_")
|
||||
|
||||
fails = 0
|
||||
if language in ["en_GB", "en_US"]:
|
||||
result_list.append(("en_GB", 100))
|
||||
result_list.append(("en_US", 100))
|
||||
continue
|
||||
|
||||
if language not in sort_ignore_translation:
|
||||
sort_ignore_translation[language] = tomlkit.table()
|
||||
|
||||
if (
|
||||
"ignore" not in sort_ignore_translation[language]
|
||||
or len(sort_ignore_translation[language].get("ignore", [])) < 1
|
||||
):
|
||||
sort_ignore_translation[language]["ignore"] = tomlkit.array(
|
||||
["language.direction"]
|
||||
)
|
||||
|
||||
current_keys = parse_json_file(file_path)
|
||||
|
||||
# Compare keys
|
||||
for default_key, default_value in default_keys.items():
|
||||
if default_key not in current_keys:
|
||||
# Key is missing entirely
|
||||
if default_key not in sort_ignore_translation[language]["ignore"]:
|
||||
print(f"{language}: Key '{default_key}' is missing.")
|
||||
fails += 1
|
||||
elif (
|
||||
default_value == current_keys[default_key]
|
||||
and default_key not in sort_ignore_translation[language]["ignore"]
|
||||
):
|
||||
# Key exists but value is untranslated (same as reference)
|
||||
print(f"{language}: Key '{default_key}' is missing the translation.")
|
||||
fails += 1
|
||||
elif default_value != current_keys[default_key]:
|
||||
# Key is translated, remove from ignore list if present
|
||||
if default_key in sort_ignore_translation[language]["ignore"]:
|
||||
sort_ignore_translation[language]["ignore"].remove(default_key)
|
||||
|
||||
print(f"{language}: {fails} out of {num_keys} keys are not translated.")
|
||||
result_list.append(
|
||||
(
|
||||
language,
|
||||
int((num_keys - fails) * 100 / num_keys),
|
||||
)
|
||||
)
|
||||
|
||||
ignore_translation = convert_to_multiline(sort_ignore_translation)
|
||||
with open(ignore_translation_file, "w", encoding="utf-8", newline="\n") as file:
|
||||
file.write(tomlkit.dumps(ignore_translation))
|
||||
|
||||
unique_data = list(set(result_list))
|
||||
unique_data.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return unique_data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
directory = os.path.join(os.getcwd(), "frontend", "public", "locales")
|
||||
translation_file_paths = glob.glob(os.path.join(directory, "*", "translation.json"))
|
||||
reference_file = os.path.join(directory, "en-GB", "translation.json")
|
||||
|
||||
scripts_directory = os.path.join(os.getcwd(), "scripts")
|
||||
translation_state_file = os.path.join(scripts_directory, "ignore_translation.toml")
|
||||
|
||||
write_readme(
|
||||
compare_files(reference_file, translation_file_paths, translation_state_file)
|
||||
)
|
||||
@@ -3,7 +3,6 @@ ignore = [
|
||||
'lang.div',
|
||||
'lang.dzo',
|
||||
'lang.que',
|
||||
'language.direction',
|
||||
]
|
||||
|
||||
[az_AZ]
|
||||
@@ -193,8 +192,6 @@ ignore = [
|
||||
'AddStampRequest.alphabet',
|
||||
'AddStampRequest.position',
|
||||
'PDFToBook.selectText.1',
|
||||
'PDFToText.tags',
|
||||
'addPageNumbers.selectText.3',
|
||||
'adminUserSettings.team',
|
||||
'alphabet',
|
||||
'audit.dashboard.modal.id',
|
||||
@@ -204,7 +201,6 @@ ignore = [
|
||||
'audit.dashboard.table.details',
|
||||
'audit.dashboard.table.id',
|
||||
'certSign.name',
|
||||
'cookieBanner.popUp.acceptAllBtn',
|
||||
'endpointStatistics.top10',
|
||||
'endpointStatistics.top20',
|
||||
'fileChooser.dragAndDrop',
|
||||
@@ -313,9 +309,7 @@ ignore = [
|
||||
]
|
||||
|
||||
[fa_IR]
|
||||
ignore = [
|
||||
'language.direction',
|
||||
]
|
||||
ignore = []
|
||||
|
||||
[fr_FR]
|
||||
ignore = [
|
||||
@@ -323,7 +317,6 @@ ignore = [
|
||||
'AddStampRequest.position',
|
||||
'AddStampRequest.rotation',
|
||||
'PDFToBook.selectText.1',
|
||||
'addPageNumbers.selectText.3',
|
||||
'adminUserSettings.actions',
|
||||
'alphabet',
|
||||
'compare.document.1',
|
||||
@@ -526,6 +519,11 @@ ignore = [
|
||||
'language.direction',
|
||||
]
|
||||
|
||||
[ml_ML]
|
||||
ignore = [
|
||||
'language.direction',
|
||||
]
|
||||
|
||||
[nl_NL]
|
||||
ignore = [
|
||||
'compare.document.1',
|
||||
@@ -770,7 +768,6 @@ ignore = [
|
||||
[sk_SK]
|
||||
ignore = [
|
||||
'adminUserSettings.admin',
|
||||
'home.multiTool.title',
|
||||
'info',
|
||||
'lang.ceb',
|
||||
'lang.chr',
|
||||
@@ -974,11 +971,15 @@ ignore = [
|
||||
'lang.yid',
|
||||
'lang.yor',
|
||||
'language.direction',
|
||||
'pipeline.title',
|
||||
'pipelineOptions.pipelineHeader',
|
||||
'showJS.tags',
|
||||
]
|
||||
|
||||
[zh_BO]
|
||||
ignore = [
|
||||
'language.direction',
|
||||
]
|
||||
|
||||
[zh_CN]
|
||||
ignore = [
|
||||
'language.direction',
|
||||
|
||||
@@ -19,9 +19,8 @@ if [[ "$INSTALL_BOOK_AND_ADVANCED_HTML_OPS" == "true" && "$FAT_DOCKER" != "true"
|
||||
#apk add --no-cache calibre@testing
|
||||
fi
|
||||
|
||||
if [[ "$FAT_DOCKER" != "true" ]]; then
|
||||
/scripts/download-security-jar.sh
|
||||
fi
|
||||
# Security jar is now built into the application jar during Docker build
|
||||
# No need to download it separately
|
||||
|
||||
if [[ -n "$LANGS" ]]; then
|
||||
/scripts/installFonts.sh $LANGS
|
||||
|
||||
579
scripts/translations/README.md
Normal file
579
scripts/translations/README.md
Normal file
@@ -0,0 +1,579 @@
|
||||
# Translation Management Scripts
|
||||
|
||||
This directory contains Python scripts for managing frontend translations in Stirling PDF. These tools help analyze, merge, validate, and manage translations against the en-GB golden truth file.
|
||||
|
||||
## Scripts Overview
|
||||
|
||||
### 0. Validation Scripts (Run First!)
|
||||
|
||||
#### `json_validator.py`
|
||||
Validates JSON syntax in translation files with detailed error reporting.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Validate single file
|
||||
python scripts/translations/json_validator.py ar_AR_batch_1_of_3.json
|
||||
|
||||
# Validate all batches for a language
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR
|
||||
|
||||
# Validate pattern with wildcards
|
||||
python scripts/translations/json_validator.py "ar_AR_batch_*.json"
|
||||
|
||||
# Brief output (no context)
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR --brief
|
||||
|
||||
# Only show files with errors
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR --quiet
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Validates JSON syntax with detailed error messages
|
||||
- Shows exact line, column, and character position of errors
|
||||
- Displays context around errors for easy fixing
|
||||
- Suggests common fixes based on error type
|
||||
- Detects unescaped quotes and backslashes
|
||||
- Reports entry counts for valid files
|
||||
- Exit code 1 if any files invalid (good for CI/CD)
|
||||
|
||||
**Common Issues Detected:**
|
||||
- Unescaped quotes inside strings: `"text with "quotes""` → `"text with \"quotes\""`
|
||||
- Invalid backslash escapes: `\d{4}` → `\\d{4}`
|
||||
- Missing commas between entries
|
||||
- Trailing commas before closing braces
|
||||
|
||||
#### `validate_placeholders.py`
|
||||
Validates that translation files have correct placeholders matching en-GB (source of truth).
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Validate all languages
|
||||
python scripts/translations/validate_placeholders.py
|
||||
|
||||
# Validate specific language
|
||||
python scripts/translations/validate_placeholders.py --language es-ES
|
||||
|
||||
# Show detailed text samples
|
||||
python scripts/translations/validate_placeholders.py --verbose
|
||||
|
||||
# Output as JSON
|
||||
python scripts/translations/validate_placeholders.py --json
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Detects missing placeholders (e.g., {n}, {total}, {filename})
|
||||
- Detects extra placeholders not in en-GB
|
||||
- Shows exact keys and text where issues occur
|
||||
- Exit code 1 if issues found (good for CI/CD)
|
||||
|
||||
#### `validate_json_structure.py`
|
||||
Validates JSON structure and key consistency with en-GB.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Validate all languages
|
||||
python scripts/translations/validate_json_structure.py
|
||||
|
||||
# Validate specific language
|
||||
python scripts/translations/validate_json_structure.py --language de-DE
|
||||
|
||||
# Show all missing/extra keys
|
||||
python scripts/translations/validate_json_structure.py --verbose
|
||||
|
||||
# Output as JSON
|
||||
python scripts/translations/validate_json_structure.py --json
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Validates JSON syntax
|
||||
- Detects missing keys (not translated yet)
|
||||
- Detects extra keys (not in en-GB, should be removed)
|
||||
- Reports key counts and structure differences
|
||||
- Exit code 1 if issues found (good for CI/CD)
|
||||
|
||||
### 1. `translation_analyzer.py`
|
||||
Analyzes translation files to find missing translations, untranslated entries, and provides completion statistics.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Analyze all languages
|
||||
python scripts/translations/translation_analyzer.py
|
||||
|
||||
# Analyze specific language
|
||||
python scripts/translations/translation_analyzer.py --language fr-FR
|
||||
|
||||
# Show only missing translations
|
||||
python scripts/translations/translation_analyzer.py --missing-only
|
||||
|
||||
# Show only untranslated entries
|
||||
python scripts/translations/translation_analyzer.py --untranslated-only
|
||||
|
||||
# Show summary only
|
||||
python scripts/translations/translation_analyzer.py --summary
|
||||
|
||||
# JSON output format
|
||||
python scripts/translations/translation_analyzer.py --format json
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Finds missing translation keys
|
||||
- Identifies untranslated entries (identical to en-GB and [UNTRANSLATED] markers)
|
||||
- Shows accurate completion percentages using ignore patterns
|
||||
- Identifies extra keys not in en-GB
|
||||
- Supports JSON and text output formats
|
||||
- Uses `scripts/ignore_translation.toml` for language-specific exclusions
|
||||
|
||||
### 2. `translation_merger.py`
|
||||
Merges missing translations from en-GB into target language files and manages translation workflows.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Add missing translations from en-GB to French
|
||||
python scripts/translations/translation_merger.py fr-FR add-missing
|
||||
|
||||
# Add without marking as [UNTRANSLATED]
|
||||
python scripts/translations/translation_merger.py fr-FR add-missing --no-mark-untranslated
|
||||
|
||||
# Extract untranslated entries to a file
|
||||
python scripts/translations/translation_merger.py fr-FR extract-untranslated --output fr_untranslated.json
|
||||
|
||||
# Create a template for AI translation
|
||||
python scripts/translations/translation_merger.py fr-FR create-template --output fr_template.json
|
||||
|
||||
# Apply translations from a file
|
||||
python scripts/translations/translation_merger.py fr-FR apply-translations --translations-file fr_translated.json
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Adds missing keys from en-GB with optional [UNTRANSLATED] markers
|
||||
- Extracts untranslated entries for external translation
|
||||
- Creates structured templates for AI translation
|
||||
- Applies translated content back to language files
|
||||
- Automatic backup creation
|
||||
|
||||
### 3. `ai_translation_helper.py`
|
||||
Specialized tool for AI-assisted translation workflows with batch processing and validation.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Create batch file for AI translation (multiple languages)
|
||||
python scripts/translations/ai_translation_helper.py create-batch --languages fr-FR de-DE es-ES --output batch.json --max-entries 50
|
||||
|
||||
# Validate AI translations
|
||||
python scripts/translations/ai_translation_helper.py validate batch.json
|
||||
|
||||
# Apply validated AI translations
|
||||
python scripts/translations/ai_translation_helper.py apply-batch batch.json
|
||||
|
||||
# Export for external translation services
|
||||
python scripts/translations/ai_translation_helper.py export --languages fr-FR de-DE --format csv
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Creates batch files for AI translation of multiple languages
|
||||
- Prioritizes important translation keys
|
||||
- Validates translations for placeholders and artifacts
|
||||
- Applies batch translations with validation
|
||||
- Exports to CSV/JSON for external translation services
|
||||
|
||||
### 4. `compact_translator.py`
|
||||
Extracts untranslated entries in minimal JSON format for character-limited AI services.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Extract all untranslated entries
|
||||
python scripts/translations/compact_translator.py it-IT --output to_translate.json
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Produces minimal JSON output with no extra whitespace
|
||||
- Automatic ignore patterns for cleaner output
|
||||
- Batch size control for manageable chunks
|
||||
- 50-80% fewer characters than other extraction methods
|
||||
|
||||
### 5. `json_beautifier.py`
|
||||
Restructures and beautifies translation JSON files to match en-GB structure exactly.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Restructure single language to match en-GB structure
|
||||
python scripts/translations/json_beautifier.py --language de-DE
|
||||
|
||||
# Restructure all languages
|
||||
python scripts/translations/json_beautifier.py --all-languages
|
||||
|
||||
# Validate structure without modifying files
|
||||
python scripts/translations/json_beautifier.py --language de-DE --validate-only
|
||||
|
||||
# Skip backup creation
|
||||
python scripts/translations/json_beautifier.py --language de-DE --no-backup
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Restructures JSON to match en-GB nested structure exactly
|
||||
- Preserves key ordering for line-by-line comparison
|
||||
- Creates automatic backups before modification
|
||||
- Validates structure and key ordering
|
||||
- Handles flattened dot-notation keys (e.g., "key.subkey") properly
|
||||
|
||||
## Translation Workflows
|
||||
|
||||
### Method 1: Compact Translation Workflow (RECOMMENDED for AI)
|
||||
|
||||
**Best for character-limited AI services like Claude or ChatGPT**
|
||||
|
||||
#### Step 1: Check Current Status
|
||||
```bash
|
||||
python scripts/translations/translation_analyzer.py --language it-IT --summary
|
||||
```
|
||||
|
||||
#### Step 2: Extract Untranslated Entries
|
||||
```bash
|
||||
# For small files (< 1200 entries)
|
||||
python scripts/translations/compact_translator.py it-IT --output to_translate.json
|
||||
|
||||
# For large files, split into batches
|
||||
python scripts/translations/compact_translator.py it-IT --output it_IT_batch --batch-size 400
|
||||
# Creates: it_IT_batch_1_of_N.json, it_IT_batch_2_of_N.json, etc.
|
||||
```
|
||||
|
||||
#### Step 2.5: Validate JSON (if using batches)
|
||||
```bash
|
||||
# After AI translates the batches, validate them before merging
|
||||
python scripts/translations/json_validator.py --all-batches it_IT
|
||||
|
||||
# Fix any errors reported (common issues: unescaped quotes, backslashes)
|
||||
```
|
||||
|
||||
**Output format**: Compact JSON with minimal whitespace
|
||||
```json
|
||||
{"key1":"English text","key2":"Another text","key3":"More text"}
|
||||
```
|
||||
|
||||
#### Step 3: AI Translation
|
||||
1. Copy the compact JSON output
|
||||
2. Give it to your AI with instructions:
|
||||
```
|
||||
Translate this JSON to Italian. Keep the same structure, translate only the values.
|
||||
Preserve placeholders like {n}, {total}, {filename}, {{variable}}.
|
||||
```
|
||||
3. Save the AI's response as `translated.json`
|
||||
|
||||
#### Step 4: Apply Translations
|
||||
```bash
|
||||
python scripts/translations/translation_merger.py it-IT apply-translations --translations-file translated.json
|
||||
```
|
||||
|
||||
#### Step 5: Verify Results
|
||||
```bash
|
||||
python scripts/translations/translation_analyzer.py --language it-IT --summary
|
||||
```
|
||||
|
||||
### Method 2: Batch Translation Workflow
|
||||
|
||||
**For complete language translation from scratch or major updates**
|
||||
|
||||
#### Step 1: Analyze Current State
|
||||
```bash
|
||||
python scripts/translations/translation_analyzer.py --language de-DE --summary
|
||||
```
|
||||
|
||||
#### Step 2: Create Translation Batches
|
||||
```bash
|
||||
# Create batches of 100 entries each for systematic translation
|
||||
python scripts/translations/ai_translation_helper.py create-batch --languages de-DE --output de_batch_1.json --max-entries 100
|
||||
```
|
||||
|
||||
#### Step 3: Translate Batch with AI
|
||||
Edit the batch file and fill in ALL `translated` fields:
|
||||
- Preserve all placeholders like `{n}`, `{total}`, `{filename}`, `{{toolName}}`
|
||||
- Keep technical terms consistent
|
||||
- Maintain JSON structure exactly
|
||||
- Consider context provided for each entry
|
||||
|
||||
#### Step 4: Apply Translations
|
||||
```bash
|
||||
# Skip validation if using legitimate placeholders ({{variable}})
|
||||
python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json --skip-validation
|
||||
```
|
||||
|
||||
#### Step 5: Check Progress and Continue
|
||||
```bash
|
||||
python scripts/translations/translation_analyzer.py --language de-DE --summary
|
||||
```
|
||||
Repeat steps 2-5 until 100% complete.
|
||||
|
||||
### Method 3: Quick Translation Workflow (Legacy)
|
||||
|
||||
**For small updates or existing translations**
|
||||
|
||||
#### Step 1: Add Missing Translations
|
||||
```bash
|
||||
python scripts/translations/translation_merger.py fr-FR add-missing --mark-untranslated
|
||||
```
|
||||
|
||||
#### Step 2: Create AI Template
|
||||
```bash
|
||||
python scripts/translations/translation_merger.py fr-FR create-template --output fr_template.json
|
||||
```
|
||||
|
||||
#### Step 3: Apply Translations
|
||||
```bash
|
||||
python scripts/translations/translation_merger.py fr-FR apply-translations --translations-file fr_translated.json
|
||||
```
|
||||
|
||||
## Translation File Structure
|
||||
|
||||
Translation files are located in `frontend/public/locales/{language}/translation.json` with nested JSON structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"addPageNumbers": {
|
||||
"title": "Add Page Numbers",
|
||||
"selectText": {
|
||||
"1": "Select PDF file:",
|
||||
"2": "Margin Size"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Keys use dot notation internally (e.g., `addPageNumbers.selectText.1`).
|
||||
|
||||
## Key Features
|
||||
|
||||
### Placeholder Preservation
|
||||
All scripts preserve placeholders like `{n}`, `{total}`, `{filename}` in translations:
|
||||
```
|
||||
"customNumberDesc": "Defaults to {n}, also accepts 'Page {n} of {total}'"
|
||||
```
|
||||
|
||||
### Automatic Backups
|
||||
Scripts create timestamped backups before modifying files:
|
||||
```
|
||||
translation.backup.20241201_143022.json
|
||||
```
|
||||
|
||||
### Context-Aware Translation
|
||||
Scripts provide context information to help with accurate translations:
|
||||
```json
|
||||
{
|
||||
"addPageNumbers.title": {
|
||||
"original": "Add Page Numbers",
|
||||
"context": "Feature for adding page numbers to PDFs"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Priority-Based Translation
|
||||
Important keys (title, submit, error messages) are prioritized when limiting translation batch sizes.
|
||||
|
||||
### Ignore Patterns System
|
||||
The `scripts/ignore_translation.toml` file defines keys that should be ignored for each language, improving completion accuracy.
|
||||
|
||||
**Common ignore patterns:**
|
||||
- `language.direction`: Text direction (ltr/rtl) - universal
|
||||
- `lang.*`: Language code entries not relevant to specific locales
|
||||
- `pipeline.title`, `home.devApi.title`: Technical terms kept in English
|
||||
- Specific technical IDs, version numbers, and system identifiers
|
||||
|
||||
**Format:**
|
||||
```toml
|
||||
[de_DE]
|
||||
ignore = [
|
||||
'language.direction',
|
||||
'pipeline.title',
|
||||
'lang.afr',
|
||||
'lang.ceb',
|
||||
# ... more patterns
|
||||
]
|
||||
```
|
||||
|
||||
## Best Practices & Lessons Learned
|
||||
|
||||
### Critical Rules for Translation
|
||||
|
||||
1. **NEVER skip entries**: Translate ALL entries in each batch to avoid [UNTRANSLATED] pollution
|
||||
2. **Use appropriate batch sizes**: 100 entries for systematic translation, unlimited for compact method
|
||||
3. **Skip validation for placeholders**: Use `--skip-validation` when batch contains `{{variable}}` patterns
|
||||
4. **Check progress between batches**: Use `--summary` flag to track completion percentage
|
||||
5. **Preserve all placeholders**: Keep `{n}`, `{total}`, `{filename}`, `{{toolName}}` exactly as-is
|
||||
|
||||
### Workflow Comparison
|
||||
|
||||
| Method | Best For | Character Usage | Complexity | Speed |
|
||||
|--------|----------|----------------|------------|-------|
|
||||
| Compact | AI services | Minimal (50-80% less) | Simple | Fastest |
|
||||
| Batch | Systematic translation | Moderate | Medium | Medium |
|
||||
| Quick | Small updates | High | Low | Slow |
|
||||
|
||||
### Common Issues and Solutions
|
||||
|
||||
#### JSON Syntax Errors in AI Translations
|
||||
**Problem**: AI-translated batch files have JSON syntax errors
|
||||
**Symptoms**:
|
||||
- `JSONDecodeError: Expecting ',' delimiter`
|
||||
- `JSONDecodeError: Invalid \escape`
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# 1. Validate all batches to find errors
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR
|
||||
|
||||
# 2. Check detailed error with context
|
||||
python scripts/translations/json_validator.py ar_AR_batch_2_of_3.json
|
||||
|
||||
# 3. Fix the reported issues:
|
||||
# - Unescaped quotes: "text with "quotes"" → "text with \"quotes\""
|
||||
# - Backslashes in regex: "\d{4}" → "\\d{4}"
|
||||
# - Missing commas between entries
|
||||
|
||||
# 4. Validate again until all pass
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR
|
||||
```
|
||||
|
||||
**Common fixes:**
|
||||
- Arabic/RTL text with embedded quotes: Always escape with backslash
|
||||
- Regex patterns: Double all backslashes (`\d` → `\\d`)
|
||||
- Check for missing/extra commas at line reported in error
|
||||
|
||||
#### [UNTRANSLATED] Pollution
|
||||
**Problem**: Hundreds of [UNTRANSLATED] markers from incomplete translation attempts
|
||||
**Solution**:
|
||||
- Only translate complete batches of manageable size
|
||||
- Use analyzer that counts [UNTRANSLATED] as missing translations
|
||||
- Restore from backup if pollution occurs
|
||||
|
||||
#### Validation False Positives
|
||||
**Problem**: Validator flags legitimate `{{variable}}` placeholders as artifacts
|
||||
**Solution**: Use `--skip-validation` flag when applying batches with template variables
|
||||
|
||||
#### JSON Structure Mismatches
|
||||
**Problem**: Flattened dot-notation keys instead of proper nested objects
|
||||
**Solution**: Use `json_beautifier.py` to restructure files to match en-GB exactly
|
||||
|
||||
## Real-World Examples
|
||||
|
||||
### Complete Arabic Translation with Validation (Batch Method)
|
||||
```bash
|
||||
# Check status
|
||||
python scripts/translations/translation_analyzer.py --language ar-AR --summary
|
||||
# Result: 50% complete, 1088 missing
|
||||
|
||||
# Extract in batches due to AI token limits
|
||||
python scripts/translations/compact_translator.py ar-AR --output ar_AR_batch --batch-size 400
|
||||
# Created: ar_AR_batch_1_of_3.json (400 entries)
|
||||
# ar_AR_batch_2_of_3.json (400 entries)
|
||||
# ar_AR_batch_3_of_3.json (288 entries)
|
||||
|
||||
# [Send each batch to AI for translation]
|
||||
|
||||
# Validate translated batches before merging
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR
|
||||
# Found errors in batch 1 and 2:
|
||||
# - Line 263: Unescaped quotes in "انقر "إضافة ملفات""
|
||||
# - Line 132: Unescaped quotes in "أو "and""
|
||||
# - Line 213: Invalid escape "\d{4}"
|
||||
|
||||
# Fix errors manually or with sed, then validate again
|
||||
python scripts/translations/json_validator.py --all-batches ar_AR
|
||||
# All valid!
|
||||
|
||||
# Merge all batches
|
||||
python3 << 'EOF'
|
||||
import json
|
||||
merged = {}
|
||||
for i in range(1, 4):
|
||||
with open(f'ar_AR_batch_{i}_of_3.json', 'r', encoding='utf-8') as f:
|
||||
merged.update(json.load(f))
|
||||
with open('ar_AR_merged.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||||
EOF
|
||||
|
||||
# Apply merged translations
|
||||
python scripts/translations/translation_merger.py ar-AR apply-translations --translations-file ar_AR_merged.json
|
||||
# Result: Applied 1088 translations
|
||||
|
||||
# Beautify to match en-GB structure
|
||||
python scripts/translations/json_beautifier.py --language ar-AR
|
||||
|
||||
# Check final progress
|
||||
python scripts/translations/translation_analyzer.py --language ar-AR --summary
|
||||
# Result: 98.7% complete, 9 missing, 20 untranslated
|
||||
```
|
||||
|
||||
### Complete Italian Translation (Compact Method)
|
||||
```bash
|
||||
# Check status
|
||||
python scripts/translations/translation_analyzer.py --language it-IT --summary
|
||||
# Result: 46.8% complete, 1147 missing
|
||||
|
||||
# Extract all entries for translation
|
||||
python scripts/translations/compact_translator.py it-IT --output batch1.json
|
||||
|
||||
# [Translate batch1.json with AI, save as batch1_translated.json]
|
||||
|
||||
# Apply translations
|
||||
python scripts/translations/translation_merger.py it-IT apply-translations --translations-file batch1_translated.json
|
||||
# Result: Applied 1147 translations
|
||||
|
||||
# Check progress
|
||||
python scripts/translations/translation_analyzer.py --language it-IT --summary
|
||||
# Result: 100% complete, 0 missing
|
||||
```
|
||||
|
||||
### German Translation (Batch Method)
|
||||
Starting from 46.3% completion, reaching 60.3% with batch method:
|
||||
|
||||
```bash
|
||||
# Initial analysis
|
||||
python scripts/translations/translation_analyzer.py --language de-DE --summary
|
||||
# Result: 46.3% complete, 1142 missing entries
|
||||
|
||||
# Batch 1 (100 entries)
|
||||
python scripts/translations/ai_translation_helper.py create-batch --languages de-DE --output de_batch_1.json --max-entries 100
|
||||
# [Translate all 100 entries in batch file]
|
||||
python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json --skip-validation
|
||||
# Progress: 46.6% → 51.2%
|
||||
|
||||
# Continue with more batches until 100% complete
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
- **Missing Files**: Scripts create new files when language directories don't exist
|
||||
- **Invalid JSON**: Clear error messages with line numbers
|
||||
- **Placeholder Mismatches**: Validation warnings for missing or extra placeholders
|
||||
- **[UNTRANSLATED] Entries**: Counted as missing translations to prevent pollution
|
||||
- **Backup Failures**: Graceful handling with user notification
|
||||
|
||||
## Integration with Development
|
||||
|
||||
These scripts integrate with the existing translation system:
|
||||
- Works with the current `frontend/public/locales/` structure
|
||||
- Compatible with the i18n system used in the React frontend
|
||||
- Respects the JSON format expected by the translation loader
|
||||
- Maintains the nested structure required by the UI components
|
||||
|
||||
## Language-Specific Notes
|
||||
|
||||
### German Translation Notes
|
||||
- Technical terms: Use German equivalents (PDF → PDF, API → API)
|
||||
- UI actions: "hochladen" (upload), "herunterladen" (download), "speichern" (save)
|
||||
- Error messages: Consistent pattern "Ein Fehler ist beim [action] aufgetreten"
|
||||
- Formal address: Use "Sie" form for user-facing text
|
||||
|
||||
### Italian Translation Notes
|
||||
- Keep technical terms in English when commonly used (PDF, API, URL)
|
||||
- Use formal address ("Lei" form) for user-facing text
|
||||
- Error messages: "Si è verificato un errore durante [action]"
|
||||
- UI actions: "carica" (upload), "scarica" (download), "salva" (save)
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
1. **Complete Language Translation**: Use Compact Workflow for fastest AI-assisted translation
|
||||
2. **New Language Addition**: Start with compact workflow for comprehensive coverage
|
||||
3. **Updating Existing Language**: Use analyzer to find gaps, then compact or batch method
|
||||
4. **Quality Assurance**: Use analyzer with `--summary` for completion metrics and issue detection
|
||||
5. **External Translation Services**: Use export functionality to generate CSV files for translators
|
||||
6. **Structure Maintenance**: Use json_beautifier to keep files aligned with en-GB structure
|
||||
408
scripts/translations/ai_translation_helper.py
Normal file
408
scripts/translations/ai_translation_helper.py
Normal file
@@ -0,0 +1,408 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI Translation Helper for Stirling PDF Frontend
|
||||
Provides utilities for AI-assisted translation workflows including
|
||||
batch processing, quality checks, and integration helpers.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple, Any, Optional
|
||||
import argparse
|
||||
import re
|
||||
from datetime import datetime
|
||||
import csv
|
||||
|
||||
|
||||
class AITranslationHelper:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales"):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
||||
|
||||
def _load_json(self, file_path: Path) -> Dict:
|
||||
"""Load JSON file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
print(f"Error loading {file_path}: {e}")
|
||||
return {}
|
||||
|
||||
def _save_json(self, data: Dict, file_path: Path) -> None:
|
||||
"""Save JSON file."""
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def create_ai_batch_file(self, languages: List[str], output_file: Path,
|
||||
max_entries_per_language: int = 50) -> None:
|
||||
"""Create a batch file for AI translation with multiple languages."""
|
||||
golden_truth = self._load_json(self.golden_truth_file)
|
||||
batch_data = {
|
||||
'metadata': {
|
||||
'created_at': datetime.now().isoformat(),
|
||||
'source_language': 'en-GB',
|
||||
'target_languages': languages,
|
||||
'max_entries_per_language': max_entries_per_language,
|
||||
'instructions': {
|
||||
'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}',
|
||||
'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.',
|
||||
'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.',
|
||||
'style': 'Keep translations concise and user-friendly'
|
||||
}
|
||||
},
|
||||
'translations': {}
|
||||
}
|
||||
|
||||
for lang in languages:
|
||||
lang_file = self.locales_dir / lang / "translation.json"
|
||||
if not lang_file.exists():
|
||||
# Create empty translation structure
|
||||
lang_data = {}
|
||||
else:
|
||||
lang_data = self._load_json(lang_file)
|
||||
|
||||
# Find untranslated entries
|
||||
untranslated = self._find_untranslated_entries(golden_truth, lang_data)
|
||||
|
||||
# Limit entries if specified
|
||||
if max_entries_per_language and len(untranslated) > max_entries_per_language:
|
||||
# Prioritize by key importance
|
||||
untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language)
|
||||
|
||||
batch_data['translations'][lang] = {}
|
||||
for key, value in untranslated.items():
|
||||
batch_data['translations'][lang][key] = {
|
||||
'original': value,
|
||||
'translated': '', # AI fills this
|
||||
'context': self._get_key_context(key)
|
||||
}
|
||||
|
||||
self._save_json(batch_data, output_file)
|
||||
total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values())
|
||||
print(f"Created AI batch file: {output_file}")
|
||||
print(f"Total entries to translate: {total_entries}")
|
||||
|
||||
def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]:
|
||||
"""Find entries that need translation."""
|
||||
golden_flat = self._flatten_dict(golden_truth)
|
||||
lang_flat = self._flatten_dict(lang_data)
|
||||
|
||||
untranslated = {}
|
||||
for key, value in golden_flat.items():
|
||||
if (key not in lang_flat or
|
||||
lang_flat[key] == value or
|
||||
(isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))):
|
||||
if not self._is_expected_identical(key, value):
|
||||
untranslated[key] = value
|
||||
|
||||
return untranslated
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
||||
"""Flatten nested dictionary."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(self._flatten_dict(v, new_key, separator).items())
|
||||
else:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if key should be identical across languages."""
|
||||
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
|
||||
return True
|
||||
return 'language.direction' in key.lower()
|
||||
|
||||
def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]:
|
||||
"""Prioritize which keys to translate first based on importance."""
|
||||
# Define priority order (higher score = higher priority)
|
||||
priority_patterns = [
|
||||
('title', 10),
|
||||
('header', 9),
|
||||
('submit', 8),
|
||||
('selectText', 7),
|
||||
('prompt', 6),
|
||||
('desc', 5),
|
||||
('error', 8),
|
||||
('warning', 7),
|
||||
('save', 8),
|
||||
('download', 8),
|
||||
('upload', 7),
|
||||
]
|
||||
|
||||
scored_keys = []
|
||||
for key, value in untranslated.items():
|
||||
score = 1 # base score
|
||||
for pattern, pattern_score in priority_patterns:
|
||||
if pattern.lower() in key.lower():
|
||||
score = max(score, pattern_score)
|
||||
scored_keys.append((key, value, score))
|
||||
|
||||
# Sort by score (descending) and return top entries
|
||||
scored_keys.sort(key=lambda x: x[2], reverse=True)
|
||||
return {key: value for key, value, _ in scored_keys[:max_count]}
|
||||
|
||||
def _get_key_context(self, key: str) -> str:
|
||||
"""Get contextual information for a translation key."""
|
||||
parts = key.split('.')
|
||||
contexts = {
|
||||
'addPageNumbers': 'Feature for adding page numbers to PDFs',
|
||||
'compress': 'PDF compression functionality',
|
||||
'merge': 'PDF merging functionality',
|
||||
'split': 'PDF splitting functionality',
|
||||
'rotate': 'PDF rotation functionality',
|
||||
'convert': 'File conversion functionality',
|
||||
'security': 'PDF security and permissions',
|
||||
'metadata': 'PDF metadata editing',
|
||||
'watermark': 'Adding watermarks to PDFs',
|
||||
'overlay': 'PDF overlay functionality',
|
||||
'extract': 'Extracting content from PDFs'
|
||||
}
|
||||
|
||||
if len(parts) > 0:
|
||||
main_section = parts[0]
|
||||
context = contexts.get(main_section, f'Part of {main_section} functionality')
|
||||
if len(parts) > 1:
|
||||
context += f', specifically for {parts[-1]}'
|
||||
return context
|
||||
|
||||
return 'General application text'
|
||||
|
||||
def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
|
||||
"""Validate AI translations for common issues."""
|
||||
batch_data = self._load_json(batch_file)
|
||||
issues = {'errors': [], 'warnings': []}
|
||||
|
||||
for lang, translations in batch_data.get('translations', {}).items():
|
||||
for key, translation_data in translations.items():
|
||||
original = translation_data.get('original', '')
|
||||
translated = translation_data.get('translated', '')
|
||||
|
||||
if not translated:
|
||||
issues['errors'].append(f"{lang}.{key}: Missing translation")
|
||||
continue
|
||||
|
||||
# Check for placeholder preservation
|
||||
original_placeholders = re.findall(r'\{[^}]+\}', original)
|
||||
translated_placeholders = re.findall(r'\{[^}]+\}', translated)
|
||||
|
||||
if set(original_placeholders) != set(translated_placeholders):
|
||||
issues['warnings'].append(
|
||||
f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
|
||||
f"Translated: {translated_placeholders}"
|
||||
)
|
||||
|
||||
# Check if translation is identical to original (might be untranslated)
|
||||
if translated == original and not self._is_expected_identical(key, original):
|
||||
issues['warnings'].append(f"{lang}.{key}: Translation identical to original")
|
||||
|
||||
# Check for common AI translation artifacts
|
||||
artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}']
|
||||
for artifact in artifacts:
|
||||
if artifact in translated:
|
||||
issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}")
|
||||
|
||||
return issues
|
||||
|
||||
def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]:
|
||||
"""Apply translations from AI batch file to individual language files."""
|
||||
batch_data = self._load_json(batch_file)
|
||||
results = {'applied': {}, 'errors': [], 'warnings': []}
|
||||
|
||||
if validate:
|
||||
validation_issues = self.validate_ai_translations(batch_file)
|
||||
if validation_issues['errors']:
|
||||
print("Validation errors found. Fix these before applying:")
|
||||
for error in validation_issues['errors']:
|
||||
print(f" ERROR: {error}")
|
||||
return results
|
||||
|
||||
if validation_issues['warnings']:
|
||||
print("Validation warnings (review recommended):")
|
||||
for warning in validation_issues['warnings'][:10]:
|
||||
print(f" WARNING: {warning}")
|
||||
|
||||
for lang, translations in batch_data.get('translations', {}).items():
|
||||
lang_file = self.locales_dir / lang / "translation.json"
|
||||
|
||||
# Load existing data or create new
|
||||
if lang_file.exists():
|
||||
lang_data = self._load_json(lang_file)
|
||||
else:
|
||||
lang_data = {}
|
||||
lang_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
applied_count = 0
|
||||
for key, translation_data in translations.items():
|
||||
translated = translation_data.get('translated', '').strip()
|
||||
if translated and translated != translation_data.get('original', ''):
|
||||
self._set_nested_value(lang_data, key, translated)
|
||||
applied_count += 1
|
||||
|
||||
if applied_count > 0:
|
||||
self._save_json(lang_data, lang_file)
|
||||
results['applied'][lang] = applied_count
|
||||
print(f"Applied {applied_count} translations to {lang}")
|
||||
|
||||
return results
|
||||
|
||||
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
||||
"""Set value in nested dict using dot notation."""
|
||||
keys = key_path.split('.')
|
||||
current = data
|
||||
for key in keys[:-1]:
|
||||
if key not in current:
|
||||
current[key] = {}
|
||||
elif not isinstance(current[key], dict):
|
||||
# If the current value is not a dict, we can't nest into it
|
||||
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
|
||||
current[key] = {}
|
||||
current = current[key]
|
||||
current[keys[-1]] = value
|
||||
|
||||
def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None:
|
||||
"""Export translations for external translation services."""
|
||||
golden_truth = self._load_json(self.golden_truth_file)
|
||||
golden_flat = self._flatten_dict(golden_truth)
|
||||
|
||||
if output_format == 'csv':
|
||||
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv')
|
||||
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['key', 'context', 'en_GB'] + languages
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for key, en_value in golden_flat.items():
|
||||
if self._is_expected_identical(key, en_value):
|
||||
continue
|
||||
|
||||
row = {
|
||||
'key': key,
|
||||
'context': self._get_key_context(key),
|
||||
'en_GB': en_value
|
||||
}
|
||||
|
||||
for lang in languages:
|
||||
lang_file = self.locales_dir / lang / "translation.json"
|
||||
if lang_file.exists():
|
||||
lang_data = self._load_json(lang_file)
|
||||
lang_flat = self._flatten_dict(lang_data)
|
||||
value = lang_flat.get(key, '')
|
||||
if value.startswith('[UNTRANSLATED]'):
|
||||
value = ''
|
||||
row[lang] = value
|
||||
else:
|
||||
row[lang] = ''
|
||||
|
||||
writer.writerow(row)
|
||||
|
||||
print(f"Exported to {output_file}")
|
||||
|
||||
elif output_format == 'json':
|
||||
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json')
|
||||
export_data = {'languages': languages, 'translations': {}}
|
||||
|
||||
for key, en_value in golden_flat.items():
|
||||
if self._is_expected_identical(key, en_value):
|
||||
continue
|
||||
|
||||
export_data['translations'][key] = {
|
||||
'en_GB': en_value,
|
||||
'context': self._get_key_context(key)
|
||||
}
|
||||
|
||||
for lang in languages:
|
||||
lang_file = self.locales_dir / lang / "translation.json"
|
||||
if lang_file.exists():
|
||||
lang_data = self._load_json(lang_file)
|
||||
lang_flat = self._flatten_dict(lang_data)
|
||||
value = lang_flat.get(key, '')
|
||||
if value.startswith('[UNTRANSLATED]'):
|
||||
value = ''
|
||||
export_data['translations'][key][lang] = value
|
||||
|
||||
self._save_json(export_data, output_file)
|
||||
print(f"Exported to {output_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='AI Translation Helper')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||
|
||||
# Create batch command
|
||||
batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file')
|
||||
batch_parser.add_argument('--languages', nargs='+', required=True,
|
||||
help='Language codes to include')
|
||||
batch_parser.add_argument('--output', required=True, help='Output batch file')
|
||||
batch_parser.add_argument('--max-entries', type=int, default=100,
|
||||
help='Max entries per language')
|
||||
|
||||
# Validate command
|
||||
validate_parser = subparsers.add_parser('validate', help='Validate AI translations')
|
||||
validate_parser.add_argument('batch_file', help='Batch file to validate')
|
||||
|
||||
# Apply command
|
||||
apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations')
|
||||
apply_parser.add_argument('batch_file', help='Batch file with translations')
|
||||
apply_parser.add_argument('--skip-validation', action='store_true',
|
||||
help='Skip validation before applying')
|
||||
|
||||
# Export command
|
||||
export_parser = subparsers.add_parser('export', help='Export for external translation')
|
||||
export_parser.add_argument('--languages', nargs='+', required=True,
|
||||
help='Language codes to export')
|
||||
export_parser.add_argument('--format', choices=['csv', 'json'], default='csv',
|
||||
help='Export format')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
helper = AITranslationHelper(args.locales_dir)
|
||||
|
||||
if args.command == 'create-batch':
|
||||
output_file = Path(args.output)
|
||||
helper.create_ai_batch_file(args.languages, output_file, args.max_entries)
|
||||
|
||||
elif args.command == 'validate':
|
||||
batch_file = Path(args.batch_file)
|
||||
issues = helper.validate_ai_translations(batch_file)
|
||||
|
||||
if issues['errors']:
|
||||
print("ERRORS:")
|
||||
for error in issues['errors']:
|
||||
print(f" - {error}")
|
||||
|
||||
if issues['warnings']:
|
||||
print("WARNINGS:")
|
||||
for warning in issues['warnings']:
|
||||
print(f" - {warning}")
|
||||
|
||||
if not issues['errors'] and not issues['warnings']:
|
||||
print("No validation issues found!")
|
||||
|
||||
elif args.command == 'apply-batch':
|
||||
batch_file = Path(args.batch_file)
|
||||
results = helper.apply_ai_batch_translations(
|
||||
batch_file,
|
||||
validate=not args.skip_validation
|
||||
)
|
||||
|
||||
total_applied = sum(results['applied'].values())
|
||||
print(f"Total translations applied: {total_applied}")
|
||||
|
||||
elif args.command == 'export':
|
||||
helper.export_for_external_translation(args.languages, args.format)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
177
scripts/translations/compact_translator.py
Normal file
177
scripts/translations/compact_translator.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compact Translation Extractor for Character-Limited AI Translation
|
||||
Outputs untranslated entries in minimal JSON format with whitespace stripped.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ImportError:
|
||||
try:
|
||||
import toml as tomllib_fallback
|
||||
tomllib = None
|
||||
except ImportError:
|
||||
tomllib = None
|
||||
tomllib_fallback = None
|
||||
|
||||
|
||||
class CompactTranslationExtractor:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
||||
self.golden_truth = self._load_json(self.golden_truth_file)
|
||||
self.ignore_file = Path(ignore_file)
|
||||
self.ignore_patterns = self._load_ignore_patterns()
|
||||
|
||||
def _load_json(self, file_path: Path) -> dict:
|
||||
"""Load JSON file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in {file_path}: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def _load_ignore_patterns(self) -> dict:
|
||||
"""Load ignore patterns from TOML file."""
|
||||
if not self.ignore_file.exists():
|
||||
return {}
|
||||
|
||||
try:
|
||||
if tomllib:
|
||||
with open(self.ignore_file, 'rb') as f:
|
||||
ignore_data = tomllib.load(f)
|
||||
elif tomllib_fallback:
|
||||
ignore_data = tomllib_fallback.load(self.ignore_file)
|
||||
else:
|
||||
ignore_data = self._parse_simple_toml()
|
||||
|
||||
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
def _parse_simple_toml(self) -> dict:
|
||||
"""Simple TOML parser for ignore patterns (fallback)."""
|
||||
ignore_data = {}
|
||||
current_section = None
|
||||
|
||||
with open(self.ignore_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
if line.startswith('[') and line.endswith(']'):
|
||||
current_section = line[1:-1]
|
||||
ignore_data[current_section] = {'ignore': []}
|
||||
elif line.strip().startswith("'") and current_section:
|
||||
item = line.strip().strip("',")
|
||||
if item:
|
||||
ignore_data[current_section]['ignore'].append(item)
|
||||
|
||||
return ignore_data
|
||||
|
||||
def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(self._flatten_dict(v, new_key, separator).items())
|
||||
else:
|
||||
items.append((new_key, str(v)))
|
||||
return dict(items)
|
||||
|
||||
def get_untranslated_entries(self, language: str) -> dict:
|
||||
"""Get all untranslated entries for a language in compact format."""
|
||||
target_file = self.locales_dir / language / "translation.json"
|
||||
|
||||
if not target_file.exists():
|
||||
print(f"Error: Translation file not found for language: {language}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
target_data = self._load_json(target_file)
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
lang_code = language.replace('-', '_')
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
# Find missing translations
|
||||
missing_keys = set(golden_flat.keys()) - set(target_flat.keys()) - ignore_set
|
||||
|
||||
# Find untranslated entries (identical to en-GB or marked [UNTRANSLATED])
|
||||
untranslated_keys = set()
|
||||
for key in target_flat:
|
||||
if key in golden_flat and key not in ignore_set:
|
||||
target_value = target_flat[key]
|
||||
golden_value = golden_flat[key]
|
||||
|
||||
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
|
||||
(golden_value == target_value and not self._is_expected_identical(key, golden_value)):
|
||||
untranslated_keys.add(key)
|
||||
|
||||
# Combine and create compact output
|
||||
all_untranslated = missing_keys | untranslated_keys
|
||||
|
||||
compact_entries = {}
|
||||
for key in sorted(all_untranslated):
|
||||
if key in golden_flat:
|
||||
compact_entries[key] = golden_flat[key]
|
||||
|
||||
return compact_entries
|
||||
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if a key-value pair is expected to be identical across languages."""
|
||||
identical_patterns = ['language.direction']
|
||||
identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'}
|
||||
|
||||
if value.strip() in identical_values:
|
||||
return True
|
||||
|
||||
for pattern in identical_patterns:
|
||||
if pattern in key.lower():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Extract untranslated entries in compact format for AI translation')
|
||||
parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory')
|
||||
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file')
|
||||
parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output')
|
||||
parser.add_argument('--output', help='Output file (default: stdout)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
extractor = CompactTranslationExtractor(args.locales_dir, args.ignore_file)
|
||||
untranslated = extractor.get_untranslated_entries(args.language)
|
||||
|
||||
if args.max_entries:
|
||||
# Take first N entries
|
||||
keys = list(untranslated.keys())[:args.max_entries]
|
||||
untranslated = {k: untranslated[k] for k in keys}
|
||||
|
||||
# Output compact JSON (no indentation, minimal whitespace)
|
||||
output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
262
scripts/translations/json_beautifier.py
Normal file
262
scripts/translations/json_beautifier.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
JSON Beautifier and Structure Fixer for Stirling PDF Frontend
|
||||
Restructures translation JSON files to match en-GB structure and key order exactly.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
import argparse
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
class JSONBeautifier:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales"):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
||||
self.golden_structure = self._load_json(self.golden_truth_file)
|
||||
|
||||
def _load_json(self, file_path: Path) -> Dict:
|
||||
"""Load JSON file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f, object_pairs_hook=OrderedDict)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in {file_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None:
|
||||
"""Save JSON file with proper formatting."""
|
||||
if backup and file_path.exists():
|
||||
backup_path = file_path.with_suffix(f'.backup.restructured.json')
|
||||
file_path.rename(backup_path)
|
||||
print(f"Backup created: {backup_path}")
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False, separators=(',', ': '))
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(self._flatten_dict(v, new_key, separator).items())
|
||||
else:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
def _rebuild_structure(self, flat_dict: Dict[str, Any], reference_structure: Dict) -> Dict:
|
||||
"""Rebuild nested structure based on reference structure and available translations."""
|
||||
def build_recursive(ref_obj: Any, current_path: str = '') -> Any:
|
||||
if isinstance(ref_obj, dict):
|
||||
result = OrderedDict()
|
||||
for key, value in ref_obj.items():
|
||||
new_path = f"{current_path}.{key}" if current_path else key
|
||||
|
||||
if new_path in flat_dict:
|
||||
# Direct translation exists
|
||||
if isinstance(value, dict):
|
||||
# If reference is dict but we have a string, use the string
|
||||
if isinstance(flat_dict[new_path], str):
|
||||
result[key] = flat_dict[new_path]
|
||||
else:
|
||||
# Recurse into nested structure
|
||||
result[key] = build_recursive(value, new_path)
|
||||
else:
|
||||
result[key] = flat_dict[new_path]
|
||||
else:
|
||||
# No direct translation, recurse to check for nested keys
|
||||
if isinstance(value, dict):
|
||||
nested_result = build_recursive(value, new_path)
|
||||
if nested_result: # Only add if we found some translations
|
||||
result[key] = nested_result
|
||||
# If no translation found and it's a leaf, skip it
|
||||
|
||||
return result if result else None
|
||||
else:
|
||||
# Leaf node - return the translation if it exists
|
||||
return flat_dict.get(current_path, None)
|
||||
|
||||
return build_recursive(reference_structure) or OrderedDict()
|
||||
|
||||
def restructure_translation_file(self, target_file: Path) -> Dict[str, Any]:
|
||||
"""Restructure a translation file to match en-GB structure exactly."""
|
||||
if not target_file.exists():
|
||||
print(f"Error: Target file does not exist: {target_file}")
|
||||
return {}
|
||||
|
||||
# Load the target file
|
||||
target_data = self._load_json(target_file)
|
||||
|
||||
# Flatten the target translations
|
||||
flat_target = self._flatten_dict(target_data)
|
||||
|
||||
# Rebuild structure based on golden truth
|
||||
restructured = self._rebuild_structure(flat_target, self.golden_structure)
|
||||
|
||||
return restructured
|
||||
|
||||
def beautify_and_restructure(self, target_file: Path, backup: bool = True) -> Dict[str, Any]:
|
||||
"""Main function to beautify and restructure a translation file."""
|
||||
lang_code = target_file.parent.name
|
||||
print(f"Restructuring {lang_code} translation file...")
|
||||
|
||||
# Get the restructured data
|
||||
restructured_data = self.restructure_translation_file(target_file)
|
||||
|
||||
# Save the restructured file
|
||||
self._save_json(restructured_data, target_file, backup)
|
||||
|
||||
# Analyze the results
|
||||
flat_golden = self._flatten_dict(self.golden_structure)
|
||||
flat_restructured = self._flatten_dict(restructured_data)
|
||||
|
||||
total_keys = len(flat_golden)
|
||||
preserved_keys = len(flat_restructured)
|
||||
|
||||
result = {
|
||||
'language': lang_code,
|
||||
'total_reference_keys': total_keys,
|
||||
'preserved_keys': preserved_keys,
|
||||
'structure_match': self._compare_structures(self.golden_structure, restructured_data)
|
||||
}
|
||||
|
||||
print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved")
|
||||
return result
|
||||
|
||||
def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]:
|
||||
"""Compare structures between reference and target."""
|
||||
def compare_recursive(r: Any, t: Any, path: str = '') -> List[str]:
|
||||
issues = []
|
||||
|
||||
if isinstance(r, dict) and isinstance(t, dict):
|
||||
# Check for missing top-level sections
|
||||
ref_keys = set(r.keys())
|
||||
target_keys = set(t.keys())
|
||||
|
||||
missing_sections = ref_keys - target_keys
|
||||
if missing_sections:
|
||||
for section in missing_sections:
|
||||
issues.append(f"Missing section: {path}.{section}" if path else section)
|
||||
|
||||
# Recurse into common sections
|
||||
for key in ref_keys & target_keys:
|
||||
new_path = f"{path}.{key}" if path else key
|
||||
issues.extend(compare_recursive(r[key], t[key], new_path))
|
||||
|
||||
return issues
|
||||
|
||||
issues = compare_recursive(ref, target)
|
||||
|
||||
return {
|
||||
'structures_match': len(issues) == 0,
|
||||
'issues': issues[:10], # Limit to first 10 issues
|
||||
'total_issues': len(issues)
|
||||
}
|
||||
|
||||
def validate_key_order(self, target_file: Path) -> Dict[str, Any]:
|
||||
"""Validate that keys appear in the same order as en-GB."""
|
||||
target_data = self._load_json(target_file)
|
||||
|
||||
def get_key_order(obj: Dict, path: str = '') -> List[str]:
|
||||
keys = []
|
||||
for key in obj.keys():
|
||||
new_path = f"{path}.{key}" if path else key
|
||||
keys.append(new_path)
|
||||
if isinstance(obj[key], dict):
|
||||
keys.extend(get_key_order(obj[key], new_path))
|
||||
return keys
|
||||
|
||||
golden_order = get_key_order(self.golden_structure)
|
||||
target_order = get_key_order(target_data)
|
||||
|
||||
# Find common keys and check their relative order
|
||||
common_keys = set(golden_order) & set(target_order)
|
||||
|
||||
golden_indices = {key: idx for idx, key in enumerate(golden_order) if key in common_keys}
|
||||
target_indices = {key: idx for idx, key in enumerate(target_order) if key in common_keys}
|
||||
|
||||
order_preserved = all(
|
||||
golden_indices[key1] < golden_indices[key2]
|
||||
for key1 in common_keys for key2 in common_keys
|
||||
if golden_indices[key1] < golden_indices[key2] and target_indices[key1] < target_indices[key2]
|
||||
)
|
||||
|
||||
return {
|
||||
'order_preserved': order_preserved,
|
||||
'common_keys_count': len(common_keys),
|
||||
'golden_keys_count': len(golden_order),
|
||||
'target_keys_count': len(target_order)
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Beautify and restructure translation JSON files')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--language', help='Restructure specific language only')
|
||||
parser.add_argument('--all-languages', action='store_true',
|
||||
help='Restructure all language files')
|
||||
parser.add_argument('--no-backup', action='store_true',
|
||||
help='Skip backup creation')
|
||||
parser.add_argument('--validate-only', action='store_true',
|
||||
help='Only validate structure, do not modify files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
beautifier = JSONBeautifier(args.locales_dir)
|
||||
|
||||
if args.language:
|
||||
target_file = Path(args.locales_dir) / args.language / "translation.json"
|
||||
if not target_file.exists():
|
||||
print(f"Error: Translation file not found for language: {args.language}")
|
||||
sys.exit(1)
|
||||
|
||||
if args.validate_only:
|
||||
order_result = beautifier.validate_key_order(target_file)
|
||||
print(f"Key order validation for {args.language}:")
|
||||
print(f" Order preserved: {order_result['order_preserved']}")
|
||||
print(f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}")
|
||||
else:
|
||||
result = beautifier.beautify_and_restructure(target_file, backup=not args.no_backup)
|
||||
print(f"\nResults for {result['language']}:")
|
||||
print(f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}")
|
||||
if result['structure_match']['total_issues'] > 0:
|
||||
print(f" Structure issues: {result['structure_match']['total_issues']}")
|
||||
for issue in result['structure_match']['issues']:
|
||||
print(f" - {issue}")
|
||||
|
||||
elif args.all_languages:
|
||||
results = []
|
||||
for lang_dir in Path(args.locales_dir).iterdir():
|
||||
if lang_dir.is_dir() and lang_dir.name != "en-GB":
|
||||
translation_file = lang_dir / "translation.json"
|
||||
if translation_file.exists():
|
||||
if args.validate_only:
|
||||
order_result = beautifier.validate_key_order(translation_file)
|
||||
print(f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}")
|
||||
else:
|
||||
result = beautifier.beautify_and_restructure(translation_file, backup=not args.no_backup)
|
||||
results.append(result)
|
||||
|
||||
if not args.validate_only and results:
|
||||
print(f"\n{'='*60}")
|
||||
print("RESTRUCTURING SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
for result in sorted(results, key=lambda x: x['language']):
|
||||
print(f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
|
||||
f"({result['preserved_keys']/result['total_reference_keys']*100:.1f}%)")
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
259
scripts/translations/json_validator.py
Normal file
259
scripts/translations/json_validator.py
Normal file
@@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
JSON Validator for Translation Files
|
||||
|
||||
Validates JSON syntax in translation files and reports detailed error information.
|
||||
Useful for validating batch translation files before merging.
|
||||
|
||||
Usage:
|
||||
python3 json_validator.py <file_or_pattern>
|
||||
python3 json_validator.py ar_AR_batch_*.json
|
||||
python3 json_validator.py ar_AR_batch_1_of_3.json
|
||||
python3 json_validator.py --all-batches ar_AR
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_line_context(file_path, line_num, context_lines=3):
|
||||
"""Get lines around the error for context"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
start = max(0, line_num - context_lines - 1)
|
||||
end = min(len(lines), line_num + context_lines)
|
||||
|
||||
context = []
|
||||
for i in range(start, end):
|
||||
marker = ">>> " if i == line_num - 1 else " "
|
||||
context.append(f"{marker}{i+1:4d}: {lines[i].rstrip()}")
|
||||
|
||||
return "\n".join(context)
|
||||
except Exception as e:
|
||||
return f"Could not read context: {e}"
|
||||
|
||||
|
||||
def get_character_context(file_path, char_pos, context_chars=100):
|
||||
"""Get characters around the error position"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
start = max(0, char_pos - context_chars)
|
||||
end = min(len(content), char_pos + context_chars)
|
||||
|
||||
before = content[start:char_pos]
|
||||
error_char = content[char_pos] if char_pos < len(content) else "EOF"
|
||||
after = content[char_pos+1:end]
|
||||
|
||||
return {
|
||||
'before': before,
|
||||
'error_char': error_char,
|
||||
'after': after,
|
||||
'display': f"{before}[{error_char}]{after}"
|
||||
}
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def validate_json_file(file_path):
|
||||
"""Validate a single JSON file and return detailed error info"""
|
||||
result = {
|
||||
'file': str(file_path),
|
||||
'valid': False,
|
||||
'error': None,
|
||||
'line': None,
|
||||
'column': None,
|
||||
'position': None,
|
||||
'context': None,
|
||||
'char_context': None,
|
||||
'entry_count': 0
|
||||
}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
result['valid'] = True
|
||||
result['entry_count'] = len(data) if isinstance(data, dict) else 0
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
result['error'] = e.msg
|
||||
result['line'] = e.lineno
|
||||
result['column'] = e.colno
|
||||
result['position'] = e.pos
|
||||
result['context'] = get_line_context(file_path, e.lineno)
|
||||
result['char_context'] = get_character_context(file_path, e.pos)
|
||||
|
||||
except FileNotFoundError:
|
||||
result['error'] = "File not found"
|
||||
|
||||
except Exception as e:
|
||||
result['error'] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def print_validation_result(result, verbose=True):
|
||||
"""Print validation result in a formatted way"""
|
||||
file_name = Path(result['file']).name
|
||||
|
||||
if result['valid']:
|
||||
print(f"✓ {file_name}: Valid JSON ({result['entry_count']} entries)")
|
||||
else:
|
||||
print(f"✗ {file_name}: Invalid JSON")
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
if result['line']:
|
||||
print(f" Location: Line {result['line']}, Column {result['column']} (character {result['position']})")
|
||||
|
||||
if verbose and result['context']:
|
||||
print(f"\n Context:")
|
||||
for line in result['context'].split('\n'):
|
||||
print(f" {line}")
|
||||
|
||||
if verbose and result['char_context']:
|
||||
print(f"\n Character context:")
|
||||
print(f" ...{result['char_context']['display'][-150:]}...")
|
||||
print(f" Error character: {repr(result['char_context']['error_char'])}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def get_common_fixes(error_msg):
|
||||
"""Suggest common fixes based on error message"""
|
||||
fixes = []
|
||||
|
||||
if "Expecting ',' delimiter" in error_msg:
|
||||
fixes.append("Missing comma between JSON entries")
|
||||
fixes.append("Check for unescaped quotes inside string values")
|
||||
|
||||
if "Invalid \\escape" in error_msg or "Invalid escape" in error_msg:
|
||||
fixes.append("Unescaped backslash in string (use \\\\ for literal backslash)")
|
||||
fixes.append("Common in regex patterns: \\d should be \\\\d")
|
||||
|
||||
if "Expecting property name" in error_msg:
|
||||
fixes.append("Missing or extra comma")
|
||||
fixes.append("Trailing comma before closing brace")
|
||||
|
||||
if "Expecting value" in error_msg:
|
||||
fixes.append("Missing value after colon")
|
||||
fixes.append("Extra comma")
|
||||
|
||||
return fixes
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate JSON syntax in translation files',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
Validate single file:
|
||||
python3 json_validator.py ar_AR_batch_1_of_3.json
|
||||
|
||||
Validate all batches for a language:
|
||||
python3 json_validator.py --all-batches ar_AR
|
||||
|
||||
Validate pattern:
|
||||
python3 json_validator.py "ar_AR_batch_*.json"
|
||||
|
||||
Validate multiple files:
|
||||
python3 json_validator.py file1.json file2.json file3.json
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'files',
|
||||
nargs='*',
|
||||
help='JSON file(s) to validate (supports wildcards)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--all-batches',
|
||||
metavar='LANGUAGE',
|
||||
help='Validate all batch files for a language (e.g., ar_AR)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--quiet',
|
||||
action='store_true',
|
||||
help='Only show files with errors'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--brief',
|
||||
action='store_true',
|
||||
help='Brief output without context'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine which files to validate
|
||||
files_to_validate = []
|
||||
|
||||
if args.all_batches:
|
||||
pattern = f"{args.all_batches}_batch_*.json"
|
||||
files_to_validate = glob.glob(pattern)
|
||||
if not files_to_validate:
|
||||
print(f"No batch files found matching: {pattern}")
|
||||
return 1
|
||||
elif args.files:
|
||||
for file_pattern in args.files:
|
||||
if '*' in file_pattern or '?' in file_pattern:
|
||||
files_to_validate.extend(glob.glob(file_pattern))
|
||||
else:
|
||||
files_to_validate.append(file_pattern)
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
if not files_to_validate:
|
||||
print("No files to validate")
|
||||
return 1
|
||||
|
||||
# Sort files for consistent output
|
||||
files_to_validate.sort()
|
||||
|
||||
print(f"Validating {len(files_to_validate)} file(s)...\n")
|
||||
|
||||
# Validate each file
|
||||
results = []
|
||||
for file_path in files_to_validate:
|
||||
result = validate_json_file(file_path)
|
||||
results.append(result)
|
||||
|
||||
if not args.quiet or not result['valid']:
|
||||
print_validation_result(result, verbose=not args.brief)
|
||||
|
||||
# Summary
|
||||
valid_count = sum(1 for r in results if r['valid'])
|
||||
invalid_count = len(results) - valid_count
|
||||
|
||||
print("=" * 60)
|
||||
print(f"Summary: {valid_count} valid, {invalid_count} invalid")
|
||||
|
||||
# Show common fixes for errors
|
||||
if invalid_count > 0:
|
||||
all_errors = [r['error'] for r in results if r['error']]
|
||||
unique_error_types = set(all_errors)
|
||||
|
||||
print("\nCommon fixes:")
|
||||
fixes_shown = set()
|
||||
for error in unique_error_types:
|
||||
fixes = get_common_fixes(error)
|
||||
for fix in fixes:
|
||||
if fix not in fixes_shown:
|
||||
print(f" • {fix}")
|
||||
fixes_shown.add(fix)
|
||||
|
||||
return 0 if invalid_count == 0 else 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
314
scripts/translations/translation_analyzer.py
Normal file
314
scripts/translations/translation_analyzer.py
Normal file
@@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Translation Analyzer for Stirling PDF Frontend
|
||||
Compares language files against en-GB golden truth file.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple
|
||||
import argparse
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ImportError:
|
||||
try:
|
||||
import toml as tomllib_fallback
|
||||
tomllib = None
|
||||
except ImportError:
|
||||
tomllib = None
|
||||
tomllib_fallback = None
|
||||
|
||||
|
||||
class TranslationAnalyzer:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
||||
self.golden_truth = self._load_json(self.golden_truth_file)
|
||||
self.ignore_file = Path(ignore_file)
|
||||
self.ignore_patterns = self._load_ignore_patterns()
|
||||
|
||||
def _load_json(self, file_path: Path) -> Dict:
|
||||
"""Load JSON file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in {file_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
|
||||
"""Load ignore patterns from TOML file."""
|
||||
if not self.ignore_file.exists():
|
||||
return {}
|
||||
|
||||
try:
|
||||
if tomllib:
|
||||
# Use Python 3.11+ built-in
|
||||
with open(self.ignore_file, 'rb') as f:
|
||||
ignore_data = tomllib.load(f)
|
||||
elif tomllib_fallback:
|
||||
# Use toml library fallback
|
||||
ignore_data = tomllib_fallback.load(self.ignore_file)
|
||||
else:
|
||||
# Simple parser as fallback
|
||||
ignore_data = self._parse_simple_toml()
|
||||
|
||||
# Convert lists to sets for faster lookup
|
||||
return {lang: set(patterns) for lang, data in ignore_data.items()
|
||||
for patterns in [data.get('ignore', [])] if patterns}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
|
||||
return {}
|
||||
|
||||
def _parse_simple_toml(self) -> Dict:
|
||||
"""Simple TOML parser for ignore patterns (fallback)."""
|
||||
ignore_data = {}
|
||||
current_section = None
|
||||
|
||||
with open(self.ignore_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
if line.startswith('[') and line.endswith(']'):
|
||||
current_section = line[1:-1]
|
||||
ignore_data[current_section] = {'ignore': []}
|
||||
elif line.startswith('ignore = [') and current_section:
|
||||
# Handle ignore array
|
||||
continue
|
||||
elif line.strip().startswith("'") and current_section:
|
||||
# Extract quoted items
|
||||
item = line.strip().strip("',")
|
||||
if item:
|
||||
ignore_data[current_section]['ignore'].append(item)
|
||||
|
||||
return ignore_data
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(self._flatten_dict(v, new_key, separator).items())
|
||||
else:
|
||||
items.append((new_key, str(v)))
|
||||
return dict(items)
|
||||
|
||||
def get_all_language_files(self) -> List[Path]:
|
||||
"""Get all translation.json files except en-GB."""
|
||||
files = []
|
||||
for lang_dir in self.locales_dir.iterdir():
|
||||
if lang_dir.is_dir() and lang_dir.name != "en-GB":
|
||||
translation_file = lang_dir / "translation.json"
|
||||
if translation_file.exists():
|
||||
files.append(translation_file)
|
||||
return sorted(files)
|
||||
|
||||
def find_missing_translations(self, target_file: Path) -> Set[str]:
|
||||
"""Find keys that exist in en-GB but missing in target file."""
|
||||
target_data = self._load_json(target_file)
|
||||
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
missing = set(golden_flat.keys()) - set(target_flat.keys())
|
||||
|
||||
# Filter out ignored keys
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
return missing - ignore_set
|
||||
|
||||
def find_untranslated_entries(self, target_file: Path) -> Set[str]:
|
||||
"""Find entries that appear to be untranslated (identical to en-GB)."""
|
||||
target_data = self._load_json(target_file)
|
||||
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
untranslated = set()
|
||||
for key in target_flat:
|
||||
if key in golden_flat:
|
||||
target_value = target_flat[key]
|
||||
golden_value = golden_flat[key]
|
||||
|
||||
# Check if marked as [UNTRANSLATED] or identical to en-GB
|
||||
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
|
||||
(golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)):
|
||||
untranslated.add(key)
|
||||
|
||||
return untranslated
|
||||
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if a key-value pair is expected to be identical across languages."""
|
||||
# Keys that should be identical across languages
|
||||
identical_patterns = [
|
||||
'language.direction',
|
||||
'true', 'false',
|
||||
'unknown'
|
||||
]
|
||||
|
||||
# Values that are often identical (numbers, symbols, etc.)
|
||||
if value.strip() in ['ltr', 'rtl', 'True', 'False']:
|
||||
return True
|
||||
|
||||
# Check for patterns
|
||||
for pattern in identical_patterns:
|
||||
if pattern in key.lower():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def find_extra_translations(self, target_file: Path) -> Set[str]:
|
||||
"""Find keys that exist in target file but not in en-GB."""
|
||||
target_data = self._load_json(target_file)
|
||||
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
return set(target_flat.keys()) - set(golden_flat.keys())
|
||||
|
||||
def analyze_file(self, target_file: Path) -> Dict:
|
||||
"""Complete analysis of a single translation file."""
|
||||
lang_code = target_file.parent.name
|
||||
|
||||
missing = self.find_missing_translations(target_file)
|
||||
untranslated = self.find_untranslated_entries(target_file)
|
||||
extra = self.find_extra_translations(target_file)
|
||||
|
||||
target_data = self._load_json(target_file)
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
# Calculate completion rate excluding ignored keys
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
relevant_keys = set(golden_flat.keys()) - ignore_set
|
||||
total_keys = len(relevant_keys)
|
||||
|
||||
# Count keys that exist and are properly translated (not [UNTRANSLATED])
|
||||
properly_translated = 0
|
||||
for key in relevant_keys:
|
||||
if key in target_flat:
|
||||
value = target_flat[key]
|
||||
if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")):
|
||||
if key not in untranslated: # Not identical to en-GB (unless expected)
|
||||
properly_translated += 1
|
||||
|
||||
completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0
|
||||
|
||||
return {
|
||||
'language': lang_code,
|
||||
'file': target_file,
|
||||
'missing_count': len(missing),
|
||||
'missing_keys': sorted(missing),
|
||||
'untranslated_count': len(untranslated),
|
||||
'untranslated_keys': sorted(untranslated),
|
||||
'extra_count': len(extra),
|
||||
'extra_keys': sorted(extra),
|
||||
'total_keys': total_keys,
|
||||
'completion_rate': completion_rate
|
||||
}
|
||||
|
||||
def analyze_all_files(self) -> List[Dict]:
|
||||
"""Analyze all translation files."""
|
||||
results = []
|
||||
for file_path in self.get_all_language_files():
|
||||
results.append(self.analyze_file(file_path))
|
||||
return sorted(results, key=lambda x: x['language'])
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
|
||||
help='Path to ignore patterns TOML file')
|
||||
parser.add_argument('--language', help='Analyze specific language only')
|
||||
parser.add_argument('--missing-only', action='store_true',
|
||||
help='Show only missing translations')
|
||||
parser.add_argument('--untranslated-only', action='store_true',
|
||||
help='Show only untranslated entries')
|
||||
parser.add_argument('--summary', action='store_true',
|
||||
help='Show summary statistics only')
|
||||
parser.add_argument('--format', choices=['text', 'json'], default='text',
|
||||
help='Output format')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = TranslationAnalyzer(args.locales_dir, args.ignore_file)
|
||||
|
||||
if args.language:
|
||||
target_file = Path(args.locales_dir) / args.language / "translation.json"
|
||||
if not target_file.exists():
|
||||
print(f"Error: Translation file not found for language: {args.language}")
|
||||
sys.exit(1)
|
||||
results = [analyzer.analyze_file(target_file)]
|
||||
else:
|
||||
results = analyzer.analyze_all_files()
|
||||
|
||||
if args.format == 'json':
|
||||
print(json.dumps(results, indent=2, default=str))
|
||||
return
|
||||
|
||||
# Text format output
|
||||
for result in results:
|
||||
lang = result['language']
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Language: {lang}")
|
||||
print(f"File: {result['file']}")
|
||||
print(f"Completion Rate: {result['completion_rate']:.1f}%")
|
||||
print(f"Total Keys in en-GB: {result['total_keys']}")
|
||||
|
||||
if not args.summary:
|
||||
if not args.untranslated_only:
|
||||
print(f"\nMissing Translations ({result['missing_count']}):")
|
||||
for key in result['missing_keys'][:10]: # Show first 10
|
||||
print(f" - {key}")
|
||||
if len(result['missing_keys']) > 10:
|
||||
print(f" ... and {len(result['missing_keys']) - 10} more")
|
||||
|
||||
if not args.missing_only:
|
||||
print(f"\nUntranslated Entries ({result['untranslated_count']}):")
|
||||
for key in result['untranslated_keys'][:10]: # Show first 10
|
||||
print(f" - {key}")
|
||||
if len(result['untranslated_keys']) > 10:
|
||||
print(f" ... and {len(result['untranslated_keys']) - 10} more")
|
||||
|
||||
if result['extra_count'] > 0:
|
||||
print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):")
|
||||
for key in result['extra_keys'][:5]:
|
||||
print(f" - {key}")
|
||||
if len(result['extra_keys']) > 5:
|
||||
print(f" ... and {len(result['extra_keys']) - 5} more")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0
|
||||
print(f"Average Completion Rate: {avg_completion:.1f}%")
|
||||
print(f"Languages Analyzed: {len(results)}")
|
||||
|
||||
# Top languages by completion
|
||||
sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True)
|
||||
print(f"\nTop 5 Most Complete Languages:")
|
||||
for result in sorted_by_completion[:5]:
|
||||
print(f" {result['language']}: {result['completion_rate']:.1f}%")
|
||||
|
||||
print(f"\nBottom 5 Languages Needing Attention:")
|
||||
for result in sorted_by_completion[-5:]:
|
||||
print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
371
scripts/translations/translation_merger.py
Normal file
371
scripts/translations/translation_merger.py
Normal file
@@ -0,0 +1,371 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Translation Merger for Stirling PDF Frontend
|
||||
Merges missing translations from en-GB into target language files.
|
||||
Useful for AI-assisted translation workflows.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple, Any
|
||||
import argparse
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ImportError:
|
||||
try:
|
||||
import toml as tomllib_fallback
|
||||
tomllib = None
|
||||
except ImportError:
|
||||
tomllib = None
|
||||
tomllib_fallback = None
|
||||
|
||||
|
||||
class TranslationMerger:
|
||||
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
|
||||
self.locales_dir = Path(locales_dir)
|
||||
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
|
||||
self.golden_truth = self._load_json(self.golden_truth_file)
|
||||
self.ignore_file = Path(ignore_file)
|
||||
self.ignore_patterns = self._load_ignore_patterns()
|
||||
|
||||
def _load_json(self, file_path: Path) -> Dict:
|
||||
"""Load JSON file with error handling."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in {file_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None:
|
||||
"""Save JSON file with backup option."""
|
||||
if backup and file_path.exists():
|
||||
backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
|
||||
shutil.copy2(file_path, backup_path)
|
||||
print(f"Backup created: {backup_path}")
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
|
||||
"""Load ignore patterns from TOML file."""
|
||||
if not self.ignore_file.exists():
|
||||
return {}
|
||||
|
||||
try:
|
||||
# Simple parser for ignore patterns
|
||||
ignore_data = {}
|
||||
current_section = None
|
||||
|
||||
with open(self.ignore_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
if line.startswith('[') and line.endswith(']'):
|
||||
current_section = line[1:-1]
|
||||
ignore_data[current_section] = set()
|
||||
elif line.strip().startswith("'") and current_section:
|
||||
# Extract quoted items
|
||||
item = line.strip().strip("',")
|
||||
if item:
|
||||
ignore_data[current_section].add(item)
|
||||
|
||||
return ignore_data
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
|
||||
return {}
|
||||
|
||||
def _get_nested_value(self, data: Dict, key_path: str) -> Any:
|
||||
"""Get value from nested dict using dot notation."""
|
||||
keys = key_path.split('.')
|
||||
current = data
|
||||
for key in keys:
|
||||
if isinstance(current, dict) and key in current:
|
||||
current = current[key]
|
||||
else:
|
||||
return None
|
||||
return current
|
||||
|
||||
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
|
||||
"""Set value in nested dict using dot notation."""
|
||||
keys = key_path.split('.')
|
||||
current = data
|
||||
for key in keys[:-1]:
|
||||
if key not in current:
|
||||
current[key] = {}
|
||||
elif not isinstance(current[key], dict):
|
||||
# If the current value is not a dict, we can't nest into it
|
||||
# This handles cases where a key exists as a string but we need to make it a dict
|
||||
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
|
||||
current[key] = {}
|
||||
current = current[key]
|
||||
current[keys[-1]] = value
|
||||
|
||||
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
|
||||
"""Flatten nested dictionary into dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(self._flatten_dict(v, new_key, separator).items())
|
||||
else:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
def get_missing_keys(self, target_file: Path) -> List[str]:
|
||||
"""Get list of missing keys in target file."""
|
||||
lang_code = target_file.parent.name.replace('-', '_')
|
||||
ignore_set = self.ignore_patterns.get(lang_code, set())
|
||||
|
||||
if not target_file.exists():
|
||||
golden_keys = set(self._flatten_dict(self.golden_truth).keys())
|
||||
return sorted(golden_keys - ignore_set)
|
||||
|
||||
target_data = self._load_json(target_file)
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
missing = set(golden_flat.keys()) - set(target_flat.keys())
|
||||
return sorted(missing - ignore_set)
|
||||
|
||||
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None,
|
||||
mark_untranslated: bool = True) -> Dict:
|
||||
"""Add missing translations from en-GB to target file."""
|
||||
if not target_file.exists():
|
||||
target_data = {}
|
||||
else:
|
||||
target_data = self._load_json(target_file)
|
||||
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
missing_keys = keys_to_add or self.get_missing_keys(target_file)
|
||||
|
||||
added_count = 0
|
||||
for key in missing_keys:
|
||||
if key in golden_flat:
|
||||
value = golden_flat[key]
|
||||
if mark_untranslated and isinstance(value, str):
|
||||
# Mark as untranslated for AI to translate later
|
||||
value = f"[UNTRANSLATED] {value}"
|
||||
|
||||
self._set_nested_value(target_data, key, value)
|
||||
added_count += 1
|
||||
|
||||
return {
|
||||
'added_count': added_count,
|
||||
'missing_keys': missing_keys,
|
||||
'data': target_data
|
||||
}
|
||||
|
||||
def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict:
|
||||
"""Extract entries marked as untranslated or identical to en-GB for AI translation."""
|
||||
if not target_file.exists():
|
||||
print(f"Error: Target file does not exist: {target_file}")
|
||||
return {}
|
||||
|
||||
target_data = self._load_json(target_file)
|
||||
golden_flat = self._flatten_dict(self.golden_truth)
|
||||
target_flat = self._flatten_dict(target_data)
|
||||
|
||||
untranslated_entries = {}
|
||||
|
||||
for key, value in target_flat.items():
|
||||
if key in golden_flat:
|
||||
golden_value = golden_flat[key]
|
||||
|
||||
# Check if marked as untranslated
|
||||
if isinstance(value, str) and value.startswith("[UNTRANSLATED]"):
|
||||
untranslated_entries[key] = {
|
||||
'original': golden_value,
|
||||
'current': value,
|
||||
'reason': 'marked_untranslated'
|
||||
}
|
||||
# Check if identical to golden (and should be translated)
|
||||
elif value == golden_value and not self._is_expected_identical(key, value):
|
||||
untranslated_entries[key] = {
|
||||
'original': golden_value,
|
||||
'current': value,
|
||||
'reason': 'identical_to_english'
|
||||
}
|
||||
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(untranslated_entries, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return untranslated_entries
|
||||
|
||||
def _is_expected_identical(self, key: str, value: str) -> bool:
|
||||
"""Check if a key-value pair is expected to be identical across languages."""
|
||||
identical_patterns = [
|
||||
'language.direction',
|
||||
]
|
||||
|
||||
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
|
||||
return True
|
||||
|
||||
for pattern in identical_patterns:
|
||||
if pattern in key.lower():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def apply_translations(self, target_file: Path, translations: Dict[str, str],
|
||||
backup: bool = True) -> Dict:
|
||||
"""Apply provided translations to target file."""
|
||||
if not target_file.exists():
|
||||
print(f"Error: Target file does not exist: {target_file}")
|
||||
return {'success': False, 'error': 'File not found'}
|
||||
|
||||
target_data = self._load_json(target_file)
|
||||
applied_count = 0
|
||||
errors = []
|
||||
|
||||
for key, translation in translations.items():
|
||||
try:
|
||||
# Remove [UNTRANSLATED] marker if present
|
||||
if translation.startswith("[UNTRANSLATED]"):
|
||||
translation = translation.replace("[UNTRANSLATED]", "").strip()
|
||||
|
||||
self._set_nested_value(target_data, key, translation)
|
||||
applied_count += 1
|
||||
except Exception as e:
|
||||
errors.append(f"Error setting {key}: {e}")
|
||||
|
||||
if applied_count > 0:
|
||||
self._save_json(target_data, target_file, backup)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'applied_count': applied_count,
|
||||
'errors': errors,
|
||||
'data': target_data
|
||||
}
|
||||
|
||||
def create_translation_template(self, target_file: Path, output_file: Path) -> None:
|
||||
"""Create a template file for AI translation with context."""
|
||||
untranslated = self.extract_untranslated_entries(target_file)
|
||||
|
||||
template = {
|
||||
'metadata': {
|
||||
'source_language': 'en-GB',
|
||||
'target_language': target_file.parent.name,
|
||||
'total_entries': len(untranslated),
|
||||
'created_at': datetime.now().isoformat(),
|
||||
'instructions': 'Translate the "original" values to the target language. Keep the same keys.'
|
||||
},
|
||||
'translations': {}
|
||||
}
|
||||
|
||||
for key, entry in untranslated.items():
|
||||
template['translations'][key] = {
|
||||
'original': entry['original'],
|
||||
'translated': '', # AI should fill this
|
||||
'context': self._get_context_for_key(key),
|
||||
'reason': entry['reason']
|
||||
}
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(template, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Translation template created: {output_file}")
|
||||
print(f"Contains {len(untranslated)} entries to translate")
|
||||
|
||||
def _get_context_for_key(self, key: str) -> str:
|
||||
"""Get context information for a translation key."""
|
||||
parts = key.split('.')
|
||||
if len(parts) >= 2:
|
||||
return f"Section: {parts[0]}, Property: {parts[-1]}"
|
||||
return f"Property: {parts[-1]}"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Merge and manage translation files')
|
||||
parser.add_argument('--locales-dir', default='frontend/public/locales',
|
||||
help='Path to locales directory')
|
||||
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
|
||||
help='Path to ignore patterns TOML file')
|
||||
parser.add_argument('language', help='Target language code (e.g., fr-FR)')
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||
|
||||
# Add missing command
|
||||
add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB')
|
||||
add_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation')
|
||||
add_parser.add_argument('--mark-untranslated', action='store_true', default=True,
|
||||
help='Mark added translations as [UNTRANSLATED]')
|
||||
|
||||
# Extract untranslated command
|
||||
extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries')
|
||||
extract_parser.add_argument('--output', help='Output file path')
|
||||
|
||||
# Create template command
|
||||
template_parser = subparsers.add_parser('create-template', help='Create AI translation template')
|
||||
template_parser.add_argument('--output', required=True, help='Output template file path')
|
||||
|
||||
# Apply translations command
|
||||
apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file')
|
||||
apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations')
|
||||
apply_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
merger = TranslationMerger(args.locales_dir, args.ignore_file)
|
||||
target_file = Path(args.locales_dir) / args.language / "translation.json"
|
||||
|
||||
if args.command == 'add-missing':
|
||||
print(f"Adding missing translations to {args.language}...")
|
||||
result = merger.add_missing_translations(
|
||||
target_file,
|
||||
mark_untranslated=args.mark_untranslated
|
||||
)
|
||||
|
||||
merger._save_json(result['data'], target_file, backup=not args.no_backup)
|
||||
print(f"Added {result['added_count']} missing translations")
|
||||
|
||||
elif args.command == 'extract-untranslated':
|
||||
output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json')
|
||||
untranslated = merger.extract_untranslated_entries(target_file, output_file)
|
||||
print(f"Extracted {len(untranslated)} untranslated entries to {output_file}")
|
||||
|
||||
elif args.command == 'create-template':
|
||||
output_file = Path(args.output)
|
||||
merger.create_translation_template(target_file, output_file)
|
||||
|
||||
elif args.command == 'apply-translations':
|
||||
with open(args.translations_file, 'r', encoding='utf-8') as f:
|
||||
translations_data = json.load(f)
|
||||
|
||||
# Extract translations from template format or simple dict
|
||||
if 'translations' in translations_data:
|
||||
translations = {k: v['translated'] for k, v in translations_data['translations'].items()
|
||||
if v.get('translated')}
|
||||
else:
|
||||
translations = translations_data
|
||||
|
||||
result = merger.apply_translations(target_file, translations, backup=not args.no_backup)
|
||||
|
||||
if result['success']:
|
||||
print(f"Applied {result['applied_count']} translations")
|
||||
if result['errors']:
|
||||
print(f"Errors: {len(result['errors'])}")
|
||||
for error in result['errors'][:5]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print(f"Failed: {result.get('error', 'Unknown error')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
229
scripts/translations/validate_json_structure.py
Normal file
229
scripts/translations/validate_json_structure.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate JSON structure and formatting of translation files.
|
||||
|
||||
Checks for:
|
||||
- Valid JSON syntax
|
||||
- Consistent key structure with en-GB
|
||||
- Missing keys
|
||||
- Extra keys not in en-GB
|
||||
- Malformed entries
|
||||
|
||||
Usage:
|
||||
python scripts/translations/validate_json_structure.py [--language LANG]
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set
|
||||
import argparse
|
||||
|
||||
|
||||
def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
|
||||
"""Get all keys from nested dict as dot-notation paths."""
|
||||
keys = set()
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
||||
keys.add(new_key)
|
||||
if isinstance(v, dict):
|
||||
keys.update(get_all_keys(v, new_key, sep=sep))
|
||||
return keys
|
||||
|
||||
|
||||
def validate_json_file(file_path: Path) -> tuple[bool, str]:
|
||||
"""Validate that a file contains valid JSON."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
json.load(f)
|
||||
return True, "Valid JSON"
|
||||
except json.JSONDecodeError as e:
|
||||
return False, f"Invalid JSON at line {e.lineno}, column {e.colno}: {e.msg}"
|
||||
except Exception as e:
|
||||
return False, f"Error reading file: {str(e)}"
|
||||
|
||||
|
||||
def validate_structure(
|
||||
en_gb_keys: Set[str],
|
||||
lang_keys: Set[str],
|
||||
lang_code: str
|
||||
) -> Dict:
|
||||
"""Compare structure between en-GB and target language."""
|
||||
missing_keys = en_gb_keys - lang_keys
|
||||
extra_keys = lang_keys - en_gb_keys
|
||||
|
||||
return {
|
||||
'language': lang_code,
|
||||
'missing_keys': sorted(missing_keys),
|
||||
'extra_keys': sorted(extra_keys),
|
||||
'total_keys': len(lang_keys),
|
||||
'expected_keys': len(en_gb_keys),
|
||||
'missing_count': len(missing_keys),
|
||||
'extra_count': len(extra_keys)
|
||||
}
|
||||
|
||||
|
||||
def print_validation_result(result: Dict, verbose: bool = False):
|
||||
"""Print validation results in readable format."""
|
||||
lang = result['language']
|
||||
|
||||
print(f"\n{'='*100}")
|
||||
print(f"Language: {lang}")
|
||||
print(f"{'='*100}")
|
||||
print(f" Total keys: {result['total_keys']}")
|
||||
print(f" Expected keys (en-GB): {result['expected_keys']}")
|
||||
print(f" Missing keys: {result['missing_count']}")
|
||||
print(f" Extra keys: {result['extra_count']}")
|
||||
|
||||
if result['missing_count'] == 0 and result['extra_count'] == 0:
|
||||
print(f" ✅ Structure matches en-GB perfectly!")
|
||||
else:
|
||||
if result['missing_count'] > 0:
|
||||
print(f"\n ⚠️ Missing {result['missing_count']} key(s):")
|
||||
if verbose or result['missing_count'] <= 20:
|
||||
for key in result['missing_keys'][:50]:
|
||||
print(f" - {key}")
|
||||
if result['missing_count'] > 50:
|
||||
print(f" ... and {result['missing_count'] - 50} more")
|
||||
else:
|
||||
print(f" (use --verbose to see all)")
|
||||
|
||||
if result['extra_count'] > 0:
|
||||
print(f"\n ⚠️ Extra {result['extra_count']} key(s) not in en-GB:")
|
||||
if verbose or result['extra_count'] <= 20:
|
||||
for key in result['extra_keys'][:50]:
|
||||
print(f" - {key}")
|
||||
if result['extra_count'] > 50:
|
||||
print(f" ... and {result['extra_count'] - 50} more")
|
||||
else:
|
||||
print(f" (use --verbose to see all)")
|
||||
|
||||
print("-" * 100)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate translation JSON structure'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--language',
|
||||
help='Specific language code to validate (e.g., es-ES)',
|
||||
default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Show all missing/extra keys'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='Output results as JSON'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define paths
|
||||
locales_dir = Path('frontend/public/locales')
|
||||
en_gb_path = locales_dir / 'en-GB' / 'translation.json'
|
||||
|
||||
if not en_gb_path.exists():
|
||||
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Validate en-GB itself
|
||||
is_valid, message = validate_json_file(en_gb_path)
|
||||
if not is_valid:
|
||||
print(f"❌ Error in en-GB file: {message}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load en-GB structure
|
||||
with open(en_gb_path, 'r', encoding='utf-8') as f:
|
||||
en_gb = json.load(f)
|
||||
|
||||
en_gb_keys = get_all_keys(en_gb)
|
||||
|
||||
# Get list of languages to validate
|
||||
if args.language:
|
||||
languages = [args.language]
|
||||
else:
|
||||
languages = [
|
||||
d.name for d in locales_dir.iterdir()
|
||||
if d.is_dir() and d.name != 'en-GB' and (d / 'translation.json').exists()
|
||||
]
|
||||
|
||||
results = []
|
||||
json_errors = []
|
||||
|
||||
# Validate each language
|
||||
for lang_code in sorted(languages):
|
||||
lang_path = locales_dir / lang_code / 'translation.json'
|
||||
|
||||
if not lang_path.exists():
|
||||
print(f"⚠️ Warning: {lang_code}/translation.json not found, skipping")
|
||||
continue
|
||||
|
||||
# First check if JSON is valid
|
||||
is_valid, message = validate_json_file(lang_path)
|
||||
if not is_valid:
|
||||
json_errors.append({
|
||||
'language': lang_code,
|
||||
'file': str(lang_path),
|
||||
'error': message
|
||||
})
|
||||
continue
|
||||
|
||||
# Load and compare structure
|
||||
with open(lang_path, 'r', encoding='utf-8') as f:
|
||||
lang_data = json.load(f)
|
||||
|
||||
lang_keys = get_all_keys(lang_data)
|
||||
result = validate_structure(en_gb_keys, lang_keys, lang_code)
|
||||
results.append(result)
|
||||
|
||||
# Output results
|
||||
if args.json:
|
||||
output = {
|
||||
'json_errors': json_errors,
|
||||
'structure_validation': results
|
||||
}
|
||||
print(json.dumps(output, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
# Print JSON errors first
|
||||
if json_errors:
|
||||
print("\n❌ JSON Syntax Errors:")
|
||||
print("=" * 100)
|
||||
for error in json_errors:
|
||||
print(f"\nLanguage: {error['language']}")
|
||||
print(f"File: {error['file']}")
|
||||
print(f"Error: {error['error']}")
|
||||
print("\n")
|
||||
|
||||
# Print structure validation results
|
||||
if results:
|
||||
print("\n📊 Structure Validation Summary:")
|
||||
print(f" Languages validated: {len(results)}")
|
||||
|
||||
perfect = sum(1 for r in results if r['missing_count'] == 0 and r['extra_count'] == 0)
|
||||
print(f" Perfect matches: {perfect}/{len(results)}")
|
||||
|
||||
total_missing = sum(r['missing_count'] for r in results)
|
||||
total_extra = sum(r['extra_count'] for r in results)
|
||||
print(f" Total missing keys: {total_missing}")
|
||||
print(f" Total extra keys: {total_extra}")
|
||||
|
||||
for result in results:
|
||||
print_validation_result(result, verbose=args.verbose)
|
||||
|
||||
if not json_errors and perfect == len(results):
|
||||
print("\n✅ All translations have perfect structure!")
|
||||
|
||||
# Exit with error code if issues found
|
||||
has_issues = len(json_errors) > 0 or any(
|
||||
r['missing_count'] > 0 or r['extra_count'] > 0 for r in results
|
||||
)
|
||||
sys.exit(1 if has_issues else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
189
scripts/translations/validate_placeholders.py
Normal file
189
scripts/translations/validate_placeholders.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate that translation files have the same placeholders as en-GB (source of truth).
|
||||
|
||||
Usage:
|
||||
python scripts/translations/validate_placeholders.py [--language LANG] [--fix]
|
||||
|
||||
--language: Validate specific language (e.g., es-ES, de-DE)
|
||||
--fix: Automatically remove extra placeholders (use with caution)
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple
|
||||
import argparse
|
||||
|
||||
|
||||
def find_placeholders(text: str) -> Set[str]:
|
||||
"""Find all placeholders in text like {n}, {{var}}, {0}, etc."""
|
||||
if not isinstance(text, str):
|
||||
return set()
|
||||
return set(re.findall(r'\{\{?[^}]+\}\}?', text))
|
||||
|
||||
|
||||
def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str]:
|
||||
"""Flatten nested dict to dot-notation keys."""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(flatten_dict(v, new_key, sep=sep).items())
|
||||
else:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
|
||||
def validate_language(
|
||||
en_gb_flat: Dict[str, str],
|
||||
lang_flat: Dict[str, str],
|
||||
lang_code: str
|
||||
) -> List[Dict]:
|
||||
"""Validate placeholders for a language against en-GB."""
|
||||
issues = []
|
||||
|
||||
for key in en_gb_flat:
|
||||
if key not in lang_flat:
|
||||
continue
|
||||
|
||||
en_placeholders = find_placeholders(en_gb_flat[key])
|
||||
lang_placeholders = find_placeholders(lang_flat[key])
|
||||
|
||||
if en_placeholders != lang_placeholders:
|
||||
missing = en_placeholders - lang_placeholders
|
||||
extra = lang_placeholders - en_placeholders
|
||||
|
||||
issue = {
|
||||
'language': lang_code,
|
||||
'key': key,
|
||||
'missing': missing,
|
||||
'extra': extra,
|
||||
'en_text': en_gb_flat[key],
|
||||
'lang_text': lang_flat[key]
|
||||
}
|
||||
issues.append(issue)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def print_issues(issues: List[Dict], verbose: bool = False):
|
||||
"""Print validation issues in a readable format."""
|
||||
if not issues:
|
||||
print("✅ No placeholder validation issues found!")
|
||||
return
|
||||
|
||||
print(f"❌ Found {len(issues)} placeholder validation issue(s):\n")
|
||||
print("=" * 100)
|
||||
|
||||
for i, issue in enumerate(issues, 1):
|
||||
print(f"\n{i}. Language: {issue['language']}")
|
||||
print(f" Key: {issue['key']}")
|
||||
|
||||
if issue['missing']:
|
||||
print(f" ⚠️ MISSING placeholders: {issue['missing']}")
|
||||
if issue['extra']:
|
||||
print(f" ⚠️ EXTRA placeholders: {issue['extra']}")
|
||||
|
||||
if verbose:
|
||||
print(f" EN-GB: {issue['en_text'][:150]}")
|
||||
print(f" {issue['language']}: {issue['lang_text'][:150]}")
|
||||
|
||||
print("-" * 100)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate translation placeholder consistency'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--language',
|
||||
help='Specific language code to validate (e.g., es-ES)',
|
||||
default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Show full text samples for each issue'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='Output results as JSON'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define paths
|
||||
locales_dir = Path('frontend/public/locales')
|
||||
en_gb_path = locales_dir / 'en-GB' / 'translation.json'
|
||||
|
||||
if not en_gb_path.exists():
|
||||
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load en-GB (source of truth)
|
||||
with open(en_gb_path, 'r', encoding='utf-8') as f:
|
||||
en_gb = json.load(f)
|
||||
|
||||
en_gb_flat = flatten_dict(en_gb)
|
||||
|
||||
# Get list of languages to validate
|
||||
if args.language:
|
||||
languages = [args.language]
|
||||
else:
|
||||
# Validate all languages except en-GB
|
||||
languages = [
|
||||
d.name for d in locales_dir.iterdir()
|
||||
if d.is_dir() and d.name != 'en-GB' and (d / 'translation.json').exists()
|
||||
]
|
||||
|
||||
all_issues = []
|
||||
|
||||
# Validate each language
|
||||
for lang_code in sorted(languages):
|
||||
lang_path = locales_dir / lang_code / 'translation.json'
|
||||
|
||||
if not lang_path.exists():
|
||||
print(f"⚠️ Warning: {lang_code}/translation.json not found, skipping")
|
||||
continue
|
||||
|
||||
with open(lang_path, 'r', encoding='utf-8') as f:
|
||||
lang_data = json.load(f)
|
||||
|
||||
lang_flat = flatten_dict(lang_data)
|
||||
issues = validate_language(en_gb_flat, lang_flat, lang_code)
|
||||
all_issues.extend(issues)
|
||||
|
||||
# Output results
|
||||
if args.json:
|
||||
print(json.dumps(all_issues, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
if all_issues:
|
||||
# Group by language
|
||||
by_language = {}
|
||||
for issue in all_issues:
|
||||
lang = issue['language']
|
||||
if lang not in by_language:
|
||||
by_language[lang] = []
|
||||
by_language[lang].append(issue)
|
||||
|
||||
print(f"📊 Validation Summary:")
|
||||
print(f" Total issues: {len(all_issues)}")
|
||||
print(f" Languages with issues: {len(by_language)}\n")
|
||||
|
||||
for lang in sorted(by_language.keys()):
|
||||
print(f"\n{'='*100}")
|
||||
print(f"Language: {lang} ({len(by_language[lang])} issue(s))")
|
||||
print(f"{'='*100}")
|
||||
print_issues(by_language[lang], verbose=args.verbose)
|
||||
else:
|
||||
print("✅ All translations have correct placeholders!")
|
||||
|
||||
# Exit with error code if issues found
|
||||
sys.exit(1 if all_issues else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user