Merge remote-tracking branch 'origin/V2' into mainToV2

This commit is contained in:
Anthony Stirling
2025-10-12 20:45:25 +01:00
979 changed files with 188275 additions and 2719 deletions

View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""
Convert Java .properties files to JSON for react-i18next
Preserves hierarchical structure and handles special cases
"""
import os
import json
import re
from pathlib import Path
def properties_to_dict(file_path):
"""Convert .properties file to nested dictionary"""
result = {}
with open(file_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
# Handle key=value pairs
if '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
# Handle multiline values (ending with \)
while value.endswith('\\'):
next_line = next(f, '').strip()
value = value[:-1] + next_line
# Create nested structure from dot notation
set_nested_value(result, key, value)
return result
def set_nested_value(dictionary, key_path, value):
"""Set value in nested dictionary using dot notation"""
keys = key_path.split('.')
current = dictionary
for key in keys[:-1]:
if key not in current:
current[key] = {}
elif not isinstance(current[key], dict):
# Convert existing string value to nested object
old_value = current[key]
current[key] = {"_value": old_value}
current = current[key]
final_key = keys[-1]
if final_key in current and isinstance(current[final_key], dict):
# If the final key already exists as an object, store the value under "_value"
current[final_key]["_value"] = value
else:
current[final_key] = value
def convert_all_properties():
"""Convert all messages_*.properties files to JSON"""
# Get project root
script_dir = Path(__file__).parent
project_root = script_dir.parent
resources_dir = project_root / 'src' / 'main' / 'resources'
output_dir = project_root / 'frontend' / 'public' / 'locales'
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Find all .properties files
properties_files = list(resources_dir.glob('messages*.properties'))
converted_count = 0
for props_file in properties_files:
# Extract locale from filename
filename = props_file.name
if filename == 'messages.properties':
locale = 'en' # Default locale
else:
# Extract locale from messages_en_US.properties format
locale_match = re.match(r'messages_(.+)\.properties', filename)
if locale_match:
locale = locale_match.group(1)
# Convert Java locale format to standard (en_US -> en-US)
locale = locale.replace('_', '-')
else:
continue
print(f"Converting {filename} -> {locale}.json")
# Convert to dictionary
data = properties_to_dict(props_file)
# Create locale directory
locale_dir = output_dir / locale
locale_dir.mkdir(exist_ok=True)
# Write translation.json (react-i18next default namespace)
output_file = locale_dir / 'translation.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
converted_count += 1
print(f"\nConverted {converted_count} language files to {output_dir}")
print("Languages available:", [d.name for d in output_dir.iterdir() if d.is_dir()])
if __name__ == '__main__':
convert_all_properties()

View File

@@ -0,0 +1,204 @@
"""A script to update language progress status in README.md based on
JSON translation file comparison.
This script compares the default translation JSON file with others in the locales directory to
determine language progress.
It then updates README.md based on provided progress list.
Author: Ludy87
Example:
To use this script, simply run it from command line:
$ python counter_translation_v2.py
""" # noqa: D205
import glob
import os
import re
import json
import tomlkit
import tomlkit.toml_file
def convert_to_multiline(data: tomlkit.TOMLDocument) -> tomlkit.TOMLDocument:
"""Converts 'ignore' and 'missing' arrays to multiline arrays and sorts the first-level keys of the TOML document.
Enhances readability and consistency in the TOML file by ensuring arrays contain unique and sorted entries.
Parameters:
data (tomlkit.TOMLDocument): The original TOML document containing the data.
Returns:
tomlkit.TOMLDocument: A new TOML document with sorted keys and properly formatted arrays.
""" # noqa: D205
sorted_data = tomlkit.document()
for key in sorted(data.keys()):
value = data[key]
if isinstance(value, dict):
new_table = tomlkit.table()
for subkey in ("ignore", "missing"):
if subkey in value:
# Convert the list to a set to remove duplicates, sort it, and convert to multiline for readability
unique_sorted_array = sorted(set(value[subkey]))
array = tomlkit.array()
array.multiline(True)
for item in unique_sorted_array:
array.append(item)
new_table[subkey] = array
sorted_data[key] = new_table
else:
# Add other types of data unchanged
sorted_data[key] = value
return sorted_data
def write_readme(progress_list: list[tuple[str, int]]) -> None:
"""Updates the progress status in the README.md file based
on the provided progress list.
Parameters:
progress_list (list[tuple[str, int]]): A list of tuples containing
language and progress percentage.
Returns:
None
""" # noqa: D205
with open("README.md", encoding="utf-8") as file:
content = file.readlines()
for i, line in enumerate(content[2:], start=2):
for progress in progress_list:
language, value = progress
if language in line:
if match := re.search(r"\!\[(\d+(\.\d+)?)%\]\(.*\)", line):
content[i] = line.replace(
match.group(0),
f"![{value}%](https://geps.dev/progress/{value})",
)
with open("README.md", "w", encoding="utf-8", newline="\n") as file:
file.writelines(content)
def parse_json_file(file_path):
"""
Parses a JSON translation file and returns a flat dictionary of all keys.
:param file_path: Path to the JSON file.
:return: Dictionary with flattened keys and values.
"""
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
def flatten_dict(d, parent_key="", sep="."):
items = {}
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.update(flatten_dict(v, new_key, sep=sep))
else:
items[new_key] = v
return items
return flatten_dict(data)
def compare_files(
default_file_path, file_paths, ignore_translation_file
) -> list[tuple[str, int]]:
"""Compares the default JSON translation file with other
translation files in the locales directory.
Parameters:
default_file_path (str): The path to the default translation JSON file.
file_paths (list): List of paths to translation JSON files.
ignore_translation_file (str): Path to the TOML file with ignore rules.
Returns:
list[tuple[str, int]]: A list of tuples containing
language and progress percentage.
""" # noqa: D205
default_keys = parse_json_file(default_file_path)
num_keys = len(default_keys)
result_list = []
sort_ignore_translation: tomlkit.TOMLDocument
# read toml
with open(ignore_translation_file, encoding="utf-8") as f:
sort_ignore_translation = tomlkit.parse(f.read())
for file_path in file_paths:
# Extract language code from directory name
locale_dir = os.path.basename(os.path.dirname(file_path))
# Convert locale format from hyphen to underscore for TOML compatibility
# e.g., en-GB -> en_GB, sr-LATN-RS -> sr_LATN_RS
language = locale_dir.replace("-", "_")
fails = 0
if language in ["en_GB", "en_US"]:
result_list.append(("en_GB", 100))
result_list.append(("en_US", 100))
continue
if language not in sort_ignore_translation:
sort_ignore_translation[language] = tomlkit.table()
if (
"ignore" not in sort_ignore_translation[language]
or len(sort_ignore_translation[language].get("ignore", [])) < 1
):
sort_ignore_translation[language]["ignore"] = tomlkit.array(
["language.direction"]
)
current_keys = parse_json_file(file_path)
# Compare keys
for default_key, default_value in default_keys.items():
if default_key not in current_keys:
# Key is missing entirely
if default_key not in sort_ignore_translation[language]["ignore"]:
print(f"{language}: Key '{default_key}' is missing.")
fails += 1
elif (
default_value == current_keys[default_key]
and default_key not in sort_ignore_translation[language]["ignore"]
):
# Key exists but value is untranslated (same as reference)
print(f"{language}: Key '{default_key}' is missing the translation.")
fails += 1
elif default_value != current_keys[default_key]:
# Key is translated, remove from ignore list if present
if default_key in sort_ignore_translation[language]["ignore"]:
sort_ignore_translation[language]["ignore"].remove(default_key)
print(f"{language}: {fails} out of {num_keys} keys are not translated.")
result_list.append(
(
language,
int((num_keys - fails) * 100 / num_keys),
)
)
ignore_translation = convert_to_multiline(sort_ignore_translation)
with open(ignore_translation_file, "w", encoding="utf-8", newline="\n") as file:
file.write(tomlkit.dumps(ignore_translation))
unique_data = list(set(result_list))
unique_data.sort(key=lambda x: x[1], reverse=True)
return unique_data
if __name__ == "__main__":
directory = os.path.join(os.getcwd(), "frontend", "public", "locales")
translation_file_paths = glob.glob(os.path.join(directory, "*", "translation.json"))
reference_file = os.path.join(directory, "en-GB", "translation.json")
scripts_directory = os.path.join(os.getcwd(), "scripts")
translation_state_file = os.path.join(scripts_directory, "ignore_translation.toml")
write_readme(
compare_files(reference_file, translation_file_paths, translation_state_file)
)

View File

@@ -3,7 +3,6 @@ ignore = [
'lang.div',
'lang.dzo',
'lang.que',
'language.direction',
]
[az_AZ]
@@ -193,8 +192,6 @@ ignore = [
'AddStampRequest.alphabet',
'AddStampRequest.position',
'PDFToBook.selectText.1',
'PDFToText.tags',
'addPageNumbers.selectText.3',
'adminUserSettings.team',
'alphabet',
'audit.dashboard.modal.id',
@@ -204,7 +201,6 @@ ignore = [
'audit.dashboard.table.details',
'audit.dashboard.table.id',
'certSign.name',
'cookieBanner.popUp.acceptAllBtn',
'endpointStatistics.top10',
'endpointStatistics.top20',
'fileChooser.dragAndDrop',
@@ -313,9 +309,7 @@ ignore = [
]
[fa_IR]
ignore = [
'language.direction',
]
ignore = []
[fr_FR]
ignore = [
@@ -323,7 +317,6 @@ ignore = [
'AddStampRequest.position',
'AddStampRequest.rotation',
'PDFToBook.selectText.1',
'addPageNumbers.selectText.3',
'adminUserSettings.actions',
'alphabet',
'compare.document.1',
@@ -526,6 +519,11 @@ ignore = [
'language.direction',
]
[ml_ML]
ignore = [
'language.direction',
]
[nl_NL]
ignore = [
'compare.document.1',
@@ -770,7 +768,6 @@ ignore = [
[sk_SK]
ignore = [
'adminUserSettings.admin',
'home.multiTool.title',
'info',
'lang.ceb',
'lang.chr',
@@ -974,11 +971,15 @@ ignore = [
'lang.yid',
'lang.yor',
'language.direction',
'pipeline.title',
'pipelineOptions.pipelineHeader',
'showJS.tags',
]
[zh_BO]
ignore = [
'language.direction',
]
[zh_CN]
ignore = [
'language.direction',

View File

@@ -19,9 +19,8 @@ if [[ "$INSTALL_BOOK_AND_ADVANCED_HTML_OPS" == "true" && "$FAT_DOCKER" != "true"
#apk add --no-cache calibre@testing
fi
if [[ "$FAT_DOCKER" != "true" ]]; then
/scripts/download-security-jar.sh
fi
# Security jar is now built into the application jar during Docker build
# No need to download it separately
if [[ -n "$LANGS" ]]; then
/scripts/installFonts.sh $LANGS

View File

@@ -0,0 +1,579 @@
# Translation Management Scripts
This directory contains Python scripts for managing frontend translations in Stirling PDF. These tools help analyze, merge, validate, and manage translations against the en-GB golden truth file.
## Scripts Overview
### 0. Validation Scripts (Run First!)
#### `json_validator.py`
Validates JSON syntax in translation files with detailed error reporting.
**Usage:**
```bash
# Validate single file
python scripts/translations/json_validator.py ar_AR_batch_1_of_3.json
# Validate all batches for a language
python scripts/translations/json_validator.py --all-batches ar_AR
# Validate pattern with wildcards
python scripts/translations/json_validator.py "ar_AR_batch_*.json"
# Brief output (no context)
python scripts/translations/json_validator.py --all-batches ar_AR --brief
# Only show files with errors
python scripts/translations/json_validator.py --all-batches ar_AR --quiet
```
**Features:**
- Validates JSON syntax with detailed error messages
- Shows exact line, column, and character position of errors
- Displays context around errors for easy fixing
- Suggests common fixes based on error type
- Detects unescaped quotes and backslashes
- Reports entry counts for valid files
- Exit code 1 if any files invalid (good for CI/CD)
**Common Issues Detected:**
- Unescaped quotes inside strings: `"text with "quotes""``"text with \"quotes\""`
- Invalid backslash escapes: `\d{4}``\\d{4}`
- Missing commas between entries
- Trailing commas before closing braces
#### `validate_placeholders.py`
Validates that translation files have correct placeholders matching en-GB (source of truth).
**Usage:**
```bash
# Validate all languages
python scripts/translations/validate_placeholders.py
# Validate specific language
python scripts/translations/validate_placeholders.py --language es-ES
# Show detailed text samples
python scripts/translations/validate_placeholders.py --verbose
# Output as JSON
python scripts/translations/validate_placeholders.py --json
```
**Features:**
- Detects missing placeholders (e.g., {n}, {total}, {filename})
- Detects extra placeholders not in en-GB
- Shows exact keys and text where issues occur
- Exit code 1 if issues found (good for CI/CD)
#### `validate_json_structure.py`
Validates JSON structure and key consistency with en-GB.
**Usage:**
```bash
# Validate all languages
python scripts/translations/validate_json_structure.py
# Validate specific language
python scripts/translations/validate_json_structure.py --language de-DE
# Show all missing/extra keys
python scripts/translations/validate_json_structure.py --verbose
# Output as JSON
python scripts/translations/validate_json_structure.py --json
```
**Features:**
- Validates JSON syntax
- Detects missing keys (not translated yet)
- Detects extra keys (not in en-GB, should be removed)
- Reports key counts and structure differences
- Exit code 1 if issues found (good for CI/CD)
### 1. `translation_analyzer.py`
Analyzes translation files to find missing translations, untranslated entries, and provides completion statistics.
**Usage:**
```bash
# Analyze all languages
python scripts/translations/translation_analyzer.py
# Analyze specific language
python scripts/translations/translation_analyzer.py --language fr-FR
# Show only missing translations
python scripts/translations/translation_analyzer.py --missing-only
# Show only untranslated entries
python scripts/translations/translation_analyzer.py --untranslated-only
# Show summary only
python scripts/translations/translation_analyzer.py --summary
# JSON output format
python scripts/translations/translation_analyzer.py --format json
```
**Features:**
- Finds missing translation keys
- Identifies untranslated entries (identical to en-GB and [UNTRANSLATED] markers)
- Shows accurate completion percentages using ignore patterns
- Identifies extra keys not in en-GB
- Supports JSON and text output formats
- Uses `scripts/ignore_translation.toml` for language-specific exclusions
### 2. `translation_merger.py`
Merges missing translations from en-GB into target language files and manages translation workflows.
**Usage:**
```bash
# Add missing translations from en-GB to French
python scripts/translations/translation_merger.py fr-FR add-missing
# Add without marking as [UNTRANSLATED]
python scripts/translations/translation_merger.py fr-FR add-missing --no-mark-untranslated
# Extract untranslated entries to a file
python scripts/translations/translation_merger.py fr-FR extract-untranslated --output fr_untranslated.json
# Create a template for AI translation
python scripts/translations/translation_merger.py fr-FR create-template --output fr_template.json
# Apply translations from a file
python scripts/translations/translation_merger.py fr-FR apply-translations --translations-file fr_translated.json
```
**Features:**
- Adds missing keys from en-GB with optional [UNTRANSLATED] markers
- Extracts untranslated entries for external translation
- Creates structured templates for AI translation
- Applies translated content back to language files
- Automatic backup creation
### 3. `ai_translation_helper.py`
Specialized tool for AI-assisted translation workflows with batch processing and validation.
**Usage:**
```bash
# Create batch file for AI translation (multiple languages)
python scripts/translations/ai_translation_helper.py create-batch --languages fr-FR de-DE es-ES --output batch.json --max-entries 50
# Validate AI translations
python scripts/translations/ai_translation_helper.py validate batch.json
# Apply validated AI translations
python scripts/translations/ai_translation_helper.py apply-batch batch.json
# Export for external translation services
python scripts/translations/ai_translation_helper.py export --languages fr-FR de-DE --format csv
```
**Features:**
- Creates batch files for AI translation of multiple languages
- Prioritizes important translation keys
- Validates translations for placeholders and artifacts
- Applies batch translations with validation
- Exports to CSV/JSON for external translation services
### 4. `compact_translator.py`
Extracts untranslated entries in minimal JSON format for character-limited AI services.
**Usage:**
```bash
# Extract all untranslated entries
python scripts/translations/compact_translator.py it-IT --output to_translate.json
```
**Features:**
- Produces minimal JSON output with no extra whitespace
- Automatic ignore patterns for cleaner output
- Batch size control for manageable chunks
- 50-80% fewer characters than other extraction methods
### 5. `json_beautifier.py`
Restructures and beautifies translation JSON files to match en-GB structure exactly.
**Usage:**
```bash
# Restructure single language to match en-GB structure
python scripts/translations/json_beautifier.py --language de-DE
# Restructure all languages
python scripts/translations/json_beautifier.py --all-languages
# Validate structure without modifying files
python scripts/translations/json_beautifier.py --language de-DE --validate-only
# Skip backup creation
python scripts/translations/json_beautifier.py --language de-DE --no-backup
```
**Features:**
- Restructures JSON to match en-GB nested structure exactly
- Preserves key ordering for line-by-line comparison
- Creates automatic backups before modification
- Validates structure and key ordering
- Handles flattened dot-notation keys (e.g., "key.subkey") properly
## Translation Workflows
### Method 1: Compact Translation Workflow (RECOMMENDED for AI)
**Best for character-limited AI services like Claude or ChatGPT**
#### Step 1: Check Current Status
```bash
python scripts/translations/translation_analyzer.py --language it-IT --summary
```
#### Step 2: Extract Untranslated Entries
```bash
# For small files (< 1200 entries)
python scripts/translations/compact_translator.py it-IT --output to_translate.json
# For large files, split into batches
python scripts/translations/compact_translator.py it-IT --output it_IT_batch --batch-size 400
# Creates: it_IT_batch_1_of_N.json, it_IT_batch_2_of_N.json, etc.
```
#### Step 2.5: Validate JSON (if using batches)
```bash
# After AI translates the batches, validate them before merging
python scripts/translations/json_validator.py --all-batches it_IT
# Fix any errors reported (common issues: unescaped quotes, backslashes)
```
**Output format**: Compact JSON with minimal whitespace
```json
{"key1":"English text","key2":"Another text","key3":"More text"}
```
#### Step 3: AI Translation
1. Copy the compact JSON output
2. Give it to your AI with instructions:
```
Translate this JSON to Italian. Keep the same structure, translate only the values.
Preserve placeholders like {n}, {total}, {filename}, {{variable}}.
```
3. Save the AI's response as `translated.json`
#### Step 4: Apply Translations
```bash
python scripts/translations/translation_merger.py it-IT apply-translations --translations-file translated.json
```
#### Step 5: Verify Results
```bash
python scripts/translations/translation_analyzer.py --language it-IT --summary
```
### Method 2: Batch Translation Workflow
**For complete language translation from scratch or major updates**
#### Step 1: Analyze Current State
```bash
python scripts/translations/translation_analyzer.py --language de-DE --summary
```
#### Step 2: Create Translation Batches
```bash
# Create batches of 100 entries each for systematic translation
python scripts/translations/ai_translation_helper.py create-batch --languages de-DE --output de_batch_1.json --max-entries 100
```
#### Step 3: Translate Batch with AI
Edit the batch file and fill in ALL `translated` fields:
- Preserve all placeholders like `{n}`, `{total}`, `{filename}`, `{{toolName}}`
- Keep technical terms consistent
- Maintain JSON structure exactly
- Consider context provided for each entry
#### Step 4: Apply Translations
```bash
# Skip validation if using legitimate placeholders ({{variable}})
python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json --skip-validation
```
#### Step 5: Check Progress and Continue
```bash
python scripts/translations/translation_analyzer.py --language de-DE --summary
```
Repeat steps 2-5 until 100% complete.
### Method 3: Quick Translation Workflow (Legacy)
**For small updates or existing translations**
#### Step 1: Add Missing Translations
```bash
python scripts/translations/translation_merger.py fr-FR add-missing --mark-untranslated
```
#### Step 2: Create AI Template
```bash
python scripts/translations/translation_merger.py fr-FR create-template --output fr_template.json
```
#### Step 3: Apply Translations
```bash
python scripts/translations/translation_merger.py fr-FR apply-translations --translations-file fr_translated.json
```
## Translation File Structure
Translation files are located in `frontend/public/locales/{language}/translation.json` with nested JSON structure:
```json
{
"addPageNumbers": {
"title": "Add Page Numbers",
"selectText": {
"1": "Select PDF file:",
"2": "Margin Size"
}
}
}
```
Keys use dot notation internally (e.g., `addPageNumbers.selectText.1`).
## Key Features
### Placeholder Preservation
All scripts preserve placeholders like `{n}`, `{total}`, `{filename}` in translations:
```
"customNumberDesc": "Defaults to {n}, also accepts 'Page {n} of {total}'"
```
### Automatic Backups
Scripts create timestamped backups before modifying files:
```
translation.backup.20241201_143022.json
```
### Context-Aware Translation
Scripts provide context information to help with accurate translations:
```json
{
"addPageNumbers.title": {
"original": "Add Page Numbers",
"context": "Feature for adding page numbers to PDFs"
}
}
```
### Priority-Based Translation
Important keys (title, submit, error messages) are prioritized when limiting translation batch sizes.
### Ignore Patterns System
The `scripts/ignore_translation.toml` file defines keys that should be ignored for each language, improving completion accuracy.
**Common ignore patterns:**
- `language.direction`: Text direction (ltr/rtl) - universal
- `lang.*`: Language code entries not relevant to specific locales
- `pipeline.title`, `home.devApi.title`: Technical terms kept in English
- Specific technical IDs, version numbers, and system identifiers
**Format:**
```toml
[de_DE]
ignore = [
'language.direction',
'pipeline.title',
'lang.afr',
'lang.ceb',
# ... more patterns
]
```
## Best Practices & Lessons Learned
### Critical Rules for Translation
1. **NEVER skip entries**: Translate ALL entries in each batch to avoid [UNTRANSLATED] pollution
2. **Use appropriate batch sizes**: 100 entries for systematic translation, unlimited for compact method
3. **Skip validation for placeholders**: Use `--skip-validation` when batch contains `{{variable}}` patterns
4. **Check progress between batches**: Use `--summary` flag to track completion percentage
5. **Preserve all placeholders**: Keep `{n}`, `{total}`, `{filename}`, `{{toolName}}` exactly as-is
### Workflow Comparison
| Method | Best For | Character Usage | Complexity | Speed |
|--------|----------|----------------|------------|-------|
| Compact | AI services | Minimal (50-80% less) | Simple | Fastest |
| Batch | Systematic translation | Moderate | Medium | Medium |
| Quick | Small updates | High | Low | Slow |
### Common Issues and Solutions
#### JSON Syntax Errors in AI Translations
**Problem**: AI-translated batch files have JSON syntax errors
**Symptoms**:
- `JSONDecodeError: Expecting ',' delimiter`
- `JSONDecodeError: Invalid \escape`
**Solution**:
```bash
# 1. Validate all batches to find errors
python scripts/translations/json_validator.py --all-batches ar_AR
# 2. Check detailed error with context
python scripts/translations/json_validator.py ar_AR_batch_2_of_3.json
# 3. Fix the reported issues:
# - Unescaped quotes: "text with "quotes"" → "text with \"quotes\""
# - Backslashes in regex: "\d{4}" → "\\d{4}"
# - Missing commas between entries
# 4. Validate again until all pass
python scripts/translations/json_validator.py --all-batches ar_AR
```
**Common fixes:**
- Arabic/RTL text with embedded quotes: Always escape with backslash
- Regex patterns: Double all backslashes (`\d` → `\\d`)
- Check for missing/extra commas at line reported in error
#### [UNTRANSLATED] Pollution
**Problem**: Hundreds of [UNTRANSLATED] markers from incomplete translation attempts
**Solution**:
- Only translate complete batches of manageable size
- Use analyzer that counts [UNTRANSLATED] as missing translations
- Restore from backup if pollution occurs
#### Validation False Positives
**Problem**: Validator flags legitimate `{{variable}}` placeholders as artifacts
**Solution**: Use `--skip-validation` flag when applying batches with template variables
#### JSON Structure Mismatches
**Problem**: Flattened dot-notation keys instead of proper nested objects
**Solution**: Use `json_beautifier.py` to restructure files to match en-GB exactly
## Real-World Examples
### Complete Arabic Translation with Validation (Batch Method)
```bash
# Check status
python scripts/translations/translation_analyzer.py --language ar-AR --summary
# Result: 50% complete, 1088 missing
# Extract in batches due to AI token limits
python scripts/translations/compact_translator.py ar-AR --output ar_AR_batch --batch-size 400
# Created: ar_AR_batch_1_of_3.json (400 entries)
# ar_AR_batch_2_of_3.json (400 entries)
# ar_AR_batch_3_of_3.json (288 entries)
# [Send each batch to AI for translation]
# Validate translated batches before merging
python scripts/translations/json_validator.py --all-batches ar_AR
# Found errors in batch 1 and 2:
# - Line 263: Unescaped quotes in "انقر "إضافة ملفات""
# - Line 132: Unescaped quotes in "أو "and""
# - Line 213: Invalid escape "\d{4}"
# Fix errors manually or with sed, then validate again
python scripts/translations/json_validator.py --all-batches ar_AR
# All valid!
# Merge all batches
python3 << 'EOF'
import json
merged = {}
for i in range(1, 4):
with open(f'ar_AR_batch_{i}_of_3.json', 'r', encoding='utf-8') as f:
merged.update(json.load(f))
with open('ar_AR_merged.json', 'w', encoding='utf-8') as f:
json.dump(merged, f, ensure_ascii=False, indent=2)
EOF
# Apply merged translations
python scripts/translations/translation_merger.py ar-AR apply-translations --translations-file ar_AR_merged.json
# Result: Applied 1088 translations
# Beautify to match en-GB structure
python scripts/translations/json_beautifier.py --language ar-AR
# Check final progress
python scripts/translations/translation_analyzer.py --language ar-AR --summary
# Result: 98.7% complete, 9 missing, 20 untranslated
```
### Complete Italian Translation (Compact Method)
```bash
# Check status
python scripts/translations/translation_analyzer.py --language it-IT --summary
# Result: 46.8% complete, 1147 missing
# Extract all entries for translation
python scripts/translations/compact_translator.py it-IT --output batch1.json
# [Translate batch1.json with AI, save as batch1_translated.json]
# Apply translations
python scripts/translations/translation_merger.py it-IT apply-translations --translations-file batch1_translated.json
# Result: Applied 1147 translations
# Check progress
python scripts/translations/translation_analyzer.py --language it-IT --summary
# Result: 100% complete, 0 missing
```
### German Translation (Batch Method)
Starting from 46.3% completion, reaching 60.3% with batch method:
```bash
# Initial analysis
python scripts/translations/translation_analyzer.py --language de-DE --summary
# Result: 46.3% complete, 1142 missing entries
# Batch 1 (100 entries)
python scripts/translations/ai_translation_helper.py create-batch --languages de-DE --output de_batch_1.json --max-entries 100
# [Translate all 100 entries in batch file]
python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json --skip-validation
# Progress: 46.6% → 51.2%
# Continue with more batches until 100% complete
```
## Error Handling
- **Missing Files**: Scripts create new files when language directories don't exist
- **Invalid JSON**: Clear error messages with line numbers
- **Placeholder Mismatches**: Validation warnings for missing or extra placeholders
- **[UNTRANSLATED] Entries**: Counted as missing translations to prevent pollution
- **Backup Failures**: Graceful handling with user notification
## Integration with Development
These scripts integrate with the existing translation system:
- Works with the current `frontend/public/locales/` structure
- Compatible with the i18n system used in the React frontend
- Respects the JSON format expected by the translation loader
- Maintains the nested structure required by the UI components
## Language-Specific Notes
### German Translation Notes
- Technical terms: Use German equivalents (PDF → PDF, API → API)
- UI actions: "hochladen" (upload), "herunterladen" (download), "speichern" (save)
- Error messages: Consistent pattern "Ein Fehler ist beim [action] aufgetreten"
- Formal address: Use "Sie" form for user-facing text
### Italian Translation Notes
- Keep technical terms in English when commonly used (PDF, API, URL)
- Use formal address ("Lei" form) for user-facing text
- Error messages: "Si è verificato un errore durante [action]"
- UI actions: "carica" (upload), "scarica" (download), "salva" (save)
## Common Use Cases
1. **Complete Language Translation**: Use Compact Workflow for fastest AI-assisted translation
2. **New Language Addition**: Start with compact workflow for comprehensive coverage
3. **Updating Existing Language**: Use analyzer to find gaps, then compact or batch method
4. **Quality Assurance**: Use analyzer with `--summary` for completion metrics and issue detection
5. **External Translation Services**: Use export functionality to generate CSV files for translators
6. **Structure Maintenance**: Use json_beautifier to keep files aligned with en-GB structure

View File

@@ -0,0 +1,408 @@
#!/usr/bin/env python3
"""
AI Translation Helper for Stirling PDF Frontend
Provides utilities for AI-assisted translation workflows including
batch processing, quality checks, and integration helpers.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any, Optional
import argparse
import re
from datetime import datetime
import csv
class AITranslationHelper:
def __init__(self, locales_dir: str = "frontend/public/locales"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
def _load_json(self, file_path: Path) -> Dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Error loading {file_path}: {e}")
return {}
def _save_json(self, data: Dict, file_path: Path) -> None:
"""Save JSON file."""
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def create_ai_batch_file(self, languages: List[str], output_file: Path,
max_entries_per_language: int = 50) -> None:
"""Create a batch file for AI translation with multiple languages."""
golden_truth = self._load_json(self.golden_truth_file)
batch_data = {
'metadata': {
'created_at': datetime.now().isoformat(),
'source_language': 'en-GB',
'target_languages': languages,
'max_entries_per_language': max_entries_per_language,
'instructions': {
'format': 'Translate each entry maintaining JSON structure and placeholder variables like {n}, {total}, {filename}',
'context': 'This is for a PDF manipulation tool. Keep technical terms consistent.',
'placeholders': 'Preserve all placeholders: {n}, {total}, {filename}, etc.',
'style': 'Keep translations concise and user-friendly'
}
},
'translations': {}
}
for lang in languages:
lang_file = self.locales_dir / lang / "translation.json"
if not lang_file.exists():
# Create empty translation structure
lang_data = {}
else:
lang_data = self._load_json(lang_file)
# Find untranslated entries
untranslated = self._find_untranslated_entries(golden_truth, lang_data)
# Limit entries if specified
if max_entries_per_language and len(untranslated) > max_entries_per_language:
# Prioritize by key importance
untranslated = self._prioritize_translation_keys(untranslated, max_entries_per_language)
batch_data['translations'][lang] = {}
for key, value in untranslated.items():
batch_data['translations'][lang][key] = {
'original': value,
'translated': '', # AI fills this
'context': self._get_key_context(key)
}
self._save_json(batch_data, output_file)
total_entries = sum(len(lang_data) for lang_data in batch_data['translations'].values())
print(f"Created AI batch file: {output_file}")
print(f"Total entries to translate: {total_entries}")
def _find_untranslated_entries(self, golden_truth: Dict, lang_data: Dict) -> Dict[str, str]:
"""Find entries that need translation."""
golden_flat = self._flatten_dict(golden_truth)
lang_flat = self._flatten_dict(lang_data)
untranslated = {}
for key, value in golden_flat.items():
if (key not in lang_flat or
lang_flat[key] == value or
(isinstance(lang_flat[key], str) and lang_flat[key].startswith("[UNTRANSLATED]"))):
if not self._is_expected_identical(key, value):
untranslated[key] = value
return untranslated
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
"""Flatten nested dictionary."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, v))
return dict(items)
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if key should be identical across languages."""
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
return True
return 'language.direction' in key.lower()
def _prioritize_translation_keys(self, untranslated: Dict[str, str], max_count: int) -> Dict[str, str]:
"""Prioritize which keys to translate first based on importance."""
# Define priority order (higher score = higher priority)
priority_patterns = [
('title', 10),
('header', 9),
('submit', 8),
('selectText', 7),
('prompt', 6),
('desc', 5),
('error', 8),
('warning', 7),
('save', 8),
('download', 8),
('upload', 7),
]
scored_keys = []
for key, value in untranslated.items():
score = 1 # base score
for pattern, pattern_score in priority_patterns:
if pattern.lower() in key.lower():
score = max(score, pattern_score)
scored_keys.append((key, value, score))
# Sort by score (descending) and return top entries
scored_keys.sort(key=lambda x: x[2], reverse=True)
return {key: value for key, value, _ in scored_keys[:max_count]}
def _get_key_context(self, key: str) -> str:
"""Get contextual information for a translation key."""
parts = key.split('.')
contexts = {
'addPageNumbers': 'Feature for adding page numbers to PDFs',
'compress': 'PDF compression functionality',
'merge': 'PDF merging functionality',
'split': 'PDF splitting functionality',
'rotate': 'PDF rotation functionality',
'convert': 'File conversion functionality',
'security': 'PDF security and permissions',
'metadata': 'PDF metadata editing',
'watermark': 'Adding watermarks to PDFs',
'overlay': 'PDF overlay functionality',
'extract': 'Extracting content from PDFs'
}
if len(parts) > 0:
main_section = parts[0]
context = contexts.get(main_section, f'Part of {main_section} functionality')
if len(parts) > 1:
context += f', specifically for {parts[-1]}'
return context
return 'General application text'
def validate_ai_translations(self, batch_file: Path) -> Dict[str, List[str]]:
"""Validate AI translations for common issues."""
batch_data = self._load_json(batch_file)
issues = {'errors': [], 'warnings': []}
for lang, translations in batch_data.get('translations', {}).items():
for key, translation_data in translations.items():
original = translation_data.get('original', '')
translated = translation_data.get('translated', '')
if not translated:
issues['errors'].append(f"{lang}.{key}: Missing translation")
continue
# Check for placeholder preservation
original_placeholders = re.findall(r'\{[^}]+\}', original)
translated_placeholders = re.findall(r'\{[^}]+\}', translated)
if set(original_placeholders) != set(translated_placeholders):
issues['warnings'].append(
f"{lang}.{key}: Placeholder mismatch - Original: {original_placeholders}, "
f"Translated: {translated_placeholders}"
)
# Check if translation is identical to original (might be untranslated)
if translated == original and not self._is_expected_identical(key, original):
issues['warnings'].append(f"{lang}.{key}: Translation identical to original")
# Check for common AI translation artifacts
artifacts = ['[TRANSLATE]', '[TODO]', 'UNTRANSLATED', '{{', '}}']
for artifact in artifacts:
if artifact in translated:
issues['errors'].append(f"{lang}.{key}: Contains translation artifact: {artifact}")
return issues
def apply_ai_batch_translations(self, batch_file: Path, validate: bool = True) -> Dict[str, Any]:
"""Apply translations from AI batch file to individual language files."""
batch_data = self._load_json(batch_file)
results = {'applied': {}, 'errors': [], 'warnings': []}
if validate:
validation_issues = self.validate_ai_translations(batch_file)
if validation_issues['errors']:
print("Validation errors found. Fix these before applying:")
for error in validation_issues['errors']:
print(f" ERROR: {error}")
return results
if validation_issues['warnings']:
print("Validation warnings (review recommended):")
for warning in validation_issues['warnings'][:10]:
print(f" WARNING: {warning}")
for lang, translations in batch_data.get('translations', {}).items():
lang_file = self.locales_dir / lang / "translation.json"
# Load existing data or create new
if lang_file.exists():
lang_data = self._load_json(lang_file)
else:
lang_data = {}
lang_file.parent.mkdir(parents=True, exist_ok=True)
applied_count = 0
for key, translation_data in translations.items():
translated = translation_data.get('translated', '').strip()
if translated and translated != translation_data.get('original', ''):
self._set_nested_value(lang_data, key, translated)
applied_count += 1
if applied_count > 0:
self._save_json(lang_data, lang_file)
results['applied'][lang] = applied_count
print(f"Applied {applied_count} translations to {lang}")
return results
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
"""Set value in nested dict using dot notation."""
keys = key_path.split('.')
current = data
for key in keys[:-1]:
if key not in current:
current[key] = {}
elif not isinstance(current[key], dict):
# If the current value is not a dict, we can't nest into it
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
current[key] = {}
current = current[key]
current[keys[-1]] = value
def export_for_external_translation(self, languages: List[str], output_format: str = 'csv') -> None:
"""Export translations for external translation services."""
golden_truth = self._load_json(self.golden_truth_file)
golden_flat = self._flatten_dict(golden_truth)
if output_format == 'csv':
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.csv')
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['key', 'context', 'en_GB'] + languages
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for key, en_value in golden_flat.items():
if self._is_expected_identical(key, en_value):
continue
row = {
'key': key,
'context': self._get_key_context(key),
'en_GB': en_value
}
for lang in languages:
lang_file = self.locales_dir / lang / "translation.json"
if lang_file.exists():
lang_data = self._load_json(lang_file)
lang_flat = self._flatten_dict(lang_data)
value = lang_flat.get(key, '')
if value.startswith('[UNTRANSLATED]'):
value = ''
row[lang] = value
else:
row[lang] = ''
writer.writerow(row)
print(f"Exported to {output_file}")
elif output_format == 'json':
output_file = Path(f'translations_export_{datetime.now().strftime("%Y%m%d")}.json')
export_data = {'languages': languages, 'translations': {}}
for key, en_value in golden_flat.items():
if self._is_expected_identical(key, en_value):
continue
export_data['translations'][key] = {
'en_GB': en_value,
'context': self._get_key_context(key)
}
for lang in languages:
lang_file = self.locales_dir / lang / "translation.json"
if lang_file.exists():
lang_data = self._load_json(lang_file)
lang_flat = self._flatten_dict(lang_data)
value = lang_flat.get(key, '')
if value.startswith('[UNTRANSLATED]'):
value = ''
export_data['translations'][key][lang] = value
self._save_json(export_data, output_file)
print(f"Exported to {output_file}")
def main():
parser = argparse.ArgumentParser(description='AI Translation Helper')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
subparsers = parser.add_subparsers(dest='command', help='Available commands')
# Create batch command
batch_parser = subparsers.add_parser('create-batch', help='Create AI translation batch file')
batch_parser.add_argument('--languages', nargs='+', required=True,
help='Language codes to include')
batch_parser.add_argument('--output', required=True, help='Output batch file')
batch_parser.add_argument('--max-entries', type=int, default=100,
help='Max entries per language')
# Validate command
validate_parser = subparsers.add_parser('validate', help='Validate AI translations')
validate_parser.add_argument('batch_file', help='Batch file to validate')
# Apply command
apply_parser = subparsers.add_parser('apply-batch', help='Apply AI batch translations')
apply_parser.add_argument('batch_file', help='Batch file with translations')
apply_parser.add_argument('--skip-validation', action='store_true',
help='Skip validation before applying')
# Export command
export_parser = subparsers.add_parser('export', help='Export for external translation')
export_parser.add_argument('--languages', nargs='+', required=True,
help='Language codes to export')
export_parser.add_argument('--format', choices=['csv', 'json'], default='csv',
help='Export format')
args = parser.parse_args()
if not args.command:
parser.print_help()
return
helper = AITranslationHelper(args.locales_dir)
if args.command == 'create-batch':
output_file = Path(args.output)
helper.create_ai_batch_file(args.languages, output_file, args.max_entries)
elif args.command == 'validate':
batch_file = Path(args.batch_file)
issues = helper.validate_ai_translations(batch_file)
if issues['errors']:
print("ERRORS:")
for error in issues['errors']:
print(f" - {error}")
if issues['warnings']:
print("WARNINGS:")
for warning in issues['warnings']:
print(f" - {warning}")
if not issues['errors'] and not issues['warnings']:
print("No validation issues found!")
elif args.command == 'apply-batch':
batch_file = Path(args.batch_file)
results = helper.apply_ai_batch_translations(
batch_file,
validate=not args.skip_validation
)
total_applied = sum(results['applied'].values())
print(f"Total translations applied: {total_applied}")
elif args.command == 'export':
helper.export_for_external_translation(args.languages, args.format)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Compact Translation Extractor for Character-Limited AI Translation
Outputs untranslated entries in minimal JSON format with whitespace stripped.
"""
import json
import sys
from pathlib import Path
import argparse
try:
import tomllib # Python 3.11+
except ImportError:
try:
import toml as tomllib_fallback
tomllib = None
except ImportError:
tomllib = None
tomllib_fallback = None
class CompactTranslationExtractor:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
self.golden_truth = self._load_json(self.golden_truth_file)
self.ignore_file = Path(ignore_file)
self.ignore_patterns = self._load_ignore_patterns()
def _load_json(self, file_path: Path) -> dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {file_path}: {e}", file=sys.stderr)
sys.exit(1)
def _load_ignore_patterns(self) -> dict:
"""Load ignore patterns from TOML file."""
if not self.ignore_file.exists():
return {}
try:
if tomllib:
with open(self.ignore_file, 'rb') as f:
ignore_data = tomllib.load(f)
elif tomllib_fallback:
ignore_data = tomllib_fallback.load(self.ignore_file)
else:
ignore_data = self._parse_simple_toml()
return {lang: set(data.get('ignore', [])) for lang, data in ignore_data.items()}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}", file=sys.stderr)
return {}
def _parse_simple_toml(self) -> dict:
"""Simple TOML parser for ignore patterns (fallback)."""
ignore_data = {}
current_section = None
with open(self.ignore_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if line.startswith('[') and line.endswith(']'):
current_section = line[1:-1]
ignore_data[current_section] = {'ignore': []}
elif line.strip().startswith("'") and current_section:
item = line.strip().strip("',")
if item:
ignore_data[current_section]['ignore'].append(item)
return ignore_data
def _flatten_dict(self, d: dict, parent_key: str = '', separator: str = '.') -> dict:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, str(v)))
return dict(items)
def get_untranslated_entries(self, language: str) -> dict:
"""Get all untranslated entries for a language in compact format."""
target_file = self.locales_dir / language / "translation.json"
if not target_file.exists():
print(f"Error: Translation file not found for language: {language}", file=sys.stderr)
sys.exit(1)
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
lang_code = language.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
# Find missing translations
missing_keys = set(golden_flat.keys()) - set(target_flat.keys()) - ignore_set
# Find untranslated entries (identical to en-GB or marked [UNTRANSLATED])
untranslated_keys = set()
for key in target_flat:
if key in golden_flat and key not in ignore_set:
target_value = target_flat[key]
golden_value = golden_flat[key]
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
(golden_value == target_value and not self._is_expected_identical(key, golden_value)):
untranslated_keys.add(key)
# Combine and create compact output
all_untranslated = missing_keys | untranslated_keys
compact_entries = {}
for key in sorted(all_untranslated):
if key in golden_flat:
compact_entries[key] = golden_flat[key]
return compact_entries
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
identical_patterns = ['language.direction']
identical_values = {'ltr', 'rtl', 'True', 'False', 'true', 'false', 'unknown'}
if value.strip() in identical_values:
return True
for pattern in identical_patterns:
if pattern in key.lower():
return True
return False
def main():
parser = argparse.ArgumentParser(description='Extract untranslated entries in compact format for AI translation')
parser.add_argument('language', help='Language code (e.g., de-DE, fr-FR)')
parser.add_argument('--locales-dir', default='frontend/public/locales', help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml', help='Path to ignore patterns file')
parser.add_argument('--max-entries', type=int, help='Maximum number of entries to output')
parser.add_argument('--output', help='Output file (default: stdout)')
args = parser.parse_args()
extractor = CompactTranslationExtractor(args.locales_dir, args.ignore_file)
untranslated = extractor.get_untranslated_entries(args.language)
if args.max_entries:
# Take first N entries
keys = list(untranslated.keys())[:args.max_entries]
untranslated = {k: untranslated[k] for k in keys}
# Output compact JSON (no indentation, minimal whitespace)
output = json.dumps(untranslated, separators=(',', ':'), ensure_ascii=False)
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f"Extracted {len(untranslated)} untranslated entries to {args.output}", file=sys.stderr)
else:
print(output)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
JSON Beautifier and Structure Fixer for Stirling PDF Frontend
Restructures translation JSON files to match en-GB structure and key order exactly.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
import argparse
from collections import OrderedDict
class JSONBeautifier:
def __init__(self, locales_dir: str = "frontend/public/locales"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
self.golden_structure = self._load_json(self.golden_truth_file)
def _load_json(self, file_path: Path) -> Dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f, object_pairs_hook=OrderedDict)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {file_path}: {e}")
sys.exit(1)
def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None:
"""Save JSON file with proper formatting."""
if backup and file_path.exists():
backup_path = file_path.with_suffix(f'.backup.restructured.json')
file_path.rename(backup_path)
print(f"Backup created: {backup_path}")
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False, separators=(',', ': '))
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, v))
return dict(items)
def _rebuild_structure(self, flat_dict: Dict[str, Any], reference_structure: Dict) -> Dict:
"""Rebuild nested structure based on reference structure and available translations."""
def build_recursive(ref_obj: Any, current_path: str = '') -> Any:
if isinstance(ref_obj, dict):
result = OrderedDict()
for key, value in ref_obj.items():
new_path = f"{current_path}.{key}" if current_path else key
if new_path in flat_dict:
# Direct translation exists
if isinstance(value, dict):
# If reference is dict but we have a string, use the string
if isinstance(flat_dict[new_path], str):
result[key] = flat_dict[new_path]
else:
# Recurse into nested structure
result[key] = build_recursive(value, new_path)
else:
result[key] = flat_dict[new_path]
else:
# No direct translation, recurse to check for nested keys
if isinstance(value, dict):
nested_result = build_recursive(value, new_path)
if nested_result: # Only add if we found some translations
result[key] = nested_result
# If no translation found and it's a leaf, skip it
return result if result else None
else:
# Leaf node - return the translation if it exists
return flat_dict.get(current_path, None)
return build_recursive(reference_structure) or OrderedDict()
def restructure_translation_file(self, target_file: Path) -> Dict[str, Any]:
"""Restructure a translation file to match en-GB structure exactly."""
if not target_file.exists():
print(f"Error: Target file does not exist: {target_file}")
return {}
# Load the target file
target_data = self._load_json(target_file)
# Flatten the target translations
flat_target = self._flatten_dict(target_data)
# Rebuild structure based on golden truth
restructured = self._rebuild_structure(flat_target, self.golden_structure)
return restructured
def beautify_and_restructure(self, target_file: Path, backup: bool = True) -> Dict[str, Any]:
"""Main function to beautify and restructure a translation file."""
lang_code = target_file.parent.name
print(f"Restructuring {lang_code} translation file...")
# Get the restructured data
restructured_data = self.restructure_translation_file(target_file)
# Save the restructured file
self._save_json(restructured_data, target_file, backup)
# Analyze the results
flat_golden = self._flatten_dict(self.golden_structure)
flat_restructured = self._flatten_dict(restructured_data)
total_keys = len(flat_golden)
preserved_keys = len(flat_restructured)
result = {
'language': lang_code,
'total_reference_keys': total_keys,
'preserved_keys': preserved_keys,
'structure_match': self._compare_structures(self.golden_structure, restructured_data)
}
print(f"Restructured {lang_code}: {preserved_keys}/{total_keys} keys preserved")
return result
def _compare_structures(self, ref: Dict, target: Dict) -> Dict[str, bool]:
"""Compare structures between reference and target."""
def compare_recursive(r: Any, t: Any, path: str = '') -> List[str]:
issues = []
if isinstance(r, dict) and isinstance(t, dict):
# Check for missing top-level sections
ref_keys = set(r.keys())
target_keys = set(t.keys())
missing_sections = ref_keys - target_keys
if missing_sections:
for section in missing_sections:
issues.append(f"Missing section: {path}.{section}" if path else section)
# Recurse into common sections
for key in ref_keys & target_keys:
new_path = f"{path}.{key}" if path else key
issues.extend(compare_recursive(r[key], t[key], new_path))
return issues
issues = compare_recursive(ref, target)
return {
'structures_match': len(issues) == 0,
'issues': issues[:10], # Limit to first 10 issues
'total_issues': len(issues)
}
def validate_key_order(self, target_file: Path) -> Dict[str, Any]:
"""Validate that keys appear in the same order as en-GB."""
target_data = self._load_json(target_file)
def get_key_order(obj: Dict, path: str = '') -> List[str]:
keys = []
for key in obj.keys():
new_path = f"{path}.{key}" if path else key
keys.append(new_path)
if isinstance(obj[key], dict):
keys.extend(get_key_order(obj[key], new_path))
return keys
golden_order = get_key_order(self.golden_structure)
target_order = get_key_order(target_data)
# Find common keys and check their relative order
common_keys = set(golden_order) & set(target_order)
golden_indices = {key: idx for idx, key in enumerate(golden_order) if key in common_keys}
target_indices = {key: idx for idx, key in enumerate(target_order) if key in common_keys}
order_preserved = all(
golden_indices[key1] < golden_indices[key2]
for key1 in common_keys for key2 in common_keys
if golden_indices[key1] < golden_indices[key2] and target_indices[key1] < target_indices[key2]
)
return {
'order_preserved': order_preserved,
'common_keys_count': len(common_keys),
'golden_keys_count': len(golden_order),
'target_keys_count': len(target_order)
}
def main():
parser = argparse.ArgumentParser(description='Beautify and restructure translation JSON files')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--language', help='Restructure specific language only')
parser.add_argument('--all-languages', action='store_true',
help='Restructure all language files')
parser.add_argument('--no-backup', action='store_true',
help='Skip backup creation')
parser.add_argument('--validate-only', action='store_true',
help='Only validate structure, do not modify files')
args = parser.parse_args()
beautifier = JSONBeautifier(args.locales_dir)
if args.language:
target_file = Path(args.locales_dir) / args.language / "translation.json"
if not target_file.exists():
print(f"Error: Translation file not found for language: {args.language}")
sys.exit(1)
if args.validate_only:
order_result = beautifier.validate_key_order(target_file)
print(f"Key order validation for {args.language}:")
print(f" Order preserved: {order_result['order_preserved']}")
print(f" Common keys: {order_result['common_keys_count']}/{order_result['golden_keys_count']}")
else:
result = beautifier.beautify_and_restructure(target_file, backup=not args.no_backup)
print(f"\nResults for {result['language']}:")
print(f" Keys preserved: {result['preserved_keys']}/{result['total_reference_keys']}")
if result['structure_match']['total_issues'] > 0:
print(f" Structure issues: {result['structure_match']['total_issues']}")
for issue in result['structure_match']['issues']:
print(f" - {issue}")
elif args.all_languages:
results = []
for lang_dir in Path(args.locales_dir).iterdir():
if lang_dir.is_dir() and lang_dir.name != "en-GB":
translation_file = lang_dir / "translation.json"
if translation_file.exists():
if args.validate_only:
order_result = beautifier.validate_key_order(translation_file)
print(f"{lang_dir.name}: Order preserved = {order_result['order_preserved']}")
else:
result = beautifier.beautify_and_restructure(translation_file, backup=not args.no_backup)
results.append(result)
if not args.validate_only and results:
print(f"\n{'='*60}")
print("RESTRUCTURING SUMMARY")
print(f"{'='*60}")
for result in sorted(results, key=lambda x: x['language']):
print(f"{result['language']}: {result['preserved_keys']}/{result['total_reference_keys']} keys "
f"({result['preserved_keys']/result['total_reference_keys']*100:.1f}%)")
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,259 @@
#!/usr/bin/env python3
"""
JSON Validator for Translation Files
Validates JSON syntax in translation files and reports detailed error information.
Useful for validating batch translation files before merging.
Usage:
python3 json_validator.py <file_or_pattern>
python3 json_validator.py ar_AR_batch_*.json
python3 json_validator.py ar_AR_batch_1_of_3.json
python3 json_validator.py --all-batches ar_AR
"""
import json
import sys
import argparse
import glob
from pathlib import Path
def get_line_context(file_path, line_num, context_lines=3):
"""Get lines around the error for context"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
start = max(0, line_num - context_lines - 1)
end = min(len(lines), line_num + context_lines)
context = []
for i in range(start, end):
marker = ">>> " if i == line_num - 1 else " "
context.append(f"{marker}{i+1:4d}: {lines[i].rstrip()}")
return "\n".join(context)
except Exception as e:
return f"Could not read context: {e}"
def get_character_context(file_path, char_pos, context_chars=100):
"""Get characters around the error position"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
start = max(0, char_pos - context_chars)
end = min(len(content), char_pos + context_chars)
before = content[start:char_pos]
error_char = content[char_pos] if char_pos < len(content) else "EOF"
after = content[char_pos+1:end]
return {
'before': before,
'error_char': error_char,
'after': after,
'display': f"{before}[{error_char}]{after}"
}
except Exception as e:
return None
def validate_json_file(file_path):
"""Validate a single JSON file and return detailed error info"""
result = {
'file': str(file_path),
'valid': False,
'error': None,
'line': None,
'column': None,
'position': None,
'context': None,
'char_context': None,
'entry_count': 0
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
result['valid'] = True
result['entry_count'] = len(data) if isinstance(data, dict) else 0
except json.JSONDecodeError as e:
result['error'] = e.msg
result['line'] = e.lineno
result['column'] = e.colno
result['position'] = e.pos
result['context'] = get_line_context(file_path, e.lineno)
result['char_context'] = get_character_context(file_path, e.pos)
except FileNotFoundError:
result['error'] = "File not found"
except Exception as e:
result['error'] = str(e)
return result
def print_validation_result(result, verbose=True):
"""Print validation result in a formatted way"""
file_name = Path(result['file']).name
if result['valid']:
print(f"{file_name}: Valid JSON ({result['entry_count']} entries)")
else:
print(f"{file_name}: Invalid JSON")
print(f" Error: {result['error']}")
if result['line']:
print(f" Location: Line {result['line']}, Column {result['column']} (character {result['position']})")
if verbose and result['context']:
print(f"\n Context:")
for line in result['context'].split('\n'):
print(f" {line}")
if verbose and result['char_context']:
print(f"\n Character context:")
print(f" ...{result['char_context']['display'][-150:]}...")
print(f" Error character: {repr(result['char_context']['error_char'])}")
print()
def get_common_fixes(error_msg):
"""Suggest common fixes based on error message"""
fixes = []
if "Expecting ',' delimiter" in error_msg:
fixes.append("Missing comma between JSON entries")
fixes.append("Check for unescaped quotes inside string values")
if "Invalid \\escape" in error_msg or "Invalid escape" in error_msg:
fixes.append("Unescaped backslash in string (use \\\\ for literal backslash)")
fixes.append("Common in regex patterns: \\d should be \\\\d")
if "Expecting property name" in error_msg:
fixes.append("Missing or extra comma")
fixes.append("Trailing comma before closing brace")
if "Expecting value" in error_msg:
fixes.append("Missing value after colon")
fixes.append("Extra comma")
return fixes
def main():
parser = argparse.ArgumentParser(
description='Validate JSON syntax in translation files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
Validate single file:
python3 json_validator.py ar_AR_batch_1_of_3.json
Validate all batches for a language:
python3 json_validator.py --all-batches ar_AR
Validate pattern:
python3 json_validator.py "ar_AR_batch_*.json"
Validate multiple files:
python3 json_validator.py file1.json file2.json file3.json
"""
)
parser.add_argument(
'files',
nargs='*',
help='JSON file(s) to validate (supports wildcards)'
)
parser.add_argument(
'--all-batches',
metavar='LANGUAGE',
help='Validate all batch files for a language (e.g., ar_AR)'
)
parser.add_argument(
'--quiet',
action='store_true',
help='Only show files with errors'
)
parser.add_argument(
'--brief',
action='store_true',
help='Brief output without context'
)
args = parser.parse_args()
# Determine which files to validate
files_to_validate = []
if args.all_batches:
pattern = f"{args.all_batches}_batch_*.json"
files_to_validate = glob.glob(pattern)
if not files_to_validate:
print(f"No batch files found matching: {pattern}")
return 1
elif args.files:
for file_pattern in args.files:
if '*' in file_pattern or '?' in file_pattern:
files_to_validate.extend(glob.glob(file_pattern))
else:
files_to_validate.append(file_pattern)
else:
parser.print_help()
return 1
if not files_to_validate:
print("No files to validate")
return 1
# Sort files for consistent output
files_to_validate.sort()
print(f"Validating {len(files_to_validate)} file(s)...\n")
# Validate each file
results = []
for file_path in files_to_validate:
result = validate_json_file(file_path)
results.append(result)
if not args.quiet or not result['valid']:
print_validation_result(result, verbose=not args.brief)
# Summary
valid_count = sum(1 for r in results if r['valid'])
invalid_count = len(results) - valid_count
print("=" * 60)
print(f"Summary: {valid_count} valid, {invalid_count} invalid")
# Show common fixes for errors
if invalid_count > 0:
all_errors = [r['error'] for r in results if r['error']]
unique_error_types = set(all_errors)
print("\nCommon fixes:")
fixes_shown = set()
for error in unique_error_types:
fixes = get_common_fixes(error)
for fix in fixes:
if fix not in fixes_shown:
print(f"{fix}")
fixes_shown.add(fix)
return 0 if invalid_count == 0 else 1
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,314 @@
#!/usr/bin/env python3
"""
Translation Analyzer for Stirling PDF Frontend
Compares language files against en-GB golden truth file.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
import argparse
try:
import tomllib # Python 3.11+
except ImportError:
try:
import toml as tomllib_fallback
tomllib = None
except ImportError:
tomllib = None
tomllib_fallback = None
class TranslationAnalyzer:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
self.golden_truth = self._load_json(self.golden_truth_file)
self.ignore_file = Path(ignore_file)
self.ignore_patterns = self._load_ignore_patterns()
def _load_json(self, file_path: Path) -> Dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {file_path}: {e}")
sys.exit(1)
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
"""Load ignore patterns from TOML file."""
if not self.ignore_file.exists():
return {}
try:
if tomllib:
# Use Python 3.11+ built-in
with open(self.ignore_file, 'rb') as f:
ignore_data = tomllib.load(f)
elif tomllib_fallback:
# Use toml library fallback
ignore_data = tomllib_fallback.load(self.ignore_file)
else:
# Simple parser as fallback
ignore_data = self._parse_simple_toml()
# Convert lists to sets for faster lookup
return {lang: set(patterns) for lang, data in ignore_data.items()
for patterns in [data.get('ignore', [])] if patterns}
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
return {}
def _parse_simple_toml(self) -> Dict:
"""Simple TOML parser for ignore patterns (fallback)."""
ignore_data = {}
current_section = None
with open(self.ignore_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if line.startswith('[') and line.endswith(']'):
current_section = line[1:-1]
ignore_data[current_section] = {'ignore': []}
elif line.startswith('ignore = [') and current_section:
# Handle ignore array
continue
elif line.strip().startswith("'") and current_section:
# Extract quoted items
item = line.strip().strip("',")
if item:
ignore_data[current_section]['ignore'].append(item)
return ignore_data
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, str]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, str(v)))
return dict(items)
def get_all_language_files(self) -> List[Path]:
"""Get all translation.json files except en-GB."""
files = []
for lang_dir in self.locales_dir.iterdir():
if lang_dir.is_dir() and lang_dir.name != "en-GB":
translation_file = lang_dir / "translation.json"
if translation_file.exists():
files.append(translation_file)
return sorted(files)
def find_missing_translations(self, target_file: Path) -> Set[str]:
"""Find keys that exist in en-GB but missing in target file."""
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
missing = set(golden_flat.keys()) - set(target_flat.keys())
# Filter out ignored keys
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
return missing - ignore_set
def find_untranslated_entries(self, target_file: Path) -> Set[str]:
"""Find entries that appear to be untranslated (identical to en-GB)."""
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
untranslated = set()
for key in target_flat:
if key in golden_flat:
target_value = target_flat[key]
golden_value = golden_flat[key]
# Check if marked as [UNTRANSLATED] or identical to en-GB
if (isinstance(target_value, str) and target_value.startswith("[UNTRANSLATED]")) or \
(golden_value == target_value and key not in ignore_set and not self._is_expected_identical(key, golden_value)):
untranslated.add(key)
return untranslated
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
# Keys that should be identical across languages
identical_patterns = [
'language.direction',
'true', 'false',
'unknown'
]
# Values that are often identical (numbers, symbols, etc.)
if value.strip() in ['ltr', 'rtl', 'True', 'False']:
return True
# Check for patterns
for pattern in identical_patterns:
if pattern in key.lower():
return True
return False
def find_extra_translations(self, target_file: Path) -> Set[str]:
"""Find keys that exist in target file but not in en-GB."""
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
return set(target_flat.keys()) - set(golden_flat.keys())
def analyze_file(self, target_file: Path) -> Dict:
"""Complete analysis of a single translation file."""
lang_code = target_file.parent.name
missing = self.find_missing_translations(target_file)
untranslated = self.find_untranslated_entries(target_file)
extra = self.find_extra_translations(target_file)
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
# Calculate completion rate excluding ignored keys
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
relevant_keys = set(golden_flat.keys()) - ignore_set
total_keys = len(relevant_keys)
# Count keys that exist and are properly translated (not [UNTRANSLATED])
properly_translated = 0
for key in relevant_keys:
if key in target_flat:
value = target_flat[key]
if not (isinstance(value, str) and value.startswith("[UNTRANSLATED]")):
if key not in untranslated: # Not identical to en-GB (unless expected)
properly_translated += 1
completion_rate = (properly_translated / total_keys) * 100 if total_keys > 0 else 0
return {
'language': lang_code,
'file': target_file,
'missing_count': len(missing),
'missing_keys': sorted(missing),
'untranslated_count': len(untranslated),
'untranslated_keys': sorted(untranslated),
'extra_count': len(extra),
'extra_keys': sorted(extra),
'total_keys': total_keys,
'completion_rate': completion_rate
}
def analyze_all_files(self) -> List[Dict]:
"""Analyze all translation files."""
results = []
for file_path in self.get_all_language_files():
results.append(self.analyze_file(file_path))
return sorted(results, key=lambda x: x['language'])
def main():
parser = argparse.ArgumentParser(description='Analyze translation files against en-GB golden truth')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
help='Path to ignore patterns TOML file')
parser.add_argument('--language', help='Analyze specific language only')
parser.add_argument('--missing-only', action='store_true',
help='Show only missing translations')
parser.add_argument('--untranslated-only', action='store_true',
help='Show only untranslated entries')
parser.add_argument('--summary', action='store_true',
help='Show summary statistics only')
parser.add_argument('--format', choices=['text', 'json'], default='text',
help='Output format')
args = parser.parse_args()
analyzer = TranslationAnalyzer(args.locales_dir, args.ignore_file)
if args.language:
target_file = Path(args.locales_dir) / args.language / "translation.json"
if not target_file.exists():
print(f"Error: Translation file not found for language: {args.language}")
sys.exit(1)
results = [analyzer.analyze_file(target_file)]
else:
results = analyzer.analyze_all_files()
if args.format == 'json':
print(json.dumps(results, indent=2, default=str))
return
# Text format output
for result in results:
lang = result['language']
print(f"\n{'='*60}")
print(f"Language: {lang}")
print(f"File: {result['file']}")
print(f"Completion Rate: {result['completion_rate']:.1f}%")
print(f"Total Keys in en-GB: {result['total_keys']}")
if not args.summary:
if not args.untranslated_only:
print(f"\nMissing Translations ({result['missing_count']}):")
for key in result['missing_keys'][:10]: # Show first 10
print(f" - {key}")
if len(result['missing_keys']) > 10:
print(f" ... and {len(result['missing_keys']) - 10} more")
if not args.missing_only:
print(f"\nUntranslated Entries ({result['untranslated_count']}):")
for key in result['untranslated_keys'][:10]: # Show first 10
print(f" - {key}")
if len(result['untranslated_keys']) > 10:
print(f" ... and {len(result['untranslated_keys']) - 10} more")
if result['extra_count'] > 0:
print(f"\nExtra Keys Not in en-GB ({result['extra_count']}):")
for key in result['extra_keys'][:5]:
print(f" - {key}")
if len(result['extra_keys']) > 5:
print(f" ... and {len(result['extra_keys']) - 5} more")
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
avg_completion = sum(r['completion_rate'] for r in results) / len(results) if results else 0
print(f"Average Completion Rate: {avg_completion:.1f}%")
print(f"Languages Analyzed: {len(results)}")
# Top languages by completion
sorted_by_completion = sorted(results, key=lambda x: x['completion_rate'], reverse=True)
print(f"\nTop 5 Most Complete Languages:")
for result in sorted_by_completion[:5]:
print(f" {result['language']}: {result['completion_rate']:.1f}%")
print(f"\nBottom 5 Languages Needing Attention:")
for result in sorted_by_completion[-5:]:
print(f" {result['language']}: {result['completion_rate']:.1f}% ({result['missing_count']} missing, {result['untranslated_count']} untranslated)")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python3
"""
Translation Merger for Stirling PDF Frontend
Merges missing translations from en-GB into target language files.
Useful for AI-assisted translation workflows.
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any
import argparse
import shutil
from datetime import datetime
try:
import tomllib # Python 3.11+
except ImportError:
try:
import toml as tomllib_fallback
tomllib = None
except ImportError:
tomllib = None
tomllib_fallback = None
class TranslationMerger:
def __init__(self, locales_dir: str = "frontend/public/locales", ignore_file: str = "scripts/ignore_translation.toml"):
self.locales_dir = Path(locales_dir)
self.golden_truth_file = self.locales_dir / "en-GB" / "translation.json"
self.golden_truth = self._load_json(self.golden_truth_file)
self.ignore_file = Path(ignore_file)
self.ignore_patterns = self._load_ignore_patterns()
def _load_json(self, file_path: Path) -> Dict:
"""Load JSON file with error handling."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {file_path}: {e}")
sys.exit(1)
def _save_json(self, data: Dict, file_path: Path, backup: bool = True) -> None:
"""Save JSON file with backup option."""
if backup and file_path.exists():
backup_path = file_path.with_suffix(f'.backup.{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
shutil.copy2(file_path, backup_path)
print(f"Backup created: {backup_path}")
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def _load_ignore_patterns(self) -> Dict[str, Set[str]]:
"""Load ignore patterns from TOML file."""
if not self.ignore_file.exists():
return {}
try:
# Simple parser for ignore patterns
ignore_data = {}
current_section = None
with open(self.ignore_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if line.startswith('[') and line.endswith(']'):
current_section = line[1:-1]
ignore_data[current_section] = set()
elif line.strip().startswith("'") and current_section:
# Extract quoted items
item = line.strip().strip("',")
if item:
ignore_data[current_section].add(item)
return ignore_data
except Exception as e:
print(f"Warning: Could not load ignore file {self.ignore_file}: {e}")
return {}
def _get_nested_value(self, data: Dict, key_path: str) -> Any:
"""Get value from nested dict using dot notation."""
keys = key_path.split('.')
current = data
for key in keys:
if isinstance(current, dict) and key in current:
current = current[key]
else:
return None
return current
def _set_nested_value(self, data: Dict, key_path: str, value: Any) -> None:
"""Set value in nested dict using dot notation."""
keys = key_path.split('.')
current = data
for key in keys[:-1]:
if key not in current:
current[key] = {}
elif not isinstance(current[key], dict):
# If the current value is not a dict, we can't nest into it
# This handles cases where a key exists as a string but we need to make it a dict
print(f"Warning: Converting non-dict value at '{key}' to dict to allow nesting")
current[key] = {}
current = current[key]
current[keys[-1]] = value
def _flatten_dict(self, d: Dict, parent_key: str = '', separator: str = '.') -> Dict[str, Any]:
"""Flatten nested dictionary into dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{separator}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, separator).items())
else:
items.append((new_key, v))
return dict(items)
def get_missing_keys(self, target_file: Path) -> List[str]:
"""Get list of missing keys in target file."""
lang_code = target_file.parent.name.replace('-', '_')
ignore_set = self.ignore_patterns.get(lang_code, set())
if not target_file.exists():
golden_keys = set(self._flatten_dict(self.golden_truth).keys())
return sorted(golden_keys - ignore_set)
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
missing = set(golden_flat.keys()) - set(target_flat.keys())
return sorted(missing - ignore_set)
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None,
mark_untranslated: bool = True) -> Dict:
"""Add missing translations from en-GB to target file."""
if not target_file.exists():
target_data = {}
else:
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
missing_keys = keys_to_add or self.get_missing_keys(target_file)
added_count = 0
for key in missing_keys:
if key in golden_flat:
value = golden_flat[key]
if mark_untranslated and isinstance(value, str):
# Mark as untranslated for AI to translate later
value = f"[UNTRANSLATED] {value}"
self._set_nested_value(target_data, key, value)
added_count += 1
return {
'added_count': added_count,
'missing_keys': missing_keys,
'data': target_data
}
def extract_untranslated_entries(self, target_file: Path, output_file: Path = None) -> Dict:
"""Extract entries marked as untranslated or identical to en-GB for AI translation."""
if not target_file.exists():
print(f"Error: Target file does not exist: {target_file}")
return {}
target_data = self._load_json(target_file)
golden_flat = self._flatten_dict(self.golden_truth)
target_flat = self._flatten_dict(target_data)
untranslated_entries = {}
for key, value in target_flat.items():
if key in golden_flat:
golden_value = golden_flat[key]
# Check if marked as untranslated
if isinstance(value, str) and value.startswith("[UNTRANSLATED]"):
untranslated_entries[key] = {
'original': golden_value,
'current': value,
'reason': 'marked_untranslated'
}
# Check if identical to golden (and should be translated)
elif value == golden_value and not self._is_expected_identical(key, value):
untranslated_entries[key] = {
'original': golden_value,
'current': value,
'reason': 'identical_to_english'
}
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(untranslated_entries, f, indent=2, ensure_ascii=False)
return untranslated_entries
def _is_expected_identical(self, key: str, value: str) -> bool:
"""Check if a key-value pair is expected to be identical across languages."""
identical_patterns = [
'language.direction',
]
if str(value).strip() in ['ltr', 'rtl', 'True', 'False', 'true', 'false']:
return True
for pattern in identical_patterns:
if pattern in key.lower():
return True
return False
def apply_translations(self, target_file: Path, translations: Dict[str, str],
backup: bool = True) -> Dict:
"""Apply provided translations to target file."""
if not target_file.exists():
print(f"Error: Target file does not exist: {target_file}")
return {'success': False, 'error': 'File not found'}
target_data = self._load_json(target_file)
applied_count = 0
errors = []
for key, translation in translations.items():
try:
# Remove [UNTRANSLATED] marker if present
if translation.startswith("[UNTRANSLATED]"):
translation = translation.replace("[UNTRANSLATED]", "").strip()
self._set_nested_value(target_data, key, translation)
applied_count += 1
except Exception as e:
errors.append(f"Error setting {key}: {e}")
if applied_count > 0:
self._save_json(target_data, target_file, backup)
return {
'success': True,
'applied_count': applied_count,
'errors': errors,
'data': target_data
}
def create_translation_template(self, target_file: Path, output_file: Path) -> None:
"""Create a template file for AI translation with context."""
untranslated = self.extract_untranslated_entries(target_file)
template = {
'metadata': {
'source_language': 'en-GB',
'target_language': target_file.parent.name,
'total_entries': len(untranslated),
'created_at': datetime.now().isoformat(),
'instructions': 'Translate the "original" values to the target language. Keep the same keys.'
},
'translations': {}
}
for key, entry in untranslated.items():
template['translations'][key] = {
'original': entry['original'],
'translated': '', # AI should fill this
'context': self._get_context_for_key(key),
'reason': entry['reason']
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(template, f, indent=2, ensure_ascii=False)
print(f"Translation template created: {output_file}")
print(f"Contains {len(untranslated)} entries to translate")
def _get_context_for_key(self, key: str) -> str:
"""Get context information for a translation key."""
parts = key.split('.')
if len(parts) >= 2:
return f"Section: {parts[0]}, Property: {parts[-1]}"
return f"Property: {parts[-1]}"
def main():
parser = argparse.ArgumentParser(description='Merge and manage translation files')
parser.add_argument('--locales-dir', default='frontend/public/locales',
help='Path to locales directory')
parser.add_argument('--ignore-file', default='scripts/ignore_translation.toml',
help='Path to ignore patterns TOML file')
parser.add_argument('language', help='Target language code (e.g., fr-FR)')
subparsers = parser.add_subparsers(dest='command', help='Available commands')
# Add missing command
add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB')
add_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation')
add_parser.add_argument('--mark-untranslated', action='store_true', default=True,
help='Mark added translations as [UNTRANSLATED]')
# Extract untranslated command
extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries')
extract_parser.add_argument('--output', help='Output file path')
# Create template command
template_parser = subparsers.add_parser('create-template', help='Create AI translation template')
template_parser.add_argument('--output', required=True, help='Output template file path')
# Apply translations command
apply_parser = subparsers.add_parser('apply-translations', help='Apply translations from JSON file')
apply_parser.add_argument('--translations-file', required=True, help='JSON file with translations')
apply_parser.add_argument('--no-backup', action='store_true', help='Skip backup creation')
args = parser.parse_args()
if not args.command:
parser.print_help()
return
merger = TranslationMerger(args.locales_dir, args.ignore_file)
target_file = Path(args.locales_dir) / args.language / "translation.json"
if args.command == 'add-missing':
print(f"Adding missing translations to {args.language}...")
result = merger.add_missing_translations(
target_file,
mark_untranslated=args.mark_untranslated
)
merger._save_json(result['data'], target_file, backup=not args.no_backup)
print(f"Added {result['added_count']} missing translations")
elif args.command == 'extract-untranslated':
output_file = Path(args.output) if args.output else target_file.with_suffix('.untranslated.json')
untranslated = merger.extract_untranslated_entries(target_file, output_file)
print(f"Extracted {len(untranslated)} untranslated entries to {output_file}")
elif args.command == 'create-template':
output_file = Path(args.output)
merger.create_translation_template(target_file, output_file)
elif args.command == 'apply-translations':
with open(args.translations_file, 'r', encoding='utf-8') as f:
translations_data = json.load(f)
# Extract translations from template format or simple dict
if 'translations' in translations_data:
translations = {k: v['translated'] for k, v in translations_data['translations'].items()
if v.get('translated')}
else:
translations = translations_data
result = merger.apply_translations(target_file, translations, backup=not args.no_backup)
if result['success']:
print(f"Applied {result['applied_count']} translations")
if result['errors']:
print(f"Errors: {len(result['errors'])}")
for error in result['errors'][:5]:
print(f" - {error}")
else:
print(f"Failed: {result.get('error', 'Unknown error')}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""
Validate JSON structure and formatting of translation files.
Checks for:
- Valid JSON syntax
- Consistent key structure with en-GB
- Missing keys
- Extra keys not in en-GB
- Malformed entries
Usage:
python scripts/translations/validate_json_structure.py [--language LANG]
"""
import json
import sys
from pathlib import Path
from typing import Dict, List, Set
import argparse
def get_all_keys(d: dict, parent_key: str = '', sep: str = '.') -> Set[str]:
"""Get all keys from nested dict as dot-notation paths."""
keys = set()
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
keys.add(new_key)
if isinstance(v, dict):
keys.update(get_all_keys(v, new_key, sep=sep))
return keys
def validate_json_file(file_path: Path) -> tuple[bool, str]:
"""Validate that a file contains valid JSON."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
json.load(f)
return True, "Valid JSON"
except json.JSONDecodeError as e:
return False, f"Invalid JSON at line {e.lineno}, column {e.colno}: {e.msg}"
except Exception as e:
return False, f"Error reading file: {str(e)}"
def validate_structure(
en_gb_keys: Set[str],
lang_keys: Set[str],
lang_code: str
) -> Dict:
"""Compare structure between en-GB and target language."""
missing_keys = en_gb_keys - lang_keys
extra_keys = lang_keys - en_gb_keys
return {
'language': lang_code,
'missing_keys': sorted(missing_keys),
'extra_keys': sorted(extra_keys),
'total_keys': len(lang_keys),
'expected_keys': len(en_gb_keys),
'missing_count': len(missing_keys),
'extra_count': len(extra_keys)
}
def print_validation_result(result: Dict, verbose: bool = False):
"""Print validation results in readable format."""
lang = result['language']
print(f"\n{'='*100}")
print(f"Language: {lang}")
print(f"{'='*100}")
print(f" Total keys: {result['total_keys']}")
print(f" Expected keys (en-GB): {result['expected_keys']}")
print(f" Missing keys: {result['missing_count']}")
print(f" Extra keys: {result['extra_count']}")
if result['missing_count'] == 0 and result['extra_count'] == 0:
print(f" ✅ Structure matches en-GB perfectly!")
else:
if result['missing_count'] > 0:
print(f"\n ⚠️ Missing {result['missing_count']} key(s):")
if verbose or result['missing_count'] <= 20:
for key in result['missing_keys'][:50]:
print(f" - {key}")
if result['missing_count'] > 50:
print(f" ... and {result['missing_count'] - 50} more")
else:
print(f" (use --verbose to see all)")
if result['extra_count'] > 0:
print(f"\n ⚠️ Extra {result['extra_count']} key(s) not in en-GB:")
if verbose or result['extra_count'] <= 20:
for key in result['extra_keys'][:50]:
print(f" - {key}")
if result['extra_count'] > 50:
print(f" ... and {result['extra_count'] - 50} more")
else:
print(f" (use --verbose to see all)")
print("-" * 100)
def main():
parser = argparse.ArgumentParser(
description='Validate translation JSON structure'
)
parser.add_argument(
'--language',
help='Specific language code to validate (e.g., es-ES)',
default=None
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show all missing/extra keys'
)
parser.add_argument(
'--json',
action='store_true',
help='Output results as JSON'
)
args = parser.parse_args()
# Define paths
locales_dir = Path('frontend/public/locales')
en_gb_path = locales_dir / 'en-GB' / 'translation.json'
if not en_gb_path.exists():
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
sys.exit(1)
# Validate en-GB itself
is_valid, message = validate_json_file(en_gb_path)
if not is_valid:
print(f"❌ Error in en-GB file: {message}")
sys.exit(1)
# Load en-GB structure
with open(en_gb_path, 'r', encoding='utf-8') as f:
en_gb = json.load(f)
en_gb_keys = get_all_keys(en_gb)
# Get list of languages to validate
if args.language:
languages = [args.language]
else:
languages = [
d.name for d in locales_dir.iterdir()
if d.is_dir() and d.name != 'en-GB' and (d / 'translation.json').exists()
]
results = []
json_errors = []
# Validate each language
for lang_code in sorted(languages):
lang_path = locales_dir / lang_code / 'translation.json'
if not lang_path.exists():
print(f"⚠️ Warning: {lang_code}/translation.json not found, skipping")
continue
# First check if JSON is valid
is_valid, message = validate_json_file(lang_path)
if not is_valid:
json_errors.append({
'language': lang_code,
'file': str(lang_path),
'error': message
})
continue
# Load and compare structure
with open(lang_path, 'r', encoding='utf-8') as f:
lang_data = json.load(f)
lang_keys = get_all_keys(lang_data)
result = validate_structure(en_gb_keys, lang_keys, lang_code)
results.append(result)
# Output results
if args.json:
output = {
'json_errors': json_errors,
'structure_validation': results
}
print(json.dumps(output, indent=2, ensure_ascii=False))
else:
# Print JSON errors first
if json_errors:
print("\n❌ JSON Syntax Errors:")
print("=" * 100)
for error in json_errors:
print(f"\nLanguage: {error['language']}")
print(f"File: {error['file']}")
print(f"Error: {error['error']}")
print("\n")
# Print structure validation results
if results:
print("\n📊 Structure Validation Summary:")
print(f" Languages validated: {len(results)}")
perfect = sum(1 for r in results if r['missing_count'] == 0 and r['extra_count'] == 0)
print(f" Perfect matches: {perfect}/{len(results)}")
total_missing = sum(r['missing_count'] for r in results)
total_extra = sum(r['extra_count'] for r in results)
print(f" Total missing keys: {total_missing}")
print(f" Total extra keys: {total_extra}")
for result in results:
print_validation_result(result, verbose=args.verbose)
if not json_errors and perfect == len(results):
print("\n✅ All translations have perfect structure!")
# Exit with error code if issues found
has_issues = len(json_errors) > 0 or any(
r['missing_count'] > 0 or r['extra_count'] > 0 for r in results
)
sys.exit(1 if has_issues else 0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
Validate that translation files have the same placeholders as en-GB (source of truth).
Usage:
python scripts/translations/validate_placeholders.py [--language LANG] [--fix]
--language: Validate specific language (e.g., es-ES, de-DE)
--fix: Automatically remove extra placeholders (use with caution)
"""
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
import argparse
def find_placeholders(text: str) -> Set[str]:
"""Find all placeholders in text like {n}, {{var}}, {0}, etc."""
if not isinstance(text, str):
return set()
return set(re.findall(r'\{\{?[^}]+\}\}?', text))
def flatten_dict(d: dict, parent_key: str = '', sep: str = '.') -> Dict[str, str]:
"""Flatten nested dict to dot-notation keys."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def validate_language(
en_gb_flat: Dict[str, str],
lang_flat: Dict[str, str],
lang_code: str
) -> List[Dict]:
"""Validate placeholders for a language against en-GB."""
issues = []
for key in en_gb_flat:
if key not in lang_flat:
continue
en_placeholders = find_placeholders(en_gb_flat[key])
lang_placeholders = find_placeholders(lang_flat[key])
if en_placeholders != lang_placeholders:
missing = en_placeholders - lang_placeholders
extra = lang_placeholders - en_placeholders
issue = {
'language': lang_code,
'key': key,
'missing': missing,
'extra': extra,
'en_text': en_gb_flat[key],
'lang_text': lang_flat[key]
}
issues.append(issue)
return issues
def print_issues(issues: List[Dict], verbose: bool = False):
"""Print validation issues in a readable format."""
if not issues:
print("✅ No placeholder validation issues found!")
return
print(f"❌ Found {len(issues)} placeholder validation issue(s):\n")
print("=" * 100)
for i, issue in enumerate(issues, 1):
print(f"\n{i}. Language: {issue['language']}")
print(f" Key: {issue['key']}")
if issue['missing']:
print(f" ⚠️ MISSING placeholders: {issue['missing']}")
if issue['extra']:
print(f" ⚠️ EXTRA placeholders: {issue['extra']}")
if verbose:
print(f" EN-GB: {issue['en_text'][:150]}")
print(f" {issue['language']}: {issue['lang_text'][:150]}")
print("-" * 100)
def main():
parser = argparse.ArgumentParser(
description='Validate translation placeholder consistency'
)
parser.add_argument(
'--language',
help='Specific language code to validate (e.g., es-ES)',
default=None
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show full text samples for each issue'
)
parser.add_argument(
'--json',
action='store_true',
help='Output results as JSON'
)
args = parser.parse_args()
# Define paths
locales_dir = Path('frontend/public/locales')
en_gb_path = locales_dir / 'en-GB' / 'translation.json'
if not en_gb_path.exists():
print(f"❌ Error: en-GB translation file not found at {en_gb_path}")
sys.exit(1)
# Load en-GB (source of truth)
with open(en_gb_path, 'r', encoding='utf-8') as f:
en_gb = json.load(f)
en_gb_flat = flatten_dict(en_gb)
# Get list of languages to validate
if args.language:
languages = [args.language]
else:
# Validate all languages except en-GB
languages = [
d.name for d in locales_dir.iterdir()
if d.is_dir() and d.name != 'en-GB' and (d / 'translation.json').exists()
]
all_issues = []
# Validate each language
for lang_code in sorted(languages):
lang_path = locales_dir / lang_code / 'translation.json'
if not lang_path.exists():
print(f"⚠️ Warning: {lang_code}/translation.json not found, skipping")
continue
with open(lang_path, 'r', encoding='utf-8') as f:
lang_data = json.load(f)
lang_flat = flatten_dict(lang_data)
issues = validate_language(en_gb_flat, lang_flat, lang_code)
all_issues.extend(issues)
# Output results
if args.json:
print(json.dumps(all_issues, indent=2, ensure_ascii=False))
else:
if all_issues:
# Group by language
by_language = {}
for issue in all_issues:
lang = issue['language']
if lang not in by_language:
by_language[lang] = []
by_language[lang].append(issue)
print(f"📊 Validation Summary:")
print(f" Total issues: {len(all_issues)}")
print(f" Languages with issues: {len(by_language)}\n")
for lang in sorted(by_language.keys()):
print(f"\n{'='*100}")
print(f"Language: {lang} ({len(by_language[lang])} issue(s))")
print(f"{'='*100}")
print_issues(by_language[lang], verbose=args.verbose)
else:
print("✅ All translations have correct placeholders!")
# Exit with error code if issues found
sys.exit(1 if all_issues else 0)
if __name__ == '__main__':
main()