lang updates plus --include-existing flag (#5212)

# Description of Changes

<!--
Please provide a summary of the changes, including:

- What was changed
- Why the change was made
- Any challenges encountered

Closes #(issue_number)
-->

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Anthony Stirling
2025-12-10 11:41:11 +00:00
committed by GitHub
parent 7b26b184d1
commit 787d0d21c9
43 changed files with 4804 additions and 51 deletions

View File

@@ -174,9 +174,6 @@ Merges missing translations from en-GB into target language files and manages tr
# Add missing translations from en-GB to French
python scripts/translations/translation_merger.py fr-FR add-missing
# Add without marking as [UNTRANSLATED]
python scripts/translations/translation_merger.py fr-FR add-missing --no-mark-untranslated
# Extract untranslated entries to a file
python scripts/translations/translation_merger.py fr-FR extract-untranslated --output fr_untranslated.json
@@ -188,7 +185,7 @@ python scripts/translations/translation_merger.py fr-FR apply-translations --tra
```
**Features:**
- Adds missing keys from en-GB with optional [UNTRANSLATED] markers
- Adds missing keys from en-GB (copies English text directly)
- Extracts untranslated entries for external translation
- Creates structured templates for AI translation
- Applies translated content back to language files
@@ -442,7 +439,7 @@ Repeat steps 2-5 until 100% complete.
#### Step 1: Add Missing Translations
```bash
python scripts/translations/translation_merger.py fr-FR add-missing --mark-untranslated
python scripts/translations/translation_merger.py fr-FR add-missing
```
#### Step 2: Create AI Template
@@ -523,7 +520,7 @@ ignore = [
### Critical Rules for Translation
1. **NEVER skip entries**: Translate ALL entries in each batch to avoid [UNTRANSLATED] pollution
1. **NEVER skip entries**: Translate ALL entries in each batch to ensure completeness
2. **Use appropriate batch sizes**: 100 entries for systematic translation, unlimited for compact method
3. **Skip validation for placeholders**: Use `--skip-validation` when batch contains `{{variable}}` patterns
4. **Check progress between batches**: Use `--summary` flag to track completion percentage
@@ -567,13 +564,6 @@ python scripts/translations/json_validator.py --all-batches ar_AR
- Regex patterns: Double all backslashes (`\d` → `\\d`)
- Check for missing/extra commas at line reported in error
#### [UNTRANSLATED] Pollution
**Problem**: Hundreds of [UNTRANSLATED] markers from incomplete translation attempts
**Solution**:
- Only translate complete batches of manageable size
- Use analyzer that counts [UNTRANSLATED] as missing translations
- Restore from backup if pollution occurs
#### Validation False Positives
**Problem**: Validator flags legitimate `{{variable}}` placeholders as artifacts
**Solution**: Use `--skip-validation` flag when applying batches with template variables
@@ -674,7 +664,7 @@ python scripts/translations/ai_translation_helper.py apply-batch de_batch_1.json
- **Missing Files**: Scripts create new files when language directories don't exist
- **Invalid JSON**: Clear error messages with line numbers
- **Placeholder Mismatches**: Validation warnings for missing or extra placeholders
- **[UNTRANSLATED] Entries**: Counted as missing translations to prevent pollution
- **Legacy [UNTRANSLATED] Markers**: Detected and stripped for backwards compatibility
- **Backup Failures**: Graceful handling with user notification
## Integration with Development

View File

@@ -45,9 +45,10 @@ def load_translation_file(file_path):
with open(file_path, 'rb') as f:
return tomllib.load(f)
def extract_untranslated(language_code, batch_size=500):
def extract_untranslated(language_code, batch_size=500, include_existing=False):
"""Extract untranslated entries and split into batches."""
print(f"\n🔍 Extracting untranslated entries for {language_code}...")
mode = "all untranslated (including existing)" if include_existing else "new (missing)"
print(f"\n🔍 Extracting {mode} entries for {language_code}...")
# Load files
golden_path = find_translation_file(Path('frontend/public/locales/en-GB'))
@@ -84,13 +85,19 @@ def extract_untranslated(language_code, batch_size=500):
# Find untranslated
untranslated = {}
for key, value in golden_flat.items():
if (key not in lang_flat or
lang_flat.get(key) == value or
(isinstance(lang_flat.get(key), str) and lang_flat.get(key).startswith("[UNTRANSLATED]"))):
untranslated[key] = value
if include_existing:
# Include missing keys, keys with English values, and [UNTRANSLATED] keys
if (key not in lang_flat or
lang_flat.get(key) == value or
(isinstance(lang_flat.get(key), str) and lang_flat.get(key).startswith("[UNTRANSLATED]"))):
untranslated[key] = value
else:
# Only include missing keys (not in target file at all)
if key not in lang_flat:
untranslated[key] = value
total = len(untranslated)
print(f"Found {total} untranslated entries")
print(f"Found {total} {mode} entries")
if total == 0:
print("✓ Language is already complete!")
@@ -268,6 +275,7 @@ Examples:
parser.add_argument('--no-cleanup', action='store_true', help='Keep temporary batch files')
parser.add_argument('--skip-verification', action='store_true', help='Skip final completion check')
parser.add_argument('--timeout', type=int, default=600, help='Timeout per batch in seconds (default: 600 = 10 minutes)')
parser.add_argument('--include-existing', action='store_true', help='Also retranslate existing keys that match English (default: only translate missing keys)')
args = parser.parse_args()
@@ -287,7 +295,7 @@ Examples:
try:
# Step 1: Extract and split
batch_files = extract_untranslated(args.language, args.batch_size)
batch_files = extract_untranslated(args.language, args.batch_size, args.include_existing)
if batch_files is None:
sys.exit(1)

View File

@@ -87,7 +87,7 @@ def get_language_completion(locales_dir: Path, language: str) -> Optional[float]
return None
def translate_language(language: str, api_key: str, batch_size: int, timeout: int, skip_verification: bool) -> Tuple[str, bool, str]:
def translate_language(language: str, api_key: str, batch_size: int, timeout: int, skip_verification: bool, include_existing: bool) -> Tuple[str, bool, str]:
"""
Translate a single language.
Returns: (language_code, success, message)
@@ -105,6 +105,9 @@ def translate_language(language: str, api_key: str, batch_size: int, timeout: in
if skip_verification:
cmd.append('--skip-verification')
if include_existing:
cmd.append('--include-existing')
try:
result = subprocess.run(
cmd,
@@ -170,6 +173,8 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
help='Path to locales directory')
parser.add_argument('--skip-verification', action='store_true',
help='Skip final completion verification for each language')
parser.add_argument('--include-existing', action='store_true',
help='Also retranslate existing keys that match English (default: only translate missing keys)')
parser.add_argument('--dry-run', action='store_true',
help='Show what would be translated without actually translating')
@@ -253,7 +258,8 @@ Note: Requires OPENAI_API_KEY environment variable or --api-key argument.
api_key,
args.batch_size,
args.timeout,
args.skip_verification
args.skip_verification,
args.include_existing
): lang
for lang in languages
}

View File

@@ -117,8 +117,7 @@ class TranslationMerger:
missing = set(golden_flat.keys()) - set(target_flat.keys())
return sorted(missing - ignore_set)
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None,
mark_untranslated: bool = True) -> Dict:
def add_missing_translations(self, target_file: Path, keys_to_add: List[str] = None) -> Dict:
"""Add missing translations from en-GB to target file."""
if not target_file.exists():
target_data = {}
@@ -132,10 +131,7 @@ class TranslationMerger:
for key in missing_keys:
if key in golden_flat:
value = golden_flat[key]
if mark_untranslated and isinstance(value, str):
# Mark as untranslated for AI to translate later
value = f"[UNTRANSLATED] {value}"
# Add the English value directly without [UNTRANSLATED] marker
self._set_nested_value(target_data, key, value)
added_count += 1
@@ -282,8 +278,6 @@ def main():
# Add missing command
add_parser = subparsers.add_parser('add-missing', help='Add missing translations from en-GB')
add_parser.add_argument('--backup', action='store_true', help='Create backup before modifying files')
add_parser.add_argument('--mark-untranslated', action='store_true', default=True,
help='Mark added translations as [UNTRANSLATED]')
# Extract untranslated command
extract_parser = subparsers.add_parser('extract-untranslated', help='Extract untranslated entries')
@@ -312,10 +306,7 @@ def main():
if args.command == 'add-missing':
print(f"Adding missing translations to {args.language}...")
result = merger.add_missing_translations(
target_file,
mark_untranslated=args.mark_untranslated
)
result = merger.add_missing_translations(target_file)
merger._save_translation_file(result['data'], target_file, backup=args.backup)
print(f"Added {result['added_count']} missing translations")