mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-04-22 23:08:53 +02:00
garbage commit font remappings
This commit is contained in:
492
scripts/convert_cff_to_ttf.py
Normal file
492
scripts/convert_cff_to_ttf.py
Normal file
@@ -0,0 +1,492 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility.
|
||||
Builds proper Unicode cmap from PDF ToUnicode data.
|
||||
"""
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
from fontTools.ttLib import TTFont, newTable
|
||||
from fontTools.cffLib import CFFFontSet
|
||||
from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12
|
||||
from fontTools.ttLib.tables._n_a_m_e import NameRecord
|
||||
from fontTools.ttLib.tables.O_S_2f_2 import Panose
|
||||
|
||||
def parse_unicode_mapping(mapping_path):
|
||||
"""
|
||||
Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap).
|
||||
|
||||
Returns:
|
||||
dict[int, int]: GID → Unicode codepoint
|
||||
"""
|
||||
try:
|
||||
with open(mapping_path, 'rb') as f:
|
||||
data = f.read().decode('utf-8', errors='ignore')
|
||||
|
||||
# Try parsing as JSON first (CID font with complete mapping)
|
||||
if data.strip().startswith('{'):
|
||||
import json
|
||||
try:
|
||||
mapping_data = json.loads(data)
|
||||
if mapping_data.get('isCID'):
|
||||
# Build GID → Unicode mapping from entries
|
||||
gid_to_unicode = {}
|
||||
for entry in mapping_data.get('entries', []):
|
||||
gid = entry['gid']
|
||||
unicode_val = entry['unicode']
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr)
|
||||
return gid_to_unicode
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fall back to parsing raw ToUnicode CMap (non-CID fonts)
|
||||
# For non-CID fonts, CID/GID is the same as array index
|
||||
gid_to_unicode = {}
|
||||
|
||||
# Pattern for bfchar entries
|
||||
bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
|
||||
for match in re.finditer(bfchar_pattern, data):
|
||||
gid = int(match.group(1), 16) # For non-CID, char code == GID
|
||||
unicode_val = int(match.group(2), 16)
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
|
||||
# Pattern for bfrange entries
|
||||
bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
|
||||
for match in re.finditer(bfrange_pattern, data):
|
||||
start_gid = int(match.group(1), 16)
|
||||
end_gid = int(match.group(2), 16)
|
||||
start_unicode = int(match.group(3), 16)
|
||||
for i, gid in enumerate(range(start_gid, end_gid + 1)):
|
||||
unicode_val = start_unicode + i
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
|
||||
print(f"Parsed ToUnicode CMap: {len(gid_to_unicode)} mappings", file=sys.stderr)
|
||||
return gid_to_unicode
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
"""
|
||||
Wrap raw CFF data (from PDF font stream) as OpenType-CFF.
|
||||
|
||||
Args:
|
||||
input_path: Path to input CFF data file
|
||||
output_path: Path to output OTF font
|
||||
tounicode_path: Optional path to ToUnicode CMap file
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Read raw CFF data
|
||||
with open(input_path, 'rb') as f:
|
||||
cff_data = f.read()
|
||||
|
||||
# Parse raw CFF data
|
||||
cff_fontset = CFFFontSet()
|
||||
cff_fontset.decompile(BytesIO(cff_data), None)
|
||||
|
||||
# Get the first (and usually only) font in the CFF set
|
||||
if len(cff_fontset.fontNames) == 0:
|
||||
print("ERROR: No fonts found in CFF data", file=sys.stderr)
|
||||
return False
|
||||
|
||||
cff_font = cff_fontset[cff_fontset.fontNames[0]]
|
||||
|
||||
# Parse Unicode mapping (JSON or raw ToUnicode CMap) if provided
|
||||
gid_to_unicode = {}
|
||||
if tounicode_path:
|
||||
gid_to_unicode = parse_unicode_mapping(tounicode_path)
|
||||
|
||||
# Create a new OTF font
|
||||
otf = TTFont(sfntVersion='OTTO') # 'OTTO' = CFF-flavored OpenType
|
||||
|
||||
# Get glyph names
|
||||
if hasattr(cff_font, 'charset') and cff_font.charset is not None:
|
||||
glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef']
|
||||
else:
|
||||
# Fallback to CharStrings keys
|
||||
charstrings = cff_font.CharStrings
|
||||
glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef']
|
||||
|
||||
otf.setGlyphOrder(glyph_order)
|
||||
|
||||
# === Add CFF table (the actual font outlines) ===
|
||||
cff_table = newTable('CFF ')
|
||||
cff_table.cff = cff_fontset
|
||||
otf['CFF '] = cff_table
|
||||
|
||||
# === Calculate metrics from CFF ===
|
||||
charstrings = cff_font.CharStrings
|
||||
|
||||
# Get defaults from CFF Private dict
|
||||
private_dict = getattr(cff_font, 'Private', None)
|
||||
default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500
|
||||
|
||||
# Calculate bounding box, widths, and LSBs
|
||||
x_min = 0
|
||||
y_min = -200
|
||||
x_max = 1000
|
||||
y_max = 800
|
||||
max_advance = 0
|
||||
min_lsb = 0
|
||||
min_rsb = 0
|
||||
max_extent = 0
|
||||
|
||||
widths = {}
|
||||
lsbs = {}
|
||||
|
||||
for glyph_name in glyph_order:
|
||||
lsb = 0
|
||||
width = int(default_width)
|
||||
|
||||
if glyph_name in charstrings:
|
||||
try:
|
||||
cs = charstrings[glyph_name]
|
||||
|
||||
# Get width from charstring
|
||||
if hasattr(cs, 'width'):
|
||||
width = int(cs.width)
|
||||
|
||||
# Calculate bounds for LSB and bbox
|
||||
try:
|
||||
bounds = cs.calcBounds(None)
|
||||
if bounds:
|
||||
glyph_xmin = int(bounds[0])
|
||||
glyph_ymin = int(bounds[1])
|
||||
glyph_xmax = int(bounds[2])
|
||||
glyph_ymax = int(bounds[3])
|
||||
|
||||
lsb = glyph_xmin
|
||||
rsb = width - glyph_xmax
|
||||
extent = lsb + glyph_xmax
|
||||
|
||||
# Update global bounds
|
||||
x_min = min(x_min, glyph_xmin)
|
||||
y_min = min(y_min, glyph_ymin)
|
||||
x_max = max(x_max, glyph_xmax)
|
||||
y_max = max(y_max, glyph_ymax)
|
||||
|
||||
# Update hhea metrics
|
||||
min_lsb = min(min_lsb, lsb)
|
||||
min_rsb = min(min_rsb, rsb)
|
||||
max_extent = max(max_extent, extent)
|
||||
except:
|
||||
pass # Some glyphs may not have outlines
|
||||
|
||||
except Exception as e:
|
||||
pass # Use defaults
|
||||
|
||||
widths[glyph_name] = width
|
||||
lsbs[glyph_name] = lsb
|
||||
max_advance = max(max_advance, width)
|
||||
|
||||
if max_advance == 0:
|
||||
max_advance = 1000
|
||||
if max_extent == 0:
|
||||
max_extent = x_max
|
||||
|
||||
units_per_em = 1000 # Standard for Type1/CFF
|
||||
|
||||
# === Create head table ===
|
||||
head = newTable('head')
|
||||
head.tableVersion = 1.0
|
||||
head.fontRevision = 1.0
|
||||
head.checkSumAdjustment = 0
|
||||
head.magicNumber = 0x5F0F3CF5
|
||||
head.flags = 0x000B # Baseline at y=0, LSB at x=0, integer PPEM
|
||||
head.unitsPerEm = units_per_em
|
||||
head.created = 3600000000
|
||||
head.modified = 3600000000
|
||||
head.xMin = x_min
|
||||
head.yMin = y_min
|
||||
head.xMax = x_max
|
||||
head.yMax = y_max
|
||||
head.macStyle = 0
|
||||
head.fontDirectionHint = 2
|
||||
head.indexToLocFormat = 0
|
||||
head.glyphDataFormat = 0
|
||||
head.lowestRecPPEM = 8
|
||||
otf['head'] = head
|
||||
|
||||
# === Create hhea table with correct metrics ===
|
||||
hhea = newTable('hhea')
|
||||
hhea.tableVersion = 0x00010000
|
||||
hhea.ascent = max(y_max, 800)
|
||||
hhea.descent = min(y_min, -200)
|
||||
hhea.lineGap = 0
|
||||
hhea.advanceWidthMax = max_advance
|
||||
hhea.minLeftSideBearing = min_lsb
|
||||
hhea.minRightSideBearing = min_rsb
|
||||
hhea.xMaxExtent = max_extent
|
||||
hhea.caretSlopeRise = 1
|
||||
hhea.caretSlopeRun = 0
|
||||
hhea.caretOffset = 0
|
||||
hhea.reserved0 = 0
|
||||
hhea.reserved1 = 0
|
||||
hhea.reserved2 = 0
|
||||
hhea.reserved3 = 0
|
||||
hhea.metricDataFormat = 0
|
||||
hhea.numberOfHMetrics = len(glyph_order)
|
||||
otf['hhea'] = hhea
|
||||
|
||||
# === Create hmtx table with correct LSBs ===
|
||||
hmtx = newTable('hmtx')
|
||||
hmtx.metrics = {}
|
||||
for glyph_name in glyph_order:
|
||||
hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0))
|
||||
otf['hmtx'] = hmtx
|
||||
|
||||
# === Create maxp table (simpler for CFF) ===
|
||||
maxp = newTable('maxp')
|
||||
maxp.tableVersion = 0x00005000 # CFF version (0.5)
|
||||
maxp.numGlyphs = len(glyph_order)
|
||||
otf['maxp'] = maxp
|
||||
|
||||
# === Build Unicode cmap from GID→Unicode mapping ===
|
||||
unicode_to_glyph = {}
|
||||
|
||||
if gid_to_unicode:
|
||||
# Debug: Show first few glyph names to understand naming convention
|
||||
sample_glyphs = glyph_order[:min(10, len(glyph_order))]
|
||||
print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr)
|
||||
|
||||
# Debug: Show which GIDs we have mappings for
|
||||
sample_gids = sorted(gid_to_unicode.keys())[:10]
|
||||
print(f"Sample GIDs from mapping: {sample_gids}", file=sys.stderr)
|
||||
|
||||
# For CID fonts: glyph names are "cid00123" (5-digit zero-padded)
|
||||
# For non-CID fonts: glyph names vary but GID == array index
|
||||
is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6]) # Check first few non-.notdef glyphs
|
||||
|
||||
for gid, unicode_val in gid_to_unicode.items():
|
||||
if unicode_val > 0:
|
||||
if is_cid_font:
|
||||
# Build glyph name as cidNNNNN (5 digits, zero-padded)
|
||||
glyph_name = f"cid{gid:05d}"
|
||||
# Verify this glyph exists in glyph_order
|
||||
if glyph_name in glyph_order:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
else:
|
||||
# Try without padding (some fonts use "cid123" not "cid00123")
|
||||
glyph_name_alt = f"cid{gid}"
|
||||
if glyph_name_alt in glyph_order:
|
||||
unicode_to_glyph[unicode_val] = glyph_name_alt
|
||||
else:
|
||||
# Non-CID font: GID is array index
|
||||
if 0 <= gid < len(glyph_order):
|
||||
glyph_name = glyph_order[gid]
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
|
||||
print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr)
|
||||
|
||||
# Also try to map from glyph names (uni0041 → U+0041)
|
||||
for glyph_name in glyph_order:
|
||||
if glyph_name.startswith('uni') and len(glyph_name) == 7:
|
||||
try:
|
||||
unicode_val = int(glyph_name[3:], 16)
|
||||
if unicode_val not in unicode_to_glyph:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
except:
|
||||
pass
|
||||
elif glyph_name.startswith('u') and len(glyph_name) >= 5:
|
||||
try:
|
||||
unicode_val = int(glyph_name[1:], 16)
|
||||
if unicode_val not in unicode_to_glyph:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
except:
|
||||
pass
|
||||
|
||||
# === Create cmap table ===
|
||||
cmap = newTable('cmap')
|
||||
cmap.tableVersion = 0
|
||||
cmap_tables = []
|
||||
|
||||
# Windows Unicode BMP (format 4) - required
|
||||
cmap4_win = cmap_format_4(4)
|
||||
cmap4_win.platformID = 3 # Windows
|
||||
cmap4_win.platEncID = 1 # Unicode BMP
|
||||
cmap4_win.language = 0
|
||||
cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
|
||||
cmap_tables.append(cmap4_win)
|
||||
|
||||
# Windows Unicode UCS-4 (format 12) - for >BMP
|
||||
if any(cp > 0xFFFF for cp in unicode_to_glyph):
|
||||
cmap12_win = cmap_format_12(12)
|
||||
cmap12_win.platformID = 3 # Windows
|
||||
cmap12_win.platEncID = 10 # Unicode UCS-4
|
||||
cmap12_win.language = 0
|
||||
cmap12_win.cmap = dict(unicode_to_glyph)
|
||||
cmap_tables.append(cmap12_win)
|
||||
|
||||
# Mac Unicode (format 4) - for compatibility
|
||||
cmap4_mac = cmap_format_4(4)
|
||||
cmap4_mac.platformID = 1 # Mac
|
||||
cmap4_mac.platEncID = 0 # Roman
|
||||
cmap4_mac.language = 0
|
||||
cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
|
||||
cmap_tables.append(cmap4_mac)
|
||||
|
||||
cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win] # Ensure at least one
|
||||
otf['cmap'] = cmap
|
||||
|
||||
print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr)
|
||||
|
||||
# === Create OS/2 table with correct metrics ===
|
||||
os2 = newTable('OS/2')
|
||||
os2.version = 4
|
||||
os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500
|
||||
os2.usWeightClass = 400 # Normal
|
||||
os2.usWidthClass = 5 # Medium
|
||||
os2.fsType = 0 # Installable embedding
|
||||
os2.ySubscriptXSize = 650
|
||||
os2.ySubscriptYSize = 600
|
||||
os2.ySubscriptXOffset = 0
|
||||
os2.ySubscriptYOffset = 75
|
||||
os2.ySuperscriptXSize = 650
|
||||
os2.ySuperscriptYSize = 600
|
||||
os2.ySuperscriptXOffset = 0
|
||||
os2.ySuperscriptYOffset = 350
|
||||
os2.yStrikeoutSize = 50
|
||||
os2.yStrikeoutPosition = 300
|
||||
os2.sFamilyClass = 0
|
||||
|
||||
# PANOSE - use proper object structure
|
||||
os2.panose = Panose()
|
||||
os2.panose.bFamilyType = 0
|
||||
os2.panose.bSerifStyle = 0
|
||||
os2.panose.bWeight = 0
|
||||
os2.panose.bProportion = 0
|
||||
os2.panose.bContrast = 0
|
||||
os2.panose.bStrokeVariation = 0
|
||||
os2.panose.bArmStyle = 0
|
||||
os2.panose.bLetterForm = 0
|
||||
os2.panose.bMidline = 0
|
||||
os2.panose.bXHeight = 0
|
||||
|
||||
os2.ulUnicodeRange1 = 0
|
||||
os2.ulUnicodeRange2 = 0
|
||||
os2.ulUnicodeRange3 = 0
|
||||
os2.ulUnicodeRange4 = 0
|
||||
os2.achVendID = 'SPDF'
|
||||
os2.fsSelection = 0x0040 # REGULAR bit
|
||||
|
||||
# Set character index range from actual cmap
|
||||
if unicode_to_glyph:
|
||||
codepoints = sorted(unicode_to_glyph.keys())
|
||||
os2.usFirstCharIndex = codepoints[0]
|
||||
os2.usLastCharIndex = codepoints[-1]
|
||||
else:
|
||||
os2.usFirstCharIndex = 0x20 # space
|
||||
os2.usLastCharIndex = 0x7E # tilde
|
||||
|
||||
# Typo metrics match hhea
|
||||
os2.sTypoAscender = hhea.ascent
|
||||
os2.sTypoDescender = hhea.descent
|
||||
os2.sTypoLineGap = hhea.lineGap
|
||||
|
||||
# Windows metrics (positive values, cover bbox)
|
||||
os2.usWinAscent = max(0, y_max)
|
||||
os2.usWinDescent = max(0, -y_min)
|
||||
|
||||
os2.ulCodePageRange1 = 0x00000001 # Latin 1
|
||||
os2.ulCodePageRange2 = 0
|
||||
os2.sxHeight = 500
|
||||
os2.sCapHeight = 700
|
||||
os2.usDefaultChar = 0
|
||||
os2.usBreakChar = 32
|
||||
os2.usMaxContext = 0
|
||||
otf['OS/2'] = os2
|
||||
|
||||
# === Create name table with Windows and Mac records ===
|
||||
name = newTable('name')
|
||||
name.names = []
|
||||
|
||||
# Get font name from CFF if available
|
||||
font_name = cff_fontset.fontNames[0] if cff_fontset.fontNames else "Converted"
|
||||
|
||||
name_strings = {
|
||||
1: font_name, # Font Family
|
||||
2: "Regular", # Subfamily
|
||||
3: f"Stirling-PDF: {font_name}", # Unique ID
|
||||
4: font_name, # Full Name
|
||||
5: "Version 1.0", # Version
|
||||
6: font_name.replace(' ', '-'), # PostScript Name
|
||||
}
|
||||
|
||||
# Add both Windows and Mac name records
|
||||
for name_id, value in name_strings.items():
|
||||
# Windows (platform 3, encoding 1, language 0x0409 = en-US)
|
||||
rec_win = NameRecord()
|
||||
rec_win.nameID = name_id
|
||||
rec_win.platformID = 3
|
||||
rec_win.platEncID = 1
|
||||
rec_win.langID = 0x0409
|
||||
rec_win.string = value
|
||||
name.names.append(rec_win)
|
||||
|
||||
# Mac (platform 1, encoding 0, language 0)
|
||||
rec_mac = NameRecord()
|
||||
rec_mac.nameID = name_id
|
||||
rec_mac.platformID = 1
|
||||
rec_mac.platEncID = 0
|
||||
rec_mac.langID = 0
|
||||
rec_mac.string = value
|
||||
name.names.append(rec_mac)
|
||||
|
||||
otf['name'] = name
|
||||
|
||||
# === Create post table (format 3.0 for smaller web fonts) ===
|
||||
post = newTable('post')
|
||||
post.formatType = 3.0 # No glyph names (smaller, web-optimized)
|
||||
post.italicAngle = 0
|
||||
post.underlinePosition = -100
|
||||
post.underlineThickness = 50
|
||||
post.isFixedPitch = 0
|
||||
post.minMemType42 = 0
|
||||
post.maxMemType42 = 0
|
||||
post.minMemType1 = 0
|
||||
post.maxMemType1 = 0
|
||||
otf['post'] = post
|
||||
|
||||
# Save the OTF font
|
||||
otf.save(output_path)
|
||||
otf.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
return False
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
input_path = Path(sys.argv[1])
|
||||
output_path = Path(sys.argv[2])
|
||||
tounicode_path = Path(sys.argv[3]) if len(sys.argv) > 3 else None
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"ERROR: Input file not found: {input_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if tounicode_path and not tounicode_path.exists():
|
||||
print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr)
|
||||
tounicode_path = None
|
||||
|
||||
success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user