tpye3 text edit init

2026-04-16 23:08:38 +02:00 · 2025-11-09 12:43:33 +00:00
parent a6bee1436f
commit e915e1aa7d
49 changed files with 23741 additions and 151 deletions
--- a/scripts/download_pdf_collection.py
+++ b/scripts/download_pdf_collection.py
@@ -0,0 +1,583 @@
+#!/usr/bin/env python3
+"""
+Mass-download PDFs from various public domains for Type3 font harvesting.
+
+Downloads hundreds of PDFs from:
+- arXiv (scientific papers)
+- Project Gutenberg (books)
+- Government reports (NASA, EPA, etc.)
+- Academic repositories
+- Technical documentation
+- And many more sources...
+
+Run with: python scripts/download_pdf_collection.py --output ./pdf-collection
+"""
+
+import argparse
+import asyncio
+import hashlib
+import random
+import re
+from pathlib import Path
+from typing import List, Optional
+from urllib.parse import urlparse
+
+import aiofiles
+import aiohttp
+
+
+# Extensive list of PDF URLs across multiple categories
+PDF_URLS = [
+    # Mathematics & Statistics
+    "https://arxiv.org/pdf/2103.14030.pdf",  # Swin Transformer
+    "https://arxiv.org/pdf/2010.11929.pdf",  # Vision Transformer
+    "https://arxiv.org/pdf/2005.14165.pdf",  # GPT-3 Paper
+    "https://arxiv.org/pdf/1910.10683.pdf",  # T5 Text-to-Text Transformer
+    "https://arxiv.org/pdf/1810.04805.pdf",  # BERT
+    "https://arxiv.org/pdf/1706.03762.pdf",  # Attention Is All You Need
+    "https://arxiv.org/pdf/1603.04467.pdf",  # TensorFlow White Paper
+    "https://arxiv.org/pdf/1511.06434.pdf",  # DCGAN
+    "https://arxiv.org/pdf/1506.03378.pdf",  # LIME
+    "https://arxiv.org/pdf/1409.1556.pdf",   # VGGNet
+    "https://arxiv.org/pdf/1312.6114.pdf",   # Variational Autoencoders
+    "https://arxiv.org/pdf/1211.4240.pdf",   # AlexNet
+    "https://arxiv.org/pdf/1106.1813.pdf",   # CIFAR-10
+    "https://arxiv.org/pdf/1003.0358.pdf",   # SVM Theory
+    "https://arxiv.org/pdf/0909.4061.pdf",   # Random Forests
+    
+    # Physics
+    "https://arxiv.org/pdf/2303.08774.pdf",  # Quantum Computing
+    "https://arxiv.org/pdf/2201.04294.pdf",  # Dark Matter Research
+    "https://arxiv.org/pdf/2105.00552.pdf",  # Gravitational Waves
+    "https://arxiv.org/pdf/2004.00007.pdf",  # Particle Physics
+    "https://arxiv.org/pdf/1906.10176.pdf",  # Cosmology
+    "https://arxiv.org/pdf/1807.02101.pdf",  # String Theory
+    "https://arxiv.org/pdf/1708.05671.pdf",  # Quantum Entanglement
+    "https://arxiv.org/pdf/1605.08625.pdf",  # Astrophysics
+    
+    # Computer Science
+    "https://arxiv.org/pdf/2204.02311.pdf",  # PaLM Language Model
+    "https://arxiv.org/pdf/2112.07804.pdf",  # Stable Diffusion
+    "https://arxiv.org/pdf/2107.03374.pdf",  # Codex
+    "https://arxiv.org/pdf/2010.02559.pdf",  # Neural Architecture Search
+    "https://arxiv.org/pdf/1912.01703.pdf",  # YOLOv4
+    "https://arxiv.org/pdf/1905.11946.pdf",  # EfficientNet
+    "https://arxiv.org/pdf/1812.01187.pdf",  # BERT Large
+    "https://arxiv.org/pdf/1801.00631.pdf",  # Transformer Applications
+    "https://arxiv.org/pdf/1704.04861.pdf",  # MobileNet
+    "https://arxiv.org/pdf/1602.07360.pdf",  # SqueezeNet
+    "https://arxiv.org/pdf/1512.03385.pdf",  # ResNet
+    "https://arxiv.org/pdf/1506.02640.pdf",  # YOLO
+    "https://arxiv.org/pdf/1502.03167.pdf",  # Batch Normalization
+    "https://arxiv.org/pdf/1412.6980.pdf",   # Adam Optimizer
+    "https://arxiv.org/pdf/1409.4842.pdf",   # GoogLeNet
+    "https://arxiv.org/pdf/1312.5602.pdf",   # Deep Q-Network
+    "https://arxiv.org/pdf/1301.3781.pdf",   # Word2Vec
+    "https://arxiv.org/pdf/1207.0580.pdf",   # Dropout
+    "https://arxiv.org/pdf/1102.1803.pdf",   # ImageNet Classification
+    
+    # Government Reports
+    "https://www.nasa.gov/sites/default/files/atoms/files/2023_nasa_annual_report.pdf",
+    "https://www.nasa.gov/sites/default/files/atoms/files/2022_nasa_annual_report.pdf",
+    "https://www.nasa.gov/sites/default/files/atoms/files/2021_nasa_annual_report.pdf",
+    "https://www.epa.gov/system/files/documents/2023-01/epa-strategic-plan-2022-2026.pdf",
+    "https://www.epa.gov/system/files/documents/2022-12/epa-annual-report-2022.pdf",
+    "https://www.nist.gov/system/files/documents/2023/02/15/NIST%20Annual%20Report%202022.pdf",
+    "https://www.nist.gov/system/files/documents/2022/03/01/NIST%20Annual%20Report%202021.pdf",
+    "https://www.noaa.gov/sites/default/files/2023-03/NOAA%20Annual%20Report%202022.pdf",
+    "https://www.fda.gov/media/165773/download",
+    "https://www.fda.gov/media/159722/download",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7201.pdf",
+    "https://www.cdc.gov/nchs/data/nvsr/nvsr71/nvsr71-01.pdf",
+    "https://www.bls.gov/opub/mlr/2023/article/pdf/labor-force-projections-2022-2032.pdf",
+    "https://www.bls.gov/opub/mlr/2023/article/pdf/union-membership-2022.pdf",
+    "https://www.census.gov/content/dam/Census/library/publications/2023/demo/p60-280.pdf",
+    "https://www.energy.gov/sites/default/files/2023-04/DOE%20Annual%20Report%202022.pdf",
+    
+    # Project Gutenberg Classics
+    "https://www.gutenberg.org/files/1342/1342-pdf.pdf",  # Pride and Prejudice
+    "https://www.gutenberg.org/files/84/84-pdf.pdf",      # Frankenstein
+    "https://www.gutenberg.org/files/11/11-pdf.pdf",      # Alice in Wonderland
+    "https://www.gutenberg.org/files/1661/1661-pdf.pdf",  # Sherlock Holmes
+    "https://www.gutenberg.org/files/98/98-pdf.pdf",      # Tale of Two Cities
+    "https://www.gutenberg.org/files/2701/2701-pdf.pdf",  # Moby Dick
+    "https://www.gutenberg.org/files/2542/2542-pdf.pdf",  # A Doll's House
+    "https://www.gutenberg.org/files/174/174-pdf.pdf",    # Picture of Dorian Gray
+    "https://www.gutenberg.org/files/1952/1952-pdf.pdf",  # The Yellow Wallpaper
+    "https://www.gutenberg.org/files/1080/1080-pdf.pdf",  # A Modest Proposal
+    "https://www.gutenberg.org/files/43/43-pdf.pdf",      # Dr. Jekyll and Mr. Hyde
+    "https://www.gutenberg.org/files/345/345-pdf.pdf",    # Dracula
+    "https://www.gutenberg.org/files/5200/5200-pdf.pdf",  # Metamorphosis
+    "https://www.gutenberg.org/files/76/76-pdf.pdf",      # Adventures of Huckleberry Finn
+    "https://www.gutenberg.org/files/74/74-pdf.pdf",      # Tom Sawyer
+    "https://www.gutenberg.org/files/1260/1260-pdf.pdf",  # Jane Eyre
+    "https://www.gutenberg.org/files/768/768-pdf.pdf",    # Wuthering Heights
+    "https://www.gutenberg.org/files/219/219-pdf.pdf",    # Heart of Darkness
+    "https://www.gutenberg.org/files/1184/1184-pdf.pdf",  # The Odyssey
+    "https://www.gutenberg.org/files/2600/2600-pdf.pdf",  # War and Peace
+    
+    # Technical Documentation
+    "https://www.kernel.org/doc/ols/2007/ols2007v1-pages-215-224.pdf",
+    "https://www.kernel.org/doc/ols/2008/ols2008v1-pages-133-142.pdf",
+    "https://www.kernel.org/doc/ols/2009/ols2009v1-pages-77-86.pdf",
+    "https://www.postgresql.org/files/documentation/pdf/15/postgresql-15-US.pdf",
+    "https://www.postgresql.org/files/documentation/pdf/14/postgresql-14-US.pdf",
+    "https://www.postgresql.org/files/documentation/pdf/13/postgresql-13-US.pdf",
+    "https://www.python.org/doc/essays/blt.pdf",
+    "https://www.python.org/doc/essays/gui-py.pdf",
+    
+    # Academic Journals
+    "https://www.ams.org/journals/bull/2023-60-01/S0273-0979-2023-01789-9/S0273-0979-2023-01789-9.pdf",
+    "https://www.ams.org/journals/bull/2022-59-02/S0273-0979-2022-01789-9/S0273-0979-2022-01789-9.pdf",
+    "https://www.ams.org/journals/bull/2021-58-03/S0273-0979-2021-01789-9/S0273-0979-2021-01789-9.pdf",
+    "https://www.ams.org/notices/202304/rnoti-p434.pdf",
+    "https://www.ams.org/notices/202203/rnoti-p434.pdf",
+    "https://www.ams.org/notices/202102/rnoti-p434.pdf",
+    
+    # Conference Papers
+    "https://www.usenix.org/system/files/conference/atc18/atc18-paper-zhang.pdf",
+    "https://www.usenix.org/system/files/conference/nsdi18/nsdi18-paper-briscoe.pdf",
+    "https://www.usenix.org/system/files/conference/osdi18/osdi18-paper-belay.pdf",
+    "https://dl.acm.org/doi/pdf/10.1145/3579990.3580020",
+    "https://dl.acm.org/doi/pdf/10.1145/3543507.3583301",
+    "https://dl.acm.org/doi/pdf/10.1145/3519935.3520001",
+    
+    # Medical Research
+    "https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208343",
+    "https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208344",
+    "https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208345",
+    "https://jamanetwork.com/journals/jama/article-abstract/2801234/pdf",
+    "https://jamanetwork.com/journals/jama/article-abstract/2801235/pdf",
+    "https://jamanetwork.com/journals/jama/article-abstract/2801236/pdf",
+    
+    # Economics & Business
+    "https://www.nber.org/papers/w12345.pdf",
+    "https://www.nber.org/papers/w12346.pdf",
+    "https://www.nber.org/papers/w12347.pdf",
+    "https://www.imf.org/en/Publications/WP/Issues/2023/03/15/paper-12345",
+    "https://www.imf.org/en/Publications/WP/Issues/2023/03/16/paper-12346",
+    "https://www.imf.org/en/Publications/WP/Issues/2023/03/17/paper-12347",
+    
+    # Environmental Science
+    "https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FullReport.pdf",
+    "https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_FullReport.pdf",
+    "https://www.ipcc.ch/report/ar6/wg3/downloads/report/IPCC_AR6_WGIII_FullReport.pdf",
+    "https://www.epa.gov/climate-indicators/downloads/climate-change-indicators-us-and-global.pdf",
+    
+    # Mathematics (continued)
+    "https://arxiv.org/pdf/2301.00001.pdf",
+    "https://arxiv.org/pdf/2301.00002.pdf",
+    "https://arxiv.org/pdf/2301.00003.pdf",
+    "https://arxiv.org/pdf/2301.00004.pdf",
+    "https://arxiv.org/pdf/2301.00005.pdf",
+    "https://arxiv.org/pdf/2301.00006.pdf",
+    "https://arxiv.org/pdf/2301.00007.pdf",
+    "https://arxiv.org/pdf/2301.00008.pdf",
+    "https://arxiv.org/pdf/2301.00009.pdf",
+    "https://arxiv.org/pdf/2301.00010.pdf",
+    "https://arxiv.org/pdf/2301.00011.pdf",
+    "https://arxiv.org/pdf/2301.00012.pdf",
+    "https://arxiv.org/pdf/2301.00013.pdf",
+    "https://arxiv.org/pdf/2301.00014.pdf",
+    "https://arxiv.org/pdf/2301.00015.pdf",
+    "https://arxiv.org/pdf/2301.00016.pdf",
+    "https://arxiv.org/pdf/2301.00017.pdf",
+    "https://arxiv.org/pdf/2301.00018.pdf",
+    "https://arxiv.org/pdf/2301.00019.pdf",
+    "https://arxiv.org/pdf/2301.00020.pdf",
+    
+    # Computer Science (continued)
+    "https://arxiv.org/pdf/2302.00001.pdf",
+    "https://arxiv.org/pdf/2302.00002.pdf",
+    "https://arxiv.org/pdf/2302.00003.pdf",
+    "https://arxiv.org/pdf/2302.00004.pdf",
+    "https://arxiv.org/pdf/2302.00005.pdf",
+    "https://arxiv.org/pdf/2302.00006.pdf",
+    "https://arxiv.org/pdf/2302.00007.pdf",
+    "https://arxiv.org/pdf/2302.00008.pdf",
+    "https://arxiv.org/pdf/2302.00009.pdf",
+    "https://arxiv.org/pdf/2302.00010.pdf",
+    "https://arxiv.org/pdf/2302.00011.pdf",
+    "https://arxiv.org/pdf/2302.00012.pdf",
+    "https://arxiv.org/pdf/2302.00013.pdf",
+    "https://arxiv.org/pdf/2302.00014.pdf",
+    "https://arxiv.org/pdf/2302.00015.pdf",
+    "https://arxiv.org/pdf/2302.00016.pdf",
+    "https://arxiv.org/pdf/2302.00017.pdf",
+    "https://arxiv.org/pdf/2302.00018.pdf",
+    "https://arxiv.org/pdf/2302.00019.pdf",
+    "https://arxiv.org/pdf/2302.00020.pdf",
+    
+    # Physics (continued)
+    "https://arxiv.org/pdf/2303.00001.pdf",
+    "https://arxiv.org/pdf/2303.00002.pdf",
+    "https://arxiv.org/pdf/2303.00003.pdf",
+    "https://arxiv.org/pdf/2303.00004.pdf",
+    "https://arxiv.org/pdf/2303.00005.pdf",
+    "https://arxiv.org/pdf/2303.00006.pdf",
+    "https://arxiv.org/pdf/2303.00007.pdf",
+    "https://arxiv.org/pdf/2303.00008.pdf",
+    "https://arxiv.org/pdf/2303.00009.pdf",
+    "https://arxiv.org/pdf/2303.00010.pdf",
+    "https://arxiv.org/pdf/2303.00011.pdf",
+    "https://arxiv.org/pdf/2303.00012.pdf",
+    "https://arxiv.org/pdf/2303.00013.pdf",
+    "https://arxiv.org/pdf/2303.00014.pdf",
+    "https://arxiv.org/pdf/2303.00015.pdf",
+    "https://arxiv.org/pdf/2303.00016.pdf",
+    "https://arxiv.org/pdf/2303.00017.pdf",
+    "https://arxiv.org/pdf/2303.00018.pdf",
+    "https://arxiv.org/pdf/2303.00019.pdf",
+    "https://arxiv.org/pdf/2303.00020.pdf",
+    
+    # More Government Reports
+    "https://www.fda.gov/media/165773/download",
+    "https://www.fda.gov/media/165774/download",
+    "https://www.fda.gov/media/165775/download",
+    "https://www.fda.gov/media/165776/download",
+    "https://www.fda.gov/media/165777/download",
+    "https://www.fda.gov/media/165778/download",
+    "https://www.fda.gov/media/165779/download",
+    "https://www.fda.gov/media/165780/download",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7202.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7203.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7204.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7205.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7206.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7207.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7208.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7209.pdf",
+    "https://www.cdc.gov/mmwr/PDF/wk/mm7210.pdf",
+    
+    # More Project Gutenberg
+    "https://www.gutenberg.org/files/46/46-pdf.pdf",      # A Christmas Carol
+    "https://www.gutenberg.org/files/45/45-pdf.pdf",      # The Scarlet Letter
+    "https://www.gutenberg.org/files/44/44-pdf.pdf",      # The Strange Case of Dr. Jekyll and Mr. Hyde
+    "https://www.gutenberg.org/files/43/43-pdf.pdf",      # The Odyssey
+    "https://www.gutenberg.org/files/42/42-pdf.pdf",      # The Iliad
+    "https://www.gutenberg.org/files/41/41-pdf.pdf",      # The Republic
+    "https://www.gutenberg.org/files/40/40-pdf.pdf",      # The Prince
+    "https://www.gutenberg.org/files/39/39-pdf.pdf",      # The Art of War
+    "https://www.gutenberg.org/files/38/38-pdf.pdf",      # The King James Bible
+    "https://www.gutenberg.org/files/37/37-pdf.pdf",      # The Quran
+    "https://www.gutenberg.org/files/36/36-pdf.pdf",      # The Book of Mormon
+    "https://www.gutenberg.org/files/35/35-pdf.pdf",      # The Tao Te Ching
+    "https://www.gutenberg.org/files/34/34-pdf.pdf",      # The Analects of Confucius
+    "https://www.gutenberg.org/files/33/33-pdf.pdf",      # The Dhammapada
+    "https://www.gutenberg.org/files/32/32-pdf.pdf",      # The Upanishads
+    "https://www.gutenberg.org/files/31/31-pdf.pdf",      # The Vedas
+    "https://www.gutenberg.org/files/30/30-pdf.pdf",      # The Bhagavad Gita
+    "https://www.gutenberg.org/files/29/29-pdf.pdf",      # The Ramayana
+    "https://www.gutenberg.org/files/28/28-pdf.pdf",      # The Mahabharata
+    "https://www.gutenberg.org/files/27/27-pdf.pdf",      # The Arabian Nights
+    
+    # Additional arXiv papers
+    "https://arxiv.org/pdf/2304.00001.pdf",
+    "https://arxiv.org/pdf/2304.00002.pdf",
+    "https://arxiv.org/pdf/2304.00003.pdf",
+    "https://arxiv.org/pdf/2304.00004.pdf",
+    "https://arxiv.org/pdf/2304.00005.pdf",
+    "https://arxiv.org/pdf/2304.00006.pdf",
+    "https://arxiv.org/pdf/2304.00007.pdf",
+    "https://arxiv.org/pdf/2304.00008.pdf",
+    "https://arxiv.org/pdf/2304.00009.pdf",
+    "https://arxiv.org/pdf/2304.00010.pdf",
+    "https://arxiv.org/pdf/2304.00011.pdf",
+    "https://arxiv.org/pdf/2304.00012.pdf",
+    "https://arxiv.org/pdf/2304.00013.pdf",
+    "https://arxiv.org/pdf/2304.00014.pdf",
+    "https://arxiv.org/pdf/2304.00015.pdf",
+    "https://arxiv.org/pdf/2304.00016.pdf",
+    "https://arxiv.org/pdf/2304.00017.pdf",
+    "https://arxiv.org/pdf/2304.00018.pdf",
+    "https://arxiv.org/pdf/2304.00019.pdf",
+    "https://arxiv.org/pdf/2304.00020.pdf",
+    
+    # Statistics and Machine Learning
+    "https://arxiv.org/pdf/2305.00001.pdf",
+    "https://arxiv.org/pdf/2305.00002.pdf",
+    "https://arxiv.org/pdf/2305.00003.pdf",
+    "https://arxiv.org/pdf/2305.00004.pdf",
+    "https://arxiv.org/pdf/2305.00005.pdf",
+    "https://arxiv.org/pdf/2305.00006.pdf",
+    "https://arxiv.org/pdf/2305.00007.pdf",
+    "https://arxiv.org/pdf/2305.00008.pdf",
+    "https://arxiv.org/pdf/2305.00009.pdf",
+    "https://arxiv.org/pdf/2305.00010.pdf",
+    
+    # Quantum Computing
+    "https://arxiv.org/pdf/2306.00001.pdf",
+    "https://arxiv.org/pdf/2306.00002.pdf",
+    "https://arxiv.org/pdf/2306.00003.pdf",
+    "https://arxiv.org/pdf/2306.00004.pdf",
+    "https://arxiv.org/pdf/2306.00005.pdf",
+    "https://arxiv.org/pdf/2306.00006.pdf",
+    "https://arxiv.org/pdf/2306.00007.pdf",
+    "https://arxiv.org/pdf/2306.00008.pdf",
+    "https://arxiv.org/pdf/2306.00009.pdf",
+    "https://arxiv.org/pdf/2306.00010.pdf",
+    
+    # Additional Government Documents
+    "https://www.gao.gov/assets/730/728146.pdf",
+    "https://www.gao.gov/assets/730/728147.pdf",
+    "https://www.gao.gov/assets/730/728148.pdf",
+    "https://www.gao.gov/assets/730/728149.pdf",
+    "https://www.gao.gov/assets/730/728150.pdf",
+    
+    # Technical Standards
+    "https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100424.pdf",
+    "https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100425.pdf",
+    "https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100426.pdf",
+    "https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100427.pdf",
+    "https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100428.pdf",
+    
+    # Historical Documents
+    "https://www.archives.gov/files/founding-docs/constitution-transcript.pdf",
+    "https://www.archives.gov/files/founding-docs/declaration-transcript.pdf",
+    "https://www.archives.gov/files/founding-docs/bill-of-rights-transcript.pdf",
+    "https://www.archives.gov/files/founding-docs/federalist-papers-transcript.pdf",
+    "https://www.archives.gov/files/founding-docs/anti-federalist-papers-transcript.pdf",
+    
+    # Educational Materials
+    "https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec1/",
+    "https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec2/",
+    "https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec3/",
+    "https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec4/",
+    "https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec5/",
+    
+    # Final batch to reach 300+
+    "https://arxiv.org/pdf/2307.00001.pdf",
+    "https://arxiv.org/pdf/2307.00002.pdf",
+    "https://arxiv.org/pdf/2307.00003.pdf",
+    "https://arxiv.org/pdf/2307.00004.pdf",
+    "https://arxiv.org/pdf/2307.00005.pdf",
+    "https://arxiv.org/pdf/2307.00006.pdf",
+    "https://arxiv.org/pdf/2307.00007.pdf",
+    "https://arxiv.org/pdf/2307.00008.pdf",
+    "https://arxiv.org/pdf/2307.00009.pdf",
+    "https://arxiv.org/pdf/2307.00010.pdf",
+    "https://arxiv.org/pdf/2307.00011.pdf",
+    "https://arxiv.org/pdf/2307.00012.pdf",
+    "https://arxiv.org/pdf/2307.00013.pdf",
+    "https://arxiv.org/pdf/2307.00014.pdf",
+    "https://arxiv.org/pdf/2307.00015.pdf",
+    "https://arxiv.org/pdf/2307.00016.pdf",
+    "https://arxiv.org/pdf/2307.00017.pdf",
+    "https://arxiv.org/pdf/2307.00018.pdf",
+    "https://arxiv.org/pdf/2307.00019.pdf",
+    "https://arxiv.org/pdf/2307.00020.pdf",
+    "https://arxiv.org/pdf/2307.00021.pdf",
+    "https://arxiv.org/pdf/2307.00022.pdf",
+    "https://arxiv.org/pdf/2307.00023.pdf",
+    "https://arxiv.org/pdf/2307.00024.pdf",
+    "https://arxiv.org/pdf/2307.00025.pdf",
+    "https://arxiv.org/pdf/2307.00026.pdf",
+    "https://arxiv.org/pdf/2307.00027.pdf",
+    "https://arxiv.org/pdf/2307.00028.pdf",
+    "https://arxiv.org/pdf/2307.00029.pdf",
+    "https://arxiv.org/pdf/2307.00030.pdf",
+]
+
+# Extended list with more categories
+EXTENDED_URLS = PDF_URLS + [
+    # More arXiv (various subjects)
+    *[
+        f"https://arxiv.org/pdf/{cat}/{num:07}.pdf"
+        for cat, num in [
+            ("math", 123456),
+            ("physics", 234567),
+            ("cs", 345678),
+            ("stat", 456789),
+            ("q-bio", 567890),
+            ("q-fin", 678901),
+        ]
+    ],
+    # Project Gutenberg samples
+    "https://www.gutenberg.org/files/1342/1342-pdf.pdf",
+    "https://www.gutenberg.org/files/84/84-pdf.pdf",
+    "https://www.gutenberg.org/files/11/11-pdf.pdf",
+    # Government economic reports
+    "https://www.bea.gov/sites/default/files/2023-03/gdp4q22_3rd.pdf",
+    "https://www.federalreserve.gov/econres/notes/feds-notes/2023/files/20230301.pdf",
+    # Scientific datasets documentation
+    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMCPMC1234567/pdf/main.pdf",
+    # Technical conference proceedings
+    "https://www.usenix.org/system/files/conference/atc18/atc18-paper-zhang.pdf",
+    "https://dl.acm.org/doi/pdf/10.1145/3579990.3580020",
+    # Mathematics journals
+    "https://www.ams.org/journals/bull/0000-0000/0000-0001.pdf",
+    "https://link.springer.com/content/pdf/10.1007/s00222-023-01145-0.pdf",
+    # Physics repositories
+    "https://iopscience.iop.org/article/10.3847/1538-4357/acb123/pdf",
+    # Computer science technical reports
+    "https://www.microsoft.com/en-us/research/uploads/prod/2023/03/paper.pdf",
+    "https://research.google/pubs/pub12345/",
+    # Engineering standards
+    "https://www.iso.org/standard/12345.html/pdf",
+    "https://www.ansi.org/standards/ansiz123/pdf",
+    # Medical research
+    "https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208343",
+    "https://jamanetwork.com/journals/jama/article-abstract/2801234/pdf",
+    # Environmental studies
+    "https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FullReport.pdf",
+    # Economic research
+    "https://www.nber.org/papers/w12345.pdf",
+    "https://www.imf.org/en/Publications/WP/Issues/2023/03/15/paper-12345",
+    # Historical documents
+    "https://www.archives.gov/founding-docs/constitution-transcript.pdf",
+    "https://www.loc.gov/item/2021667891/pdf",
+    # Educational materials
+    "https://openstax.org/resources/9d88d84e2e3343f5a7c2e6a9d9b8c7e3.pdf",
+    # Technical manuals
+    "https://www.python.org/doc/essays/blt.pdf",
+    "https://www.r-project.org/conferences/useR-2023/abstracts/abstract_123.pdf",
+    
+    
+    "https://arxiv.org/pdf/1706.03762.pdf",  # Attention Is All You Need
+"https://arxiv.org/pdf/1502.03167.pdf",  # Batch Normalization
+"https://arxiv.org/pdf/1409.1556.pdf",   # VGG Network
+"https://arxiv.org/pdf/1512.03385.pdf",  # ResNet
+"https://arxiv.org/pdf/1312.6114.pdf",   # Auto-Encoding Variational Bayes
+"https://arxiv.org/pdf/1712.09913.pdf",  # Fitting Linear Mixed-Effects Models Using lme4
+"https://arxiv.org/pdf/1504.08083.pdf",  # Faster R-CNN
+"https://arxiv.org/pdf/1409.4842.pdf",   # Going Deeper with Convolutions
+"https://arxiv.org/pdf/1608.06993.pdf",  # DenseNet
+"https://arxiv.org/pdf/1506.02640.pdf",  # YOLO (You Only Look Once)
+"https://arxiv.org/pdf/1502.03167.pdf",  # Batch Normalization
+"https://arxiv.org/pdf/1411.4038.pdf",   # Fully Convolutional Networks
+"https://arxiv.org/pdf/1512.02325.pdf",  # SSD: Single Shot MultiBox Detector
+"https://arxiv.org/pdf/2010.11929.pdf",  # An Image is Worth 16x16 Words (ViT)
+"https://arxiv.org/pdf/1312.5602.pdf",   # Deep Reinforcement Learning
+"https://arxiv.org/pdf/1505.04597.pdf",  # U-Net
+"https://arxiv.org/pdf/1603.05027.pdf",  # Identity Mappings in Deep Residual Networks
+"https://arxiv.org/pdf/1706.03762.pdf",  # Attention is All You Need
+"https://pmc.ncbi.nlm.nih.gov/articles/PMC1234567/pdf/main.pdf",  # Sample biomedical paper
+# U.S. House Committee on Oversight Reports[citation:2]
+"https://oversight.house.gov/report/the-biden-autopen-presidency-decline-delusion-and-deception-in-the-white-house.pdf",
+"https://oversight.house.gov/report/the-green-new-deal-scam-the-greenhouse-gas-reduction-fund.pdf",
+"https://oversight.house.gov/report/after-action-review-of-the-covid-19-pandemic-the-lessons-learned-and-a-path-forward.pdf",
+"https://oversight.house.gov/report/death-by-a-thousand-regulations-the-biden-harris-administrations-campaign-to-bury-america-in-red-tape.pdf",
+
+# National Archives OGIS Annual Reports[citation:6]
+"https://www.archives.gov/files/ogis/reports/fy2024-annual-report.pdf",
+"https://www.archives.gov/files/ogis/reports/fy2023-annual-report.pdf",
+"https://www.archives.gov/files/ogis/reports/fy2022-annual-report.pdf",
+"https://www.archives.gov/files/ogis/reports/fy2021-annual-report.pdf",
+"https://www.archives.gov/files/ogis/reports/fy2020-annual-report.pdf",
+"https://www.archives.gov/files/ogis/reports/fy2019-annual-report.pdf",
+# Project Gutenberg Top Downloads[citation:3]
+"https://www.gutenberg.org/files/84/84-pdf.pdf",                   # Frankenstein
+"https://www.gutenberg.org/files/1342/1342-pdf.pdf",               # Pride and Prejudice
+"https://www.gutenberg.org/files/11/11-pdf.pdf",                   # Alice's Adventures in Wonderland
+"https://www.gutenberg.org/files/1661/1661-pdf.pdf",               # The Adventures of Sherlock Holmes
+"https://www.gutenberg.org/files/98/98-pdf.pdf",                   # A Tale of Two Cities
+"https://www.gutenberg.org/files/2701/2701-pdf.pdf",               # Moby Dick
+"https://www.gutenberg.org/files/2542/2542-pdf.pdf",               # A Doll's House
+"https://www.gutenberg.org/files/174/174-pdf.pdf",                 # The Picture of Dorian Gray
+"https://www.gutenberg.org/files/1952/1952-pdf.pdf",               # The Yellow Wallpaper
+
+# Open Library & ManyBooks[citation:1][citation:4][citation:7]
+# (Note: You may need to find the direct PDF link from the book's page)
+"https://openlibrary.org/books/OL1234567M/Book_Title.pdf",
+"https://manybooks.net/book/123456/download/pdf"
+]
+
+
+class PDFDownloader:
+    def __init__(self, output_dir: Path, max_concurrent: int = 10):
+        self.output_dir = output_dir
+        self.max_concurrent = max_concurrent
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.downloaded = 0
+        self.failed = 0
+        self.skipped = 0
+
+    async def download_pdf(self, session: aiohttp.ClientSession, url: str) -> Optional[Path]:
+        try:
+            filename = self._url_to_filename(url)
+            filepath = self.output_dir / filename
+            if filepath.exists():
+                self.skipped += 1
+                print(f"✓ Already exists: {filename}")
+                return filepath
+
+            async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
+                if response.status != 200:
+                    print(f"✗ HTTP {response.status}: {url}")
+                    self.failed += 1
+                    return None
+
+                content = await response.read()
+                if not content.startswith(b"%PDF"):
+                    print(f"✗ Not a PDF: {url}")
+                    self.failed += 1
+                    return None
+
+                async with aiofiles.open(filepath, "wb") as handle:
+                    await handle.write(content)
+                self.downloaded += 1
+                print(f"✓ Downloaded: {filename} ({len(content)} bytes)")
+                return filepath
+
+        except Exception as exc:  # pylint: disable=broad-except
+            print(f"✗ Error downloading {url}: {exc}")
+            self.failed += 1
+            return None
+
+    def _url_to_filename(self, url: str) -> str:
+        parsed = urlparse(url)
+        path = parsed.path.strip("/") or "document"
+        filename = re.sub(r"[^a-zA-Z0-9.-]", "_", path)
+        if not filename.endswith(".pdf"):
+            filename += ".pdf"
+        domain = parsed.netloc.replace("www.", "").split(".")[0] or "site"
+        # Hash query params for uniqueness
+        digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
+        return f"{domain}_{filename}_{digest}"
+
+    async def download_all(self, urls: List[str]) -> None:
+        print(f"Starting download of {len(urls)} PDFs to {self.output_dir}")
+        connector = aiohttp.TCPConnector(limit=self.max_concurrent)
+        async with aiohttp.ClientSession(connector=connector) as session:
+            for i in range(0, len(urls), self.max_concurrent):
+                batch = urls[i : i + self.max_concurrent]
+                await asyncio.gather(*(self.download_pdf(session, url) for url in batch))
+                if i + self.max_concurrent < len(urls):
+                    await asyncio.sleep(1)
+        self._print_summary()
+
+    def _print_summary(self) -> None:
+        print("\n" + "=" * 40)
+        print("DOWNLOAD SUMMARY")
+        print("=" * 40)
+        print(f"✓ Downloaded: {self.downloaded}")
+        print(f"○ Skipped:    {self.skipped}")
+        print(f"✗ Failed:     {self.failed}")
+        total = len(list(self.output_dir.glob("*.pdf")))
+        print(f"Total files in directory: {total}")
+        print(f"Location: {self.output_dir.resolve()}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download massive PDF collection for Type3 font harvesting"
+    )
+    parser.add_argument("--output", "-o", default="./pdf-collection", help="Output directory")
+    parser.add_argument(
+        "--max-concurrent", "-c", type=int, default=5, help="Maximum concurrent downloads"
+    )
+    parser.add_argument("--shuffle", action="store_true", help="Shuffle URL order before download")
+    args = parser.parse_args()
+
+    urls = EXTENDED_URLS.copy()
+    if args.shuffle:
+        random.shuffle(urls)
+
+    downloader = PDFDownloader(Path(args.output), args.max_concurrent)
+    asyncio.run(downloader.download_all(urls))
+
+    print(f"\nNext step: python scripts/harvest_type3_fonts.py --input {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/download_pdf_samples.py
+++ b/scripts/download_pdf_samples.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Download large batches of PDF URLs into a local directory so they can be fed to
+scripts/harvest_type3_fonts.py (or any other processing pipeline).
+
+Usage examples:
+
+    # Download every URL listed in pdf_urls.txt into tmp/type3-pdfs
+    python scripts/download_pdf_samples.py \
+        --urls-file pdf_urls.txt \
+        --output-dir tmp/type3-pdfs
+
+    # Mix inline URLs with a file and use 16 concurrent downloads
+    python scripts/download_pdf_samples.py \
+        --urls https://example.com/a.pdf https://example.com/b.pdf \
+        --urls-file more_urls.txt \
+        --output-dir tmp/type3-pdfs \
+        --workers 16
+"""
+
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import hashlib
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Iterable, List, Optional, Set, Tuple
+from urllib.parse import unquote, urlparse
+
+import requests
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Bulk download PDF URLs.")
+    parser.add_argument(
+        "--urls",
+        nargs="*",
+        default=[],
+        help="Inline list of PDF URLs (can be combined with --urls-file).",
+    )
+    parser.add_argument(
+        "--urls-file",
+        action="append",
+        help="Text file containing one URL per line (can be repeated).",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="tmp/harvest-pdfs",
+        help="Directory to store downloaded PDFs (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=min(8, (os.cpu_count() or 4) * 2),
+        help="Number of concurrent downloads (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Per-request timeout in seconds (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing files (default: skip already downloaded PDFs).",
+    )
+    return parser.parse_args()
+
+
+def load_urls(args: argparse.Namespace) -> List[str]:
+    urls: List[str] = []
+    seen: Set[str] = set()
+
+    def add(url: str) -> None:
+        clean = url.strip()
+        if not clean or clean.startswith("#"):
+            return
+        if clean not in seen:
+            seen.add(clean)
+            urls.append(clean)
+
+    for url in args.urls:
+        add(url)
+    if args.urls_file:
+        for file in args.urls_file:
+            path = Path(file)
+            if not path.exists():
+                print(f"[WARN] URL file not found: {file}", file=sys.stderr)
+                continue
+            with path.open("r", encoding="utf-8") as handle:
+                for line in handle:
+                    add(line)
+    if not urls:
+        raise SystemExit("No URLs supplied. Use --urls and/or --urls-file.")
+    return urls
+
+
+def sanitize_filename(name: str) -> str:
+    return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "download"
+
+
+def build_filename(url: str, output_dir: Path) -> Path:
+    parsed = urlparse(url)
+    candidate = Path(unquote(parsed.path)).name
+    if not candidate:
+        candidate = "download.pdf"
+    candidate = sanitize_filename(candidate)
+    if not candidate.lower().endswith(".pdf"):
+        candidate += ".pdf"
+    target = output_dir / candidate
+    if not target.exists():
+        return target
+    stem = target.stem
+    suffix = target.suffix
+    digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
+    return output_dir / f"{stem}-{digest}{suffix}"
+
+
+def download_pdf(
+        url: str,
+        output_dir: Path,
+        timeout: int,
+        overwrite: bool,
+) -> Tuple[str, Optional[Path], Optional[str]]:
+    try:
+        dest = build_filename(url, output_dir)
+        if dest.exists() and not overwrite:
+            return url, dest, "exists"
+
+        response = requests.get(url, stream=True, timeout=timeout)
+        response.raise_for_status()
+
+        content_type = response.headers.get("Content-Type", "").lower()
+        if "pdf" not in content_type and not url.lower().endswith(".pdf"):
+            # Peek into the first bytes to be safe
+            peek = response.raw.read(5, decode_content=True)
+            if not peek.startswith(b"%PDF"):
+                return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
+            content = peek + response.content[len(peek):]
+        else:
+            content = response.content
+
+        output_dir.mkdir(parents=True, exist_ok=True)
+        dest.write_bytes(content)
+        return url, dest, None
+    except Exception as exc:  # pylint: disable=broad-except
+        return url, None, str(exc)
+
+
+def main() -> None:
+    args = parse_args()
+    urls = load_urls(args)
+    output_dir = Path(args.output_dir).resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
+
+    successes = 0
+    skipped = 0
+    failures: List[Tuple[str, str]] = []
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
+        future_to_url = {
+            executor.submit(
+                download_pdf, url, output_dir, args.timeout, args.overwrite
+            ): url
+            for url in urls
+        }
+        for future in concurrent.futures.as_completed(future_to_url):
+            url = future_to_url[future]
+            result_url, path, error = future.result()
+            if error == "exists":
+                skipped += 1
+                print(f"[SKIP] {url} (already downloaded)")
+            elif error:
+                failures.append((result_url, error))
+                print(f"[FAIL] {url} -> {error}", file=sys.stderr)
+            else:
+                successes += 1
+                print(f"[OK] {url} -> {path}")
+
+    print()
+    print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
+    if failures:
+        print("Failures:")
+        for url, error in failures:
+            print(f"  {url} -> {error}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/harvest_type3_fonts.py
+++ b/scripts/harvest_type3_fonts.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+"""
+Bulk-harvest Type3 font signatures from a folder full of PDFs.
+
+The script iterates over every PDF (recursively) inside the supplied --input
+paths, invokes the existing Gradle Type3SignatureTool for each document, and
+collects the unique Type3 font signatures that were discovered. Signature JSON
+files are stored under --signatures-dir; previously captured files are reused
+so you can keep dropping new PDFs into the input directory and re-run the
+harvester at any time.
+
+Example:
+    python scripts/harvest_type3_fonts.py \
+        --input incoming-type3-pdfs \
+        --signatures docs/type3/signatures \
+        --report docs/type3/harvest_report.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import hashlib
+import json
+import os
+import re
+import shlex
+import subprocess
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Bulk collect Type3 font signatures from PDFs.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        required=True,
+        help="One or more PDF files or directories containing PDFs (searched recursively).",
+    )
+    parser.add_argument(
+        "--signatures-dir",
+        default="docs/type3/signatures",
+        help="Destination directory for per-PDF signature JSON files.",
+    )
+    parser.add_argument(
+        "--report",
+        default="docs/type3/harvest_report.json",
+        help="Summary JSON that lists every unique signature discovered so far.",
+    )
+    default_gradle = "gradlew.bat" if os.name == "nt" else "./gradlew"
+    parser.add_argument(
+        "--gradle-cmd",
+        default=default_gradle,
+        help=f"Path to the Gradle wrapper used to invoke the Type3SignatureTool (default: {default_gradle}).",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-run the signature tool even if the output JSON already exists.",
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Ask the Java tool to emit pretty-printed JSON (handy for diffs).",
+    )
+    return parser.parse_args()
+
+
+def discover_pdfs(paths: Sequence[str]) -> List[Path]:
+    pdfs: List[Path] = []
+    for raw in paths:
+        path = Path(raw).resolve()
+        if path.is_file():
+            if path.suffix.lower() == ".pdf":
+                pdfs.append(path)
+        elif path.is_dir():
+            pdfs.extend(sorted(path.rglob("*.pdf")))
+    unique = sorted(dict.fromkeys(pdfs))
+    if not unique:
+        raise SystemExit("No PDF files found under the supplied --input paths.")
+    return unique
+
+
+def sanitize_part(part: str) -> str:
+    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", part)
+    return cleaned or "_"
+
+
+def derive_signature_path(pdf: Path, signatures_dir: Path) -> Path:
+    """
+    Mirror the PDF path under the signatures directory.
+    If the PDF lives outside the repo, fall back to a hashed filename.
+    """
+    try:
+        rel = pdf.relative_to(REPO_ROOT)
+    except ValueError:
+        digest = hashlib.sha1(str(pdf).encode("utf-8")).hexdigest()[:10]
+        rel = Path("__external__") / f"{sanitize_part(pdf.stem)}-{digest}.pdf"
+
+    sanitized_parts = [sanitize_part(part) for part in rel.parts]
+    signature_rel = Path(*sanitized_parts).with_suffix(".json")
+    return signatures_dir / signature_rel
+
+
+def load_signature_file(path: Path) -> dict:
+    with path.open("r", encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+def collect_known_signatures(signatures_dir: Path) -> Dict[str, dict]:
+    known: Dict[str, dict] = {}
+    if not signatures_dir.exists():
+        return known
+    for json_file in signatures_dir.rglob("*.json"):
+        try:
+            payload = load_signature_file(json_file)
+        except Exception:
+            continue
+        pdf = payload.get("pdf")
+        for font in payload.get("fonts", []):
+            signature = font.get("signature")
+            if not signature or signature in known:
+                continue
+            known[signature] = {
+                "signature": signature,
+                "alias": font.get("alias"),
+                "baseName": font.get("baseName"),
+                "glyphCount": font.get("glyphCount"),
+                "glyphCoverage": font.get("glyphCoverage"),
+                "samplePdf": pdf,
+                "signatureJson": str(json_file),
+            }
+    return known
+
+
+def run_signature_tool(
+    gradle_cmd: str, pdf: Path, output_path: Path, pretty: bool, cwd: Path
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    args = f"--pdf {shlex.quote(str(pdf))} --output {shlex.quote(str(output_path))}"
+    if pretty:
+        args += " --pretty"
+    # Use shell invocation so the quoted --args string is parsed correctly by Gradle.
+    cmd = f"{gradle_cmd} -q :proprietary:type3SignatureTool --args=\"{args}\""
+    completed = subprocess.run(
+        cmd,
+        shell=True,
+        cwd=cwd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"Gradle Type3SignatureTool failed for {pdf}:\n{completed.stderr.strip()}"
+        )
+
+
+def extract_fonts_from_payload(payload: dict) -> List[dict]:
+    pdf = payload.get("pdf")
+    fonts = []
+    for font in payload.get("fonts", []):
+        signature = font.get("signature")
+        if not signature:
+            continue
+        fonts.append(
+            {
+                "signature": signature,
+                "alias": font.get("alias"),
+                "baseName": font.get("baseName"),
+                "glyphCount": font.get("glyphCount"),
+                "glyphCoverage": font.get("glyphCoverage"),
+                "samplePdf": pdf,
+            }
+        )
+    return fonts
+
+
+def write_report(report_path: Path, fonts_by_signature: Dict[str, dict]) -> None:
+    ordered = sorted(fonts_by_signature.values(), key=lambda entry: entry["signature"])
+    report = {
+        "generatedAt": dt.datetime.utcnow().isoformat(timespec="seconds") + "Z",
+        "totalSignatures": len(ordered),
+        "fonts": ordered,
+    }
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    with report_path.open("w", encoding="utf-8") as handle:
+        json.dump(report, handle, indent=2)
+
+
+def main() -> None:
+    args = parse_args()
+    signatures_dir = Path(args.signatures_dir).resolve()
+    report_path = Path(args.report).resolve()
+    pdfs = discover_pdfs(args.input)
+
+    known = collect_known_signatures(signatures_dir)
+    newly_added: List[Tuple[str, str]] = []
+
+    for pdf in pdfs:
+        signature_path = derive_signature_path(pdf, signatures_dir)
+        if signature_path.exists() and not args.force:
+            try:
+                payload = load_signature_file(signature_path)
+            except Exception as exc:
+                print(f"[WARN] Failed to parse cached signature {signature_path}: {exc}")
+                payload = None
+        else:
+            try:
+                run_signature_tool(args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT)
+            except Exception as exc:
+                print(f"[ERROR] Harvest failed for {pdf}: {exc}", file=sys.stderr)
+                continue
+            payload = load_signature_file(signature_path)
+
+        if not payload:
+            continue
+
+        for font in extract_fonts_from_payload(payload):
+            signature = font["signature"]
+            if signature in known:
+                continue
+            font["signatureJson"] = str(signature_path)
+            known[signature] = font
+            newly_added.append((signature, pdf.name))
+
+    write_report(report_path, known)
+
+    print(
+        f"Processed {len(pdfs)} PDFs. "
+        f"Captured {len(newly_added)} new Type3 font signatures "
+        f"(total unique signatures: {len(known)})."
+    )
+    if newly_added:
+        print("New signatures:")
+        for signature, sample in newly_added:
+            print(f"  {signature}  ({sample})")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/index_type3_catalogue.py
+++ b/scripts/index_type3_catalogue.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Build a Type3 font catalogue from sample PDFs."""
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+
+def run(cmd, cwd=None):
+    result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Command {' '.join(cmd)} failed: {result.stderr}")
+    return result.stdout
+
+
+def parse_pdffonts(output):
+    lines = output.splitlines()
+    entries = []
+    for line in lines[2:]:
+        if not line.strip():
+            continue
+        parts = line.split()
+        if "Type" not in parts:
+            continue
+        idx = parts.index("Type")
+        type_value = parts[idx + 1] if idx + 1 < len(parts) else ""
+        if not type_value.startswith("3"):
+            continue
+        font_name = parts[0]
+        encoding = parts[-2] if len(parts) >= 2 else ""
+        entries.append((font_name, encoding))
+    return entries
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Index Type3 fonts from sample PDFs")
+    parser.add_argument(
+        "--samples",
+        default="app/core/src/main/resources/type3/samples",
+        help="Directory containing sample PDFs",
+    )
+    parser.add_argument(
+        "--output",
+        default="app/core/src/main/resources/type3/catalogue.json",
+        help="Output JSON file",
+    )
+    args = parser.parse_args()
+
+    samples_dir = Path(args.samples)
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    catalogue = []
+    for pdf in sorted(samples_dir.glob("*.pdf")):
+        try:
+            output = run(["pdffonts", str(pdf)])
+        except Exception as exc:
+            print(f"Skipping {pdf.name}: {exc}")
+            continue
+        for font_name, encoding in parse_pdffonts(output):
+            catalogue.append(
+                {
+                    "source": pdf.name,
+                    "fontName": font_name,
+                    "encoding": encoding,
+                }
+            )
+
+    with out_path.open("w", encoding="utf-8") as handle:
+        json.dump(catalogue, handle, indent=2)
+    print(f"Wrote {len(catalogue)} entries to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/summarize_type3_signatures.py
+++ b/scripts/summarize_type3_signatures.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Summarize captured Type3 signature dumps as a Markdown inventory.
+
+Usage:
+    scripts/summarize_type3_signatures.py \
+        --input docs/type3/signatures \
+        --output docs/type3/signature_inventory.md
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Summarize Type3 signature JSON dumps.")
+    parser.add_argument(
+        "--input",
+        default="docs/type3/signatures",
+        help="Directory containing signature JSON files (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--output",
+        default="docs/type3/signature_inventory.md",
+        help="Markdown file to write (default: %(default)s)",
+    )
+    return parser.parse_args()
+
+
+def load_signatures(directory: Path) -> Dict[str, List[dict]]:
+    inventory: Dict[str, List[dict]] = defaultdict(list)
+    for path in sorted(directory.glob("*.json")):
+        with path.open("r", encoding="utf-8") as handle:
+            payload = json.load(handle)
+        source_pdf = payload.get("pdf") or path.name
+        for font in payload.get("fonts", []):
+            alias = (font.get("alias") or font.get("baseName") or "unknown").lower()
+            entry = {
+                "source": source_pdf,
+                "file": path.name,
+                "alias": alias,
+                "baseName": font.get("baseName"),
+                "signature": font.get("signature"),
+                "glyphCount": font.get("glyphCount"),
+                "glyphCoverage": font.get("glyphCoverage"),
+            }
+            inventory[alias].append(entry)
+    return inventory
+
+
+def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Path) -> None:
+    lines: List[str] = []
+    lines.append("# Type3 Signature Inventory")
+    lines.append("")
+    lines.append(
+        f"_Generated from `{input_dir}`. "
+        "Run `scripts/summarize_type3_signatures.py` after capturing new samples._"
+    )
+    lines.append("")
+
+    for alias in sorted(inventory.keys()):
+        entries = inventory[alias]
+        lines.append(f"## Alias: `{alias}`")
+        lines.append("")
+        lines.append("| Signature | Samples | Glyph Count | Coverage (first 10) |")
+        lines.append("| --- | --- | --- | --- |")
+        for entry in entries:
+            signature = entry.get("signature") or "—"
+            sample = Path(entry["source"]).name
+            glyph_count = entry.get("glyphCount") if entry.get("glyphCount") is not None else "—"
+            coverage = entry.get("glyphCoverage") or []
+            preview = ", ".join(str(code) for code in coverage[:10])
+            lines.append(f"| `{signature}` | `{sample}` | {glyph_count} | {preview} |")
+        lines.append("")
+
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text("\n".join(lines), encoding="utf-8")
+
+
+def main() -> None:
+    args = parse_args()
+    input_dir = Path(args.input)
+    if not input_dir.exists():
+        raise SystemExit(f"Input directory not found: {input_dir}")
+    inventory = load_signatures(input_dir)
+    output_path = Path(args.output)
+    write_markdown(inventory, output_path, input_dir)
+    print(f"Wrote inventory for {len(inventory)} aliases to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/type3_to_cff.py
+++ b/scripts/type3_to_cff.py
@@ -0,0 +1,481 @@
+#!/usr/bin/env python3
+"""
+Convert Stirling PDF Type3 glyph JSON into synthesised fonts using fontTools.
+
+The input JSON is expected to contain:
+  - fontId, pageNumber (optional metadata)
+  - fontMatrix: 3x3 matrix describing the Type3 glyph transform
+  - glyphs: array of glyph records with keys:
+        name, code, advanceWidth, bbox, unicode, outline (list of commands)
+
+The script produces an OpenType CFF font and, when requested, a companion
+TrueType font for web-preview usage. Only the fontTools package is required,
+avoiding heavyweight build dependencies such as fontmake/ufoLib2.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+
+from fontTools.fontBuilder import FontBuilder
+from fontTools.misc.fixedTools import otRound
+from fontTools.pens.cu2quPen import Cu2QuPen
+from fontTools.pens.t2CharStringPen import T2CharStringPen
+from fontTools.pens.ttGlyphPen import TTGlyphPen
+
+
+Command = Dict[str, object]
+Matrix = Tuple[float, float, float, float, float, float]
+
+
+@dataclass
+class GlyphSource:
+    name: str
+    width: float
+    unicode: Optional[int]
+    char_code: Optional[int]
+    outline: Sequence[Command]
+
+
+@dataclass
+class GlyphBuildResult:
+    name: str
+    width: int
+    charstring: object
+    ttf_glyph: Optional[object]
+    unicode: Optional[int]
+    char_code: Optional[int]
+    bounds: Optional[Tuple[float, float, float, float]]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Synthesize fonts from Type3 glyph JSON.")
+    parser.add_argument("--input", required=True, help="Path to glyph JSON emitted by the backend")
+    parser.add_argument("--otf-output", required=True, help="Destination path for the CFF/OTF font")
+    parser.add_argument("--ttf-output", help="Optional destination path for a TrueType font")
+    parser.add_argument("--family-name", default="Type3 Synth", help="Family name for the output")
+    parser.add_argument("--style-name", default="Regular", help="Style name for the output")
+    parser.add_argument("--units-per-em", type=int, default=1000, help="Units per EM value")
+    parser.add_argument("--cu2qu-error", type=float, default=1.0, help="Max error for cubic→quadratic conversion")
+    return parser.parse_args()
+
+
+def load_json(path: Path) -> Dict[str, object]:
+    try:
+        with path.open("r", encoding="utf-8") as handle:
+            return json.load(handle)
+    except Exception as exc:  # pragma: no cover - fatal configuration error
+        print(f"ERROR: Failed to load glyph JSON '{path}': {exc}", file=sys.stderr)
+        sys.exit(2)
+
+
+def parse_font_matrix(rows: Optional[Iterable[Iterable[float]]]) -> Matrix:
+    """
+    Retrieve the raw 2×3 FontMatrix entries for diagnostics. Type3 glyph
+    outlines in our extractor are emitted in their native coordinate system, so
+    the returned matrix is currently informational only.
+    """
+    if not rows:
+        return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
+    values: List[List[float]] = []
+    for row in rows:
+        try:
+            values.append([float(col) for col in row])
+        except (TypeError, ValueError):
+            return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
+    if len(values) < 3 or len(values[0]) < 2 or len(values[1]) < 2:
+        return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
+    return (
+        float(values[0][0]),
+        float(values[0][1]),
+        float(values[1][0]),
+        float(values[1][1]),
+        float(values[2][0]),
+        float(values[2][1]),
+    )
+
+
+def resolve_width(raw_width: float, default: int) -> int:
+    try:
+        value = float(raw_width)
+    except (TypeError, ValueError):
+        return default
+    if not math.isfinite(value) or value <= 0:
+        return default
+    width = otRound(value)
+    return width if width > 0 else default
+
+
+def quadratic_to_cubic(
+    current: Tuple[float, float],
+    ctrl: Tuple[float, float],
+    end: Tuple[float, float],
+) -> Tuple[Tuple[float, float], Tuple[float, float], Tuple[float, float]]:
+    """
+    Convert a quadratic Bézier segment to cubic control points.
+    """
+    c1 = (
+        current[0] + (2.0 / 3.0) * (ctrl[0] - current[0]),
+        current[1] + (2.0 / 3.0) * (ctrl[1] - current[1]),
+    )
+    c2 = (
+        end[0] + (2.0 / 3.0) * (ctrl[0] - end[0]),
+        end[1] + (2.0 / 3.0) * (ctrl[1] - end[1]),
+    )
+    return c1, c2, end
+
+
+def iterate_glyphs(data: Dict[str, object]) -> List[GlyphSource]:
+    glyph_records = data.get("glyphs") or []
+    sources: List[GlyphSource] = []
+    for index, record in enumerate(glyph_records, start=1):
+        if not isinstance(record, dict):
+            continue
+        name = record.get("name")
+        if not isinstance(name, str) or not name:
+            name = f"g{index}"
+        width = record.get("advanceWidth")
+        if not isinstance(width, (int, float)) or math.isnan(width):
+            width = 1000.0
+        unicode_value = record.get("unicode")
+        if not isinstance(unicode_value, int) or unicode_value <= 0:
+            unicode_value = None
+        char_code_value = record.get("charCode")
+        if not isinstance(char_code_value, int):
+            char_code_value = record.get("code")
+        if not isinstance(char_code_value, int):
+            char_code_value = record.get("charCodeRaw")
+        if not isinstance(char_code_value, int) or not (0 <= char_code_value <= 0x10FFFF):
+            char_code_value = None
+        outline = record.get("outline")
+        if not isinstance(outline, list):
+            outline = []
+        sources.append(
+                GlyphSource(
+                    name=name,
+                    width=float(width),
+                    unicode=unicode_value,
+                    char_code=char_code_value,
+                    outline=outline))
+    return sources
+
+
+def build_cff_charstring(
+    glyph: GlyphSource,
+    width: int,
+) -> Tuple[object, Optional[Tuple[float, float, float, float]]]:
+    pen = T2CharStringPen(width=width, glyphSet=None)
+    bounds = [math.inf, math.inf, -math.inf, -math.inf]
+
+    def update_bounds(point: Tuple[float, float]) -> None:
+        x, y = point
+        bounds[0] = min(bounds[0], x)
+        bounds[1] = min(bounds[1], y)
+        bounds[2] = max(bounds[2], x)
+        bounds[3] = max(bounds[3], y)
+
+    current: Optional[Tuple[float, float]] = None
+    start_point: Optional[Tuple[float, float]] = None
+    open_path = False
+
+    for command in glyph.outline:
+        if not isinstance(command, dict):
+            continue
+        op = command.get("cmd")
+        if op == "M":
+            if open_path:
+                pen.endPath()
+                open_path = False
+            point = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
+            pen.moveTo(point)
+            update_bounds(point)
+            current = point
+            start_point = point
+            open_path = True
+        elif op == "L" and current is not None:
+            point = (float(command.get("x", current[0])), float(command.get("y", current[1])))
+            pen.lineTo(point)
+            update_bounds(point)
+            current = point
+        elif op == "C" and current is not None:
+            ctrl1 = (
+                float(command.get("x1", current[0])),
+                float(command.get("y1", current[1])),
+            )
+            ctrl2 = (
+                float(command.get("x2", current[0])),
+                float(command.get("y2", current[1])),
+            )
+            end = (
+                float(command.get("x", current[0])),
+                float(command.get("y", current[1])),
+            )
+            pen.curveTo(ctrl1, ctrl2, end)
+            update_bounds(ctrl1)
+            update_bounds(ctrl2)
+            update_bounds(end)
+            current = end
+        elif op == "Q" and current is not None:
+            ctrl = (
+                float(command.get("x1", current[0])),
+                float(command.get("y1", current[1])),
+            )
+            end = (
+                float(command.get("x", current[0])),
+                float(command.get("y", current[1])),
+            )
+            c1, c2, end_point = quadratic_to_cubic(current, ctrl, end)
+            pen.curveTo(c1, c2, end_point)
+            update_bounds(ctrl)
+            update_bounds(end_point)
+            current = end_point
+        elif op == "Z" and open_path:
+            pen.closePath()
+            open_path = False
+            if start_point is not None:
+                current = start_point
+        # Ignore unsupported commands silently.
+
+    if open_path:
+        pen.endPath()
+
+    charstring = pen.getCharString()
+    bbox = None
+    if bounds[0] <= bounds[2] and bounds[1] <= bounds[3]:
+        bbox = (bounds[0], bounds[1], bounds[2], bounds[3])
+    return charstring, bbox
+
+
+def build_ttf_glyph(glyph: GlyphSource, max_error: float) -> Optional[object]:
+    pen = TTGlyphPen(glyphSet=None)
+    draw_pen = Cu2QuPen(pen, max_error, reverse_direction=False)
+
+    current_exists = False
+
+    for command in glyph.outline:
+        if not isinstance(command, dict):
+            continue
+        op = command.get("cmd")
+        if op == "M":
+            x = float(command.get("x", 0.0))
+            y = float(command.get("y", 0.0))
+            draw_pen.moveTo((x, y))
+            current_exists = True
+        elif op == "L" and current_exists:
+            x = float(command.get("x", 0.0))
+            y = float(command.get("y", 0.0))
+            draw_pen.lineTo((x, y))
+        elif op == "C" and current_exists:
+            ctrl1 = (float(command.get("x1", 0.0)), float(command.get("y1", 0.0)))
+            ctrl2 = (float(command.get("x2", 0.0)), float(command.get("y2", 0.0)))
+            end = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
+            draw_pen.curveTo(ctrl1, ctrl2, end)
+        elif op == "Q" and current_exists:
+            ctrl = (float(command.get("x1", 0.0)), float(command.get("y1", 0.0)))
+            end = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
+            draw_pen.qCurveTo(ctrl, end)
+        elif op == "Z" and current_exists:
+            draw_pen.closePath()
+            current_exists = False
+
+    if current_exists:
+        draw_pen.endPath()
+
+    try:
+        glyph_obj = pen.glyph()
+    except Exception:
+        return None
+    return glyph_obj
+
+
+def synthesise_fonts(
+    data: Dict[str, object],
+    otf_output: Path,
+    ttf_output: Optional[Path],
+    family_name: str,
+    style_name: str,
+    units_per_em: int,
+    cu2qu_error: float,
+) -> None:
+    _font_matrix = parse_font_matrix(data.get("fontMatrix"))
+    glyphs = iterate_glyphs(data)
+
+    results: List[GlyphBuildResult] = []
+    global_y_min = math.inf
+    global_y_max = -math.inf
+
+    default_width = max(1, units_per_em // 2)
+
+    for glyph in glyphs:
+        width = resolve_width(glyph.width, default_width)
+        charstring, bounds = build_cff_charstring(glyph, width)
+        ttf_glyph = None
+        if ttf_output is not None:
+            ttf_glyph = build_ttf_glyph(glyph, cu2qu_error)
+            if ttf_glyph is not None:
+                ttf_glyph.width = width
+        if bounds is not None:
+            global_y_min = min(global_y_min, bounds[1])
+            global_y_max = max(global_y_max, bounds[3])
+        results.append(
+            GlyphBuildResult(
+                name=glyph.name,
+                width=width,
+                charstring=charstring,
+                ttf_glyph=ttf_glyph,
+                unicode=glyph.unicode,
+                char_code=glyph.char_code,
+                bounds=bounds,
+            )
+        )
+
+    if not results:
+        raise RuntimeError("No glyphs provided in input JSON")
+
+    ascent = global_y_max if math.isfinite(global_y_max) else units_per_em * 0.8
+    descent = global_y_min if math.isfinite(global_y_min) else -units_per_em * 0.2
+    ascent = otRound(ascent)
+    descent = otRound(descent)
+    if ascent <= 0:
+        ascent = otRound(units_per_em * 0.8)
+    if descent >= 0:
+        descent = -otRound(units_per_em * 0.2)
+
+    glyph_order = [".notdef"] + [result.name for result in results]
+    horizontal_metrics = {result.name: (result.width, 0) for result in results}
+    horizontal_metrics[".notdef"] = (default_width, 0)
+
+    cmap: Dict[int, str] = {}
+    next_private = 0xF000
+    for result in results:
+        code_point = result.unicode
+        if code_point is None:
+            raw_code = result.char_code
+            if raw_code is not None:
+                code_point = raw_code
+            else:
+                code_point = next_private
+                next_private += 1
+        cmap[code_point] = result.name
+
+    notdef_pen = T2CharStringPen(width=default_width, glyphSet=None)
+    notdef_pen.endPath()
+    charstrings = {result.name: result.charstring for result in results}
+    charstrings[".notdef"] = notdef_pen.getCharString()
+
+    name_table_entries = {
+        "familyName": family_name,
+        "styleName": style_name,
+        "psName": f"{family_name.replace(' ', '')}-{style_name}",
+        "fullName": f"{family_name} {style_name}",
+    }
+
+    # Build OTF (CFF) font.
+    fb = FontBuilder(units_per_em, isTTF=False)
+    fb.setupGlyphOrder(glyph_order)
+    fb.setupCharacterMap(cmap)
+    fb.setupHorizontalMetrics(horizontal_metrics)
+    fb.setupHorizontalHeader(ascent=ascent, descent=descent)
+    fb.setupOS2(
+        sTypoAscender=ascent,
+        sTypoDescender=descent,
+        usWinAscent=max(ascent, 0),
+        usWinDescent=abs(min(descent, 0)),
+        sxHeight=otRound(units_per_em * 0.5),
+        sCapHeight=otRound(units_per_em * 0.7),
+    )
+    fb.setupNameTable(name_table_entries)
+    fb.setupPost()
+    fb.setupCFF(
+        name_table_entries["psName"],
+        {
+            "FullName": name_table_entries["fullName"],
+            "FamilyName": name_table_entries["familyName"],
+            "Weight": style_name,
+        },
+        charstrings,
+        {"BlueValues": []},
+    )
+    fb.font.save(str(otf_output))
+
+    if ttf_output is None:
+        return
+
+    glyph_objects: Dict[str, object] = {}
+    empty_pen = TTGlyphPen(None)
+    empty_pen.moveTo((0, 0))
+    empty_pen.lineTo((0, 0))
+    empty_pen.closePath()
+    empty_glyph = empty_pen.glyph()
+    empty_glyph.width = default_width
+    glyph_objects[".notdef"] = empty_glyph
+    for result in results:
+        glyph_obj = result.ttf_glyph
+        if glyph_obj is None:
+            temp_pen = TTGlyphPen(None)
+            temp_pen.moveTo((0, 0))
+            temp_pen.lineTo((0, 0))
+            temp_pen.closePath()
+            glyph_obj = temp_pen.glyph()
+        glyph_obj.width = result.width
+        glyph_objects[result.name] = glyph_obj
+
+    ttf_fb = FontBuilder(units_per_em, isTTF=True)
+    ttf_fb.setupGlyphOrder(glyph_order)
+    ttf_fb.setupCharacterMap(cmap)
+    ttf_fb.setupHorizontalMetrics(horizontal_metrics)
+    ttf_fb.setupHorizontalHeader(ascent=ascent, descent=descent)
+    ttf_fb.setupOS2(
+        sTypoAscender=ascent,
+        sTypoDescender=descent,
+        usWinAscent=max(ascent, 0),
+        usWinDescent=abs(min(descent, 0)),
+        sxHeight=otRound(units_per_em * 0.5),
+        sCapHeight=otRound(units_per_em * 0.7),
+    )
+    ttf_fb.setupNameTable(name_table_entries)
+    ttf_fb.setupPost()
+    ttf_fb.setupGlyf(glyph_objects)
+    ttf_fb.setupDummyDSIG()
+    ttf_fb.font.save(str(ttf_output))
+
+
+def main() -> None:
+    args = parse_args()
+    input_path = Path(args.input).resolve()
+    otf_output = Path(args.otf_output).resolve()
+    ttf_output = Path(args.ttf_output).resolve() if args.ttf_output else None
+
+    data = load_json(input_path)
+    try:
+        synthesise_fonts(
+            data=data,
+            otf_output=otf_output,
+            ttf_output=ttf_output,
+            family_name=args.family_name,
+            style_name=args.style_name,
+            units_per_em=args.units_per_em,
+            cu2qu_error=args.cu2qu_error,
+        )
+    except Exception as exc:
+        print(f"ERROR: Failed to generate fonts: {exc}", file=sys.stderr)
+        if otf_output.exists():
+            otf_output.unlink()
+        if ttf_output and ttf_output.exists():
+            ttf_output.unlink()
+        sys.exit(1)
+
+    message = f"Generated font at {otf_output}"
+    if ttf_output:
+        message += f" and {ttf_output}"
+    print(message, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/update_type3_library.py
+++ b/scripts/update_type3_library.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""
+Synchronize Type3 library index entries with captured signature dumps.
+
+The script scans docs/type3/signatures/*.json (or a custom --signatures-dir),
+matches each font by alias/signature to app/core/src/main/resources/type3/library/index.json,
+and updates the entry's signatures / glyphCoverage / aliases / source fields.
+
+Usage:
+    scripts/update_type3_library.py --apply
+
+Run without --apply to see a dry-run summary.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+DEFAULT_SIGNATURES = REPO_ROOT / "docs" / "type3" / "signatures"
+DEFAULT_INDEX = (
+    REPO_ROOT / "app" / "core" / "src" / "main" / "resources" / "type3" / "library" / "index.json"
+)
+
+
+def normalize_alias(value: Optional[str]) -> Optional[str]:
+    if not value:
+        return None
+    trimmed = value.strip()
+    plus = trimmed.find("+")
+    if plus >= 0 and plus < len(trimmed) - 1:
+        trimmed = trimmed[plus + 1 :]
+    lowered = trimmed.lower()
+    return lowered if lowered else None
+
+
+def load_json(path: Path):
+    with path.open("r", encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+def dump_json(path: Path, data) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        json.dump(data, handle, indent=2)
+        handle.write("\n")
+
+
+def iter_signature_fonts(signature_file: Path):
+    payload = load_json(signature_file)
+    pdf_source = payload.get("pdf")
+    for font in payload.get("fonts", []):
+        alias = font.get("alias") or font.get("baseName")
+        normalized = normalize_alias(alias) or normalize_alias(font.get("baseName"))
+        yield {
+            "alias_raw": alias,
+            "alias": normalized,
+            "baseName": font.get("baseName"),
+            "signature": font.get("signature"),
+            "glyphCoverage": font.get("glyphCoverage") or [],
+            "pdf": pdf_source,
+            "file": signature_file,
+        }
+
+
+def make_alias_index(entries: List[Dict]) -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
+    alias_index: Dict[str, Dict] = {}
+    signature_index: Dict[str, Dict] = {}
+    for entry in entries:
+        for alias in entry.get("aliases", []) or []:
+            normalized = normalize_alias(alias)
+            if normalized:
+                alias_index.setdefault(normalized, entry)
+        base_name_alias = normalize_alias(entry.get("label"))
+        if base_name_alias:
+            alias_index.setdefault(base_name_alias, entry)
+        for signature in entry.get("signatures", []) or []:
+            signature_index.setdefault(signature.lower(), entry)
+    return alias_index, signature_index
+
+
+def ensure_list(container: Dict, key: str) -> List:
+    value = container.get(key)
+    if isinstance(value, list):
+        return value
+    value = []
+    container[key] = value
+    return value
+
+
+def merge_sorted_unique(values: Iterable[int]) -> List[int]:
+    return sorted({int(v) for v in values if isinstance(v, int)})
+
+
+def normalize_source_path(pdf_path: Optional[str]) -> Optional[str]:
+    if not pdf_path:
+        return None
+    try:
+        source = Path(pdf_path)
+        rel = source.relative_to(REPO_ROOT)
+    except Exception:
+        rel = Path(pdf_path)
+    return str(rel).replace("\\", "/")
+
+
+def update_library(
+    signatures_dir: Path, index_path: Path, apply_changes: bool
+) -> Tuple[int, int, List[Tuple[str, Path]]]:
+    entries = load_json(index_path)
+    alias_index, signature_index = make_alias_index(entries)
+
+    modifications = 0
+    updated_entries = set()
+    unmatched: List[Tuple[str, Path]] = []
+
+    signature_files = sorted(signatures_dir.glob("*.json"))
+    if not signature_files:
+        print(f"No signature JSON files found under {signatures_dir}", file=sys.stderr)
+        return 0, 0, unmatched
+
+    for sig_file in signature_files:
+        for font in iter_signature_fonts(sig_file):
+            signature = font["signature"]
+            norm_signature = signature.lower() if signature else None
+            alias = font["alias"]
+
+            entry = None
+            if norm_signature and norm_signature in signature_index:
+                entry = signature_index[norm_signature]
+            elif alias and alias in alias_index:
+                entry = alias_index[alias]
+
+            if entry is None:
+                unmatched.append((font.get("baseName") or font.get("alias_raw") or "unknown", sig_file))
+                continue
+
+            entry_modified = False
+
+            # Signatures
+            if signature:
+                signature_list = ensure_list(entry, "signatures")
+                if signature not in signature_list:
+                    signature_list.append(signature)
+                    entry_modified = True
+                    signature_index[signature.lower()] = entry
+
+            # Aliases
+            alias_raw = font.get("alias_raw")
+            if alias_raw:
+                aliases = ensure_list(entry, "aliases")
+                if alias_raw not in aliases:
+                    aliases.append(alias_raw)
+                    entry_modified = True
+                    normalized = normalize_alias(alias_raw)
+                    if normalized:
+                        alias_index.setdefault(normalized, entry)
+
+            # Glyph coverage
+            coverage = font.get("glyphCoverage") or []
+            if coverage:
+                existing = set(entry.get("glyphCoverage", []))
+                merged = merge_sorted_unique(list(existing) + coverage)
+                if merged != entry.get("glyphCoverage"):
+                    entry["glyphCoverage"] = merged
+                    entry_modified = True
+
+            # Source PDF
+            pdf_source = normalize_source_path(font.get("pdf"))
+            if pdf_source and not entry.get("source"):
+                entry["source"] = pdf_source
+                entry_modified = True
+
+            if entry_modified:
+                modifications += 1
+                updated_entries.add(entry.get("id", "<unknown>"))
+
+    if apply_changes and modifications > 0:
+        dump_json(index_path, entries)
+
+    return modifications, len(updated_entries), unmatched
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Update Type3 library index using signature dumps.")
+    parser.add_argument(
+        "--signatures-dir",
+        type=Path,
+        default=DEFAULT_SIGNATURES,
+        help=f"Directory containing signature JSON files (default: {DEFAULT_SIGNATURES})",
+    )
+    parser.add_argument(
+        "--index",
+        type=Path,
+        default=DEFAULT_INDEX,
+        help=f"Path to type3/library/index.json (default: {DEFAULT_INDEX})",
+    )
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="Write changes back to the index file. Without this flag the script runs in dry-run mode.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    signatures_dir = args.signatures_dir if args.signatures_dir.is_absolute() else (REPO_ROOT / args.signatures_dir)
+    index_path = args.index if args.index.is_absolute() else (REPO_ROOT / args.index)
+
+    if not signatures_dir.exists():
+        print(f"Signature directory not found: {signatures_dir}", file=sys.stderr)
+        sys.exit(2)
+    if not index_path.exists():
+        print(f"Index file not found: {index_path}", file=sys.stderr)
+        sys.exit(2)
+
+    modifications, updated_entries, unmatched = update_library(
+        signatures_dir, index_path, apply_changes=args.apply
+    )
+
+    mode = "APPLIED" if args.apply else "DRY-RUN"
+    print(
+        f"[{mode}] Processed signatures under {signatures_dir}. "
+        f"Updated entries: {updated_entries}, individual modifications: {modifications}."
+    )
+
+    if unmatched:
+        print("\nUnmatched fonts (no library entry yet):")
+        for alias, sig_file in unmatched:
+            print(f"  - {alias} (from {sig_file})")
+        print("Add these fonts to index.json with the proper payload before rerunning.")
+
+    if modifications == 0:
+        print("No changes detected; index.json already matches captured signatures.")
+
+
+if __name__ == "__main__":
+    main()