tpye3 text edit init

This commit is contained in:
Anthony Stirling
2025-11-09 12:43:33 +00:00
parent a6bee1436f
commit e915e1aa7d
49 changed files with 23741 additions and 151 deletions

View File

@@ -0,0 +1,583 @@
#!/usr/bin/env python3
"""
Mass-download PDFs from various public domains for Type3 font harvesting.
Downloads hundreds of PDFs from:
- arXiv (scientific papers)
- Project Gutenberg (books)
- Government reports (NASA, EPA, etc.)
- Academic repositories
- Technical documentation
- And many more sources...
Run with: python scripts/download_pdf_collection.py --output ./pdf-collection
"""
import argparse
import asyncio
import hashlib
import random
import re
from pathlib import Path
from typing import List, Optional
from urllib.parse import urlparse
import aiofiles
import aiohttp
# Extensive list of PDF URLs across multiple categories
PDF_URLS = [
# Mathematics & Statistics
"https://arxiv.org/pdf/2103.14030.pdf", # Swin Transformer
"https://arxiv.org/pdf/2010.11929.pdf", # Vision Transformer
"https://arxiv.org/pdf/2005.14165.pdf", # GPT-3 Paper
"https://arxiv.org/pdf/1910.10683.pdf", # T5 Text-to-Text Transformer
"https://arxiv.org/pdf/1810.04805.pdf", # BERT
"https://arxiv.org/pdf/1706.03762.pdf", # Attention Is All You Need
"https://arxiv.org/pdf/1603.04467.pdf", # TensorFlow White Paper
"https://arxiv.org/pdf/1511.06434.pdf", # DCGAN
"https://arxiv.org/pdf/1506.03378.pdf", # LIME
"https://arxiv.org/pdf/1409.1556.pdf", # VGGNet
"https://arxiv.org/pdf/1312.6114.pdf", # Variational Autoencoders
"https://arxiv.org/pdf/1211.4240.pdf", # AlexNet
"https://arxiv.org/pdf/1106.1813.pdf", # CIFAR-10
"https://arxiv.org/pdf/1003.0358.pdf", # SVM Theory
"https://arxiv.org/pdf/0909.4061.pdf", # Random Forests
# Physics
"https://arxiv.org/pdf/2303.08774.pdf", # Quantum Computing
"https://arxiv.org/pdf/2201.04294.pdf", # Dark Matter Research
"https://arxiv.org/pdf/2105.00552.pdf", # Gravitational Waves
"https://arxiv.org/pdf/2004.00007.pdf", # Particle Physics
"https://arxiv.org/pdf/1906.10176.pdf", # Cosmology
"https://arxiv.org/pdf/1807.02101.pdf", # String Theory
"https://arxiv.org/pdf/1708.05671.pdf", # Quantum Entanglement
"https://arxiv.org/pdf/1605.08625.pdf", # Astrophysics
# Computer Science
"https://arxiv.org/pdf/2204.02311.pdf", # PaLM Language Model
"https://arxiv.org/pdf/2112.07804.pdf", # Stable Diffusion
"https://arxiv.org/pdf/2107.03374.pdf", # Codex
"https://arxiv.org/pdf/2010.02559.pdf", # Neural Architecture Search
"https://arxiv.org/pdf/1912.01703.pdf", # YOLOv4
"https://arxiv.org/pdf/1905.11946.pdf", # EfficientNet
"https://arxiv.org/pdf/1812.01187.pdf", # BERT Large
"https://arxiv.org/pdf/1801.00631.pdf", # Transformer Applications
"https://arxiv.org/pdf/1704.04861.pdf", # MobileNet
"https://arxiv.org/pdf/1602.07360.pdf", # SqueezeNet
"https://arxiv.org/pdf/1512.03385.pdf", # ResNet
"https://arxiv.org/pdf/1506.02640.pdf", # YOLO
"https://arxiv.org/pdf/1502.03167.pdf", # Batch Normalization
"https://arxiv.org/pdf/1412.6980.pdf", # Adam Optimizer
"https://arxiv.org/pdf/1409.4842.pdf", # GoogLeNet
"https://arxiv.org/pdf/1312.5602.pdf", # Deep Q-Network
"https://arxiv.org/pdf/1301.3781.pdf", # Word2Vec
"https://arxiv.org/pdf/1207.0580.pdf", # Dropout
"https://arxiv.org/pdf/1102.1803.pdf", # ImageNet Classification
# Government Reports
"https://www.nasa.gov/sites/default/files/atoms/files/2023_nasa_annual_report.pdf",
"https://www.nasa.gov/sites/default/files/atoms/files/2022_nasa_annual_report.pdf",
"https://www.nasa.gov/sites/default/files/atoms/files/2021_nasa_annual_report.pdf",
"https://www.epa.gov/system/files/documents/2023-01/epa-strategic-plan-2022-2026.pdf",
"https://www.epa.gov/system/files/documents/2022-12/epa-annual-report-2022.pdf",
"https://www.nist.gov/system/files/documents/2023/02/15/NIST%20Annual%20Report%202022.pdf",
"https://www.nist.gov/system/files/documents/2022/03/01/NIST%20Annual%20Report%202021.pdf",
"https://www.noaa.gov/sites/default/files/2023-03/NOAA%20Annual%20Report%202022.pdf",
"https://www.fda.gov/media/165773/download",
"https://www.fda.gov/media/159722/download",
"https://www.cdc.gov/mmwr/PDF/wk/mm7201.pdf",
"https://www.cdc.gov/nchs/data/nvsr/nvsr71/nvsr71-01.pdf",
"https://www.bls.gov/opub/mlr/2023/article/pdf/labor-force-projections-2022-2032.pdf",
"https://www.bls.gov/opub/mlr/2023/article/pdf/union-membership-2022.pdf",
"https://www.census.gov/content/dam/Census/library/publications/2023/demo/p60-280.pdf",
"https://www.energy.gov/sites/default/files/2023-04/DOE%20Annual%20Report%202022.pdf",
# Project Gutenberg Classics
"https://www.gutenberg.org/files/1342/1342-pdf.pdf", # Pride and Prejudice
"https://www.gutenberg.org/files/84/84-pdf.pdf", # Frankenstein
"https://www.gutenberg.org/files/11/11-pdf.pdf", # Alice in Wonderland
"https://www.gutenberg.org/files/1661/1661-pdf.pdf", # Sherlock Holmes
"https://www.gutenberg.org/files/98/98-pdf.pdf", # Tale of Two Cities
"https://www.gutenberg.org/files/2701/2701-pdf.pdf", # Moby Dick
"https://www.gutenberg.org/files/2542/2542-pdf.pdf", # A Doll's House
"https://www.gutenberg.org/files/174/174-pdf.pdf", # Picture of Dorian Gray
"https://www.gutenberg.org/files/1952/1952-pdf.pdf", # The Yellow Wallpaper
"https://www.gutenberg.org/files/1080/1080-pdf.pdf", # A Modest Proposal
"https://www.gutenberg.org/files/43/43-pdf.pdf", # Dr. Jekyll and Mr. Hyde
"https://www.gutenberg.org/files/345/345-pdf.pdf", # Dracula
"https://www.gutenberg.org/files/5200/5200-pdf.pdf", # Metamorphosis
"https://www.gutenberg.org/files/76/76-pdf.pdf", # Adventures of Huckleberry Finn
"https://www.gutenberg.org/files/74/74-pdf.pdf", # Tom Sawyer
"https://www.gutenberg.org/files/1260/1260-pdf.pdf", # Jane Eyre
"https://www.gutenberg.org/files/768/768-pdf.pdf", # Wuthering Heights
"https://www.gutenberg.org/files/219/219-pdf.pdf", # Heart of Darkness
"https://www.gutenberg.org/files/1184/1184-pdf.pdf", # The Odyssey
"https://www.gutenberg.org/files/2600/2600-pdf.pdf", # War and Peace
# Technical Documentation
"https://www.kernel.org/doc/ols/2007/ols2007v1-pages-215-224.pdf",
"https://www.kernel.org/doc/ols/2008/ols2008v1-pages-133-142.pdf",
"https://www.kernel.org/doc/ols/2009/ols2009v1-pages-77-86.pdf",
"https://www.postgresql.org/files/documentation/pdf/15/postgresql-15-US.pdf",
"https://www.postgresql.org/files/documentation/pdf/14/postgresql-14-US.pdf",
"https://www.postgresql.org/files/documentation/pdf/13/postgresql-13-US.pdf",
"https://www.python.org/doc/essays/blt.pdf",
"https://www.python.org/doc/essays/gui-py.pdf",
# Academic Journals
"https://www.ams.org/journals/bull/2023-60-01/S0273-0979-2023-01789-9/S0273-0979-2023-01789-9.pdf",
"https://www.ams.org/journals/bull/2022-59-02/S0273-0979-2022-01789-9/S0273-0979-2022-01789-9.pdf",
"https://www.ams.org/journals/bull/2021-58-03/S0273-0979-2021-01789-9/S0273-0979-2021-01789-9.pdf",
"https://www.ams.org/notices/202304/rnoti-p434.pdf",
"https://www.ams.org/notices/202203/rnoti-p434.pdf",
"https://www.ams.org/notices/202102/rnoti-p434.pdf",
# Conference Papers
"https://www.usenix.org/system/files/conference/atc18/atc18-paper-zhang.pdf",
"https://www.usenix.org/system/files/conference/nsdi18/nsdi18-paper-briscoe.pdf",
"https://www.usenix.org/system/files/conference/osdi18/osdi18-paper-belay.pdf",
"https://dl.acm.org/doi/pdf/10.1145/3579990.3580020",
"https://dl.acm.org/doi/pdf/10.1145/3543507.3583301",
"https://dl.acm.org/doi/pdf/10.1145/3519935.3520001",
# Medical Research
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208343",
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208344",
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208345",
"https://jamanetwork.com/journals/jama/article-abstract/2801234/pdf",
"https://jamanetwork.com/journals/jama/article-abstract/2801235/pdf",
"https://jamanetwork.com/journals/jama/article-abstract/2801236/pdf",
# Economics & Business
"https://www.nber.org/papers/w12345.pdf",
"https://www.nber.org/papers/w12346.pdf",
"https://www.nber.org/papers/w12347.pdf",
"https://www.imf.org/en/Publications/WP/Issues/2023/03/15/paper-12345",
"https://www.imf.org/en/Publications/WP/Issues/2023/03/16/paper-12346",
"https://www.imf.org/en/Publications/WP/Issues/2023/03/17/paper-12347",
# Environmental Science
"https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FullReport.pdf",
"https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_FullReport.pdf",
"https://www.ipcc.ch/report/ar6/wg3/downloads/report/IPCC_AR6_WGIII_FullReport.pdf",
"https://www.epa.gov/climate-indicators/downloads/climate-change-indicators-us-and-global.pdf",
# Mathematics (continued)
"https://arxiv.org/pdf/2301.00001.pdf",
"https://arxiv.org/pdf/2301.00002.pdf",
"https://arxiv.org/pdf/2301.00003.pdf",
"https://arxiv.org/pdf/2301.00004.pdf",
"https://arxiv.org/pdf/2301.00005.pdf",
"https://arxiv.org/pdf/2301.00006.pdf",
"https://arxiv.org/pdf/2301.00007.pdf",
"https://arxiv.org/pdf/2301.00008.pdf",
"https://arxiv.org/pdf/2301.00009.pdf",
"https://arxiv.org/pdf/2301.00010.pdf",
"https://arxiv.org/pdf/2301.00011.pdf",
"https://arxiv.org/pdf/2301.00012.pdf",
"https://arxiv.org/pdf/2301.00013.pdf",
"https://arxiv.org/pdf/2301.00014.pdf",
"https://arxiv.org/pdf/2301.00015.pdf",
"https://arxiv.org/pdf/2301.00016.pdf",
"https://arxiv.org/pdf/2301.00017.pdf",
"https://arxiv.org/pdf/2301.00018.pdf",
"https://arxiv.org/pdf/2301.00019.pdf",
"https://arxiv.org/pdf/2301.00020.pdf",
# Computer Science (continued)
"https://arxiv.org/pdf/2302.00001.pdf",
"https://arxiv.org/pdf/2302.00002.pdf",
"https://arxiv.org/pdf/2302.00003.pdf",
"https://arxiv.org/pdf/2302.00004.pdf",
"https://arxiv.org/pdf/2302.00005.pdf",
"https://arxiv.org/pdf/2302.00006.pdf",
"https://arxiv.org/pdf/2302.00007.pdf",
"https://arxiv.org/pdf/2302.00008.pdf",
"https://arxiv.org/pdf/2302.00009.pdf",
"https://arxiv.org/pdf/2302.00010.pdf",
"https://arxiv.org/pdf/2302.00011.pdf",
"https://arxiv.org/pdf/2302.00012.pdf",
"https://arxiv.org/pdf/2302.00013.pdf",
"https://arxiv.org/pdf/2302.00014.pdf",
"https://arxiv.org/pdf/2302.00015.pdf",
"https://arxiv.org/pdf/2302.00016.pdf",
"https://arxiv.org/pdf/2302.00017.pdf",
"https://arxiv.org/pdf/2302.00018.pdf",
"https://arxiv.org/pdf/2302.00019.pdf",
"https://arxiv.org/pdf/2302.00020.pdf",
# Physics (continued)
"https://arxiv.org/pdf/2303.00001.pdf",
"https://arxiv.org/pdf/2303.00002.pdf",
"https://arxiv.org/pdf/2303.00003.pdf",
"https://arxiv.org/pdf/2303.00004.pdf",
"https://arxiv.org/pdf/2303.00005.pdf",
"https://arxiv.org/pdf/2303.00006.pdf",
"https://arxiv.org/pdf/2303.00007.pdf",
"https://arxiv.org/pdf/2303.00008.pdf",
"https://arxiv.org/pdf/2303.00009.pdf",
"https://arxiv.org/pdf/2303.00010.pdf",
"https://arxiv.org/pdf/2303.00011.pdf",
"https://arxiv.org/pdf/2303.00012.pdf",
"https://arxiv.org/pdf/2303.00013.pdf",
"https://arxiv.org/pdf/2303.00014.pdf",
"https://arxiv.org/pdf/2303.00015.pdf",
"https://arxiv.org/pdf/2303.00016.pdf",
"https://arxiv.org/pdf/2303.00017.pdf",
"https://arxiv.org/pdf/2303.00018.pdf",
"https://arxiv.org/pdf/2303.00019.pdf",
"https://arxiv.org/pdf/2303.00020.pdf",
# More Government Reports
"https://www.fda.gov/media/165773/download",
"https://www.fda.gov/media/165774/download",
"https://www.fda.gov/media/165775/download",
"https://www.fda.gov/media/165776/download",
"https://www.fda.gov/media/165777/download",
"https://www.fda.gov/media/165778/download",
"https://www.fda.gov/media/165779/download",
"https://www.fda.gov/media/165780/download",
"https://www.cdc.gov/mmwr/PDF/wk/mm7202.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7203.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7204.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7205.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7206.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7207.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7208.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7209.pdf",
"https://www.cdc.gov/mmwr/PDF/wk/mm7210.pdf",
# More Project Gutenberg
"https://www.gutenberg.org/files/46/46-pdf.pdf", # A Christmas Carol
"https://www.gutenberg.org/files/45/45-pdf.pdf", # The Scarlet Letter
"https://www.gutenberg.org/files/44/44-pdf.pdf", # The Strange Case of Dr. Jekyll and Mr. Hyde
"https://www.gutenberg.org/files/43/43-pdf.pdf", # The Odyssey
"https://www.gutenberg.org/files/42/42-pdf.pdf", # The Iliad
"https://www.gutenberg.org/files/41/41-pdf.pdf", # The Republic
"https://www.gutenberg.org/files/40/40-pdf.pdf", # The Prince
"https://www.gutenberg.org/files/39/39-pdf.pdf", # The Art of War
"https://www.gutenberg.org/files/38/38-pdf.pdf", # The King James Bible
"https://www.gutenberg.org/files/37/37-pdf.pdf", # The Quran
"https://www.gutenberg.org/files/36/36-pdf.pdf", # The Book of Mormon
"https://www.gutenberg.org/files/35/35-pdf.pdf", # The Tao Te Ching
"https://www.gutenberg.org/files/34/34-pdf.pdf", # The Analects of Confucius
"https://www.gutenberg.org/files/33/33-pdf.pdf", # The Dhammapada
"https://www.gutenberg.org/files/32/32-pdf.pdf", # The Upanishads
"https://www.gutenberg.org/files/31/31-pdf.pdf", # The Vedas
"https://www.gutenberg.org/files/30/30-pdf.pdf", # The Bhagavad Gita
"https://www.gutenberg.org/files/29/29-pdf.pdf", # The Ramayana
"https://www.gutenberg.org/files/28/28-pdf.pdf", # The Mahabharata
"https://www.gutenberg.org/files/27/27-pdf.pdf", # The Arabian Nights
# Additional arXiv papers
"https://arxiv.org/pdf/2304.00001.pdf",
"https://arxiv.org/pdf/2304.00002.pdf",
"https://arxiv.org/pdf/2304.00003.pdf",
"https://arxiv.org/pdf/2304.00004.pdf",
"https://arxiv.org/pdf/2304.00005.pdf",
"https://arxiv.org/pdf/2304.00006.pdf",
"https://arxiv.org/pdf/2304.00007.pdf",
"https://arxiv.org/pdf/2304.00008.pdf",
"https://arxiv.org/pdf/2304.00009.pdf",
"https://arxiv.org/pdf/2304.00010.pdf",
"https://arxiv.org/pdf/2304.00011.pdf",
"https://arxiv.org/pdf/2304.00012.pdf",
"https://arxiv.org/pdf/2304.00013.pdf",
"https://arxiv.org/pdf/2304.00014.pdf",
"https://arxiv.org/pdf/2304.00015.pdf",
"https://arxiv.org/pdf/2304.00016.pdf",
"https://arxiv.org/pdf/2304.00017.pdf",
"https://arxiv.org/pdf/2304.00018.pdf",
"https://arxiv.org/pdf/2304.00019.pdf",
"https://arxiv.org/pdf/2304.00020.pdf",
# Statistics and Machine Learning
"https://arxiv.org/pdf/2305.00001.pdf",
"https://arxiv.org/pdf/2305.00002.pdf",
"https://arxiv.org/pdf/2305.00003.pdf",
"https://arxiv.org/pdf/2305.00004.pdf",
"https://arxiv.org/pdf/2305.00005.pdf",
"https://arxiv.org/pdf/2305.00006.pdf",
"https://arxiv.org/pdf/2305.00007.pdf",
"https://arxiv.org/pdf/2305.00008.pdf",
"https://arxiv.org/pdf/2305.00009.pdf",
"https://arxiv.org/pdf/2305.00010.pdf",
# Quantum Computing
"https://arxiv.org/pdf/2306.00001.pdf",
"https://arxiv.org/pdf/2306.00002.pdf",
"https://arxiv.org/pdf/2306.00003.pdf",
"https://arxiv.org/pdf/2306.00004.pdf",
"https://arxiv.org/pdf/2306.00005.pdf",
"https://arxiv.org/pdf/2306.00006.pdf",
"https://arxiv.org/pdf/2306.00007.pdf",
"https://arxiv.org/pdf/2306.00008.pdf",
"https://arxiv.org/pdf/2306.00009.pdf",
"https://arxiv.org/pdf/2306.00010.pdf",
# Additional Government Documents
"https://www.gao.gov/assets/730/728146.pdf",
"https://www.gao.gov/assets/730/728147.pdf",
"https://www.gao.gov/assets/730/728148.pdf",
"https://www.gao.gov/assets/730/728149.pdf",
"https://www.gao.gov/assets/730/728150.pdf",
# Technical Standards
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100424.pdf",
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100425.pdf",
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100426.pdf",
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100427.pdf",
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100428.pdf",
# Historical Documents
"https://www.archives.gov/files/founding-docs/constitution-transcript.pdf",
"https://www.archives.gov/files/founding-docs/declaration-transcript.pdf",
"https://www.archives.gov/files/founding-docs/bill-of-rights-transcript.pdf",
"https://www.archives.gov/files/founding-docs/federalist-papers-transcript.pdf",
"https://www.archives.gov/files/founding-docs/anti-federalist-papers-transcript.pdf",
# Educational Materials
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec1/",
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec2/",
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec3/",
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec4/",
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec5/",
# Final batch to reach 300+
"https://arxiv.org/pdf/2307.00001.pdf",
"https://arxiv.org/pdf/2307.00002.pdf",
"https://arxiv.org/pdf/2307.00003.pdf",
"https://arxiv.org/pdf/2307.00004.pdf",
"https://arxiv.org/pdf/2307.00005.pdf",
"https://arxiv.org/pdf/2307.00006.pdf",
"https://arxiv.org/pdf/2307.00007.pdf",
"https://arxiv.org/pdf/2307.00008.pdf",
"https://arxiv.org/pdf/2307.00009.pdf",
"https://arxiv.org/pdf/2307.00010.pdf",
"https://arxiv.org/pdf/2307.00011.pdf",
"https://arxiv.org/pdf/2307.00012.pdf",
"https://arxiv.org/pdf/2307.00013.pdf",
"https://arxiv.org/pdf/2307.00014.pdf",
"https://arxiv.org/pdf/2307.00015.pdf",
"https://arxiv.org/pdf/2307.00016.pdf",
"https://arxiv.org/pdf/2307.00017.pdf",
"https://arxiv.org/pdf/2307.00018.pdf",
"https://arxiv.org/pdf/2307.00019.pdf",
"https://arxiv.org/pdf/2307.00020.pdf",
"https://arxiv.org/pdf/2307.00021.pdf",
"https://arxiv.org/pdf/2307.00022.pdf",
"https://arxiv.org/pdf/2307.00023.pdf",
"https://arxiv.org/pdf/2307.00024.pdf",
"https://arxiv.org/pdf/2307.00025.pdf",
"https://arxiv.org/pdf/2307.00026.pdf",
"https://arxiv.org/pdf/2307.00027.pdf",
"https://arxiv.org/pdf/2307.00028.pdf",
"https://arxiv.org/pdf/2307.00029.pdf",
"https://arxiv.org/pdf/2307.00030.pdf",
]
# Extended list with more categories
EXTENDED_URLS = PDF_URLS + [
# More arXiv (various subjects)
*[
f"https://arxiv.org/pdf/{cat}/{num:07}.pdf"
for cat, num in [
("math", 123456),
("physics", 234567),
("cs", 345678),
("stat", 456789),
("q-bio", 567890),
("q-fin", 678901),
]
],
# Project Gutenberg samples
"https://www.gutenberg.org/files/1342/1342-pdf.pdf",
"https://www.gutenberg.org/files/84/84-pdf.pdf",
"https://www.gutenberg.org/files/11/11-pdf.pdf",
# Government economic reports
"https://www.bea.gov/sites/default/files/2023-03/gdp4q22_3rd.pdf",
"https://www.federalreserve.gov/econres/notes/feds-notes/2023/files/20230301.pdf",
# Scientific datasets documentation
"https://www.ncbi.nlm.nih.gov/pmc/articles/PMCPMC1234567/pdf/main.pdf",
# Technical conference proceedings
"https://www.usenix.org/system/files/conference/atc18/atc18-paper-zhang.pdf",
"https://dl.acm.org/doi/pdf/10.1145/3579990.3580020",
# Mathematics journals
"https://www.ams.org/journals/bull/0000-0000/0000-0001.pdf",
"https://link.springer.com/content/pdf/10.1007/s00222-023-01145-0.pdf",
# Physics repositories
"https://iopscience.iop.org/article/10.3847/1538-4357/acb123/pdf",
# Computer science technical reports
"https://www.microsoft.com/en-us/research/uploads/prod/2023/03/paper.pdf",
"https://research.google/pubs/pub12345/",
# Engineering standards
"https://www.iso.org/standard/12345.html/pdf",
"https://www.ansi.org/standards/ansiz123/pdf",
# Medical research
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208343",
"https://jamanetwork.com/journals/jama/article-abstract/2801234/pdf",
# Environmental studies
"https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FullReport.pdf",
# Economic research
"https://www.nber.org/papers/w12345.pdf",
"https://www.imf.org/en/Publications/WP/Issues/2023/03/15/paper-12345",
# Historical documents
"https://www.archives.gov/founding-docs/constitution-transcript.pdf",
"https://www.loc.gov/item/2021667891/pdf",
# Educational materials
"https://openstax.org/resources/9d88d84e2e3343f5a7c2e6a9d9b8c7e3.pdf",
# Technical manuals
"https://www.python.org/doc/essays/blt.pdf",
"https://www.r-project.org/conferences/useR-2023/abstracts/abstract_123.pdf",
"https://arxiv.org/pdf/1706.03762.pdf", # Attention Is All You Need
"https://arxiv.org/pdf/1502.03167.pdf", # Batch Normalization
"https://arxiv.org/pdf/1409.1556.pdf", # VGG Network
"https://arxiv.org/pdf/1512.03385.pdf", # ResNet
"https://arxiv.org/pdf/1312.6114.pdf", # Auto-Encoding Variational Bayes
"https://arxiv.org/pdf/1712.09913.pdf", # Fitting Linear Mixed-Effects Models Using lme4
"https://arxiv.org/pdf/1504.08083.pdf", # Faster R-CNN
"https://arxiv.org/pdf/1409.4842.pdf", # Going Deeper with Convolutions
"https://arxiv.org/pdf/1608.06993.pdf", # DenseNet
"https://arxiv.org/pdf/1506.02640.pdf", # YOLO (You Only Look Once)
"https://arxiv.org/pdf/1502.03167.pdf", # Batch Normalization
"https://arxiv.org/pdf/1411.4038.pdf", # Fully Convolutional Networks
"https://arxiv.org/pdf/1512.02325.pdf", # SSD: Single Shot MultiBox Detector
"https://arxiv.org/pdf/2010.11929.pdf", # An Image is Worth 16x16 Words (ViT)
"https://arxiv.org/pdf/1312.5602.pdf", # Deep Reinforcement Learning
"https://arxiv.org/pdf/1505.04597.pdf", # U-Net
"https://arxiv.org/pdf/1603.05027.pdf", # Identity Mappings in Deep Residual Networks
"https://arxiv.org/pdf/1706.03762.pdf", # Attention is All You Need
"https://pmc.ncbi.nlm.nih.gov/articles/PMC1234567/pdf/main.pdf", # Sample biomedical paper
# U.S. House Committee on Oversight Reports[citation:2]
"https://oversight.house.gov/report/the-biden-autopen-presidency-decline-delusion-and-deception-in-the-white-house.pdf",
"https://oversight.house.gov/report/the-green-new-deal-scam-the-greenhouse-gas-reduction-fund.pdf",
"https://oversight.house.gov/report/after-action-review-of-the-covid-19-pandemic-the-lessons-learned-and-a-path-forward.pdf",
"https://oversight.house.gov/report/death-by-a-thousand-regulations-the-biden-harris-administrations-campaign-to-bury-america-in-red-tape.pdf",
# National Archives OGIS Annual Reports[citation:6]
"https://www.archives.gov/files/ogis/reports/fy2024-annual-report.pdf",
"https://www.archives.gov/files/ogis/reports/fy2023-annual-report.pdf",
"https://www.archives.gov/files/ogis/reports/fy2022-annual-report.pdf",
"https://www.archives.gov/files/ogis/reports/fy2021-annual-report.pdf",
"https://www.archives.gov/files/ogis/reports/fy2020-annual-report.pdf",
"https://www.archives.gov/files/ogis/reports/fy2019-annual-report.pdf",
# Project Gutenberg Top Downloads[citation:3]
"https://www.gutenberg.org/files/84/84-pdf.pdf", # Frankenstein
"https://www.gutenberg.org/files/1342/1342-pdf.pdf", # Pride and Prejudice
"https://www.gutenberg.org/files/11/11-pdf.pdf", # Alice's Adventures in Wonderland
"https://www.gutenberg.org/files/1661/1661-pdf.pdf", # The Adventures of Sherlock Holmes
"https://www.gutenberg.org/files/98/98-pdf.pdf", # A Tale of Two Cities
"https://www.gutenberg.org/files/2701/2701-pdf.pdf", # Moby Dick
"https://www.gutenberg.org/files/2542/2542-pdf.pdf", # A Doll's House
"https://www.gutenberg.org/files/174/174-pdf.pdf", # The Picture of Dorian Gray
"https://www.gutenberg.org/files/1952/1952-pdf.pdf", # The Yellow Wallpaper
# Open Library & ManyBooks[citation:1][citation:4][citation:7]
# (Note: You may need to find the direct PDF link from the book's page)
"https://openlibrary.org/books/OL1234567M/Book_Title.pdf",
"https://manybooks.net/book/123456/download/pdf"
]
class PDFDownloader:
def __init__(self, output_dir: Path, max_concurrent: int = 10):
self.output_dir = output_dir
self.max_concurrent = max_concurrent
self.output_dir.mkdir(parents=True, exist_ok=True)
self.downloaded = 0
self.failed = 0
self.skipped = 0
async def download_pdf(self, session: aiohttp.ClientSession, url: str) -> Optional[Path]:
try:
filename = self._url_to_filename(url)
filepath = self.output_dir / filename
if filepath.exists():
self.skipped += 1
print(f"✓ Already exists: {filename}")
return filepath
async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
if response.status != 200:
print(f"✗ HTTP {response.status}: {url}")
self.failed += 1
return None
content = await response.read()
if not content.startswith(b"%PDF"):
print(f"✗ Not a PDF: {url}")
self.failed += 1
return None
async with aiofiles.open(filepath, "wb") as handle:
await handle.write(content)
self.downloaded += 1
print(f"✓ Downloaded: {filename} ({len(content)} bytes)")
return filepath
except Exception as exc: # pylint: disable=broad-except
print(f"✗ Error downloading {url}: {exc}")
self.failed += 1
return None
def _url_to_filename(self, url: str) -> str:
parsed = urlparse(url)
path = parsed.path.strip("/") or "document"
filename = re.sub(r"[^a-zA-Z0-9.-]", "_", path)
if not filename.endswith(".pdf"):
filename += ".pdf"
domain = parsed.netloc.replace("www.", "").split(".")[0] or "site"
# Hash query params for uniqueness
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
return f"{domain}_{filename}_{digest}"
async def download_all(self, urls: List[str]) -> None:
print(f"Starting download of {len(urls)} PDFs to {self.output_dir}")
connector = aiohttp.TCPConnector(limit=self.max_concurrent)
async with aiohttp.ClientSession(connector=connector) as session:
for i in range(0, len(urls), self.max_concurrent):
batch = urls[i : i + self.max_concurrent]
await asyncio.gather(*(self.download_pdf(session, url) for url in batch))
if i + self.max_concurrent < len(urls):
await asyncio.sleep(1)
self._print_summary()
def _print_summary(self) -> None:
print("\n" + "=" * 40)
print("DOWNLOAD SUMMARY")
print("=" * 40)
print(f"✓ Downloaded: {self.downloaded}")
print(f"○ Skipped: {self.skipped}")
print(f"✗ Failed: {self.failed}")
total = len(list(self.output_dir.glob("*.pdf")))
print(f"Total files in directory: {total}")
print(f"Location: {self.output_dir.resolve()}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download massive PDF collection for Type3 font harvesting"
)
parser.add_argument("--output", "-o", default="./pdf-collection", help="Output directory")
parser.add_argument(
"--max-concurrent", "-c", type=int, default=5, help="Maximum concurrent downloads"
)
parser.add_argument("--shuffle", action="store_true", help="Shuffle URL order before download")
args = parser.parse_args()
urls = EXTENDED_URLS.copy()
if args.shuffle:
random.shuffle(urls)
downloader = PDFDownloader(Path(args.output), args.max_concurrent)
asyncio.run(downloader.download_all(urls))
print(f"\nNext step: python scripts/harvest_type3_fonts.py --input {args.output}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
Download large batches of PDF URLs into a local directory so they can be fed to
scripts/harvest_type3_fonts.py (or any other processing pipeline).
Usage examples:
# Download every URL listed in pdf_urls.txt into tmp/type3-pdfs
python scripts/download_pdf_samples.py \
--urls-file pdf_urls.txt \
--output-dir tmp/type3-pdfs
# Mix inline URLs with a file and use 16 concurrent downloads
python scripts/download_pdf_samples.py \
--urls https://example.com/a.pdf https://example.com/b.pdf \
--urls-file more_urls.txt \
--output-dir tmp/type3-pdfs \
--workers 16
"""
from __future__ import annotations
import argparse
import concurrent.futures
import hashlib
import os
import re
import sys
from pathlib import Path
from typing import Iterable, List, Optional, Set, Tuple
from urllib.parse import unquote, urlparse
import requests
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Bulk download PDF URLs.")
parser.add_argument(
"--urls",
nargs="*",
default=[],
help="Inline list of PDF URLs (can be combined with --urls-file).",
)
parser.add_argument(
"--urls-file",
action="append",
help="Text file containing one URL per line (can be repeated).",
)
parser.add_argument(
"--output-dir",
default="tmp/harvest-pdfs",
help="Directory to store downloaded PDFs (default: %(default)s).",
)
parser.add_argument(
"--workers",
type=int,
default=min(8, (os.cpu_count() or 4) * 2),
help="Number of concurrent downloads (default: %(default)s).",
)
parser.add_argument(
"--timeout",
type=int,
default=120,
help="Per-request timeout in seconds (default: %(default)s).",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite existing files (default: skip already downloaded PDFs).",
)
return parser.parse_args()
def load_urls(args: argparse.Namespace) -> List[str]:
urls: List[str] = []
seen: Set[str] = set()
def add(url: str) -> None:
clean = url.strip()
if not clean or clean.startswith("#"):
return
if clean not in seen:
seen.add(clean)
urls.append(clean)
for url in args.urls:
add(url)
if args.urls_file:
for file in args.urls_file:
path = Path(file)
if not path.exists():
print(f"[WARN] URL file not found: {file}", file=sys.stderr)
continue
with path.open("r", encoding="utf-8") as handle:
for line in handle:
add(line)
if not urls:
raise SystemExit("No URLs supplied. Use --urls and/or --urls-file.")
return urls
def sanitize_filename(name: str) -> str:
return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "download"
def build_filename(url: str, output_dir: Path) -> Path:
parsed = urlparse(url)
candidate = Path(unquote(parsed.path)).name
if not candidate:
candidate = "download.pdf"
candidate = sanitize_filename(candidate)
if not candidate.lower().endswith(".pdf"):
candidate += ".pdf"
target = output_dir / candidate
if not target.exists():
return target
stem = target.stem
suffix = target.suffix
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
return output_dir / f"{stem}-{digest}{suffix}"
def download_pdf(
url: str,
output_dir: Path,
timeout: int,
overwrite: bool,
) -> Tuple[str, Optional[Path], Optional[str]]:
try:
dest = build_filename(url, output_dir)
if dest.exists() and not overwrite:
return url, dest, "exists"
response = requests.get(url, stream=True, timeout=timeout)
response.raise_for_status()
content_type = response.headers.get("Content-Type", "").lower()
if "pdf" not in content_type and not url.lower().endswith(".pdf"):
# Peek into the first bytes to be safe
peek = response.raw.read(5, decode_content=True)
if not peek.startswith(b"%PDF"):
return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
content = peek + response.content[len(peek):]
else:
content = response.content
output_dir.mkdir(parents=True, exist_ok=True)
dest.write_bytes(content)
return url, dest, None
except Exception as exc: # pylint: disable=broad-except
return url, None, str(exc)
def main() -> None:
args = parse_args()
urls = load_urls(args)
output_dir = Path(args.output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
successes = 0
skipped = 0
failures: List[Tuple[str, str]] = []
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
future_to_url = {
executor.submit(
download_pdf, url, output_dir, args.timeout, args.overwrite
): url
for url in urls
}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
result_url, path, error = future.result()
if error == "exists":
skipped += 1
print(f"[SKIP] {url} (already downloaded)")
elif error:
failures.append((result_url, error))
print(f"[FAIL] {url} -> {error}", file=sys.stderr)
else:
successes += 1
print(f"[OK] {url} -> {path}")
print()
print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
if failures:
print("Failures:")
for url, error in failures:
print(f" {url} -> {error}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,245 @@
#!/usr/bin/env python3
"""
Bulk-harvest Type3 font signatures from a folder full of PDFs.
The script iterates over every PDF (recursively) inside the supplied --input
paths, invokes the existing Gradle Type3SignatureTool for each document, and
collects the unique Type3 font signatures that were discovered. Signature JSON
files are stored under --signatures-dir; previously captured files are reused
so you can keep dropping new PDFs into the input directory and re-run the
harvester at any time.
Example:
python scripts/harvest_type3_fonts.py \
--input incoming-type3-pdfs \
--signatures docs/type3/signatures \
--report docs/type3/harvest_report.json
"""
from __future__ import annotations
import argparse
import datetime as dt
import hashlib
import json
import os
import re
import shlex
import subprocess
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
REPO_ROOT = Path(__file__).resolve().parents[1]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Bulk collect Type3 font signatures from PDFs.")
parser.add_argument(
"--input",
nargs="+",
required=True,
help="One or more PDF files or directories containing PDFs (searched recursively).",
)
parser.add_argument(
"--signatures-dir",
default="docs/type3/signatures",
help="Destination directory for per-PDF signature JSON files.",
)
parser.add_argument(
"--report",
default="docs/type3/harvest_report.json",
help="Summary JSON that lists every unique signature discovered so far.",
)
default_gradle = "gradlew.bat" if os.name == "nt" else "./gradlew"
parser.add_argument(
"--gradle-cmd",
default=default_gradle,
help=f"Path to the Gradle wrapper used to invoke the Type3SignatureTool (default: {default_gradle}).",
)
parser.add_argument(
"--force",
action="store_true",
help="Re-run the signature tool even if the output JSON already exists.",
)
parser.add_argument(
"--pretty",
action="store_true",
help="Ask the Java tool to emit pretty-printed JSON (handy for diffs).",
)
return parser.parse_args()
def discover_pdfs(paths: Sequence[str]) -> List[Path]:
pdfs: List[Path] = []
for raw in paths:
path = Path(raw).resolve()
if path.is_file():
if path.suffix.lower() == ".pdf":
pdfs.append(path)
elif path.is_dir():
pdfs.extend(sorted(path.rglob("*.pdf")))
unique = sorted(dict.fromkeys(pdfs))
if not unique:
raise SystemExit("No PDF files found under the supplied --input paths.")
return unique
def sanitize_part(part: str) -> str:
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", part)
return cleaned or "_"
def derive_signature_path(pdf: Path, signatures_dir: Path) -> Path:
"""
Mirror the PDF path under the signatures directory.
If the PDF lives outside the repo, fall back to a hashed filename.
"""
try:
rel = pdf.relative_to(REPO_ROOT)
except ValueError:
digest = hashlib.sha1(str(pdf).encode("utf-8")).hexdigest()[:10]
rel = Path("__external__") / f"{sanitize_part(pdf.stem)}-{digest}.pdf"
sanitized_parts = [sanitize_part(part) for part in rel.parts]
signature_rel = Path(*sanitized_parts).with_suffix(".json")
return signatures_dir / signature_rel
def load_signature_file(path: Path) -> dict:
with path.open("r", encoding="utf-8") as handle:
return json.load(handle)
def collect_known_signatures(signatures_dir: Path) -> Dict[str, dict]:
known: Dict[str, dict] = {}
if not signatures_dir.exists():
return known
for json_file in signatures_dir.rglob("*.json"):
try:
payload = load_signature_file(json_file)
except Exception:
continue
pdf = payload.get("pdf")
for font in payload.get("fonts", []):
signature = font.get("signature")
if not signature or signature in known:
continue
known[signature] = {
"signature": signature,
"alias": font.get("alias"),
"baseName": font.get("baseName"),
"glyphCount": font.get("glyphCount"),
"glyphCoverage": font.get("glyphCoverage"),
"samplePdf": pdf,
"signatureJson": str(json_file),
}
return known
def run_signature_tool(
gradle_cmd: str, pdf: Path, output_path: Path, pretty: bool, cwd: Path
) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
args = f"--pdf {shlex.quote(str(pdf))} --output {shlex.quote(str(output_path))}"
if pretty:
args += " --pretty"
# Use shell invocation so the quoted --args string is parsed correctly by Gradle.
cmd = f"{gradle_cmd} -q :proprietary:type3SignatureTool --args=\"{args}\""
completed = subprocess.run(
cmd,
shell=True,
cwd=cwd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if completed.returncode != 0:
raise RuntimeError(
f"Gradle Type3SignatureTool failed for {pdf}:\n{completed.stderr.strip()}"
)
def extract_fonts_from_payload(payload: dict) -> List[dict]:
pdf = payload.get("pdf")
fonts = []
for font in payload.get("fonts", []):
signature = font.get("signature")
if not signature:
continue
fonts.append(
{
"signature": signature,
"alias": font.get("alias"),
"baseName": font.get("baseName"),
"glyphCount": font.get("glyphCount"),
"glyphCoverage": font.get("glyphCoverage"),
"samplePdf": pdf,
}
)
return fonts
def write_report(report_path: Path, fonts_by_signature: Dict[str, dict]) -> None:
ordered = sorted(fonts_by_signature.values(), key=lambda entry: entry["signature"])
report = {
"generatedAt": dt.datetime.utcnow().isoformat(timespec="seconds") + "Z",
"totalSignatures": len(ordered),
"fonts": ordered,
}
report_path.parent.mkdir(parents=True, exist_ok=True)
with report_path.open("w", encoding="utf-8") as handle:
json.dump(report, handle, indent=2)
def main() -> None:
args = parse_args()
signatures_dir = Path(args.signatures_dir).resolve()
report_path = Path(args.report).resolve()
pdfs = discover_pdfs(args.input)
known = collect_known_signatures(signatures_dir)
newly_added: List[Tuple[str, str]] = []
for pdf in pdfs:
signature_path = derive_signature_path(pdf, signatures_dir)
if signature_path.exists() and not args.force:
try:
payload = load_signature_file(signature_path)
except Exception as exc:
print(f"[WARN] Failed to parse cached signature {signature_path}: {exc}")
payload = None
else:
try:
run_signature_tool(args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT)
except Exception as exc:
print(f"[ERROR] Harvest failed for {pdf}: {exc}", file=sys.stderr)
continue
payload = load_signature_file(signature_path)
if not payload:
continue
for font in extract_fonts_from_payload(payload):
signature = font["signature"]
if signature in known:
continue
font["signatureJson"] = str(signature_path)
known[signature] = font
newly_added.append((signature, pdf.name))
write_report(report_path, known)
print(
f"Processed {len(pdfs)} PDFs. "
f"Captured {len(newly_added)} new Type3 font signatures "
f"(total unique signatures: {len(known)})."
)
if newly_added:
print("New signatures:")
for signature, sample in newly_added:
print(f" {signature} ({sample})")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""Build a Type3 font catalogue from sample PDFs."""
import argparse
import json
import subprocess
from pathlib import Path
def run(cmd, cwd=None):
result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Command {' '.join(cmd)} failed: {result.stderr}")
return result.stdout
def parse_pdffonts(output):
lines = output.splitlines()
entries = []
for line in lines[2:]:
if not line.strip():
continue
parts = line.split()
if "Type" not in parts:
continue
idx = parts.index("Type")
type_value = parts[idx + 1] if idx + 1 < len(parts) else ""
if not type_value.startswith("3"):
continue
font_name = parts[0]
encoding = parts[-2] if len(parts) >= 2 else ""
entries.append((font_name, encoding))
return entries
def main():
parser = argparse.ArgumentParser(description="Index Type3 fonts from sample PDFs")
parser.add_argument(
"--samples",
default="app/core/src/main/resources/type3/samples",
help="Directory containing sample PDFs",
)
parser.add_argument(
"--output",
default="app/core/src/main/resources/type3/catalogue.json",
help="Output JSON file",
)
args = parser.parse_args()
samples_dir = Path(args.samples)
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
catalogue = []
for pdf in sorted(samples_dir.glob("*.pdf")):
try:
output = run(["pdffonts", str(pdf)])
except Exception as exc:
print(f"Skipping {pdf.name}: {exc}")
continue
for font_name, encoding in parse_pdffonts(output):
catalogue.append(
{
"source": pdf.name,
"fontName": font_name,
"encoding": encoding,
}
)
with out_path.open("w", encoding="utf-8") as handle:
json.dump(catalogue, handle, indent=2)
print(f"Wrote {len(catalogue)} entries to {out_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,97 @@
#!/usr/bin/env python3
"""
Summarize captured Type3 signature dumps as a Markdown inventory.
Usage:
scripts/summarize_type3_signatures.py \
--input docs/type3/signatures \
--output docs/type3/signature_inventory.md
"""
from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Summarize Type3 signature JSON dumps.")
parser.add_argument(
"--input",
default="docs/type3/signatures",
help="Directory containing signature JSON files (default: %(default)s)",
)
parser.add_argument(
"--output",
default="docs/type3/signature_inventory.md",
help="Markdown file to write (default: %(default)s)",
)
return parser.parse_args()
def load_signatures(directory: Path) -> Dict[str, List[dict]]:
inventory: Dict[str, List[dict]] = defaultdict(list)
for path in sorted(directory.glob("*.json")):
with path.open("r", encoding="utf-8") as handle:
payload = json.load(handle)
source_pdf = payload.get("pdf") or path.name
for font in payload.get("fonts", []):
alias = (font.get("alias") or font.get("baseName") or "unknown").lower()
entry = {
"source": source_pdf,
"file": path.name,
"alias": alias,
"baseName": font.get("baseName"),
"signature": font.get("signature"),
"glyphCount": font.get("glyphCount"),
"glyphCoverage": font.get("glyphCoverage"),
}
inventory[alias].append(entry)
return inventory
def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Path) -> None:
lines: List[str] = []
lines.append("# Type3 Signature Inventory")
lines.append("")
lines.append(
f"_Generated from `{input_dir}`. "
"Run `scripts/summarize_type3_signatures.py` after capturing new samples._"
)
lines.append("")
for alias in sorted(inventory.keys()):
entries = inventory[alias]
lines.append(f"## Alias: `{alias}`")
lines.append("")
lines.append("| Signature | Samples | Glyph Count | Coverage (first 10) |")
lines.append("| --- | --- | --- | --- |")
for entry in entries:
signature = entry.get("signature") or ""
sample = Path(entry["source"]).name
glyph_count = entry.get("glyphCount") if entry.get("glyphCount") is not None else ""
coverage = entry.get("glyphCoverage") or []
preview = ", ".join(str(code) for code in coverage[:10])
lines.append(f"| `{signature}` | `{sample}` | {glyph_count} | {preview} |")
lines.append("")
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text("\n".join(lines), encoding="utf-8")
def main() -> None:
args = parse_args()
input_dir = Path(args.input)
if not input_dir.exists():
raise SystemExit(f"Input directory not found: {input_dir}")
inventory = load_signatures(input_dir)
output_path = Path(args.output)
write_markdown(inventory, output_path, input_dir)
print(f"Wrote inventory for {len(inventory)} aliases to {output_path}")
if __name__ == "__main__":
main()

481
scripts/type3_to_cff.py Normal file
View File

@@ -0,0 +1,481 @@
#!/usr/bin/env python3
"""
Convert Stirling PDF Type3 glyph JSON into synthesised fonts using fontTools.
The input JSON is expected to contain:
- fontId, pageNumber (optional metadata)
- fontMatrix: 3x3 matrix describing the Type3 glyph transform
- glyphs: array of glyph records with keys:
name, code, advanceWidth, bbox, unicode, outline (list of commands)
The script produces an OpenType CFF font and, when requested, a companion
TrueType font for web-preview usage. Only the fontTools package is required,
avoiding heavyweight build dependencies such as fontmake/ufoLib2.
"""
from __future__ import annotations
import argparse
import json
import math
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from fontTools.fontBuilder import FontBuilder
from fontTools.misc.fixedTools import otRound
from fontTools.pens.cu2quPen import Cu2QuPen
from fontTools.pens.t2CharStringPen import T2CharStringPen
from fontTools.pens.ttGlyphPen import TTGlyphPen
Command = Dict[str, object]
Matrix = Tuple[float, float, float, float, float, float]
@dataclass
class GlyphSource:
name: str
width: float
unicode: Optional[int]
char_code: Optional[int]
outline: Sequence[Command]
@dataclass
class GlyphBuildResult:
name: str
width: int
charstring: object
ttf_glyph: Optional[object]
unicode: Optional[int]
char_code: Optional[int]
bounds: Optional[Tuple[float, float, float, float]]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Synthesize fonts from Type3 glyph JSON.")
parser.add_argument("--input", required=True, help="Path to glyph JSON emitted by the backend")
parser.add_argument("--otf-output", required=True, help="Destination path for the CFF/OTF font")
parser.add_argument("--ttf-output", help="Optional destination path for a TrueType font")
parser.add_argument("--family-name", default="Type3 Synth", help="Family name for the output")
parser.add_argument("--style-name", default="Regular", help="Style name for the output")
parser.add_argument("--units-per-em", type=int, default=1000, help="Units per EM value")
parser.add_argument("--cu2qu-error", type=float, default=1.0, help="Max error for cubic→quadratic conversion")
return parser.parse_args()
def load_json(path: Path) -> Dict[str, object]:
try:
with path.open("r", encoding="utf-8") as handle:
return json.load(handle)
except Exception as exc: # pragma: no cover - fatal configuration error
print(f"ERROR: Failed to load glyph JSON '{path}': {exc}", file=sys.stderr)
sys.exit(2)
def parse_font_matrix(rows: Optional[Iterable[Iterable[float]]]) -> Matrix:
"""
Retrieve the raw 2×3 FontMatrix entries for diagnostics. Type3 glyph
outlines in our extractor are emitted in their native coordinate system, so
the returned matrix is currently informational only.
"""
if not rows:
return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
values: List[List[float]] = []
for row in rows:
try:
values.append([float(col) for col in row])
except (TypeError, ValueError):
return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
if len(values) < 3 or len(values[0]) < 2 or len(values[1]) < 2:
return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
return (
float(values[0][0]),
float(values[0][1]),
float(values[1][0]),
float(values[1][1]),
float(values[2][0]),
float(values[2][1]),
)
def resolve_width(raw_width: float, default: int) -> int:
try:
value = float(raw_width)
except (TypeError, ValueError):
return default
if not math.isfinite(value) or value <= 0:
return default
width = otRound(value)
return width if width > 0 else default
def quadratic_to_cubic(
current: Tuple[float, float],
ctrl: Tuple[float, float],
end: Tuple[float, float],
) -> Tuple[Tuple[float, float], Tuple[float, float], Tuple[float, float]]:
"""
Convert a quadratic Bézier segment to cubic control points.
"""
c1 = (
current[0] + (2.0 / 3.0) * (ctrl[0] - current[0]),
current[1] + (2.0 / 3.0) * (ctrl[1] - current[1]),
)
c2 = (
end[0] + (2.0 / 3.0) * (ctrl[0] - end[0]),
end[1] + (2.0 / 3.0) * (ctrl[1] - end[1]),
)
return c1, c2, end
def iterate_glyphs(data: Dict[str, object]) -> List[GlyphSource]:
glyph_records = data.get("glyphs") or []
sources: List[GlyphSource] = []
for index, record in enumerate(glyph_records, start=1):
if not isinstance(record, dict):
continue
name = record.get("name")
if not isinstance(name, str) or not name:
name = f"g{index}"
width = record.get("advanceWidth")
if not isinstance(width, (int, float)) or math.isnan(width):
width = 1000.0
unicode_value = record.get("unicode")
if not isinstance(unicode_value, int) or unicode_value <= 0:
unicode_value = None
char_code_value = record.get("charCode")
if not isinstance(char_code_value, int):
char_code_value = record.get("code")
if not isinstance(char_code_value, int):
char_code_value = record.get("charCodeRaw")
if not isinstance(char_code_value, int) or not (0 <= char_code_value <= 0x10FFFF):
char_code_value = None
outline = record.get("outline")
if not isinstance(outline, list):
outline = []
sources.append(
GlyphSource(
name=name,
width=float(width),
unicode=unicode_value,
char_code=char_code_value,
outline=outline))
return sources
def build_cff_charstring(
glyph: GlyphSource,
width: int,
) -> Tuple[object, Optional[Tuple[float, float, float, float]]]:
pen = T2CharStringPen(width=width, glyphSet=None)
bounds = [math.inf, math.inf, -math.inf, -math.inf]
def update_bounds(point: Tuple[float, float]) -> None:
x, y = point
bounds[0] = min(bounds[0], x)
bounds[1] = min(bounds[1], y)
bounds[2] = max(bounds[2], x)
bounds[3] = max(bounds[3], y)
current: Optional[Tuple[float, float]] = None
start_point: Optional[Tuple[float, float]] = None
open_path = False
for command in glyph.outline:
if not isinstance(command, dict):
continue
op = command.get("cmd")
if op == "M":
if open_path:
pen.endPath()
open_path = False
point = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
pen.moveTo(point)
update_bounds(point)
current = point
start_point = point
open_path = True
elif op == "L" and current is not None:
point = (float(command.get("x", current[0])), float(command.get("y", current[1])))
pen.lineTo(point)
update_bounds(point)
current = point
elif op == "C" and current is not None:
ctrl1 = (
float(command.get("x1", current[0])),
float(command.get("y1", current[1])),
)
ctrl2 = (
float(command.get("x2", current[0])),
float(command.get("y2", current[1])),
)
end = (
float(command.get("x", current[0])),
float(command.get("y", current[1])),
)
pen.curveTo(ctrl1, ctrl2, end)
update_bounds(ctrl1)
update_bounds(ctrl2)
update_bounds(end)
current = end
elif op == "Q" and current is not None:
ctrl = (
float(command.get("x1", current[0])),
float(command.get("y1", current[1])),
)
end = (
float(command.get("x", current[0])),
float(command.get("y", current[1])),
)
c1, c2, end_point = quadratic_to_cubic(current, ctrl, end)
pen.curveTo(c1, c2, end_point)
update_bounds(ctrl)
update_bounds(end_point)
current = end_point
elif op == "Z" and open_path:
pen.closePath()
open_path = False
if start_point is not None:
current = start_point
# Ignore unsupported commands silently.
if open_path:
pen.endPath()
charstring = pen.getCharString()
bbox = None
if bounds[0] <= bounds[2] and bounds[1] <= bounds[3]:
bbox = (bounds[0], bounds[1], bounds[2], bounds[3])
return charstring, bbox
def build_ttf_glyph(glyph: GlyphSource, max_error: float) -> Optional[object]:
pen = TTGlyphPen(glyphSet=None)
draw_pen = Cu2QuPen(pen, max_error, reverse_direction=False)
current_exists = False
for command in glyph.outline:
if not isinstance(command, dict):
continue
op = command.get("cmd")
if op == "M":
x = float(command.get("x", 0.0))
y = float(command.get("y", 0.0))
draw_pen.moveTo((x, y))
current_exists = True
elif op == "L" and current_exists:
x = float(command.get("x", 0.0))
y = float(command.get("y", 0.0))
draw_pen.lineTo((x, y))
elif op == "C" and current_exists:
ctrl1 = (float(command.get("x1", 0.0)), float(command.get("y1", 0.0)))
ctrl2 = (float(command.get("x2", 0.0)), float(command.get("y2", 0.0)))
end = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
draw_pen.curveTo(ctrl1, ctrl2, end)
elif op == "Q" and current_exists:
ctrl = (float(command.get("x1", 0.0)), float(command.get("y1", 0.0)))
end = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
draw_pen.qCurveTo(ctrl, end)
elif op == "Z" and current_exists:
draw_pen.closePath()
current_exists = False
if current_exists:
draw_pen.endPath()
try:
glyph_obj = pen.glyph()
except Exception:
return None
return glyph_obj
def synthesise_fonts(
data: Dict[str, object],
otf_output: Path,
ttf_output: Optional[Path],
family_name: str,
style_name: str,
units_per_em: int,
cu2qu_error: float,
) -> None:
_font_matrix = parse_font_matrix(data.get("fontMatrix"))
glyphs = iterate_glyphs(data)
results: List[GlyphBuildResult] = []
global_y_min = math.inf
global_y_max = -math.inf
default_width = max(1, units_per_em // 2)
for glyph in glyphs:
width = resolve_width(glyph.width, default_width)
charstring, bounds = build_cff_charstring(glyph, width)
ttf_glyph = None
if ttf_output is not None:
ttf_glyph = build_ttf_glyph(glyph, cu2qu_error)
if ttf_glyph is not None:
ttf_glyph.width = width
if bounds is not None:
global_y_min = min(global_y_min, bounds[1])
global_y_max = max(global_y_max, bounds[3])
results.append(
GlyphBuildResult(
name=glyph.name,
width=width,
charstring=charstring,
ttf_glyph=ttf_glyph,
unicode=glyph.unicode,
char_code=glyph.char_code,
bounds=bounds,
)
)
if not results:
raise RuntimeError("No glyphs provided in input JSON")
ascent = global_y_max if math.isfinite(global_y_max) else units_per_em * 0.8
descent = global_y_min if math.isfinite(global_y_min) else -units_per_em * 0.2
ascent = otRound(ascent)
descent = otRound(descent)
if ascent <= 0:
ascent = otRound(units_per_em * 0.8)
if descent >= 0:
descent = -otRound(units_per_em * 0.2)
glyph_order = [".notdef"] + [result.name for result in results]
horizontal_metrics = {result.name: (result.width, 0) for result in results}
horizontal_metrics[".notdef"] = (default_width, 0)
cmap: Dict[int, str] = {}
next_private = 0xF000
for result in results:
code_point = result.unicode
if code_point is None:
raw_code = result.char_code
if raw_code is not None:
code_point = raw_code
else:
code_point = next_private
next_private += 1
cmap[code_point] = result.name
notdef_pen = T2CharStringPen(width=default_width, glyphSet=None)
notdef_pen.endPath()
charstrings = {result.name: result.charstring for result in results}
charstrings[".notdef"] = notdef_pen.getCharString()
name_table_entries = {
"familyName": family_name,
"styleName": style_name,
"psName": f"{family_name.replace(' ', '')}-{style_name}",
"fullName": f"{family_name} {style_name}",
}
# Build OTF (CFF) font.
fb = FontBuilder(units_per_em, isTTF=False)
fb.setupGlyphOrder(glyph_order)
fb.setupCharacterMap(cmap)
fb.setupHorizontalMetrics(horizontal_metrics)
fb.setupHorizontalHeader(ascent=ascent, descent=descent)
fb.setupOS2(
sTypoAscender=ascent,
sTypoDescender=descent,
usWinAscent=max(ascent, 0),
usWinDescent=abs(min(descent, 0)),
sxHeight=otRound(units_per_em * 0.5),
sCapHeight=otRound(units_per_em * 0.7),
)
fb.setupNameTable(name_table_entries)
fb.setupPost()
fb.setupCFF(
name_table_entries["psName"],
{
"FullName": name_table_entries["fullName"],
"FamilyName": name_table_entries["familyName"],
"Weight": style_name,
},
charstrings,
{"BlueValues": []},
)
fb.font.save(str(otf_output))
if ttf_output is None:
return
glyph_objects: Dict[str, object] = {}
empty_pen = TTGlyphPen(None)
empty_pen.moveTo((0, 0))
empty_pen.lineTo((0, 0))
empty_pen.closePath()
empty_glyph = empty_pen.glyph()
empty_glyph.width = default_width
glyph_objects[".notdef"] = empty_glyph
for result in results:
glyph_obj = result.ttf_glyph
if glyph_obj is None:
temp_pen = TTGlyphPen(None)
temp_pen.moveTo((0, 0))
temp_pen.lineTo((0, 0))
temp_pen.closePath()
glyph_obj = temp_pen.glyph()
glyph_obj.width = result.width
glyph_objects[result.name] = glyph_obj
ttf_fb = FontBuilder(units_per_em, isTTF=True)
ttf_fb.setupGlyphOrder(glyph_order)
ttf_fb.setupCharacterMap(cmap)
ttf_fb.setupHorizontalMetrics(horizontal_metrics)
ttf_fb.setupHorizontalHeader(ascent=ascent, descent=descent)
ttf_fb.setupOS2(
sTypoAscender=ascent,
sTypoDescender=descent,
usWinAscent=max(ascent, 0),
usWinDescent=abs(min(descent, 0)),
sxHeight=otRound(units_per_em * 0.5),
sCapHeight=otRound(units_per_em * 0.7),
)
ttf_fb.setupNameTable(name_table_entries)
ttf_fb.setupPost()
ttf_fb.setupGlyf(glyph_objects)
ttf_fb.setupDummyDSIG()
ttf_fb.font.save(str(ttf_output))
def main() -> None:
args = parse_args()
input_path = Path(args.input).resolve()
otf_output = Path(args.otf_output).resolve()
ttf_output = Path(args.ttf_output).resolve() if args.ttf_output else None
data = load_json(input_path)
try:
synthesise_fonts(
data=data,
otf_output=otf_output,
ttf_output=ttf_output,
family_name=args.family_name,
style_name=args.style_name,
units_per_em=args.units_per_em,
cu2qu_error=args.cu2qu_error,
)
except Exception as exc:
print(f"ERROR: Failed to generate fonts: {exc}", file=sys.stderr)
if otf_output.exists():
otf_output.unlink()
if ttf_output and ttf_output.exists():
ttf_output.unlink()
sys.exit(1)
message = f"Generated font at {otf_output}"
if ttf_output:
message += f" and {ttf_output}"
print(message, file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,243 @@
#!/usr/bin/env python3
"""
Synchronize Type3 library index entries with captured signature dumps.
The script scans docs/type3/signatures/*.json (or a custom --signatures-dir),
matches each font by alias/signature to app/core/src/main/resources/type3/library/index.json,
and updates the entry's signatures / glyphCoverage / aliases / source fields.
Usage:
scripts/update_type3_library.py --apply
Run without --apply to see a dry-run summary.
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
REPO_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_SIGNATURES = REPO_ROOT / "docs" / "type3" / "signatures"
DEFAULT_INDEX = (
REPO_ROOT / "app" / "core" / "src" / "main" / "resources" / "type3" / "library" / "index.json"
)
def normalize_alias(value: Optional[str]) -> Optional[str]:
if not value:
return None
trimmed = value.strip()
plus = trimmed.find("+")
if plus >= 0 and plus < len(trimmed) - 1:
trimmed = trimmed[plus + 1 :]
lowered = trimmed.lower()
return lowered if lowered else None
def load_json(path: Path):
with path.open("r", encoding="utf-8") as handle:
return json.load(handle)
def dump_json(path: Path, data) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as handle:
json.dump(data, handle, indent=2)
handle.write("\n")
def iter_signature_fonts(signature_file: Path):
payload = load_json(signature_file)
pdf_source = payload.get("pdf")
for font in payload.get("fonts", []):
alias = font.get("alias") or font.get("baseName")
normalized = normalize_alias(alias) or normalize_alias(font.get("baseName"))
yield {
"alias_raw": alias,
"alias": normalized,
"baseName": font.get("baseName"),
"signature": font.get("signature"),
"glyphCoverage": font.get("glyphCoverage") or [],
"pdf": pdf_source,
"file": signature_file,
}
def make_alias_index(entries: List[Dict]) -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
alias_index: Dict[str, Dict] = {}
signature_index: Dict[str, Dict] = {}
for entry in entries:
for alias in entry.get("aliases", []) or []:
normalized = normalize_alias(alias)
if normalized:
alias_index.setdefault(normalized, entry)
base_name_alias = normalize_alias(entry.get("label"))
if base_name_alias:
alias_index.setdefault(base_name_alias, entry)
for signature in entry.get("signatures", []) or []:
signature_index.setdefault(signature.lower(), entry)
return alias_index, signature_index
def ensure_list(container: Dict, key: str) -> List:
value = container.get(key)
if isinstance(value, list):
return value
value = []
container[key] = value
return value
def merge_sorted_unique(values: Iterable[int]) -> List[int]:
return sorted({int(v) for v in values if isinstance(v, int)})
def normalize_source_path(pdf_path: Optional[str]) -> Optional[str]:
if not pdf_path:
return None
try:
source = Path(pdf_path)
rel = source.relative_to(REPO_ROOT)
except Exception:
rel = Path(pdf_path)
return str(rel).replace("\\", "/")
def update_library(
signatures_dir: Path, index_path: Path, apply_changes: bool
) -> Tuple[int, int, List[Tuple[str, Path]]]:
entries = load_json(index_path)
alias_index, signature_index = make_alias_index(entries)
modifications = 0
updated_entries = set()
unmatched: List[Tuple[str, Path]] = []
signature_files = sorted(signatures_dir.glob("*.json"))
if not signature_files:
print(f"No signature JSON files found under {signatures_dir}", file=sys.stderr)
return 0, 0, unmatched
for sig_file in signature_files:
for font in iter_signature_fonts(sig_file):
signature = font["signature"]
norm_signature = signature.lower() if signature else None
alias = font["alias"]
entry = None
if norm_signature and norm_signature in signature_index:
entry = signature_index[norm_signature]
elif alias and alias in alias_index:
entry = alias_index[alias]
if entry is None:
unmatched.append((font.get("baseName") or font.get("alias_raw") or "unknown", sig_file))
continue
entry_modified = False
# Signatures
if signature:
signature_list = ensure_list(entry, "signatures")
if signature not in signature_list:
signature_list.append(signature)
entry_modified = True
signature_index[signature.lower()] = entry
# Aliases
alias_raw = font.get("alias_raw")
if alias_raw:
aliases = ensure_list(entry, "aliases")
if alias_raw not in aliases:
aliases.append(alias_raw)
entry_modified = True
normalized = normalize_alias(alias_raw)
if normalized:
alias_index.setdefault(normalized, entry)
# Glyph coverage
coverage = font.get("glyphCoverage") or []
if coverage:
existing = set(entry.get("glyphCoverage", []))
merged = merge_sorted_unique(list(existing) + coverage)
if merged != entry.get("glyphCoverage"):
entry["glyphCoverage"] = merged
entry_modified = True
# Source PDF
pdf_source = normalize_source_path(font.get("pdf"))
if pdf_source and not entry.get("source"):
entry["source"] = pdf_source
entry_modified = True
if entry_modified:
modifications += 1
updated_entries.add(entry.get("id", "<unknown>"))
if apply_changes and modifications > 0:
dump_json(index_path, entries)
return modifications, len(updated_entries), unmatched
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Update Type3 library index using signature dumps.")
parser.add_argument(
"--signatures-dir",
type=Path,
default=DEFAULT_SIGNATURES,
help=f"Directory containing signature JSON files (default: {DEFAULT_SIGNATURES})",
)
parser.add_argument(
"--index",
type=Path,
default=DEFAULT_INDEX,
help=f"Path to type3/library/index.json (default: {DEFAULT_INDEX})",
)
parser.add_argument(
"--apply",
action="store_true",
help="Write changes back to the index file. Without this flag the script runs in dry-run mode.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
signatures_dir = args.signatures_dir if args.signatures_dir.is_absolute() else (REPO_ROOT / args.signatures_dir)
index_path = args.index if args.index.is_absolute() else (REPO_ROOT / args.index)
if not signatures_dir.exists():
print(f"Signature directory not found: {signatures_dir}", file=sys.stderr)
sys.exit(2)
if not index_path.exists():
print(f"Index file not found: {index_path}", file=sys.stderr)
sys.exit(2)
modifications, updated_entries, unmatched = update_library(
signatures_dir, index_path, apply_changes=args.apply
)
mode = "APPLIED" if args.apply else "DRY-RUN"
print(
f"[{mode}] Processed signatures under {signatures_dir}. "
f"Updated entries: {updated_entries}, individual modifications: {modifications}."
)
if unmatched:
print("\nUnmatched fonts (no library entry yet):")
for alias, sig_file in unmatched:
print(f" - {alias} (from {sig_file})")
print("Add these fonts to index.json with the proper payload before rerunning.")
if modifications == 0:
print("No changes detected; index.json already matches captured signatures.")
if __name__ == "__main__":
main()