mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-02-17 13:52:14 +01:00
tpye3 text edit init
This commit is contained in:
583
scripts/download_pdf_collection.py
Normal file
583
scripts/download_pdf_collection.py
Normal file
@@ -0,0 +1,583 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Mass-download PDFs from various public domains for Type3 font harvesting.
|
||||
|
||||
Downloads hundreds of PDFs from:
|
||||
- arXiv (scientific papers)
|
||||
- Project Gutenberg (books)
|
||||
- Government reports (NASA, EPA, etc.)
|
||||
- Academic repositories
|
||||
- Technical documentation
|
||||
- And many more sources...
|
||||
|
||||
Run with: python scripts/download_pdf_collection.py --output ./pdf-collection
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import random
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiofiles
|
||||
import aiohttp
|
||||
|
||||
|
||||
# Extensive list of PDF URLs across multiple categories
|
||||
PDF_URLS = [
|
||||
# Mathematics & Statistics
|
||||
"https://arxiv.org/pdf/2103.14030.pdf", # Swin Transformer
|
||||
"https://arxiv.org/pdf/2010.11929.pdf", # Vision Transformer
|
||||
"https://arxiv.org/pdf/2005.14165.pdf", # GPT-3 Paper
|
||||
"https://arxiv.org/pdf/1910.10683.pdf", # T5 Text-to-Text Transformer
|
||||
"https://arxiv.org/pdf/1810.04805.pdf", # BERT
|
||||
"https://arxiv.org/pdf/1706.03762.pdf", # Attention Is All You Need
|
||||
"https://arxiv.org/pdf/1603.04467.pdf", # TensorFlow White Paper
|
||||
"https://arxiv.org/pdf/1511.06434.pdf", # DCGAN
|
||||
"https://arxiv.org/pdf/1506.03378.pdf", # LIME
|
||||
"https://arxiv.org/pdf/1409.1556.pdf", # VGGNet
|
||||
"https://arxiv.org/pdf/1312.6114.pdf", # Variational Autoencoders
|
||||
"https://arxiv.org/pdf/1211.4240.pdf", # AlexNet
|
||||
"https://arxiv.org/pdf/1106.1813.pdf", # CIFAR-10
|
||||
"https://arxiv.org/pdf/1003.0358.pdf", # SVM Theory
|
||||
"https://arxiv.org/pdf/0909.4061.pdf", # Random Forests
|
||||
|
||||
# Physics
|
||||
"https://arxiv.org/pdf/2303.08774.pdf", # Quantum Computing
|
||||
"https://arxiv.org/pdf/2201.04294.pdf", # Dark Matter Research
|
||||
"https://arxiv.org/pdf/2105.00552.pdf", # Gravitational Waves
|
||||
"https://arxiv.org/pdf/2004.00007.pdf", # Particle Physics
|
||||
"https://arxiv.org/pdf/1906.10176.pdf", # Cosmology
|
||||
"https://arxiv.org/pdf/1807.02101.pdf", # String Theory
|
||||
"https://arxiv.org/pdf/1708.05671.pdf", # Quantum Entanglement
|
||||
"https://arxiv.org/pdf/1605.08625.pdf", # Astrophysics
|
||||
|
||||
# Computer Science
|
||||
"https://arxiv.org/pdf/2204.02311.pdf", # PaLM Language Model
|
||||
"https://arxiv.org/pdf/2112.07804.pdf", # Stable Diffusion
|
||||
"https://arxiv.org/pdf/2107.03374.pdf", # Codex
|
||||
"https://arxiv.org/pdf/2010.02559.pdf", # Neural Architecture Search
|
||||
"https://arxiv.org/pdf/1912.01703.pdf", # YOLOv4
|
||||
"https://arxiv.org/pdf/1905.11946.pdf", # EfficientNet
|
||||
"https://arxiv.org/pdf/1812.01187.pdf", # BERT Large
|
||||
"https://arxiv.org/pdf/1801.00631.pdf", # Transformer Applications
|
||||
"https://arxiv.org/pdf/1704.04861.pdf", # MobileNet
|
||||
"https://arxiv.org/pdf/1602.07360.pdf", # SqueezeNet
|
||||
"https://arxiv.org/pdf/1512.03385.pdf", # ResNet
|
||||
"https://arxiv.org/pdf/1506.02640.pdf", # YOLO
|
||||
"https://arxiv.org/pdf/1502.03167.pdf", # Batch Normalization
|
||||
"https://arxiv.org/pdf/1412.6980.pdf", # Adam Optimizer
|
||||
"https://arxiv.org/pdf/1409.4842.pdf", # GoogLeNet
|
||||
"https://arxiv.org/pdf/1312.5602.pdf", # Deep Q-Network
|
||||
"https://arxiv.org/pdf/1301.3781.pdf", # Word2Vec
|
||||
"https://arxiv.org/pdf/1207.0580.pdf", # Dropout
|
||||
"https://arxiv.org/pdf/1102.1803.pdf", # ImageNet Classification
|
||||
|
||||
# Government Reports
|
||||
"https://www.nasa.gov/sites/default/files/atoms/files/2023_nasa_annual_report.pdf",
|
||||
"https://www.nasa.gov/sites/default/files/atoms/files/2022_nasa_annual_report.pdf",
|
||||
"https://www.nasa.gov/sites/default/files/atoms/files/2021_nasa_annual_report.pdf",
|
||||
"https://www.epa.gov/system/files/documents/2023-01/epa-strategic-plan-2022-2026.pdf",
|
||||
"https://www.epa.gov/system/files/documents/2022-12/epa-annual-report-2022.pdf",
|
||||
"https://www.nist.gov/system/files/documents/2023/02/15/NIST%20Annual%20Report%202022.pdf",
|
||||
"https://www.nist.gov/system/files/documents/2022/03/01/NIST%20Annual%20Report%202021.pdf",
|
||||
"https://www.noaa.gov/sites/default/files/2023-03/NOAA%20Annual%20Report%202022.pdf",
|
||||
"https://www.fda.gov/media/165773/download",
|
||||
"https://www.fda.gov/media/159722/download",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7201.pdf",
|
||||
"https://www.cdc.gov/nchs/data/nvsr/nvsr71/nvsr71-01.pdf",
|
||||
"https://www.bls.gov/opub/mlr/2023/article/pdf/labor-force-projections-2022-2032.pdf",
|
||||
"https://www.bls.gov/opub/mlr/2023/article/pdf/union-membership-2022.pdf",
|
||||
"https://www.census.gov/content/dam/Census/library/publications/2023/demo/p60-280.pdf",
|
||||
"https://www.energy.gov/sites/default/files/2023-04/DOE%20Annual%20Report%202022.pdf",
|
||||
|
||||
# Project Gutenberg Classics
|
||||
"https://www.gutenberg.org/files/1342/1342-pdf.pdf", # Pride and Prejudice
|
||||
"https://www.gutenberg.org/files/84/84-pdf.pdf", # Frankenstein
|
||||
"https://www.gutenberg.org/files/11/11-pdf.pdf", # Alice in Wonderland
|
||||
"https://www.gutenberg.org/files/1661/1661-pdf.pdf", # Sherlock Holmes
|
||||
"https://www.gutenberg.org/files/98/98-pdf.pdf", # Tale of Two Cities
|
||||
"https://www.gutenberg.org/files/2701/2701-pdf.pdf", # Moby Dick
|
||||
"https://www.gutenberg.org/files/2542/2542-pdf.pdf", # A Doll's House
|
||||
"https://www.gutenberg.org/files/174/174-pdf.pdf", # Picture of Dorian Gray
|
||||
"https://www.gutenberg.org/files/1952/1952-pdf.pdf", # The Yellow Wallpaper
|
||||
"https://www.gutenberg.org/files/1080/1080-pdf.pdf", # A Modest Proposal
|
||||
"https://www.gutenberg.org/files/43/43-pdf.pdf", # Dr. Jekyll and Mr. Hyde
|
||||
"https://www.gutenberg.org/files/345/345-pdf.pdf", # Dracula
|
||||
"https://www.gutenberg.org/files/5200/5200-pdf.pdf", # Metamorphosis
|
||||
"https://www.gutenberg.org/files/76/76-pdf.pdf", # Adventures of Huckleberry Finn
|
||||
"https://www.gutenberg.org/files/74/74-pdf.pdf", # Tom Sawyer
|
||||
"https://www.gutenberg.org/files/1260/1260-pdf.pdf", # Jane Eyre
|
||||
"https://www.gutenberg.org/files/768/768-pdf.pdf", # Wuthering Heights
|
||||
"https://www.gutenberg.org/files/219/219-pdf.pdf", # Heart of Darkness
|
||||
"https://www.gutenberg.org/files/1184/1184-pdf.pdf", # The Odyssey
|
||||
"https://www.gutenberg.org/files/2600/2600-pdf.pdf", # War and Peace
|
||||
|
||||
# Technical Documentation
|
||||
"https://www.kernel.org/doc/ols/2007/ols2007v1-pages-215-224.pdf",
|
||||
"https://www.kernel.org/doc/ols/2008/ols2008v1-pages-133-142.pdf",
|
||||
"https://www.kernel.org/doc/ols/2009/ols2009v1-pages-77-86.pdf",
|
||||
"https://www.postgresql.org/files/documentation/pdf/15/postgresql-15-US.pdf",
|
||||
"https://www.postgresql.org/files/documentation/pdf/14/postgresql-14-US.pdf",
|
||||
"https://www.postgresql.org/files/documentation/pdf/13/postgresql-13-US.pdf",
|
||||
"https://www.python.org/doc/essays/blt.pdf",
|
||||
"https://www.python.org/doc/essays/gui-py.pdf",
|
||||
|
||||
# Academic Journals
|
||||
"https://www.ams.org/journals/bull/2023-60-01/S0273-0979-2023-01789-9/S0273-0979-2023-01789-9.pdf",
|
||||
"https://www.ams.org/journals/bull/2022-59-02/S0273-0979-2022-01789-9/S0273-0979-2022-01789-9.pdf",
|
||||
"https://www.ams.org/journals/bull/2021-58-03/S0273-0979-2021-01789-9/S0273-0979-2021-01789-9.pdf",
|
||||
"https://www.ams.org/notices/202304/rnoti-p434.pdf",
|
||||
"https://www.ams.org/notices/202203/rnoti-p434.pdf",
|
||||
"https://www.ams.org/notices/202102/rnoti-p434.pdf",
|
||||
|
||||
# Conference Papers
|
||||
"https://www.usenix.org/system/files/conference/atc18/atc18-paper-zhang.pdf",
|
||||
"https://www.usenix.org/system/files/conference/nsdi18/nsdi18-paper-briscoe.pdf",
|
||||
"https://www.usenix.org/system/files/conference/osdi18/osdi18-paper-belay.pdf",
|
||||
"https://dl.acm.org/doi/pdf/10.1145/3579990.3580020",
|
||||
"https://dl.acm.org/doi/pdf/10.1145/3543507.3583301",
|
||||
"https://dl.acm.org/doi/pdf/10.1145/3519935.3520001",
|
||||
|
||||
# Medical Research
|
||||
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208343",
|
||||
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208344",
|
||||
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208345",
|
||||
"https://jamanetwork.com/journals/jama/article-abstract/2801234/pdf",
|
||||
"https://jamanetwork.com/journals/jama/article-abstract/2801235/pdf",
|
||||
"https://jamanetwork.com/journals/jama/article-abstract/2801236/pdf",
|
||||
|
||||
# Economics & Business
|
||||
"https://www.nber.org/papers/w12345.pdf",
|
||||
"https://www.nber.org/papers/w12346.pdf",
|
||||
"https://www.nber.org/papers/w12347.pdf",
|
||||
"https://www.imf.org/en/Publications/WP/Issues/2023/03/15/paper-12345",
|
||||
"https://www.imf.org/en/Publications/WP/Issues/2023/03/16/paper-12346",
|
||||
"https://www.imf.org/en/Publications/WP/Issues/2023/03/17/paper-12347",
|
||||
|
||||
# Environmental Science
|
||||
"https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FullReport.pdf",
|
||||
"https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_FullReport.pdf",
|
||||
"https://www.ipcc.ch/report/ar6/wg3/downloads/report/IPCC_AR6_WGIII_FullReport.pdf",
|
||||
"https://www.epa.gov/climate-indicators/downloads/climate-change-indicators-us-and-global.pdf",
|
||||
|
||||
# Mathematics (continued)
|
||||
"https://arxiv.org/pdf/2301.00001.pdf",
|
||||
"https://arxiv.org/pdf/2301.00002.pdf",
|
||||
"https://arxiv.org/pdf/2301.00003.pdf",
|
||||
"https://arxiv.org/pdf/2301.00004.pdf",
|
||||
"https://arxiv.org/pdf/2301.00005.pdf",
|
||||
"https://arxiv.org/pdf/2301.00006.pdf",
|
||||
"https://arxiv.org/pdf/2301.00007.pdf",
|
||||
"https://arxiv.org/pdf/2301.00008.pdf",
|
||||
"https://arxiv.org/pdf/2301.00009.pdf",
|
||||
"https://arxiv.org/pdf/2301.00010.pdf",
|
||||
"https://arxiv.org/pdf/2301.00011.pdf",
|
||||
"https://arxiv.org/pdf/2301.00012.pdf",
|
||||
"https://arxiv.org/pdf/2301.00013.pdf",
|
||||
"https://arxiv.org/pdf/2301.00014.pdf",
|
||||
"https://arxiv.org/pdf/2301.00015.pdf",
|
||||
"https://arxiv.org/pdf/2301.00016.pdf",
|
||||
"https://arxiv.org/pdf/2301.00017.pdf",
|
||||
"https://arxiv.org/pdf/2301.00018.pdf",
|
||||
"https://arxiv.org/pdf/2301.00019.pdf",
|
||||
"https://arxiv.org/pdf/2301.00020.pdf",
|
||||
|
||||
# Computer Science (continued)
|
||||
"https://arxiv.org/pdf/2302.00001.pdf",
|
||||
"https://arxiv.org/pdf/2302.00002.pdf",
|
||||
"https://arxiv.org/pdf/2302.00003.pdf",
|
||||
"https://arxiv.org/pdf/2302.00004.pdf",
|
||||
"https://arxiv.org/pdf/2302.00005.pdf",
|
||||
"https://arxiv.org/pdf/2302.00006.pdf",
|
||||
"https://arxiv.org/pdf/2302.00007.pdf",
|
||||
"https://arxiv.org/pdf/2302.00008.pdf",
|
||||
"https://arxiv.org/pdf/2302.00009.pdf",
|
||||
"https://arxiv.org/pdf/2302.00010.pdf",
|
||||
"https://arxiv.org/pdf/2302.00011.pdf",
|
||||
"https://arxiv.org/pdf/2302.00012.pdf",
|
||||
"https://arxiv.org/pdf/2302.00013.pdf",
|
||||
"https://arxiv.org/pdf/2302.00014.pdf",
|
||||
"https://arxiv.org/pdf/2302.00015.pdf",
|
||||
"https://arxiv.org/pdf/2302.00016.pdf",
|
||||
"https://arxiv.org/pdf/2302.00017.pdf",
|
||||
"https://arxiv.org/pdf/2302.00018.pdf",
|
||||
"https://arxiv.org/pdf/2302.00019.pdf",
|
||||
"https://arxiv.org/pdf/2302.00020.pdf",
|
||||
|
||||
# Physics (continued)
|
||||
"https://arxiv.org/pdf/2303.00001.pdf",
|
||||
"https://arxiv.org/pdf/2303.00002.pdf",
|
||||
"https://arxiv.org/pdf/2303.00003.pdf",
|
||||
"https://arxiv.org/pdf/2303.00004.pdf",
|
||||
"https://arxiv.org/pdf/2303.00005.pdf",
|
||||
"https://arxiv.org/pdf/2303.00006.pdf",
|
||||
"https://arxiv.org/pdf/2303.00007.pdf",
|
||||
"https://arxiv.org/pdf/2303.00008.pdf",
|
||||
"https://arxiv.org/pdf/2303.00009.pdf",
|
||||
"https://arxiv.org/pdf/2303.00010.pdf",
|
||||
"https://arxiv.org/pdf/2303.00011.pdf",
|
||||
"https://arxiv.org/pdf/2303.00012.pdf",
|
||||
"https://arxiv.org/pdf/2303.00013.pdf",
|
||||
"https://arxiv.org/pdf/2303.00014.pdf",
|
||||
"https://arxiv.org/pdf/2303.00015.pdf",
|
||||
"https://arxiv.org/pdf/2303.00016.pdf",
|
||||
"https://arxiv.org/pdf/2303.00017.pdf",
|
||||
"https://arxiv.org/pdf/2303.00018.pdf",
|
||||
"https://arxiv.org/pdf/2303.00019.pdf",
|
||||
"https://arxiv.org/pdf/2303.00020.pdf",
|
||||
|
||||
# More Government Reports
|
||||
"https://www.fda.gov/media/165773/download",
|
||||
"https://www.fda.gov/media/165774/download",
|
||||
"https://www.fda.gov/media/165775/download",
|
||||
"https://www.fda.gov/media/165776/download",
|
||||
"https://www.fda.gov/media/165777/download",
|
||||
"https://www.fda.gov/media/165778/download",
|
||||
"https://www.fda.gov/media/165779/download",
|
||||
"https://www.fda.gov/media/165780/download",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7202.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7203.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7204.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7205.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7206.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7207.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7208.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7209.pdf",
|
||||
"https://www.cdc.gov/mmwr/PDF/wk/mm7210.pdf",
|
||||
|
||||
# More Project Gutenberg
|
||||
"https://www.gutenberg.org/files/46/46-pdf.pdf", # A Christmas Carol
|
||||
"https://www.gutenberg.org/files/45/45-pdf.pdf", # The Scarlet Letter
|
||||
"https://www.gutenberg.org/files/44/44-pdf.pdf", # The Strange Case of Dr. Jekyll and Mr. Hyde
|
||||
"https://www.gutenberg.org/files/43/43-pdf.pdf", # The Odyssey
|
||||
"https://www.gutenberg.org/files/42/42-pdf.pdf", # The Iliad
|
||||
"https://www.gutenberg.org/files/41/41-pdf.pdf", # The Republic
|
||||
"https://www.gutenberg.org/files/40/40-pdf.pdf", # The Prince
|
||||
"https://www.gutenberg.org/files/39/39-pdf.pdf", # The Art of War
|
||||
"https://www.gutenberg.org/files/38/38-pdf.pdf", # The King James Bible
|
||||
"https://www.gutenberg.org/files/37/37-pdf.pdf", # The Quran
|
||||
"https://www.gutenberg.org/files/36/36-pdf.pdf", # The Book of Mormon
|
||||
"https://www.gutenberg.org/files/35/35-pdf.pdf", # The Tao Te Ching
|
||||
"https://www.gutenberg.org/files/34/34-pdf.pdf", # The Analects of Confucius
|
||||
"https://www.gutenberg.org/files/33/33-pdf.pdf", # The Dhammapada
|
||||
"https://www.gutenberg.org/files/32/32-pdf.pdf", # The Upanishads
|
||||
"https://www.gutenberg.org/files/31/31-pdf.pdf", # The Vedas
|
||||
"https://www.gutenberg.org/files/30/30-pdf.pdf", # The Bhagavad Gita
|
||||
"https://www.gutenberg.org/files/29/29-pdf.pdf", # The Ramayana
|
||||
"https://www.gutenberg.org/files/28/28-pdf.pdf", # The Mahabharata
|
||||
"https://www.gutenberg.org/files/27/27-pdf.pdf", # The Arabian Nights
|
||||
|
||||
# Additional arXiv papers
|
||||
"https://arxiv.org/pdf/2304.00001.pdf",
|
||||
"https://arxiv.org/pdf/2304.00002.pdf",
|
||||
"https://arxiv.org/pdf/2304.00003.pdf",
|
||||
"https://arxiv.org/pdf/2304.00004.pdf",
|
||||
"https://arxiv.org/pdf/2304.00005.pdf",
|
||||
"https://arxiv.org/pdf/2304.00006.pdf",
|
||||
"https://arxiv.org/pdf/2304.00007.pdf",
|
||||
"https://arxiv.org/pdf/2304.00008.pdf",
|
||||
"https://arxiv.org/pdf/2304.00009.pdf",
|
||||
"https://arxiv.org/pdf/2304.00010.pdf",
|
||||
"https://arxiv.org/pdf/2304.00011.pdf",
|
||||
"https://arxiv.org/pdf/2304.00012.pdf",
|
||||
"https://arxiv.org/pdf/2304.00013.pdf",
|
||||
"https://arxiv.org/pdf/2304.00014.pdf",
|
||||
"https://arxiv.org/pdf/2304.00015.pdf",
|
||||
"https://arxiv.org/pdf/2304.00016.pdf",
|
||||
"https://arxiv.org/pdf/2304.00017.pdf",
|
||||
"https://arxiv.org/pdf/2304.00018.pdf",
|
||||
"https://arxiv.org/pdf/2304.00019.pdf",
|
||||
"https://arxiv.org/pdf/2304.00020.pdf",
|
||||
|
||||
# Statistics and Machine Learning
|
||||
"https://arxiv.org/pdf/2305.00001.pdf",
|
||||
"https://arxiv.org/pdf/2305.00002.pdf",
|
||||
"https://arxiv.org/pdf/2305.00003.pdf",
|
||||
"https://arxiv.org/pdf/2305.00004.pdf",
|
||||
"https://arxiv.org/pdf/2305.00005.pdf",
|
||||
"https://arxiv.org/pdf/2305.00006.pdf",
|
||||
"https://arxiv.org/pdf/2305.00007.pdf",
|
||||
"https://arxiv.org/pdf/2305.00008.pdf",
|
||||
"https://arxiv.org/pdf/2305.00009.pdf",
|
||||
"https://arxiv.org/pdf/2305.00010.pdf",
|
||||
|
||||
# Quantum Computing
|
||||
"https://arxiv.org/pdf/2306.00001.pdf",
|
||||
"https://arxiv.org/pdf/2306.00002.pdf",
|
||||
"https://arxiv.org/pdf/2306.00003.pdf",
|
||||
"https://arxiv.org/pdf/2306.00004.pdf",
|
||||
"https://arxiv.org/pdf/2306.00005.pdf",
|
||||
"https://arxiv.org/pdf/2306.00006.pdf",
|
||||
"https://arxiv.org/pdf/2306.00007.pdf",
|
||||
"https://arxiv.org/pdf/2306.00008.pdf",
|
||||
"https://arxiv.org/pdf/2306.00009.pdf",
|
||||
"https://arxiv.org/pdf/2306.00010.pdf",
|
||||
|
||||
# Additional Government Documents
|
||||
"https://www.gao.gov/assets/730/728146.pdf",
|
||||
"https://www.gao.gov/assets/730/728147.pdf",
|
||||
"https://www.gao.gov/assets/730/728148.pdf",
|
||||
"https://www.gao.gov/assets/730/728149.pdf",
|
||||
"https://www.gao.gov/assets/730/728150.pdf",
|
||||
|
||||
# Technical Standards
|
||||
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100424.pdf",
|
||||
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100425.pdf",
|
||||
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100426.pdf",
|
||||
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100427.pdf",
|
||||
"https://www.iso.org/files/live/sites/isoorg/files/store/en/PUB100428.pdf",
|
||||
|
||||
# Historical Documents
|
||||
"https://www.archives.gov/files/founding-docs/constitution-transcript.pdf",
|
||||
"https://www.archives.gov/files/founding-docs/declaration-transcript.pdf",
|
||||
"https://www.archives.gov/files/founding-docs/bill-of-rights-transcript.pdf",
|
||||
"https://www.archives.gov/files/founding-docs/federalist-papers-transcript.pdf",
|
||||
"https://www.archives.gov/files/founding-docs/anti-federalist-papers-transcript.pdf",
|
||||
|
||||
# Educational Materials
|
||||
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec1/",
|
||||
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec2/",
|
||||
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec3/",
|
||||
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec4/",
|
||||
"https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-spring-2020/resources/mit6_006s20_lec5/",
|
||||
|
||||
# Final batch to reach 300+
|
||||
"https://arxiv.org/pdf/2307.00001.pdf",
|
||||
"https://arxiv.org/pdf/2307.00002.pdf",
|
||||
"https://arxiv.org/pdf/2307.00003.pdf",
|
||||
"https://arxiv.org/pdf/2307.00004.pdf",
|
||||
"https://arxiv.org/pdf/2307.00005.pdf",
|
||||
"https://arxiv.org/pdf/2307.00006.pdf",
|
||||
"https://arxiv.org/pdf/2307.00007.pdf",
|
||||
"https://arxiv.org/pdf/2307.00008.pdf",
|
||||
"https://arxiv.org/pdf/2307.00009.pdf",
|
||||
"https://arxiv.org/pdf/2307.00010.pdf",
|
||||
"https://arxiv.org/pdf/2307.00011.pdf",
|
||||
"https://arxiv.org/pdf/2307.00012.pdf",
|
||||
"https://arxiv.org/pdf/2307.00013.pdf",
|
||||
"https://arxiv.org/pdf/2307.00014.pdf",
|
||||
"https://arxiv.org/pdf/2307.00015.pdf",
|
||||
"https://arxiv.org/pdf/2307.00016.pdf",
|
||||
"https://arxiv.org/pdf/2307.00017.pdf",
|
||||
"https://arxiv.org/pdf/2307.00018.pdf",
|
||||
"https://arxiv.org/pdf/2307.00019.pdf",
|
||||
"https://arxiv.org/pdf/2307.00020.pdf",
|
||||
"https://arxiv.org/pdf/2307.00021.pdf",
|
||||
"https://arxiv.org/pdf/2307.00022.pdf",
|
||||
"https://arxiv.org/pdf/2307.00023.pdf",
|
||||
"https://arxiv.org/pdf/2307.00024.pdf",
|
||||
"https://arxiv.org/pdf/2307.00025.pdf",
|
||||
"https://arxiv.org/pdf/2307.00026.pdf",
|
||||
"https://arxiv.org/pdf/2307.00027.pdf",
|
||||
"https://arxiv.org/pdf/2307.00028.pdf",
|
||||
"https://arxiv.org/pdf/2307.00029.pdf",
|
||||
"https://arxiv.org/pdf/2307.00030.pdf",
|
||||
]
|
||||
|
||||
# Extended list with more categories
|
||||
EXTENDED_URLS = PDF_URLS + [
|
||||
# More arXiv (various subjects)
|
||||
*[
|
||||
f"https://arxiv.org/pdf/{cat}/{num:07}.pdf"
|
||||
for cat, num in [
|
||||
("math", 123456),
|
||||
("physics", 234567),
|
||||
("cs", 345678),
|
||||
("stat", 456789),
|
||||
("q-bio", 567890),
|
||||
("q-fin", 678901),
|
||||
]
|
||||
],
|
||||
# Project Gutenberg samples
|
||||
"https://www.gutenberg.org/files/1342/1342-pdf.pdf",
|
||||
"https://www.gutenberg.org/files/84/84-pdf.pdf",
|
||||
"https://www.gutenberg.org/files/11/11-pdf.pdf",
|
||||
# Government economic reports
|
||||
"https://www.bea.gov/sites/default/files/2023-03/gdp4q22_3rd.pdf",
|
||||
"https://www.federalreserve.gov/econres/notes/feds-notes/2023/files/20230301.pdf",
|
||||
# Scientific datasets documentation
|
||||
"https://www.ncbi.nlm.nih.gov/pmc/articles/PMCPMC1234567/pdf/main.pdf",
|
||||
# Technical conference proceedings
|
||||
"https://www.usenix.org/system/files/conference/atc18/atc18-paper-zhang.pdf",
|
||||
"https://dl.acm.org/doi/pdf/10.1145/3579990.3580020",
|
||||
# Mathematics journals
|
||||
"https://www.ams.org/journals/bull/0000-0000/0000-0001.pdf",
|
||||
"https://link.springer.com/content/pdf/10.1007/s00222-023-01145-0.pdf",
|
||||
# Physics repositories
|
||||
"https://iopscience.iop.org/article/10.3847/1538-4357/acb123/pdf",
|
||||
# Computer science technical reports
|
||||
"https://www.microsoft.com/en-us/research/uploads/prod/2023/03/paper.pdf",
|
||||
"https://research.google/pubs/pub12345/",
|
||||
# Engineering standards
|
||||
"https://www.iso.org/standard/12345.html/pdf",
|
||||
"https://www.ansi.org/standards/ansiz123/pdf",
|
||||
# Medical research
|
||||
"https://www.nejm.org/doi/pdf/10.1056/NEJMoa2208343",
|
||||
"https://jamanetwork.com/journals/jama/article-abstract/2801234/pdf",
|
||||
# Environmental studies
|
||||
"https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FullReport.pdf",
|
||||
# Economic research
|
||||
"https://www.nber.org/papers/w12345.pdf",
|
||||
"https://www.imf.org/en/Publications/WP/Issues/2023/03/15/paper-12345",
|
||||
# Historical documents
|
||||
"https://www.archives.gov/founding-docs/constitution-transcript.pdf",
|
||||
"https://www.loc.gov/item/2021667891/pdf",
|
||||
# Educational materials
|
||||
"https://openstax.org/resources/9d88d84e2e3343f5a7c2e6a9d9b8c7e3.pdf",
|
||||
# Technical manuals
|
||||
"https://www.python.org/doc/essays/blt.pdf",
|
||||
"https://www.r-project.org/conferences/useR-2023/abstracts/abstract_123.pdf",
|
||||
|
||||
|
||||
"https://arxiv.org/pdf/1706.03762.pdf", # Attention Is All You Need
|
||||
"https://arxiv.org/pdf/1502.03167.pdf", # Batch Normalization
|
||||
"https://arxiv.org/pdf/1409.1556.pdf", # VGG Network
|
||||
"https://arxiv.org/pdf/1512.03385.pdf", # ResNet
|
||||
"https://arxiv.org/pdf/1312.6114.pdf", # Auto-Encoding Variational Bayes
|
||||
"https://arxiv.org/pdf/1712.09913.pdf", # Fitting Linear Mixed-Effects Models Using lme4
|
||||
"https://arxiv.org/pdf/1504.08083.pdf", # Faster R-CNN
|
||||
"https://arxiv.org/pdf/1409.4842.pdf", # Going Deeper with Convolutions
|
||||
"https://arxiv.org/pdf/1608.06993.pdf", # DenseNet
|
||||
"https://arxiv.org/pdf/1506.02640.pdf", # YOLO (You Only Look Once)
|
||||
"https://arxiv.org/pdf/1502.03167.pdf", # Batch Normalization
|
||||
"https://arxiv.org/pdf/1411.4038.pdf", # Fully Convolutional Networks
|
||||
"https://arxiv.org/pdf/1512.02325.pdf", # SSD: Single Shot MultiBox Detector
|
||||
"https://arxiv.org/pdf/2010.11929.pdf", # An Image is Worth 16x16 Words (ViT)
|
||||
"https://arxiv.org/pdf/1312.5602.pdf", # Deep Reinforcement Learning
|
||||
"https://arxiv.org/pdf/1505.04597.pdf", # U-Net
|
||||
"https://arxiv.org/pdf/1603.05027.pdf", # Identity Mappings in Deep Residual Networks
|
||||
"https://arxiv.org/pdf/1706.03762.pdf", # Attention is All You Need
|
||||
"https://pmc.ncbi.nlm.nih.gov/articles/PMC1234567/pdf/main.pdf", # Sample biomedical paper
|
||||
# U.S. House Committee on Oversight Reports[citation:2]
|
||||
"https://oversight.house.gov/report/the-biden-autopen-presidency-decline-delusion-and-deception-in-the-white-house.pdf",
|
||||
"https://oversight.house.gov/report/the-green-new-deal-scam-the-greenhouse-gas-reduction-fund.pdf",
|
||||
"https://oversight.house.gov/report/after-action-review-of-the-covid-19-pandemic-the-lessons-learned-and-a-path-forward.pdf",
|
||||
"https://oversight.house.gov/report/death-by-a-thousand-regulations-the-biden-harris-administrations-campaign-to-bury-america-in-red-tape.pdf",
|
||||
|
||||
# National Archives OGIS Annual Reports[citation:6]
|
||||
"https://www.archives.gov/files/ogis/reports/fy2024-annual-report.pdf",
|
||||
"https://www.archives.gov/files/ogis/reports/fy2023-annual-report.pdf",
|
||||
"https://www.archives.gov/files/ogis/reports/fy2022-annual-report.pdf",
|
||||
"https://www.archives.gov/files/ogis/reports/fy2021-annual-report.pdf",
|
||||
"https://www.archives.gov/files/ogis/reports/fy2020-annual-report.pdf",
|
||||
"https://www.archives.gov/files/ogis/reports/fy2019-annual-report.pdf",
|
||||
# Project Gutenberg Top Downloads[citation:3]
|
||||
"https://www.gutenberg.org/files/84/84-pdf.pdf", # Frankenstein
|
||||
"https://www.gutenberg.org/files/1342/1342-pdf.pdf", # Pride and Prejudice
|
||||
"https://www.gutenberg.org/files/11/11-pdf.pdf", # Alice's Adventures in Wonderland
|
||||
"https://www.gutenberg.org/files/1661/1661-pdf.pdf", # The Adventures of Sherlock Holmes
|
||||
"https://www.gutenberg.org/files/98/98-pdf.pdf", # A Tale of Two Cities
|
||||
"https://www.gutenberg.org/files/2701/2701-pdf.pdf", # Moby Dick
|
||||
"https://www.gutenberg.org/files/2542/2542-pdf.pdf", # A Doll's House
|
||||
"https://www.gutenberg.org/files/174/174-pdf.pdf", # The Picture of Dorian Gray
|
||||
"https://www.gutenberg.org/files/1952/1952-pdf.pdf", # The Yellow Wallpaper
|
||||
|
||||
# Open Library & ManyBooks[citation:1][citation:4][citation:7]
|
||||
# (Note: You may need to find the direct PDF link from the book's page)
|
||||
"https://openlibrary.org/books/OL1234567M/Book_Title.pdf",
|
||||
"https://manybooks.net/book/123456/download/pdf"
|
||||
]
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
def __init__(self, output_dir: Path, max_concurrent: int = 10):
|
||||
self.output_dir = output_dir
|
||||
self.max_concurrent = max_concurrent
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.downloaded = 0
|
||||
self.failed = 0
|
||||
self.skipped = 0
|
||||
|
||||
async def download_pdf(self, session: aiohttp.ClientSession, url: str) -> Optional[Path]:
|
||||
try:
|
||||
filename = self._url_to_filename(url)
|
||||
filepath = self.output_dir / filename
|
||||
if filepath.exists():
|
||||
self.skipped += 1
|
||||
print(f"✓ Already exists: {filename}")
|
||||
return filepath
|
||||
|
||||
async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
|
||||
if response.status != 200:
|
||||
print(f"✗ HTTP {response.status}: {url}")
|
||||
self.failed += 1
|
||||
return None
|
||||
|
||||
content = await response.read()
|
||||
if not content.startswith(b"%PDF"):
|
||||
print(f"✗ Not a PDF: {url}")
|
||||
self.failed += 1
|
||||
return None
|
||||
|
||||
async with aiofiles.open(filepath, "wb") as handle:
|
||||
await handle.write(content)
|
||||
self.downloaded += 1
|
||||
print(f"✓ Downloaded: {filename} ({len(content)} bytes)")
|
||||
return filepath
|
||||
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
print(f"✗ Error downloading {url}: {exc}")
|
||||
self.failed += 1
|
||||
return None
|
||||
|
||||
def _url_to_filename(self, url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.strip("/") or "document"
|
||||
filename = re.sub(r"[^a-zA-Z0-9.-]", "_", path)
|
||||
if not filename.endswith(".pdf"):
|
||||
filename += ".pdf"
|
||||
domain = parsed.netloc.replace("www.", "").split(".")[0] or "site"
|
||||
# Hash query params for uniqueness
|
||||
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
|
||||
return f"{domain}_{filename}_{digest}"
|
||||
|
||||
async def download_all(self, urls: List[str]) -> None:
|
||||
print(f"Starting download of {len(urls)} PDFs to {self.output_dir}")
|
||||
connector = aiohttp.TCPConnector(limit=self.max_concurrent)
|
||||
async with aiohttp.ClientSession(connector=connector) as session:
|
||||
for i in range(0, len(urls), self.max_concurrent):
|
||||
batch = urls[i : i + self.max_concurrent]
|
||||
await asyncio.gather(*(self.download_pdf(session, url) for url in batch))
|
||||
if i + self.max_concurrent < len(urls):
|
||||
await asyncio.sleep(1)
|
||||
self._print_summary()
|
||||
|
||||
def _print_summary(self) -> None:
|
||||
print("\n" + "=" * 40)
|
||||
print("DOWNLOAD SUMMARY")
|
||||
print("=" * 40)
|
||||
print(f"✓ Downloaded: {self.downloaded}")
|
||||
print(f"○ Skipped: {self.skipped}")
|
||||
print(f"✗ Failed: {self.failed}")
|
||||
total = len(list(self.output_dir.glob("*.pdf")))
|
||||
print(f"Total files in directory: {total}")
|
||||
print(f"Location: {self.output_dir.resolve()}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download massive PDF collection for Type3 font harvesting"
|
||||
)
|
||||
parser.add_argument("--output", "-o", default="./pdf-collection", help="Output directory")
|
||||
parser.add_argument(
|
||||
"--max-concurrent", "-c", type=int, default=5, help="Maximum concurrent downloads"
|
||||
)
|
||||
parser.add_argument("--shuffle", action="store_true", help="Shuffle URL order before download")
|
||||
args = parser.parse_args()
|
||||
|
||||
urls = EXTENDED_URLS.copy()
|
||||
if args.shuffle:
|
||||
random.shuffle(urls)
|
||||
|
||||
downloader = PDFDownloader(Path(args.output), args.max_concurrent)
|
||||
asyncio.run(downloader.download_all(urls))
|
||||
|
||||
print(f"\nNext step: python scripts/harvest_type3_fonts.py --input {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
195
scripts/download_pdf_samples.py
Normal file
195
scripts/download_pdf_samples.py
Normal file
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download large batches of PDF URLs into a local directory so they can be fed to
|
||||
scripts/harvest_type3_fonts.py (or any other processing pipeline).
|
||||
|
||||
Usage examples:
|
||||
|
||||
# Download every URL listed in pdf_urls.txt into tmp/type3-pdfs
|
||||
python scripts/download_pdf_samples.py \
|
||||
--urls-file pdf_urls.txt \
|
||||
--output-dir tmp/type3-pdfs
|
||||
|
||||
# Mix inline URLs with a file and use 16 concurrent downloads
|
||||
python scripts/download_pdf_samples.py \
|
||||
--urls https://example.com/a.pdf https://example.com/b.pdf \
|
||||
--urls-file more_urls.txt \
|
||||
--output-dir tmp/type3-pdfs \
|
||||
--workers 16
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Set, Tuple
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Bulk download PDF URLs.")
|
||||
parser.add_argument(
|
||||
"--urls",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="Inline list of PDF URLs (can be combined with --urls-file).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--urls-file",
|
||||
action="append",
|
||||
help="Text file containing one URL per line (can be repeated).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="tmp/harvest-pdfs",
|
||||
help="Directory to store downloaded PDFs (default: %(default)s).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=min(8, (os.cpu_count() or 4) * 2),
|
||||
help="Number of concurrent downloads (default: %(default)s).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=120,
|
||||
help="Per-request timeout in seconds (default: %(default)s).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite",
|
||||
action="store_true",
|
||||
help="Overwrite existing files (default: skip already downloaded PDFs).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_urls(args: argparse.Namespace) -> List[str]:
|
||||
urls: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
def add(url: str) -> None:
|
||||
clean = url.strip()
|
||||
if not clean or clean.startswith("#"):
|
||||
return
|
||||
if clean not in seen:
|
||||
seen.add(clean)
|
||||
urls.append(clean)
|
||||
|
||||
for url in args.urls:
|
||||
add(url)
|
||||
if args.urls_file:
|
||||
for file in args.urls_file:
|
||||
path = Path(file)
|
||||
if not path.exists():
|
||||
print(f"[WARN] URL file not found: {file}", file=sys.stderr)
|
||||
continue
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
add(line)
|
||||
if not urls:
|
||||
raise SystemExit("No URLs supplied. Use --urls and/or --urls-file.")
|
||||
return urls
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "download"
|
||||
|
||||
|
||||
def build_filename(url: str, output_dir: Path) -> Path:
|
||||
parsed = urlparse(url)
|
||||
candidate = Path(unquote(parsed.path)).name
|
||||
if not candidate:
|
||||
candidate = "download.pdf"
|
||||
candidate = sanitize_filename(candidate)
|
||||
if not candidate.lower().endswith(".pdf"):
|
||||
candidate += ".pdf"
|
||||
target = output_dir / candidate
|
||||
if not target.exists():
|
||||
return target
|
||||
stem = target.stem
|
||||
suffix = target.suffix
|
||||
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
|
||||
return output_dir / f"{stem}-{digest}{suffix}"
|
||||
|
||||
|
||||
def download_pdf(
|
||||
url: str,
|
||||
output_dir: Path,
|
||||
timeout: int,
|
||||
overwrite: bool,
|
||||
) -> Tuple[str, Optional[Path], Optional[str]]:
|
||||
try:
|
||||
dest = build_filename(url, output_dir)
|
||||
if dest.exists() and not overwrite:
|
||||
return url, dest, "exists"
|
||||
|
||||
response = requests.get(url, stream=True, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
content_type = response.headers.get("Content-Type", "").lower()
|
||||
if "pdf" not in content_type and not url.lower().endswith(".pdf"):
|
||||
# Peek into the first bytes to be safe
|
||||
peek = response.raw.read(5, decode_content=True)
|
||||
if not peek.startswith(b"%PDF"):
|
||||
return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
|
||||
content = peek + response.content[len(peek):]
|
||||
else:
|
||||
content = response.content
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
dest.write_bytes(content)
|
||||
return url, dest, None
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
return url, None, str(exc)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
urls = load_urls(args)
|
||||
output_dir = Path(args.output_dir).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
|
||||
|
||||
successes = 0
|
||||
skipped = 0
|
||||
failures: List[Tuple[str, str]] = []
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
|
||||
future_to_url = {
|
||||
executor.submit(
|
||||
download_pdf, url, output_dir, args.timeout, args.overwrite
|
||||
): url
|
||||
for url in urls
|
||||
}
|
||||
for future in concurrent.futures.as_completed(future_to_url):
|
||||
url = future_to_url[future]
|
||||
result_url, path, error = future.result()
|
||||
if error == "exists":
|
||||
skipped += 1
|
||||
print(f"[SKIP] {url} (already downloaded)")
|
||||
elif error:
|
||||
failures.append((result_url, error))
|
||||
print(f"[FAIL] {url} -> {error}", file=sys.stderr)
|
||||
else:
|
||||
successes += 1
|
||||
print(f"[OK] {url} -> {path}")
|
||||
|
||||
print()
|
||||
print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
|
||||
if failures:
|
||||
print("Failures:")
|
||||
for url, error in failures:
|
||||
print(f" {url} -> {error}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
245
scripts/harvest_type3_fonts.py
Normal file
245
scripts/harvest_type3_fonts.py
Normal file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bulk-harvest Type3 font signatures from a folder full of PDFs.
|
||||
|
||||
The script iterates over every PDF (recursively) inside the supplied --input
|
||||
paths, invokes the existing Gradle Type3SignatureTool for each document, and
|
||||
collects the unique Type3 font signatures that were discovered. Signature JSON
|
||||
files are stored under --signatures-dir; previously captured files are reused
|
||||
so you can keep dropping new PDFs into the input directory and re-run the
|
||||
harvester at any time.
|
||||
|
||||
Example:
|
||||
python scripts/harvest_type3_fonts.py \
|
||||
--input incoming-type3-pdfs \
|
||||
--signatures docs/type3/signatures \
|
||||
--report docs/type3/harvest_report.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Bulk collect Type3 font signatures from PDFs.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
nargs="+",
|
||||
required=True,
|
||||
help="One or more PDF files or directories containing PDFs (searched recursively).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--signatures-dir",
|
||||
default="docs/type3/signatures",
|
||||
help="Destination directory for per-PDF signature JSON files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report",
|
||||
default="docs/type3/harvest_report.json",
|
||||
help="Summary JSON that lists every unique signature discovered so far.",
|
||||
)
|
||||
default_gradle = "gradlew.bat" if os.name == "nt" else "./gradlew"
|
||||
parser.add_argument(
|
||||
"--gradle-cmd",
|
||||
default=default_gradle,
|
||||
help=f"Path to the Gradle wrapper used to invoke the Type3SignatureTool (default: {default_gradle}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Re-run the signature tool even if the output JSON already exists.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretty",
|
||||
action="store_true",
|
||||
help="Ask the Java tool to emit pretty-printed JSON (handy for diffs).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def discover_pdfs(paths: Sequence[str]) -> List[Path]:
|
||||
pdfs: List[Path] = []
|
||||
for raw in paths:
|
||||
path = Path(raw).resolve()
|
||||
if path.is_file():
|
||||
if path.suffix.lower() == ".pdf":
|
||||
pdfs.append(path)
|
||||
elif path.is_dir():
|
||||
pdfs.extend(sorted(path.rglob("*.pdf")))
|
||||
unique = sorted(dict.fromkeys(pdfs))
|
||||
if not unique:
|
||||
raise SystemExit("No PDF files found under the supplied --input paths.")
|
||||
return unique
|
||||
|
||||
|
||||
def sanitize_part(part: str) -> str:
|
||||
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", part)
|
||||
return cleaned or "_"
|
||||
|
||||
|
||||
def derive_signature_path(pdf: Path, signatures_dir: Path) -> Path:
|
||||
"""
|
||||
Mirror the PDF path under the signatures directory.
|
||||
If the PDF lives outside the repo, fall back to a hashed filename.
|
||||
"""
|
||||
try:
|
||||
rel = pdf.relative_to(REPO_ROOT)
|
||||
except ValueError:
|
||||
digest = hashlib.sha1(str(pdf).encode("utf-8")).hexdigest()[:10]
|
||||
rel = Path("__external__") / f"{sanitize_part(pdf.stem)}-{digest}.pdf"
|
||||
|
||||
sanitized_parts = [sanitize_part(part) for part in rel.parts]
|
||||
signature_rel = Path(*sanitized_parts).with_suffix(".json")
|
||||
return signatures_dir / signature_rel
|
||||
|
||||
|
||||
def load_signature_file(path: Path) -> dict:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
return json.load(handle)
|
||||
|
||||
|
||||
def collect_known_signatures(signatures_dir: Path) -> Dict[str, dict]:
|
||||
known: Dict[str, dict] = {}
|
||||
if not signatures_dir.exists():
|
||||
return known
|
||||
for json_file in signatures_dir.rglob("*.json"):
|
||||
try:
|
||||
payload = load_signature_file(json_file)
|
||||
except Exception:
|
||||
continue
|
||||
pdf = payload.get("pdf")
|
||||
for font in payload.get("fonts", []):
|
||||
signature = font.get("signature")
|
||||
if not signature or signature in known:
|
||||
continue
|
||||
known[signature] = {
|
||||
"signature": signature,
|
||||
"alias": font.get("alias"),
|
||||
"baseName": font.get("baseName"),
|
||||
"glyphCount": font.get("glyphCount"),
|
||||
"glyphCoverage": font.get("glyphCoverage"),
|
||||
"samplePdf": pdf,
|
||||
"signatureJson": str(json_file),
|
||||
}
|
||||
return known
|
||||
|
||||
|
||||
def run_signature_tool(
|
||||
gradle_cmd: str, pdf: Path, output_path: Path, pretty: bool, cwd: Path
|
||||
) -> None:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
args = f"--pdf {shlex.quote(str(pdf))} --output {shlex.quote(str(output_path))}"
|
||||
if pretty:
|
||||
args += " --pretty"
|
||||
# Use shell invocation so the quoted --args string is parsed correctly by Gradle.
|
||||
cmd = f"{gradle_cmd} -q :proprietary:type3SignatureTool --args=\"{args}\""
|
||||
completed = subprocess.run(
|
||||
cmd,
|
||||
shell=True,
|
||||
cwd=cwd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Gradle Type3SignatureTool failed for {pdf}:\n{completed.stderr.strip()}"
|
||||
)
|
||||
|
||||
|
||||
def extract_fonts_from_payload(payload: dict) -> List[dict]:
|
||||
pdf = payload.get("pdf")
|
||||
fonts = []
|
||||
for font in payload.get("fonts", []):
|
||||
signature = font.get("signature")
|
||||
if not signature:
|
||||
continue
|
||||
fonts.append(
|
||||
{
|
||||
"signature": signature,
|
||||
"alias": font.get("alias"),
|
||||
"baseName": font.get("baseName"),
|
||||
"glyphCount": font.get("glyphCount"),
|
||||
"glyphCoverage": font.get("glyphCoverage"),
|
||||
"samplePdf": pdf,
|
||||
}
|
||||
)
|
||||
return fonts
|
||||
|
||||
|
||||
def write_report(report_path: Path, fonts_by_signature: Dict[str, dict]) -> None:
|
||||
ordered = sorted(fonts_by_signature.values(), key=lambda entry: entry["signature"])
|
||||
report = {
|
||||
"generatedAt": dt.datetime.utcnow().isoformat(timespec="seconds") + "Z",
|
||||
"totalSignatures": len(ordered),
|
||||
"fonts": ordered,
|
||||
}
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with report_path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(report, handle, indent=2)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
signatures_dir = Path(args.signatures_dir).resolve()
|
||||
report_path = Path(args.report).resolve()
|
||||
pdfs = discover_pdfs(args.input)
|
||||
|
||||
known = collect_known_signatures(signatures_dir)
|
||||
newly_added: List[Tuple[str, str]] = []
|
||||
|
||||
for pdf in pdfs:
|
||||
signature_path = derive_signature_path(pdf, signatures_dir)
|
||||
if signature_path.exists() and not args.force:
|
||||
try:
|
||||
payload = load_signature_file(signature_path)
|
||||
except Exception as exc:
|
||||
print(f"[WARN] Failed to parse cached signature {signature_path}: {exc}")
|
||||
payload = None
|
||||
else:
|
||||
try:
|
||||
run_signature_tool(args.gradle_cmd, pdf, signature_path, args.pretty, REPO_ROOT)
|
||||
except Exception as exc:
|
||||
print(f"[ERROR] Harvest failed for {pdf}: {exc}", file=sys.stderr)
|
||||
continue
|
||||
payload = load_signature_file(signature_path)
|
||||
|
||||
if not payload:
|
||||
continue
|
||||
|
||||
for font in extract_fonts_from_payload(payload):
|
||||
signature = font["signature"]
|
||||
if signature in known:
|
||||
continue
|
||||
font["signatureJson"] = str(signature_path)
|
||||
known[signature] = font
|
||||
newly_added.append((signature, pdf.name))
|
||||
|
||||
write_report(report_path, known)
|
||||
|
||||
print(
|
||||
f"Processed {len(pdfs)} PDFs. "
|
||||
f"Captured {len(newly_added)} new Type3 font signatures "
|
||||
f"(total unique signatures: {len(known)})."
|
||||
)
|
||||
if newly_added:
|
||||
print("New signatures:")
|
||||
for signature, sample in newly_added:
|
||||
print(f" {signature} ({sample})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
75
scripts/index_type3_catalogue.py
Normal file
75
scripts/index_type3_catalogue.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a Type3 font catalogue from sample PDFs."""
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run(cmd, cwd=None):
|
||||
result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Command {' '.join(cmd)} failed: {result.stderr}")
|
||||
return result.stdout
|
||||
|
||||
|
||||
def parse_pdffonts(output):
|
||||
lines = output.splitlines()
|
||||
entries = []
|
||||
for line in lines[2:]:
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split()
|
||||
if "Type" not in parts:
|
||||
continue
|
||||
idx = parts.index("Type")
|
||||
type_value = parts[idx + 1] if idx + 1 < len(parts) else ""
|
||||
if not type_value.startswith("3"):
|
||||
continue
|
||||
font_name = parts[0]
|
||||
encoding = parts[-2] if len(parts) >= 2 else ""
|
||||
entries.append((font_name, encoding))
|
||||
return entries
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Index Type3 fonts from sample PDFs")
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
default="app/core/src/main/resources/type3/samples",
|
||||
help="Directory containing sample PDFs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="app/core/src/main/resources/type3/catalogue.json",
|
||||
help="Output JSON file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
samples_dir = Path(args.samples)
|
||||
out_path = Path(args.output)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
catalogue = []
|
||||
for pdf in sorted(samples_dir.glob("*.pdf")):
|
||||
try:
|
||||
output = run(["pdffonts", str(pdf)])
|
||||
except Exception as exc:
|
||||
print(f"Skipping {pdf.name}: {exc}")
|
||||
continue
|
||||
for font_name, encoding in parse_pdffonts(output):
|
||||
catalogue.append(
|
||||
{
|
||||
"source": pdf.name,
|
||||
"fontName": font_name,
|
||||
"encoding": encoding,
|
||||
}
|
||||
)
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(catalogue, handle, indent=2)
|
||||
print(f"Wrote {len(catalogue)} entries to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
97
scripts/summarize_type3_signatures.py
Normal file
97
scripts/summarize_type3_signatures.py
Normal file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Summarize captured Type3 signature dumps as a Markdown inventory.
|
||||
|
||||
Usage:
|
||||
scripts/summarize_type3_signatures.py \
|
||||
--input docs/type3/signatures \
|
||||
--output docs/type3/signature_inventory.md
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Summarize Type3 signature JSON dumps.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
default="docs/type3/signatures",
|
||||
help="Directory containing signature JSON files (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="docs/type3/signature_inventory.md",
|
||||
help="Markdown file to write (default: %(default)s)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_signatures(directory: Path) -> Dict[str, List[dict]]:
|
||||
inventory: Dict[str, List[dict]] = defaultdict(list)
|
||||
for path in sorted(directory.glob("*.json")):
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
payload = json.load(handle)
|
||||
source_pdf = payload.get("pdf") or path.name
|
||||
for font in payload.get("fonts", []):
|
||||
alias = (font.get("alias") or font.get("baseName") or "unknown").lower()
|
||||
entry = {
|
||||
"source": source_pdf,
|
||||
"file": path.name,
|
||||
"alias": alias,
|
||||
"baseName": font.get("baseName"),
|
||||
"signature": font.get("signature"),
|
||||
"glyphCount": font.get("glyphCount"),
|
||||
"glyphCoverage": font.get("glyphCoverage"),
|
||||
}
|
||||
inventory[alias].append(entry)
|
||||
return inventory
|
||||
|
||||
|
||||
def write_markdown(inventory: Dict[str, List[dict]], output: Path, input_dir: Path) -> None:
|
||||
lines: List[str] = []
|
||||
lines.append("# Type3 Signature Inventory")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
f"_Generated from `{input_dir}`. "
|
||||
"Run `scripts/summarize_type3_signatures.py` after capturing new samples._"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
for alias in sorted(inventory.keys()):
|
||||
entries = inventory[alias]
|
||||
lines.append(f"## Alias: `{alias}`")
|
||||
lines.append("")
|
||||
lines.append("| Signature | Samples | Glyph Count | Coverage (first 10) |")
|
||||
lines.append("| --- | --- | --- | --- |")
|
||||
for entry in entries:
|
||||
signature = entry.get("signature") or "—"
|
||||
sample = Path(entry["source"]).name
|
||||
glyph_count = entry.get("glyphCount") if entry.get("glyphCount") is not None else "—"
|
||||
coverage = entry.get("glyphCoverage") or []
|
||||
preview = ", ".join(str(code) for code in coverage[:10])
|
||||
lines.append(f"| `{signature}` | `{sample}` | {glyph_count} | {preview} |")
|
||||
lines.append("")
|
||||
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
output.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
input_dir = Path(args.input)
|
||||
if not input_dir.exists():
|
||||
raise SystemExit(f"Input directory not found: {input_dir}")
|
||||
inventory = load_signatures(input_dir)
|
||||
output_path = Path(args.output)
|
||||
write_markdown(inventory, output_path, input_dir)
|
||||
print(f"Wrote inventory for {len(inventory)} aliases to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
481
scripts/type3_to_cff.py
Normal file
481
scripts/type3_to_cff.py
Normal file
@@ -0,0 +1,481 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert Stirling PDF Type3 glyph JSON into synthesised fonts using fontTools.
|
||||
|
||||
The input JSON is expected to contain:
|
||||
- fontId, pageNumber (optional metadata)
|
||||
- fontMatrix: 3x3 matrix describing the Type3 glyph transform
|
||||
- glyphs: array of glyph records with keys:
|
||||
name, code, advanceWidth, bbox, unicode, outline (list of commands)
|
||||
|
||||
The script produces an OpenType CFF font and, when requested, a companion
|
||||
TrueType font for web-preview usage. Only the fontTools package is required,
|
||||
avoiding heavyweight build dependencies such as fontmake/ufoLib2.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from fontTools.fontBuilder import FontBuilder
|
||||
from fontTools.misc.fixedTools import otRound
|
||||
from fontTools.pens.cu2quPen import Cu2QuPen
|
||||
from fontTools.pens.t2CharStringPen import T2CharStringPen
|
||||
from fontTools.pens.ttGlyphPen import TTGlyphPen
|
||||
|
||||
|
||||
Command = Dict[str, object]
|
||||
Matrix = Tuple[float, float, float, float, float, float]
|
||||
|
||||
|
||||
@dataclass
|
||||
class GlyphSource:
|
||||
name: str
|
||||
width: float
|
||||
unicode: Optional[int]
|
||||
char_code: Optional[int]
|
||||
outline: Sequence[Command]
|
||||
|
||||
|
||||
@dataclass
|
||||
class GlyphBuildResult:
|
||||
name: str
|
||||
width: int
|
||||
charstring: object
|
||||
ttf_glyph: Optional[object]
|
||||
unicode: Optional[int]
|
||||
char_code: Optional[int]
|
||||
bounds: Optional[Tuple[float, float, float, float]]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Synthesize fonts from Type3 glyph JSON.")
|
||||
parser.add_argument("--input", required=True, help="Path to glyph JSON emitted by the backend")
|
||||
parser.add_argument("--otf-output", required=True, help="Destination path for the CFF/OTF font")
|
||||
parser.add_argument("--ttf-output", help="Optional destination path for a TrueType font")
|
||||
parser.add_argument("--family-name", default="Type3 Synth", help="Family name for the output")
|
||||
parser.add_argument("--style-name", default="Regular", help="Style name for the output")
|
||||
parser.add_argument("--units-per-em", type=int, default=1000, help="Units per EM value")
|
||||
parser.add_argument("--cu2qu-error", type=float, default=1.0, help="Max error for cubic→quadratic conversion")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
return json.load(handle)
|
||||
except Exception as exc: # pragma: no cover - fatal configuration error
|
||||
print(f"ERROR: Failed to load glyph JSON '{path}': {exc}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def parse_font_matrix(rows: Optional[Iterable[Iterable[float]]]) -> Matrix:
|
||||
"""
|
||||
Retrieve the raw 2×3 FontMatrix entries for diagnostics. Type3 glyph
|
||||
outlines in our extractor are emitted in their native coordinate system, so
|
||||
the returned matrix is currently informational only.
|
||||
"""
|
||||
if not rows:
|
||||
return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
|
||||
values: List[List[float]] = []
|
||||
for row in rows:
|
||||
try:
|
||||
values.append([float(col) for col in row])
|
||||
except (TypeError, ValueError):
|
||||
return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
|
||||
if len(values) < 3 or len(values[0]) < 2 or len(values[1]) < 2:
|
||||
return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
|
||||
return (
|
||||
float(values[0][0]),
|
||||
float(values[0][1]),
|
||||
float(values[1][0]),
|
||||
float(values[1][1]),
|
||||
float(values[2][0]),
|
||||
float(values[2][1]),
|
||||
)
|
||||
|
||||
|
||||
def resolve_width(raw_width: float, default: int) -> int:
|
||||
try:
|
||||
value = float(raw_width)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
if not math.isfinite(value) or value <= 0:
|
||||
return default
|
||||
width = otRound(value)
|
||||
return width if width > 0 else default
|
||||
|
||||
|
||||
def quadratic_to_cubic(
|
||||
current: Tuple[float, float],
|
||||
ctrl: Tuple[float, float],
|
||||
end: Tuple[float, float],
|
||||
) -> Tuple[Tuple[float, float], Tuple[float, float], Tuple[float, float]]:
|
||||
"""
|
||||
Convert a quadratic Bézier segment to cubic control points.
|
||||
"""
|
||||
c1 = (
|
||||
current[0] + (2.0 / 3.0) * (ctrl[0] - current[0]),
|
||||
current[1] + (2.0 / 3.0) * (ctrl[1] - current[1]),
|
||||
)
|
||||
c2 = (
|
||||
end[0] + (2.0 / 3.0) * (ctrl[0] - end[0]),
|
||||
end[1] + (2.0 / 3.0) * (ctrl[1] - end[1]),
|
||||
)
|
||||
return c1, c2, end
|
||||
|
||||
|
||||
def iterate_glyphs(data: Dict[str, object]) -> List[GlyphSource]:
|
||||
glyph_records = data.get("glyphs") or []
|
||||
sources: List[GlyphSource] = []
|
||||
for index, record in enumerate(glyph_records, start=1):
|
||||
if not isinstance(record, dict):
|
||||
continue
|
||||
name = record.get("name")
|
||||
if not isinstance(name, str) or not name:
|
||||
name = f"g{index}"
|
||||
width = record.get("advanceWidth")
|
||||
if not isinstance(width, (int, float)) or math.isnan(width):
|
||||
width = 1000.0
|
||||
unicode_value = record.get("unicode")
|
||||
if not isinstance(unicode_value, int) or unicode_value <= 0:
|
||||
unicode_value = None
|
||||
char_code_value = record.get("charCode")
|
||||
if not isinstance(char_code_value, int):
|
||||
char_code_value = record.get("code")
|
||||
if not isinstance(char_code_value, int):
|
||||
char_code_value = record.get("charCodeRaw")
|
||||
if not isinstance(char_code_value, int) or not (0 <= char_code_value <= 0x10FFFF):
|
||||
char_code_value = None
|
||||
outline = record.get("outline")
|
||||
if not isinstance(outline, list):
|
||||
outline = []
|
||||
sources.append(
|
||||
GlyphSource(
|
||||
name=name,
|
||||
width=float(width),
|
||||
unicode=unicode_value,
|
||||
char_code=char_code_value,
|
||||
outline=outline))
|
||||
return sources
|
||||
|
||||
|
||||
def build_cff_charstring(
|
||||
glyph: GlyphSource,
|
||||
width: int,
|
||||
) -> Tuple[object, Optional[Tuple[float, float, float, float]]]:
|
||||
pen = T2CharStringPen(width=width, glyphSet=None)
|
||||
bounds = [math.inf, math.inf, -math.inf, -math.inf]
|
||||
|
||||
def update_bounds(point: Tuple[float, float]) -> None:
|
||||
x, y = point
|
||||
bounds[0] = min(bounds[0], x)
|
||||
bounds[1] = min(bounds[1], y)
|
||||
bounds[2] = max(bounds[2], x)
|
||||
bounds[3] = max(bounds[3], y)
|
||||
|
||||
current: Optional[Tuple[float, float]] = None
|
||||
start_point: Optional[Tuple[float, float]] = None
|
||||
open_path = False
|
||||
|
||||
for command in glyph.outline:
|
||||
if not isinstance(command, dict):
|
||||
continue
|
||||
op = command.get("cmd")
|
||||
if op == "M":
|
||||
if open_path:
|
||||
pen.endPath()
|
||||
open_path = False
|
||||
point = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
|
||||
pen.moveTo(point)
|
||||
update_bounds(point)
|
||||
current = point
|
||||
start_point = point
|
||||
open_path = True
|
||||
elif op == "L" and current is not None:
|
||||
point = (float(command.get("x", current[0])), float(command.get("y", current[1])))
|
||||
pen.lineTo(point)
|
||||
update_bounds(point)
|
||||
current = point
|
||||
elif op == "C" and current is not None:
|
||||
ctrl1 = (
|
||||
float(command.get("x1", current[0])),
|
||||
float(command.get("y1", current[1])),
|
||||
)
|
||||
ctrl2 = (
|
||||
float(command.get("x2", current[0])),
|
||||
float(command.get("y2", current[1])),
|
||||
)
|
||||
end = (
|
||||
float(command.get("x", current[0])),
|
||||
float(command.get("y", current[1])),
|
||||
)
|
||||
pen.curveTo(ctrl1, ctrl2, end)
|
||||
update_bounds(ctrl1)
|
||||
update_bounds(ctrl2)
|
||||
update_bounds(end)
|
||||
current = end
|
||||
elif op == "Q" and current is not None:
|
||||
ctrl = (
|
||||
float(command.get("x1", current[0])),
|
||||
float(command.get("y1", current[1])),
|
||||
)
|
||||
end = (
|
||||
float(command.get("x", current[0])),
|
||||
float(command.get("y", current[1])),
|
||||
)
|
||||
c1, c2, end_point = quadratic_to_cubic(current, ctrl, end)
|
||||
pen.curveTo(c1, c2, end_point)
|
||||
update_bounds(ctrl)
|
||||
update_bounds(end_point)
|
||||
current = end_point
|
||||
elif op == "Z" and open_path:
|
||||
pen.closePath()
|
||||
open_path = False
|
||||
if start_point is not None:
|
||||
current = start_point
|
||||
# Ignore unsupported commands silently.
|
||||
|
||||
if open_path:
|
||||
pen.endPath()
|
||||
|
||||
charstring = pen.getCharString()
|
||||
bbox = None
|
||||
if bounds[0] <= bounds[2] and bounds[1] <= bounds[3]:
|
||||
bbox = (bounds[0], bounds[1], bounds[2], bounds[3])
|
||||
return charstring, bbox
|
||||
|
||||
|
||||
def build_ttf_glyph(glyph: GlyphSource, max_error: float) -> Optional[object]:
|
||||
pen = TTGlyphPen(glyphSet=None)
|
||||
draw_pen = Cu2QuPen(pen, max_error, reverse_direction=False)
|
||||
|
||||
current_exists = False
|
||||
|
||||
for command in glyph.outline:
|
||||
if not isinstance(command, dict):
|
||||
continue
|
||||
op = command.get("cmd")
|
||||
if op == "M":
|
||||
x = float(command.get("x", 0.0))
|
||||
y = float(command.get("y", 0.0))
|
||||
draw_pen.moveTo((x, y))
|
||||
current_exists = True
|
||||
elif op == "L" and current_exists:
|
||||
x = float(command.get("x", 0.0))
|
||||
y = float(command.get("y", 0.0))
|
||||
draw_pen.lineTo((x, y))
|
||||
elif op == "C" and current_exists:
|
||||
ctrl1 = (float(command.get("x1", 0.0)), float(command.get("y1", 0.0)))
|
||||
ctrl2 = (float(command.get("x2", 0.0)), float(command.get("y2", 0.0)))
|
||||
end = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
|
||||
draw_pen.curveTo(ctrl1, ctrl2, end)
|
||||
elif op == "Q" and current_exists:
|
||||
ctrl = (float(command.get("x1", 0.0)), float(command.get("y1", 0.0)))
|
||||
end = (float(command.get("x", 0.0)), float(command.get("y", 0.0)))
|
||||
draw_pen.qCurveTo(ctrl, end)
|
||||
elif op == "Z" and current_exists:
|
||||
draw_pen.closePath()
|
||||
current_exists = False
|
||||
|
||||
if current_exists:
|
||||
draw_pen.endPath()
|
||||
|
||||
try:
|
||||
glyph_obj = pen.glyph()
|
||||
except Exception:
|
||||
return None
|
||||
return glyph_obj
|
||||
|
||||
|
||||
def synthesise_fonts(
|
||||
data: Dict[str, object],
|
||||
otf_output: Path,
|
||||
ttf_output: Optional[Path],
|
||||
family_name: str,
|
||||
style_name: str,
|
||||
units_per_em: int,
|
||||
cu2qu_error: float,
|
||||
) -> None:
|
||||
_font_matrix = parse_font_matrix(data.get("fontMatrix"))
|
||||
glyphs = iterate_glyphs(data)
|
||||
|
||||
results: List[GlyphBuildResult] = []
|
||||
global_y_min = math.inf
|
||||
global_y_max = -math.inf
|
||||
|
||||
default_width = max(1, units_per_em // 2)
|
||||
|
||||
for glyph in glyphs:
|
||||
width = resolve_width(glyph.width, default_width)
|
||||
charstring, bounds = build_cff_charstring(glyph, width)
|
||||
ttf_glyph = None
|
||||
if ttf_output is not None:
|
||||
ttf_glyph = build_ttf_glyph(glyph, cu2qu_error)
|
||||
if ttf_glyph is not None:
|
||||
ttf_glyph.width = width
|
||||
if bounds is not None:
|
||||
global_y_min = min(global_y_min, bounds[1])
|
||||
global_y_max = max(global_y_max, bounds[3])
|
||||
results.append(
|
||||
GlyphBuildResult(
|
||||
name=glyph.name,
|
||||
width=width,
|
||||
charstring=charstring,
|
||||
ttf_glyph=ttf_glyph,
|
||||
unicode=glyph.unicode,
|
||||
char_code=glyph.char_code,
|
||||
bounds=bounds,
|
||||
)
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise RuntimeError("No glyphs provided in input JSON")
|
||||
|
||||
ascent = global_y_max if math.isfinite(global_y_max) else units_per_em * 0.8
|
||||
descent = global_y_min if math.isfinite(global_y_min) else -units_per_em * 0.2
|
||||
ascent = otRound(ascent)
|
||||
descent = otRound(descent)
|
||||
if ascent <= 0:
|
||||
ascent = otRound(units_per_em * 0.8)
|
||||
if descent >= 0:
|
||||
descent = -otRound(units_per_em * 0.2)
|
||||
|
||||
glyph_order = [".notdef"] + [result.name for result in results]
|
||||
horizontal_metrics = {result.name: (result.width, 0) for result in results}
|
||||
horizontal_metrics[".notdef"] = (default_width, 0)
|
||||
|
||||
cmap: Dict[int, str] = {}
|
||||
next_private = 0xF000
|
||||
for result in results:
|
||||
code_point = result.unicode
|
||||
if code_point is None:
|
||||
raw_code = result.char_code
|
||||
if raw_code is not None:
|
||||
code_point = raw_code
|
||||
else:
|
||||
code_point = next_private
|
||||
next_private += 1
|
||||
cmap[code_point] = result.name
|
||||
|
||||
notdef_pen = T2CharStringPen(width=default_width, glyphSet=None)
|
||||
notdef_pen.endPath()
|
||||
charstrings = {result.name: result.charstring for result in results}
|
||||
charstrings[".notdef"] = notdef_pen.getCharString()
|
||||
|
||||
name_table_entries = {
|
||||
"familyName": family_name,
|
||||
"styleName": style_name,
|
||||
"psName": f"{family_name.replace(' ', '')}-{style_name}",
|
||||
"fullName": f"{family_name} {style_name}",
|
||||
}
|
||||
|
||||
# Build OTF (CFF) font.
|
||||
fb = FontBuilder(units_per_em, isTTF=False)
|
||||
fb.setupGlyphOrder(glyph_order)
|
||||
fb.setupCharacterMap(cmap)
|
||||
fb.setupHorizontalMetrics(horizontal_metrics)
|
||||
fb.setupHorizontalHeader(ascent=ascent, descent=descent)
|
||||
fb.setupOS2(
|
||||
sTypoAscender=ascent,
|
||||
sTypoDescender=descent,
|
||||
usWinAscent=max(ascent, 0),
|
||||
usWinDescent=abs(min(descent, 0)),
|
||||
sxHeight=otRound(units_per_em * 0.5),
|
||||
sCapHeight=otRound(units_per_em * 0.7),
|
||||
)
|
||||
fb.setupNameTable(name_table_entries)
|
||||
fb.setupPost()
|
||||
fb.setupCFF(
|
||||
name_table_entries["psName"],
|
||||
{
|
||||
"FullName": name_table_entries["fullName"],
|
||||
"FamilyName": name_table_entries["familyName"],
|
||||
"Weight": style_name,
|
||||
},
|
||||
charstrings,
|
||||
{"BlueValues": []},
|
||||
)
|
||||
fb.font.save(str(otf_output))
|
||||
|
||||
if ttf_output is None:
|
||||
return
|
||||
|
||||
glyph_objects: Dict[str, object] = {}
|
||||
empty_pen = TTGlyphPen(None)
|
||||
empty_pen.moveTo((0, 0))
|
||||
empty_pen.lineTo((0, 0))
|
||||
empty_pen.closePath()
|
||||
empty_glyph = empty_pen.glyph()
|
||||
empty_glyph.width = default_width
|
||||
glyph_objects[".notdef"] = empty_glyph
|
||||
for result in results:
|
||||
glyph_obj = result.ttf_glyph
|
||||
if glyph_obj is None:
|
||||
temp_pen = TTGlyphPen(None)
|
||||
temp_pen.moveTo((0, 0))
|
||||
temp_pen.lineTo((0, 0))
|
||||
temp_pen.closePath()
|
||||
glyph_obj = temp_pen.glyph()
|
||||
glyph_obj.width = result.width
|
||||
glyph_objects[result.name] = glyph_obj
|
||||
|
||||
ttf_fb = FontBuilder(units_per_em, isTTF=True)
|
||||
ttf_fb.setupGlyphOrder(glyph_order)
|
||||
ttf_fb.setupCharacterMap(cmap)
|
||||
ttf_fb.setupHorizontalMetrics(horizontal_metrics)
|
||||
ttf_fb.setupHorizontalHeader(ascent=ascent, descent=descent)
|
||||
ttf_fb.setupOS2(
|
||||
sTypoAscender=ascent,
|
||||
sTypoDescender=descent,
|
||||
usWinAscent=max(ascent, 0),
|
||||
usWinDescent=abs(min(descent, 0)),
|
||||
sxHeight=otRound(units_per_em * 0.5),
|
||||
sCapHeight=otRound(units_per_em * 0.7),
|
||||
)
|
||||
ttf_fb.setupNameTable(name_table_entries)
|
||||
ttf_fb.setupPost()
|
||||
ttf_fb.setupGlyf(glyph_objects)
|
||||
ttf_fb.setupDummyDSIG()
|
||||
ttf_fb.font.save(str(ttf_output))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
input_path = Path(args.input).resolve()
|
||||
otf_output = Path(args.otf_output).resolve()
|
||||
ttf_output = Path(args.ttf_output).resolve() if args.ttf_output else None
|
||||
|
||||
data = load_json(input_path)
|
||||
try:
|
||||
synthesise_fonts(
|
||||
data=data,
|
||||
otf_output=otf_output,
|
||||
ttf_output=ttf_output,
|
||||
family_name=args.family_name,
|
||||
style_name=args.style_name,
|
||||
units_per_em=args.units_per_em,
|
||||
cu2qu_error=args.cu2qu_error,
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f"ERROR: Failed to generate fonts: {exc}", file=sys.stderr)
|
||||
if otf_output.exists():
|
||||
otf_output.unlink()
|
||||
if ttf_output and ttf_output.exists():
|
||||
ttf_output.unlink()
|
||||
sys.exit(1)
|
||||
|
||||
message = f"Generated font at {otf_output}"
|
||||
if ttf_output:
|
||||
message += f" and {ttf_output}"
|
||||
print(message, file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
243
scripts/update_type3_library.py
Normal file
243
scripts/update_type3_library.py
Normal file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Synchronize Type3 library index entries with captured signature dumps.
|
||||
|
||||
The script scans docs/type3/signatures/*.json (or a custom --signatures-dir),
|
||||
matches each font by alias/signature to app/core/src/main/resources/type3/library/index.json,
|
||||
and updates the entry's signatures / glyphCoverage / aliases / source fields.
|
||||
|
||||
Usage:
|
||||
scripts/update_type3_library.py --apply
|
||||
|
||||
Run without --apply to see a dry-run summary.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
DEFAULT_SIGNATURES = REPO_ROOT / "docs" / "type3" / "signatures"
|
||||
DEFAULT_INDEX = (
|
||||
REPO_ROOT / "app" / "core" / "src" / "main" / "resources" / "type3" / "library" / "index.json"
|
||||
)
|
||||
|
||||
|
||||
def normalize_alias(value: Optional[str]) -> Optional[str]:
|
||||
if not value:
|
||||
return None
|
||||
trimmed = value.strip()
|
||||
plus = trimmed.find("+")
|
||||
if plus >= 0 and plus < len(trimmed) - 1:
|
||||
trimmed = trimmed[plus + 1 :]
|
||||
lowered = trimmed.lower()
|
||||
return lowered if lowered else None
|
||||
|
||||
|
||||
def load_json(path: Path):
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
return json.load(handle)
|
||||
|
||||
|
||||
def dump_json(path: Path, data) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(data, handle, indent=2)
|
||||
handle.write("\n")
|
||||
|
||||
|
||||
def iter_signature_fonts(signature_file: Path):
|
||||
payload = load_json(signature_file)
|
||||
pdf_source = payload.get("pdf")
|
||||
for font in payload.get("fonts", []):
|
||||
alias = font.get("alias") or font.get("baseName")
|
||||
normalized = normalize_alias(alias) or normalize_alias(font.get("baseName"))
|
||||
yield {
|
||||
"alias_raw": alias,
|
||||
"alias": normalized,
|
||||
"baseName": font.get("baseName"),
|
||||
"signature": font.get("signature"),
|
||||
"glyphCoverage": font.get("glyphCoverage") or [],
|
||||
"pdf": pdf_source,
|
||||
"file": signature_file,
|
||||
}
|
||||
|
||||
|
||||
def make_alias_index(entries: List[Dict]) -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
|
||||
alias_index: Dict[str, Dict] = {}
|
||||
signature_index: Dict[str, Dict] = {}
|
||||
for entry in entries:
|
||||
for alias in entry.get("aliases", []) or []:
|
||||
normalized = normalize_alias(alias)
|
||||
if normalized:
|
||||
alias_index.setdefault(normalized, entry)
|
||||
base_name_alias = normalize_alias(entry.get("label"))
|
||||
if base_name_alias:
|
||||
alias_index.setdefault(base_name_alias, entry)
|
||||
for signature in entry.get("signatures", []) or []:
|
||||
signature_index.setdefault(signature.lower(), entry)
|
||||
return alias_index, signature_index
|
||||
|
||||
|
||||
def ensure_list(container: Dict, key: str) -> List:
|
||||
value = container.get(key)
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
value = []
|
||||
container[key] = value
|
||||
return value
|
||||
|
||||
|
||||
def merge_sorted_unique(values: Iterable[int]) -> List[int]:
|
||||
return sorted({int(v) for v in values if isinstance(v, int)})
|
||||
|
||||
|
||||
def normalize_source_path(pdf_path: Optional[str]) -> Optional[str]:
|
||||
if not pdf_path:
|
||||
return None
|
||||
try:
|
||||
source = Path(pdf_path)
|
||||
rel = source.relative_to(REPO_ROOT)
|
||||
except Exception:
|
||||
rel = Path(pdf_path)
|
||||
return str(rel).replace("\\", "/")
|
||||
|
||||
|
||||
def update_library(
|
||||
signatures_dir: Path, index_path: Path, apply_changes: bool
|
||||
) -> Tuple[int, int, List[Tuple[str, Path]]]:
|
||||
entries = load_json(index_path)
|
||||
alias_index, signature_index = make_alias_index(entries)
|
||||
|
||||
modifications = 0
|
||||
updated_entries = set()
|
||||
unmatched: List[Tuple[str, Path]] = []
|
||||
|
||||
signature_files = sorted(signatures_dir.glob("*.json"))
|
||||
if not signature_files:
|
||||
print(f"No signature JSON files found under {signatures_dir}", file=sys.stderr)
|
||||
return 0, 0, unmatched
|
||||
|
||||
for sig_file in signature_files:
|
||||
for font in iter_signature_fonts(sig_file):
|
||||
signature = font["signature"]
|
||||
norm_signature = signature.lower() if signature else None
|
||||
alias = font["alias"]
|
||||
|
||||
entry = None
|
||||
if norm_signature and norm_signature in signature_index:
|
||||
entry = signature_index[norm_signature]
|
||||
elif alias and alias in alias_index:
|
||||
entry = alias_index[alias]
|
||||
|
||||
if entry is None:
|
||||
unmatched.append((font.get("baseName") or font.get("alias_raw") or "unknown", sig_file))
|
||||
continue
|
||||
|
||||
entry_modified = False
|
||||
|
||||
# Signatures
|
||||
if signature:
|
||||
signature_list = ensure_list(entry, "signatures")
|
||||
if signature not in signature_list:
|
||||
signature_list.append(signature)
|
||||
entry_modified = True
|
||||
signature_index[signature.lower()] = entry
|
||||
|
||||
# Aliases
|
||||
alias_raw = font.get("alias_raw")
|
||||
if alias_raw:
|
||||
aliases = ensure_list(entry, "aliases")
|
||||
if alias_raw not in aliases:
|
||||
aliases.append(alias_raw)
|
||||
entry_modified = True
|
||||
normalized = normalize_alias(alias_raw)
|
||||
if normalized:
|
||||
alias_index.setdefault(normalized, entry)
|
||||
|
||||
# Glyph coverage
|
||||
coverage = font.get("glyphCoverage") or []
|
||||
if coverage:
|
||||
existing = set(entry.get("glyphCoverage", []))
|
||||
merged = merge_sorted_unique(list(existing) + coverage)
|
||||
if merged != entry.get("glyphCoverage"):
|
||||
entry["glyphCoverage"] = merged
|
||||
entry_modified = True
|
||||
|
||||
# Source PDF
|
||||
pdf_source = normalize_source_path(font.get("pdf"))
|
||||
if pdf_source and not entry.get("source"):
|
||||
entry["source"] = pdf_source
|
||||
entry_modified = True
|
||||
|
||||
if entry_modified:
|
||||
modifications += 1
|
||||
updated_entries.add(entry.get("id", "<unknown>"))
|
||||
|
||||
if apply_changes and modifications > 0:
|
||||
dump_json(index_path, entries)
|
||||
|
||||
return modifications, len(updated_entries), unmatched
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Update Type3 library index using signature dumps.")
|
||||
parser.add_argument(
|
||||
"--signatures-dir",
|
||||
type=Path,
|
||||
default=DEFAULT_SIGNATURES,
|
||||
help=f"Directory containing signature JSON files (default: {DEFAULT_SIGNATURES})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index",
|
||||
type=Path,
|
||||
default=DEFAULT_INDEX,
|
||||
help=f"Path to type3/library/index.json (default: {DEFAULT_INDEX})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Write changes back to the index file. Without this flag the script runs in dry-run mode.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
signatures_dir = args.signatures_dir if args.signatures_dir.is_absolute() else (REPO_ROOT / args.signatures_dir)
|
||||
index_path = args.index if args.index.is_absolute() else (REPO_ROOT / args.index)
|
||||
|
||||
if not signatures_dir.exists():
|
||||
print(f"Signature directory not found: {signatures_dir}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
if not index_path.exists():
|
||||
print(f"Index file not found: {index_path}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
modifications, updated_entries, unmatched = update_library(
|
||||
signatures_dir, index_path, apply_changes=args.apply
|
||||
)
|
||||
|
||||
mode = "APPLIED" if args.apply else "DRY-RUN"
|
||||
print(
|
||||
f"[{mode}] Processed signatures under {signatures_dir}. "
|
||||
f"Updated entries: {updated_entries}, individual modifications: {modifications}."
|
||||
)
|
||||
|
||||
if unmatched:
|
||||
print("\nUnmatched fonts (no library entry yet):")
|
||||
for alias, sig_file in unmatched:
|
||||
print(f" - {alias} (from {sig_file})")
|
||||
print("Add these fonts to index.json with the proper payload before rerunning.")
|
||||
|
||||
if modifications == 0:
|
||||
print("No changes detected; index.json already matches captured signatures.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user