112 lines
3.7 KiB
Python
112 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Compress PDF — output DPI and JPEG quality are chosen automatically
|
|
based on the detected resolution of the source PDF.
|
|
|
|
Usage: python compress_pdf.py <input.pdf> [output.pdf]
|
|
python compress_pdf.py (processes all PDFs in current folder)
|
|
Output filename: original_name (139 kB).pdf
|
|
"""
|
|
|
|
import sys
|
|
import fitz
|
|
from pathlib import Path
|
|
|
|
# ==============================
|
|
# COMPRESSION TABLE
|
|
# Detected source DPI -> (output DPI, JPEG quality)
|
|
# Rows are evaluated top-to-bottom; first match wins.
|
|
# ==============================
|
|
#
|
|
# src_dpi_min src_dpi_max out_dpi jpeg_quality
|
|
COMPRESSION_TABLE = [
|
|
( 0, 99, 72, 60), # very low res — already small, compress hard
|
|
( 100, 149, 100, 70), # low res
|
|
( 150, 249, 150, 80), # standard scan (our tested sweet spot)
|
|
( 250, 399, 150, 80), # good scan — downsample to 150 is fine
|
|
( 400, 599, 200, 85), # high res scan
|
|
( 600, 9999, 150, 80), # very high res / professional scan
|
|
]
|
|
|
|
|
|
def detect_source_dpi(src: fitz.Document) -> int:
|
|
"""Estimate source DPI from the largest image on the first page."""
|
|
page = src[0]
|
|
images = page.get_images(full=True)
|
|
if not images:
|
|
return 150 # no raster images — use default
|
|
|
|
# Find the largest image by pixel area
|
|
best = max(images, key=lambda img: img[2] * img[3]) # width * height
|
|
img_w_px, img_h_px = best[2], best[3]
|
|
|
|
# Page size in inches (1 point = 1/72 inch)
|
|
page_w_in = page.rect.width / 72.0
|
|
page_h_in = page.rect.height / 72.0
|
|
|
|
dpi_x = img_w_px / page_w_in if page_w_in else 0
|
|
dpi_y = img_h_px / page_h_in if page_h_in else 0
|
|
return round((dpi_x + dpi_y) / 2)
|
|
|
|
|
|
def pick_settings(source_dpi: int) -> tuple[int, int]:
|
|
for min_dpi, max_dpi, out_dpi, quality in COMPRESSION_TABLE:
|
|
if min_dpi <= source_dpi <= max_dpi:
|
|
return out_dpi, quality
|
|
# fallback to last row
|
|
return COMPRESSION_TABLE[-1][2], COMPRESSION_TABLE[-1][3]
|
|
|
|
|
|
def compress(input_path: Path, output_path: Path = None):
|
|
src = fitz.open(input_path)
|
|
|
|
source_dpi = detect_source_dpi(src)
|
|
out_dpi, jpeg_quality = pick_settings(source_dpi)
|
|
|
|
print(f" zdroj ~{source_dpi} DPI -> komprese {out_dpi} DPI / JPEG q{jpeg_quality}")
|
|
|
|
zoom = out_dpi / 72.0
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
|
|
out_doc = fitz.open()
|
|
for page in src:
|
|
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
|
img_bytes = pix.tobytes("jpeg", jpg_quality=jpeg_quality)
|
|
img_doc = fitz.open("pdf", fitz.open("jpeg", img_bytes).convert_to_pdf())
|
|
rect = page.rect
|
|
new_page = out_doc.new_page(width=rect.width, height=rect.height)
|
|
new_page.show_pdf_page(new_page.rect, img_doc, 0)
|
|
src.close()
|
|
|
|
tmp = input_path.with_suffix(".tmp.pdf")
|
|
out_doc.save(tmp, deflate=True, garbage=4)
|
|
out_doc.close()
|
|
|
|
size_kb = round(tmp.stat().st_size / 1024)
|
|
|
|
if output_path is None:
|
|
output_path = input_path.parent / f"{input_path.stem} ({size_kb} kB).pdf"
|
|
|
|
if output_path.exists():
|
|
output_path.unlink()
|
|
tmp.rename(output_path)
|
|
|
|
orig_kb = round(input_path.stat().st_size / 1024)
|
|
saving = (1 - size_kb / orig_kb) * 100
|
|
print(f" {input_path.name} -> {output_path.name} (bylo {orig_kb} kB, uspora {saving:.0f}%)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) >= 2:
|
|
inp = Path(sys.argv[1])
|
|
out = Path(sys.argv[2]) if len(sys.argv) >= 3 else None
|
|
compress(inp, out)
|
|
else:
|
|
folder = Path(__file__).parent
|
|
pdfs = [p for p in folder.glob("*.pdf") if not p.name.endswith(").pdf") and p.stem != Path(__file__).stem]
|
|
if not pdfs:
|
|
print("Zadne PDF k zpracovani.")
|
|
for pdf in pdfs:
|
|
compress(pdf)
|