Files
Vladimir Buzalka daad4adeab notebookvb
2026-04-29 06:55:23 +02:00

112 lines
3.7 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Compress PDF — output DPI and JPEG quality are chosen automatically
based on the detected resolution of the source PDF.
Usage: python compress_pdf.py <input.pdf> [output.pdf]
python compress_pdf.py (processes all PDFs in current folder)
Output filename: original_name (139 kB).pdf
"""
import sys
import fitz
from pathlib import Path
# ==============================
# COMPRESSION TABLE
# Detected source DPI -> (output DPI, JPEG quality)
# Rows are evaluated top-to-bottom; first match wins.
# ==============================
#
# src_dpi_min src_dpi_max out_dpi jpeg_quality
COMPRESSION_TABLE = [
( 0, 99, 72, 60), # very low res — already small, compress hard
( 100, 149, 100, 70), # low res
( 150, 249, 150, 80), # standard scan (our tested sweet spot)
( 250, 399, 150, 80), # good scan — downsample to 150 is fine
( 400, 599, 200, 85), # high res scan
( 600, 9999, 150, 80), # very high res / professional scan
]
def detect_source_dpi(src: fitz.Document) -> int:
"""Estimate source DPI from the largest image on the first page."""
page = src[0]
images = page.get_images(full=True)
if not images:
return 150 # no raster images — use default
# Find the largest image by pixel area
best = max(images, key=lambda img: img[2] * img[3]) # width * height
img_w_px, img_h_px = best[2], best[3]
# Page size in inches (1 point = 1/72 inch)
page_w_in = page.rect.width / 72.0
page_h_in = page.rect.height / 72.0
dpi_x = img_w_px / page_w_in if page_w_in else 0
dpi_y = img_h_px / page_h_in if page_h_in else 0
return round((dpi_x + dpi_y) / 2)
def pick_settings(source_dpi: int) -> tuple[int, int]:
for min_dpi, max_dpi, out_dpi, quality in COMPRESSION_TABLE:
if min_dpi <= source_dpi <= max_dpi:
return out_dpi, quality
# fallback to last row
return COMPRESSION_TABLE[-1][2], COMPRESSION_TABLE[-1][3]
def compress(input_path: Path, output_path: Path = None):
src = fitz.open(input_path)
source_dpi = detect_source_dpi(src)
out_dpi, jpeg_quality = pick_settings(source_dpi)
print(f" zdroj ~{source_dpi} DPI -> komprese {out_dpi} DPI / JPEG q{jpeg_quality}")
zoom = out_dpi / 72.0
mat = fitz.Matrix(zoom, zoom)
out_doc = fitz.open()
for page in src:
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img_bytes = pix.tobytes("jpeg", jpg_quality=jpeg_quality)
img_doc = fitz.open("pdf", fitz.open("jpeg", img_bytes).convert_to_pdf())
rect = page.rect
new_page = out_doc.new_page(width=rect.width, height=rect.height)
new_page.show_pdf_page(new_page.rect, img_doc, 0)
src.close()
tmp = input_path.with_suffix(".tmp.pdf")
out_doc.save(tmp, deflate=True, garbage=4)
out_doc.close()
size_kb = round(tmp.stat().st_size / 1024)
if output_path is None:
output_path = input_path.parent / f"{input_path.stem} ({size_kb} kB).pdf"
if output_path.exists():
output_path.unlink()
tmp.rename(output_path)
orig_kb = round(input_path.stat().st_size / 1024)
saving = (1 - size_kb / orig_kb) * 100
print(f" {input_path.name} -> {output_path.name} (bylo {orig_kb} kB, uspora {saving:.0f}%)")
if __name__ == "__main__":
if len(sys.argv) >= 2:
inp = Path(sys.argv[1])
out = Path(sys.argv[2]) if len(sys.argv) >= 3 else None
compress(inp, out)
else:
folder = Path(__file__).parent
pdfs = [p for p in folder.glob("*.pdf") if not p.name.endswith(").pdf") and p.stem != Path(__file__).stem]
if not pdfs:
print("Zadne PDF k zpracovani.")
for pdf in pdfs:
compress(pdf)