medevio/50 Různé testy/MinimizeOptimizePDF/compress_pdf.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Compress PDF — output DPI and JPEG quality are chosen automatically
based on the detected resolution of the source PDF.

Usage: python compress_pdf.py <input.pdf> [output.pdf]
       python compress_pdf.py              (processes all PDFs in current folder)
Output filename: original_name (139 kB).pdf
"""

import sys
import fitz
from pathlib import Path

# ==============================
# COMPRESSION TABLE
# Detected source DPI -> (output DPI, JPEG quality)
# Rows are evaluated top-to-bottom; first match wins.
# ==============================
#
#  src_dpi_min  src_dpi_max  out_dpi  jpeg_quality
COMPRESSION_TABLE = [
    (   0,   99,    72,  60),   # very low res — already small, compress hard
    ( 100,  149,   100,  70),   # low res
    ( 150,  249,   150,  80),   # standard scan (our tested sweet spot)
    ( 250,  399,   150,  80),   # good scan — downsample to 150 is fine
    ( 400,  599,   200,  85),   # high res scan
    ( 600, 9999,   150,  80),   # very high res / professional scan
]


def detect_source_dpi(src: fitz.Document) -> int:
    """Estimate source DPI from the largest image on the first page."""
    page = src[0]
    images = page.get_images(full=True)
    if not images:
        return 150  # no raster images — use default

    # Find the largest image by pixel area
    best = max(images, key=lambda img: img[2] * img[3])  # width * height
    img_w_px, img_h_px = best[2], best[3]

    # Page size in inches (1 point = 1/72 inch)
    page_w_in = page.rect.width / 72.0
    page_h_in = page.rect.height / 72.0

    dpi_x = img_w_px / page_w_in if page_w_in else 0
    dpi_y = img_h_px / page_h_in if page_h_in else 0
    return round((dpi_x + dpi_y) / 2)


def pick_settings(source_dpi: int) -> tuple[int, int]:
    for min_dpi, max_dpi, out_dpi, quality in COMPRESSION_TABLE:
        if min_dpi <= source_dpi <= max_dpi:
            return out_dpi, quality
    # fallback to last row
    return COMPRESSION_TABLE[-1][2], COMPRESSION_TABLE[-1][3]


def compress(input_path: Path, output_path: Path = None):
    src = fitz.open(input_path)

    source_dpi = detect_source_dpi(src)
    out_dpi, jpeg_quality = pick_settings(source_dpi)

    print(f"  zdroj ~{source_dpi} DPI  ->  komprese {out_dpi} DPI / JPEG q{jpeg_quality}")

    zoom = out_dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)

    out_doc = fitz.open()
    for page in src:
        pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
        img_bytes = pix.tobytes("jpeg", jpg_quality=jpeg_quality)
        img_doc = fitz.open("pdf", fitz.open("jpeg", img_bytes).convert_to_pdf())
        rect = page.rect
        new_page = out_doc.new_page(width=rect.width, height=rect.height)
        new_page.show_pdf_page(new_page.rect, img_doc, 0)
    src.close()

    tmp = input_path.with_suffix(".tmp.pdf")
    out_doc.save(tmp, deflate=True, garbage=4)
    out_doc.close()

    size_kb = round(tmp.stat().st_size / 1024)

    if output_path is None:
        output_path = input_path.parent / f"{input_path.stem} ({size_kb} kB).pdf"

    if output_path.exists():
        output_path.unlink()
    tmp.rename(output_path)

    orig_kb = round(input_path.stat().st_size / 1024)
    saving = (1 - size_kb / orig_kb) * 100
    print(f"  {input_path.name}  ->  {output_path.name}  (bylo {orig_kb} kB, uspora {saving:.0f}%)")


if __name__ == "__main__":
    if len(sys.argv) >= 2:
        inp = Path(sys.argv[1])
        out = Path(sys.argv[2]) if len(sys.argv) >= 3 else None
        compress(inp, out)
    else:
        folder = Path(__file__).parent
        pdfs = [p for p in folder.glob("*.pdf") if not p.name.endswith(").pdf") and p.stem != Path(__file__).stem]
        if not pdfs:
            print("Zadne PDF k zpracovani.")
        for pdf in pdfs:
            compress(pdf)