#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Compress PDF — output DPI and JPEG quality are chosen automatically based on the detected resolution of the source PDF. Usage: python compress_pdf.py [output.pdf] python compress_pdf.py (processes all PDFs in current folder) Output filename: original_name (139 kB).pdf """ import sys import fitz from pathlib import Path # ============================== # COMPRESSION TABLE # Detected source DPI -> (output DPI, JPEG quality) # Rows are evaluated top-to-bottom; first match wins. # ============================== # # src_dpi_min src_dpi_max out_dpi jpeg_quality COMPRESSION_TABLE = [ ( 0, 99, 72, 60), # very low res — already small, compress hard ( 100, 149, 100, 70), # low res ( 150, 249, 150, 80), # standard scan (our tested sweet spot) ( 250, 399, 150, 80), # good scan — downsample to 150 is fine ( 400, 599, 200, 85), # high res scan ( 600, 9999, 150, 80), # very high res / professional scan ] def detect_source_dpi(src: fitz.Document) -> int: """Estimate source DPI from the largest image on the first page.""" page = src[0] images = page.get_images(full=True) if not images: return 150 # no raster images — use default # Find the largest image by pixel area best = max(images, key=lambda img: img[2] * img[3]) # width * height img_w_px, img_h_px = best[2], best[3] # Page size in inches (1 point = 1/72 inch) page_w_in = page.rect.width / 72.0 page_h_in = page.rect.height / 72.0 dpi_x = img_w_px / page_w_in if page_w_in else 0 dpi_y = img_h_px / page_h_in if page_h_in else 0 return round((dpi_x + dpi_y) / 2) def pick_settings(source_dpi: int) -> tuple[int, int]: for min_dpi, max_dpi, out_dpi, quality in COMPRESSION_TABLE: if min_dpi <= source_dpi <= max_dpi: return out_dpi, quality # fallback to last row return COMPRESSION_TABLE[-1][2], COMPRESSION_TABLE[-1][3] def compress(input_path: Path, output_path: Path = None): src = fitz.open(input_path) source_dpi = detect_source_dpi(src) out_dpi, jpeg_quality = pick_settings(source_dpi) print(f" zdroj ~{source_dpi} DPI -> komprese {out_dpi} DPI / JPEG q{jpeg_quality}") zoom = out_dpi / 72.0 mat = fitz.Matrix(zoom, zoom) out_doc = fitz.open() for page in src: pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) img_bytes = pix.tobytes("jpeg", jpg_quality=jpeg_quality) img_doc = fitz.open("pdf", fitz.open("jpeg", img_bytes).convert_to_pdf()) rect = page.rect new_page = out_doc.new_page(width=rect.width, height=rect.height) new_page.show_pdf_page(new_page.rect, img_doc, 0) src.close() tmp = input_path.with_suffix(".tmp.pdf") out_doc.save(tmp, deflate=True, garbage=4) out_doc.close() size_kb = round(tmp.stat().st_size / 1024) if output_path is None: output_path = input_path.parent / f"{input_path.stem} ({size_kb} kB).pdf" if output_path.exists(): output_path.unlink() tmp.rename(output_path) orig_kb = round(input_path.stat().st_size / 1024) saving = (1 - size_kb / orig_kb) * 100 print(f" {input_path.name} -> {output_path.name} (bylo {orig_kb} kB, uspora {saving:.0f}%)") if __name__ == "__main__": if len(sys.argv) >= 2: inp = Path(sys.argv[1]) out = Path(sys.argv[2]) if len(sys.argv) >= 3 else None compress(inp, out) else: folder = Path(__file__).parent pdfs = [p for p in folder.glob("*.pdf") if not p.name.endswith(").pdf") and p.stem != Path(__file__).stem] if not pdfs: print("Zadne PDF k zpracovani.") for pdf in pdfs: compress(pdf)