#!/usr/bin/env python3 r""" Verifikace fotek — pro každou položku v `photos` zkusí soubor otevřít a dekódovat, zapíše nalezené chyby do tabulky `photo_errors` a označí `photos.verified_at = NOW()`. Cesty v DB jsou v nativním Tower1 formátu (/mnt/user/...) a skript je přemapuje podle prostředí (Windows UNC / Tower1 native / tower remote mount) — stejně jako collect_pictures_*.py a generate_thumbnails.py. Kategorie chyb (sloupec photo_errors.severity): critical — soubor je nepoužitelný (file_missing, zero_size, invalid_format, decode_failed, read_error) warning — soubor lze použít, ale má vadu (truncated, exif_parse_error, dimension_mismatch, size_suspicious) info — jen poznámka (missing_exif, missing_datetime_original, missing_gps) Usage: python verify_photos.py [--batch-size 500] [--dry-run] Konfigurace přes proměnné na začátku souboru (MAX_PHOTOS, RESET, ...). """ import argparse import logging import os import platform import socket import sys import time from pathlib import Path import psycopg2 import psycopg2.extras from dotenv import load_dotenv from PIL import Image, ImageFile, UnidentifiedImageError # .env hledáme nejprve vedle skriptu, pak v rodičovském adresáři (root projektu) _here = Path(__file__).parent for _env in (_here / ".env", _here.parent / ".env"): if _env.is_file(): load_dotenv(_env) break # ── Konfigurace ────────────────────────────────────────────────────────────── # Maximální počet fotek ke zpracování (0 = všechny) MAX_PHOTOS = 1000 # Pokud True, na začátku skriptu se smažou všechny záznamy v photo_errors # a vyresetuje verified_at v photos. Pak proběhne plná verifikace. RESET = False # Pokud True, re-verifikují se i fotky, které už verified_at mají. # Pokud False (default), zpracují se jen fotky s verified_at IS NULL. REVERIFY_ALL = False BATCH_SIZE = 500 # Limity pro `size_suspicious` warning SIZE_MIN_BYTES = 10 * 1024 # < 10 KB je podezřele malé SIZE_MAX_BYTES = 50 * 1024 * 1024 # > 50 MB je podezřele velké # Kanonický prefix pro DB (nativní Tower1 cesta) DB_SOURCE_BASE = "/mnt/user/ZalohaVsechObrazku" # Fyzické cesty podle prostředí if platform.system() == "Windows": LOCAL_SOURCE_BASE = Path(r"\\Tower1\ZalohaVsechObrazku") else: hostname = socket.gethostname() if hostname == "Tower1": LOCAL_SOURCE_BASE = Path("/mnt/user/ZalohaVsechObrazku") else: LOCAL_SOURCE_BASE = Path("/mnt/remotes/TOWER1.LAN_ZalohaVsechObrazku") # ── Logging ────────────────────────────────────────────────────────────────── LOG_FILE = Path(__file__).parent / "verify_photos.log" logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler(LOG_FILE, encoding="utf-8"), ], ) log = logging.getLogger(__name__) # ── Pomocné funkce ─────────────────────────────────────────────────────────── def get_conn(): return psycopg2.connect( host=os.getenv("DB_HOST") or os.getenv("PGHOST", "192.168.1.76"), port=int(os.getenv("DB_PORT") or os.getenv("PGPORT", 5432)), dbname=os.getenv("DB_NAME") or os.getenv("PGDATABASE", "fotky_buzalkovi"), user=os.getenv("DB_USER") or os.getenv("PGUSER", "vladimir.buzalka"), password=os.getenv("DB_PASSWORD") or os.getenv("PGPASSWORD", ""), ) def db_path_to_local(db_path: str) -> Path: """Převede cestu z DB (Tower1 nativní) na lokální cestu pro čtení.""" if db_path and db_path.startswith(DB_SOURCE_BASE): relative = db_path[len(DB_SOURCE_BASE):] return LOCAL_SOURCE_BASE / relative.lstrip("/") return Path(db_path) if db_path else Path() # ── Verifikace ─────────────────────────────────────────────────────────────── def verify_file(local_path: Path, row) -> list[tuple[str, str, str]]: """Vrátí seznam (severity, error_code, error_message) pro jeden soubor.""" errors: list[tuple[str, str, str]] = [] # 1) Existence if not local_path.is_file(): errors.append(("critical", "file_missing", f"Path not found: {local_path}")) return errors # bez souboru nemá smysl pokračovat # 2) Velikost na disku try: actual_size = local_path.stat().st_size except OSError as e: errors.append(("critical", "read_error", f"stat() failed: {e}")) return errors if actual_size == 0: errors.append(("critical", "zero_size", "File is empty (0 bytes)")) return errors if actual_size < SIZE_MIN_BYTES: errors.append(("warning", "size_suspicious", f"Very small: {actual_size} B (< {SIZE_MIN_BYTES} B)")) elif actual_size > SIZE_MAX_BYTES: errors.append(("warning", "size_suspicious", f"Very large: {actual_size} B (> {SIZE_MAX_BYTES} B)")) # 3) Pillow verify() — kontrola JPEG/PNG hlavičky header_w = header_h = None try: with Image.open(local_path) as img: header_w, header_h = img.size img.verify() except UnidentifiedImageError as e: errors.append(("critical", "invalid_format", f"Unidentified image: {e}")) return errors except OSError as e: errors.append(("critical", "read_error", f"IO error during verify: {e}")) return errors except Exception as e: errors.append(("critical", "invalid_format", f"verify() failed: {type(e).__name__}: {e}")) return errors # 4) Plný dekód — chytne truncation a decode_failed. # Nejprve striktně (LOAD_TRUNCATED_IMAGES = False). Pokud spadne na # truncation, zkusíme znovu s tolerancí — pokud dojede, je to warning, # jinak critical decode_failed. truncated = False ImageFile.LOAD_TRUNCATED_IMAGES = False try: with Image.open(local_path) as img: img.load() except OSError as e: msg = str(e).lower() if "truncated" in msg: truncated = True else: errors.append(("critical", "decode_failed", f"load() OSError: {e}")) return errors except Exception as e: errors.append(("critical", "decode_failed", f"load() {type(e).__name__}: {e}")) return errors if truncated: # Zkus s tolerancí ImageFile.LOAD_TRUNCATED_IMAGES = True try: with Image.open(local_path) as img: img.load() errors.append(("warning", "truncated", "Image is truncated but loads with LOAD_TRUNCATED_IMAGES")) except Exception as e: errors.append(("critical", "decode_failed", f"Truncated and unrecoverable: {type(e).__name__}: {e}")) return errors finally: ImageFile.LOAD_TRUNCATED_IMAGES = False # 5) dimension_mismatch — DB má width/height, porovnej s hlavičkou db_w = row.get("width") db_h = row.get("height") if db_w and db_h and header_w and header_h: if (db_w, db_h) != (header_w, header_h): errors.append(("warning", "dimension_mismatch", f"DB says {db_w}x{db_h}, file header says {header_w}x{header_h}")) # 6) Info checks (vždy zapnuté podle volby) if not row.get("exif_raw"): errors.append(("info", "missing_exif", "No EXIF metadata")) else: exif = row["exif_raw"] if "EXIF DateTimeOriginal" not in exif: errors.append(("info", "missing_datetime_original", "No EXIF DateTimeOriginal")) if row.get("gps_lat") is None or row.get("gps_lon") is None: errors.append(("info", "missing_gps", "No GPS coordinates")) return errors def _progress(msg: str) -> None: """In-place jednořádkový progress na stdout (přepisuje stejný řádek).""" # \r = návrat na začátek řádku, \033[K = smaž zbytek řádku sys.stdout.write(f"\r{msg}\033[K") sys.stdout.flush() def _progress_clear() -> None: """Ukončí progress řádek nulou + newline (aby další log.info šel na čistý řádek).""" sys.stdout.write("\n") sys.stdout.flush() # ── Zpracování ─────────────────────────────────────────────────────────────── def reset_state(conn, dry_run: bool) -> None: """RESET=True: smaže photo_errors a vynuluje verified_at.""" log.warning("RESET=True — mažu photo_errors a vynuluju verified_at.") if dry_run: log.info("[DRY RUN] Would TRUNCATE photo_errors and UPDATE photos SET verified_at = NULL") return with conn.cursor() as cur: cur.execute("TRUNCATE photo_errors RESTART IDENTITY") cur.execute("UPDATE photos SET verified_at = NULL WHERE verified_at IS NOT NULL") affected = cur.rowcount conn.commit() log.info("Reset done. Cleared verified_at on %d rows.", affected) def process_batch(conn, batch_size: int, dry_run: bool, batch_num: int = 0, total_so_far: int = 0) -> int: where = "" if REVERIFY_ALL else "WHERE verified_at IS NULL" with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute( f""" SELECT id, file_path, width, height, gps_lat, gps_lon, exif_raw FROM photos {where} ORDER BY id LIMIT %s """, (batch_size,), ) rows = cur.fetchall() if not rows: return 0 batch_total = len(rows) processed = 0 crit_n = warn_n = info_n = 0 t_start = time.time() for idx, row in enumerate(rows, 1): photo_id = row["id"] local_path = db_path_to_local(row["file_path"]) try: errors = verify_file(local_path, row) except Exception: # Vyčistit progress a logovat exception celý _progress_clear() log.exception("Unexpected error verifying id=%d (%s)", photo_id, local_path) errors = [("critical", "read_error", "Unexpected exception during verification")] if dry_run: for sev, code, msg in errors: _progress_clear() log.info("[DRY RUN] id=%d %s/%s: %s", photo_id, sev, code, msg) else: with conn.cursor() as cur: cur.execute("DELETE FROM photo_errors WHERE photo_id = %s", (photo_id,)) if errors: psycopg2.extras.execute_values( cur, "INSERT INTO photo_errors (photo_id, severity, error_code, error_message) VALUES %s", [(photo_id, sev, code, msg) for sev, code, msg in errors], ) cur.execute("UPDATE photos SET verified_at = NOW() WHERE id = %s", (photo_id,)) conn.commit() processed += 1 for sev, _, _ in errors: if sev == "critical": crit_n += 1 elif sev == "warning": warn_n += 1 elif sev == "info": info_n += 1 # Single-line progress elapsed = time.time() - t_start rate = processed / elapsed if elapsed > 0 else 0 name = local_path.name[:40] _progress( f"[batch {batch_num}] {idx}/{batch_total} " f"total={total_so_far + processed} " f"id={photo_id} " f"crit={crit_n} warn={warn_n} info={info_n} " f"{rate:.1f}/s {name}" ) _progress_clear() return processed def main(): parser = argparse.ArgumentParser(description="Verify photos and log errors") parser.add_argument("--batch-size", type=int, default=BATCH_SIZE) parser.add_argument("--dry-run", action="store_true", help="Don't write to DB, only log what would happen") args = parser.parse_args() limit = MAX_PHOTOS log.info("=" * 60) log.info("Starting photo verification") log.info(" batch_size=%d, dry_run=%s, limit=%s, reverify_all=%s", args.batch_size, args.dry_run, limit or "all", REVERIFY_ALL) log.info(" hostname=%s, platform=%s", socket.gethostname(), platform.system()) log.info(" source base (local): %s", LOCAL_SOURCE_BASE) conn = get_conn() total_processed = 0 batch_num = 0 try: if RESET: reset_state(conn, args.dry_run) while True: remaining = args.batch_size if limit > 0: remaining = min(args.batch_size, limit - total_processed) if remaining <= 0: log.info("Limit %d reached. Done.", limit) break batch_num += 1 t0 = time.time() count = process_batch(conn, remaining, args.dry_run, batch_num=batch_num, total_so_far=total_processed) elapsed = time.time() - t0 if count == 0: log.info("No more photos to verify. Done.") break total_processed += count log.info( "Batch %d: verified %d photos in %.1fs (total: %d, %.1f/s)", batch_num, count, elapsed, total_processed, count / elapsed if elapsed > 0 else 0, ) except KeyboardInterrupt: log.info("Interrupted by user. Total verified: %d", total_processed) finally: conn.close() log.info("Finished. Total verified: %d", total_processed) if __name__ == "__main__": main()