Files
fotkyBuzalkovi/30 SběrDat/verify_photos.py
T
administrator 97c1796753 notebookVb
2026-05-29 06:21:55 +02:00

348 lines
13 KiB
Python

#!/usr/bin/env python3
r"""
Verifikace fotek — pro každou položku v `photos` zkusí soubor otevřít a dekódovat,
zapíše nalezené chyby do tabulky `photo_errors` a označí `photos.verified_at = NOW()`.
Cesty v DB jsou v nativním Tower1 formátu (/mnt/user/...) a skript je přemapuje
podle prostředí (Windows UNC / Tower1 native / tower remote mount) — stejně
jako collect_pictures_*.py a generate_thumbnails.py.
Kategorie chyb (sloupec photo_errors.severity):
critical — soubor je nepoužitelný (file_missing, zero_size, invalid_format,
decode_failed, read_error)
warning — soubor lze použít, ale má vadu (truncated, exif_parse_error,
dimension_mismatch, size_suspicious)
info — jen poznámka (missing_exif, missing_datetime_original, missing_gps)
Usage:
python verify_photos.py [--batch-size 500] [--dry-run]
Konfigurace přes proměnné na začátku souboru (MAX_PHOTOS, RESET, ...).
"""
import argparse
import logging
import os
import platform
import socket
import sys
import time
from pathlib import Path
import psycopg2
import psycopg2.extras
from dotenv import load_dotenv
from PIL import Image, ImageFile, UnidentifiedImageError
# .env hledáme nejprve vedle skriptu, pak v rodičovském adresáři (root projektu)
_here = Path(__file__).parent
for _env in (_here / ".env", _here.parent / ".env"):
if _env.is_file():
load_dotenv(_env)
break
# ── Konfigurace ──────────────────────────────────────────────────────────────
# Maximální počet fotek ke zpracování (0 = všechny)
MAX_PHOTOS = 10
# Pokud True, na začátku skriptu se smažou všechny záznamy v photo_errors
# a vyresetuje verified_at v photos. Pak proběhne plná verifikace.
RESET = False
# Pokud True, re-verifikují se i fotky, které už verified_at mají.
# Pokud False (default), zpracují se jen fotky s verified_at IS NULL.
REVERIFY_ALL = False
BATCH_SIZE = 500
# Limity pro `size_suspicious` warning
SIZE_MIN_BYTES = 10 * 1024 # < 10 KB je podezřele malé
SIZE_MAX_BYTES = 50 * 1024 * 1024 # > 50 MB je podezřele velké
# Kanonický prefix pro DB (nativní Tower1 cesta)
DB_SOURCE_BASE = "/mnt/user/ZalohaVsechObrazku"
# Fyzické cesty podle prostředí
if platform.system() == "Windows":
LOCAL_SOURCE_BASE = Path(r"\\Tower1\ZalohaVsechObrazku")
else:
hostname = socket.gethostname()
if hostname == "Tower1":
LOCAL_SOURCE_BASE = Path("/mnt/user/ZalohaVsechObrazku")
else:
LOCAL_SOURCE_BASE = Path("/mnt/remotes/TOWER1.LAN_ZalohaVsechObrazku")
# ── Logging ──────────────────────────────────────────────────────────────────
LOG_FILE = Path(__file__).parent / "verify_photos.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(LOG_FILE, encoding="utf-8"),
],
)
log = logging.getLogger(__name__)
# ── Pomocné funkce ───────────────────────────────────────────────────────────
def get_conn():
return psycopg2.connect(
host=os.getenv("DB_HOST") or os.getenv("PGHOST", "192.168.1.76"),
port=int(os.getenv("DB_PORT") or os.getenv("PGPORT", 5432)),
dbname=os.getenv("DB_NAME") or os.getenv("PGDATABASE", "fotky_buzalkovi"),
user=os.getenv("DB_USER") or os.getenv("PGUSER", "vladimir.buzalka"),
password=os.getenv("DB_PASSWORD") or os.getenv("PGPASSWORD", ""),
)
def db_path_to_local(db_path: str) -> Path:
"""Převede cestu z DB (Tower1 nativní) na lokální cestu pro čtení."""
if db_path and db_path.startswith(DB_SOURCE_BASE):
relative = db_path[len(DB_SOURCE_BASE):]
return LOCAL_SOURCE_BASE / relative.lstrip("/")
return Path(db_path) if db_path else Path()
# ── Verifikace ───────────────────────────────────────────────────────────────
def verify_file(local_path: Path, row) -> list[tuple[str, str, str]]:
"""Vrátí seznam (severity, error_code, error_message) pro jeden soubor."""
errors: list[tuple[str, str, str]] = []
# 1) Existence
if not local_path.is_file():
errors.append(("critical", "file_missing", f"Path not found: {local_path}"))
return errors # bez souboru nemá smysl pokračovat
# 2) Velikost na disku
try:
actual_size = local_path.stat().st_size
except OSError as e:
errors.append(("critical", "read_error", f"stat() failed: {e}"))
return errors
if actual_size == 0:
errors.append(("critical", "zero_size", "File is empty (0 bytes)"))
return errors
if actual_size < SIZE_MIN_BYTES:
errors.append(("warning", "size_suspicious",
f"Very small: {actual_size} B (< {SIZE_MIN_BYTES} B)"))
elif actual_size > SIZE_MAX_BYTES:
errors.append(("warning", "size_suspicious",
f"Very large: {actual_size} B (> {SIZE_MAX_BYTES} B)"))
# 3) Pillow verify() — kontrola JPEG/PNG hlavičky
header_w = header_h = None
try:
with Image.open(local_path) as img:
header_w, header_h = img.size
img.verify()
except UnidentifiedImageError as e:
errors.append(("critical", "invalid_format", f"Unidentified image: {e}"))
return errors
except OSError as e:
errors.append(("critical", "read_error", f"IO error during verify: {e}"))
return errors
except Exception as e:
errors.append(("critical", "invalid_format", f"verify() failed: {type(e).__name__}: {e}"))
return errors
# 4) Plný dekód — chytne truncation a decode_failed.
# Nejprve striktně (LOAD_TRUNCATED_IMAGES = False). Pokud spadne na
# truncation, zkusíme znovu s tolerancí — pokud dojede, je to warning,
# jinak critical decode_failed.
truncated = False
ImageFile.LOAD_TRUNCATED_IMAGES = False
try:
with Image.open(local_path) as img:
img.load()
except OSError as e:
msg = str(e).lower()
if "truncated" in msg:
truncated = True
else:
errors.append(("critical", "decode_failed",
f"load() OSError: {e}"))
return errors
except Exception as e:
errors.append(("critical", "decode_failed",
f"load() {type(e).__name__}: {e}"))
return errors
if truncated:
# Zkus s tolerancí
ImageFile.LOAD_TRUNCATED_IMAGES = True
try:
with Image.open(local_path) as img:
img.load()
errors.append(("warning", "truncated",
"Image is truncated but loads with LOAD_TRUNCATED_IMAGES"))
except Exception as e:
errors.append(("critical", "decode_failed",
f"Truncated and unrecoverable: {type(e).__name__}: {e}"))
return errors
finally:
ImageFile.LOAD_TRUNCATED_IMAGES = False
# 5) dimension_mismatch — DB má width/height, porovnej s hlavičkou
db_w = row.get("width")
db_h = row.get("height")
if db_w and db_h and header_w and header_h:
if (db_w, db_h) != (header_w, header_h):
errors.append(("warning", "dimension_mismatch",
f"DB says {db_w}x{db_h}, file header says {header_w}x{header_h}"))
# 6) Info checks (vždy zapnuté podle volby)
if not row.get("exif_raw"):
errors.append(("info", "missing_exif", "No EXIF metadata"))
else:
exif = row["exif_raw"]
if "EXIF DateTimeOriginal" not in exif:
errors.append(("info", "missing_datetime_original",
"No EXIF DateTimeOriginal"))
if row.get("gps_lat") is None or row.get("gps_lon") is None:
errors.append(("info", "missing_gps", "No GPS coordinates"))
return errors
# ── Zpracování ───────────────────────────────────────────────────────────────
def reset_state(conn, dry_run: bool) -> None:
"""RESET=True: smaže photo_errors a vynuluje verified_at."""
log.warning("RESET=True — mažu photo_errors a vynuluju verified_at.")
if dry_run:
log.info("[DRY RUN] Would TRUNCATE photo_errors and UPDATE photos SET verified_at = NULL")
return
with conn.cursor() as cur:
cur.execute("TRUNCATE photo_errors RESTART IDENTITY")
cur.execute("UPDATE photos SET verified_at = NULL WHERE verified_at IS NOT NULL")
affected = cur.rowcount
conn.commit()
log.info("Reset done. Cleared verified_at on %d rows.", affected)
def process_batch(conn, batch_size: int, dry_run: bool) -> int:
where = "" if REVERIFY_ALL else "WHERE verified_at IS NULL"
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
f"""
SELECT id, file_path, width, height, gps_lat, gps_lon, exif_raw
FROM photos
{where}
ORDER BY id
LIMIT %s
""",
(batch_size,),
)
rows = cur.fetchall()
if not rows:
return 0
processed = 0
for row in rows:
photo_id = row["id"]
local_path = db_path_to_local(row["file_path"])
try:
errors = verify_file(local_path, row)
except Exception:
log.exception("Unexpected error verifying id=%d (%s)", photo_id, local_path)
errors = [("critical", "read_error", "Unexpected exception during verification")]
if dry_run:
for sev, code, msg in errors:
log.info("[DRY RUN] id=%d %s/%s: %s", photo_id, sev, code, msg)
else:
with conn.cursor() as cur:
# Smažeme staré záznamy pro tuto fotku — vždy chceme čerstvý snapshot.
cur.execute("DELETE FROM photo_errors WHERE photo_id = %s", (photo_id,))
if errors:
psycopg2.extras.execute_values(
cur,
"INSERT INTO photo_errors (photo_id, severity, error_code, error_message) VALUES %s",
[(photo_id, sev, code, msg) for sev, code, msg in errors],
)
cur.execute("UPDATE photos SET verified_at = NOW() WHERE id = %s", (photo_id,))
conn.commit()
processed += 1
if errors:
crit = sum(1 for s, _, _ in errors if s == "critical")
warn = sum(1 for s, _, _ in errors if s == "warning")
info = sum(1 for s, _, _ in errors if s == "info")
log.info("id=%d: %d errors (crit=%d, warn=%d, info=%d) — %s",
photo_id, len(errors), crit, warn, info, local_path.name)
return processed
def main():
parser = argparse.ArgumentParser(description="Verify photos and log errors")
parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
parser.add_argument("--dry-run", action="store_true",
help="Don't write to DB, only log what would happen")
args = parser.parse_args()
limit = MAX_PHOTOS
log.info("=" * 60)
log.info("Starting photo verification")
log.info(" batch_size=%d, dry_run=%s, limit=%s, reverify_all=%s",
args.batch_size, args.dry_run, limit or "all", REVERIFY_ALL)
log.info(" hostname=%s, platform=%s", socket.gethostname(), platform.system())
log.info(" source base (local): %s", LOCAL_SOURCE_BASE)
conn = get_conn()
total_processed = 0
batch_num = 0
try:
if RESET:
reset_state(conn, args.dry_run)
while True:
remaining = args.batch_size
if limit > 0:
remaining = min(args.batch_size, limit - total_processed)
if remaining <= 0:
log.info("Limit %d reached. Done.", limit)
break
batch_num += 1
t0 = time.time()
count = process_batch(conn, remaining, args.dry_run)
elapsed = time.time() - t0
if count == 0:
log.info("No more photos to verify. Done.")
break
total_processed += count
log.info(
"Batch %d: verified %d photos in %.1fs (total: %d, %.1f/s)",
batch_num, count, elapsed, total_processed,
count / elapsed if elapsed > 0 else 0,
)
except KeyboardInterrupt:
log.info("Interrupted by user. Total verified: %d", total_processed)
finally:
conn.close()
log.info("Finished. Total verified: %d", total_processed)
if __name__ == "__main__":
main()