diff --git a/30 SběrDat/verify_photos.md b/30 SběrDat/verify_photos.md new file mode 100644 index 0000000..0e9fb78 --- /dev/null +++ b/30 SběrDat/verify_photos.md @@ -0,0 +1,181 @@ +# verify_photos.py + +Verifikace fotek v databázi `fotky_buzalkovi` — pro každý záznam v tabulce +`photos` zkusí soubor otevřít a dekódovat, zapíše nalezené chyby do tabulky +`photo_errors` a označí `photos.verified_at = NOW()`. + +## Závislosti + +- Migrace `migrations/002_add_photo_errors.sql` (tabulka `photo_errors` + + sloupec `photos.verified_at` + partial index). +- `.env` s `DB_HOST`, `DB_PORT`, `DB_NAME`, `DB_USER`, `DB_PASSWORD` + (hledá se vedle skriptu i v rodičovském adresáři). +- Python balíčky: `psycopg2-binary`, `Pillow`, `python-dotenv`. + +## Cesty napříč prostředími + +Stejný princip jako `collect_pictures_*.py` a `generate_thumbnails.py`: + +| Prostředí | Lokální mount | +|---|---| +| Windows | `\\Tower1\ZalohaVsechObrazku\...` | +| Tower1 (Unraid) | `/mnt/user/ZalohaVsechObrazku/...` | +| tower / ostatní Linux | `/mnt/remotes/TOWER1.LAN_ZalohaVsechObrazku/...` | + +V DB jsou cesty vždy v nativním Tower1 formátu (`/mnt/user/...`) — skript +si je za běhu přemapuje. + +## Konfigurace (na začátku skriptu) + +```python +MAX_PHOTOS = 0 # 0 = všechny, jinak limit +RESET = False # True = TRUNCATE photo_errors + verified_at = NULL +REVERIFY_ALL = False # True = ignoruj verified_at, projdi všechny znovu +BATCH_SIZE = 500 +SIZE_MIN_BYTES = 10 * 1024 # < 10 KB → size_suspicious +SIZE_MAX_BYTES = 50 * 1024 * 1024 # > 50 MB → size_suspicious +``` + +## Spuštění + +```bash +python verify_photos.py # produkce +python verify_photos.py --dry-run # nic neukládá, jen loguje +python verify_photos.py --batch-size 200 +``` + +## Co skript dělá s každou fotkou + +1. **Existence souboru** — `local_path.is_file()` +2. **Velikost** — `stat().st_size` +3. **Pillow `verify()`** — kontrola JPEG/PNG hlavičky +4. **Pillow `load()` (strict)** — plný dekód pixelů. Pokud spadne na + truncation, zkusí znovu s `ImageFile.LOAD_TRUNCATED_IMAGES = True`. +5. **`dimension_mismatch`** — porovná `photos.width/height` s hlavičkou + dekódovaného obrázku. +6. **Info checks** — `missing_exif`, `missing_datetime_original`, `missing_gps`. +7. Pro každou fotku v jedné transakci: + - `DELETE FROM photo_errors WHERE photo_id = ?` + - `INSERT INTO photo_errors (...)` — všechny nalezené chyby + - `UPDATE photos SET verified_at = NOW() WHERE id = ?` + +## Kategorie chyb + +### 🔴 `critical` — soubor je nepoužitelný + +| `error_code` | Význam | +|---|---| +| `file_missing` | Cesta v DB existuje, soubor na disku ne | +| `read_error` | IOError / Permission / SMB timeout / `stat()` selhal | +| `invalid_format` | Pillow neumí soubor rozpoznat ani otevřít | +| `decode_failed` | Hlavička OK, ale `load()` praskne (vážně poškozená data) | +| `zero_size` | `file_size == 0` | + +### 🟡 `warning` — soubor lze použít, ale má vadu + +| `error_code` | Význam | +|---|---| +| `truncated` | `load()` strict praskne, ale s `LOAD_TRUNCATED_IMAGES=True` dojede | +| `exif_parse_error` | EXIF blok má vadu (zatím rezervováno) | +| `dimension_mismatch` | DB `width/height` ≠ skutečné rozměry souboru | +| `size_suspicious` | < 10 KB nebo > 50 MB | + +### 🔵 `info` — jen poznámka, nic není rozbité + +| `error_code` | Význam | +|---|---| +| `missing_exif` | Žádný `exif_raw` | +| `missing_datetime_original` | Není `EXIF DateTimeOriginal` | +| `missing_gps` | Žádné `gps_lat` / `gps_lon` | + +## Výběr fotek + +- Default: `WHERE verified_at IS NULL` (díky partial indexu rychlé) +- `REVERIFY_ALL=True` → projede všechny +- `RESET=True` → `TRUNCATE photo_errors` + `verified_at = NULL`, pak plná verifikace + +## Idempotence + +Před zápisem se vždy smažou všechny `photo_errors` pro danou `photo_id`, +takže každý běh nahradí předchozí výsledek. Historie chyb se neudržuje. + +## Užitečné dotazy + +### Přehled chyb podle typu + +```sql +SELECT severity, error_code, COUNT(*) AS cnt +FROM photo_errors +GROUP BY 1, 2 +ORDER BY 1, 3 DESC; +``` + +### Počet unikátních postižených fotek + +```sql +SELECT severity, COUNT(DISTINCT photo_id) AS photos +FROM photo_errors +GROUP BY severity; +``` + +### Truncated fotky + +```sql +SELECT p.id, p.file_path, p.file_size +FROM photo_errors e +JOIN photos p ON p.id = e.photo_id +WHERE e.error_code = 'truncated' +ORDER BY p.file_size DESC; +``` + +### Kritické chyby + +```sql +SELECT p.id, p.file_path, e.error_code, e.error_message +FROM photo_errors e +JOIN photos p ON p.id = e.photo_id +WHERE e.severity = 'critical' +ORDER BY e.error_code, p.id; +``` + +### Pokrok verifikace + +```sql +SELECT + COUNT(*) FILTER (WHERE verified_at IS NOT NULL) AS verified, + COUNT(*) FILTER (WHERE verified_at IS NULL) AS remaining, + COUNT(*) AS total +FROM photos; +``` + +### Fotky bez nalezených chyb (čisté) + +```sql +SELECT COUNT(*) FROM photos p +WHERE p.verified_at IS NOT NULL + AND NOT EXISTS (SELECT 1 FROM photo_errors e WHERE e.photo_id = p.id); +``` + +## Doporučený workflow při prvním spuštění + +1. Spustit migraci `002_add_photo_errors.sql` v Navicatu. +2. `MAX_PHOTOS=20`, `--dry-run` — vidíš v logu, co by se zapsalo. +3. `MAX_PHOTOS=100`, ostrý běh — ověříš zápis do DB. +4. `MAX_PHOTOS=0`, ostrý běh — plná verifikace. + +## Schéma `photo_errors` + +```sql +CREATE TABLE photo_errors ( + id BIGSERIAL PRIMARY KEY, + photo_id BIGINT NOT NULL REFERENCES photos(id) ON DELETE CASCADE, + severity VARCHAR(20) NOT NULL CHECK (severity IN ('critical', 'warning', 'info')), + error_code VARCHAR(50) NOT NULL, + error_message TEXT, + detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX idx_photo_errors_photo_id ON photo_errors (photo_id); +CREATE INDEX idx_photo_errors_severity ON photo_errors (severity); +CREATE INDEX idx_photo_errors_code ON photo_errors (error_code); +``` diff --git a/30 SběrDat/verify_photos.py b/30 SběrDat/verify_photos.py new file mode 100644 index 0000000..dbc7185 --- /dev/null +++ b/30 SběrDat/verify_photos.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +r""" +Verifikace fotek — pro každou položku v `photos` zkusí soubor otevřít a dekódovat, +zapíše nalezené chyby do tabulky `photo_errors` a označí `photos.verified_at = NOW()`. + +Cesty v DB jsou v nativním Tower1 formátu (/mnt/user/...) a skript je přemapuje +podle prostředí (Windows UNC / Tower1 native / tower remote mount) — stejně +jako collect_pictures_*.py a generate_thumbnails.py. + +Kategorie chyb (sloupec photo_errors.severity): + critical — soubor je nepoužitelný (file_missing, zero_size, invalid_format, + decode_failed, read_error) + warning — soubor lze použít, ale má vadu (truncated, exif_parse_error, + dimension_mismatch, size_suspicious) + info — jen poznámka (missing_exif, missing_datetime_original, missing_gps) + +Usage: + python verify_photos.py [--batch-size 500] [--dry-run] + +Konfigurace přes proměnné na začátku souboru (MAX_PHOTOS, RESET, ...). +""" + +import argparse +import logging +import os +import platform +import socket +import sys +import time +from pathlib import Path + +import psycopg2 +import psycopg2.extras +from dotenv import load_dotenv +from PIL import Image, ImageFile, UnidentifiedImageError + +# .env hledáme nejprve vedle skriptu, pak v rodičovském adresáři (root projektu) +_here = Path(__file__).parent +for _env in (_here / ".env", _here.parent / ".env"): + if _env.is_file(): + load_dotenv(_env) + break + +# ── Konfigurace ────────────────────────────────────────────────────────────── + +# Maximální počet fotek ke zpracování (0 = všechny) +MAX_PHOTOS = 10 + +# Pokud True, na začátku skriptu se smažou všechny záznamy v photo_errors +# a vyresetuje verified_at v photos. Pak proběhne plná verifikace. +RESET = False + +# Pokud True, re-verifikují se i fotky, které už verified_at mají. +# Pokud False (default), zpracují se jen fotky s verified_at IS NULL. +REVERIFY_ALL = False + +BATCH_SIZE = 500 + +# Limity pro `size_suspicious` warning +SIZE_MIN_BYTES = 10 * 1024 # < 10 KB je podezřele malé +SIZE_MAX_BYTES = 50 * 1024 * 1024 # > 50 MB je podezřele velké + +# Kanonický prefix pro DB (nativní Tower1 cesta) +DB_SOURCE_BASE = "/mnt/user/ZalohaVsechObrazku" + +# Fyzické cesty podle prostředí +if platform.system() == "Windows": + LOCAL_SOURCE_BASE = Path(r"\\Tower1\ZalohaVsechObrazku") +else: + hostname = socket.gethostname() + if hostname == "Tower1": + LOCAL_SOURCE_BASE = Path("/mnt/user/ZalohaVsechObrazku") + else: + LOCAL_SOURCE_BASE = Path("/mnt/remotes/TOWER1.LAN_ZalohaVsechObrazku") + +# ── Logging ────────────────────────────────────────────────────────────────── + +LOG_FILE = Path(__file__).parent / "verify_photos.log" + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler(LOG_FILE, encoding="utf-8"), + ], +) +log = logging.getLogger(__name__) + +# ── Pomocné funkce ─────────────────────────────────────────────────────────── + + +def get_conn(): + return psycopg2.connect( + host=os.getenv("DB_HOST") or os.getenv("PGHOST", "192.168.1.76"), + port=int(os.getenv("DB_PORT") or os.getenv("PGPORT", 5432)), + dbname=os.getenv("DB_NAME") or os.getenv("PGDATABASE", "fotky_buzalkovi"), + user=os.getenv("DB_USER") or os.getenv("PGUSER", "vladimir.buzalka"), + password=os.getenv("DB_PASSWORD") or os.getenv("PGPASSWORD", ""), + ) + + +def db_path_to_local(db_path: str) -> Path: + """Převede cestu z DB (Tower1 nativní) na lokální cestu pro čtení.""" + if db_path and db_path.startswith(DB_SOURCE_BASE): + relative = db_path[len(DB_SOURCE_BASE):] + return LOCAL_SOURCE_BASE / relative.lstrip("/") + return Path(db_path) if db_path else Path() + + +# ── Verifikace ─────────────────────────────────────────────────────────────── + + +def verify_file(local_path: Path, row) -> list[tuple[str, str, str]]: + """Vrátí seznam (severity, error_code, error_message) pro jeden soubor.""" + errors: list[tuple[str, str, str]] = [] + + # 1) Existence + if not local_path.is_file(): + errors.append(("critical", "file_missing", f"Path not found: {local_path}")) + return errors # bez souboru nemá smysl pokračovat + + # 2) Velikost na disku + try: + actual_size = local_path.stat().st_size + except OSError as e: + errors.append(("critical", "read_error", f"stat() failed: {e}")) + return errors + + if actual_size == 0: + errors.append(("critical", "zero_size", "File is empty (0 bytes)")) + return errors + + if actual_size < SIZE_MIN_BYTES: + errors.append(("warning", "size_suspicious", + f"Very small: {actual_size} B (< {SIZE_MIN_BYTES} B)")) + elif actual_size > SIZE_MAX_BYTES: + errors.append(("warning", "size_suspicious", + f"Very large: {actual_size} B (> {SIZE_MAX_BYTES} B)")) + + # 3) Pillow verify() — kontrola JPEG/PNG hlavičky + header_w = header_h = None + try: + with Image.open(local_path) as img: + header_w, header_h = img.size + img.verify() + except UnidentifiedImageError as e: + errors.append(("critical", "invalid_format", f"Unidentified image: {e}")) + return errors + except OSError as e: + errors.append(("critical", "read_error", f"IO error during verify: {e}")) + return errors + except Exception as e: + errors.append(("critical", "invalid_format", f"verify() failed: {type(e).__name__}: {e}")) + return errors + + # 4) Plný dekód — chytne truncation a decode_failed. + # Nejprve striktně (LOAD_TRUNCATED_IMAGES = False). Pokud spadne na + # truncation, zkusíme znovu s tolerancí — pokud dojede, je to warning, + # jinak critical decode_failed. + truncated = False + ImageFile.LOAD_TRUNCATED_IMAGES = False + try: + with Image.open(local_path) as img: + img.load() + except OSError as e: + msg = str(e).lower() + if "truncated" in msg: + truncated = True + else: + errors.append(("critical", "decode_failed", + f"load() OSError: {e}")) + return errors + except Exception as e: + errors.append(("critical", "decode_failed", + f"load() {type(e).__name__}: {e}")) + return errors + + if truncated: + # Zkus s tolerancí + ImageFile.LOAD_TRUNCATED_IMAGES = True + try: + with Image.open(local_path) as img: + img.load() + errors.append(("warning", "truncated", + "Image is truncated but loads with LOAD_TRUNCATED_IMAGES")) + except Exception as e: + errors.append(("critical", "decode_failed", + f"Truncated and unrecoverable: {type(e).__name__}: {e}")) + return errors + finally: + ImageFile.LOAD_TRUNCATED_IMAGES = False + + # 5) dimension_mismatch — DB má width/height, porovnej s hlavičkou + db_w = row.get("width") + db_h = row.get("height") + if db_w and db_h and header_w and header_h: + if (db_w, db_h) != (header_w, header_h): + errors.append(("warning", "dimension_mismatch", + f"DB says {db_w}x{db_h}, file header says {header_w}x{header_h}")) + + # 6) Info checks (vždy zapnuté podle volby) + if not row.get("exif_raw"): + errors.append(("info", "missing_exif", "No EXIF metadata")) + else: + exif = row["exif_raw"] + if "EXIF DateTimeOriginal" not in exif: + errors.append(("info", "missing_datetime_original", + "No EXIF DateTimeOriginal")) + + if row.get("gps_lat") is None or row.get("gps_lon") is None: + errors.append(("info", "missing_gps", "No GPS coordinates")) + + return errors + + +# ── Zpracování ─────────────────────────────────────────────────────────────── + + +def reset_state(conn, dry_run: bool) -> None: + """RESET=True: smaže photo_errors a vynuluje verified_at.""" + log.warning("RESET=True — mažu photo_errors a vynuluju verified_at.") + if dry_run: + log.info("[DRY RUN] Would TRUNCATE photo_errors and UPDATE photos SET verified_at = NULL") + return + with conn.cursor() as cur: + cur.execute("TRUNCATE photo_errors RESTART IDENTITY") + cur.execute("UPDATE photos SET verified_at = NULL WHERE verified_at IS NOT NULL") + affected = cur.rowcount + conn.commit() + log.info("Reset done. Cleared verified_at on %d rows.", affected) + + +def process_batch(conn, batch_size: int, dry_run: bool) -> int: + where = "" if REVERIFY_ALL else "WHERE verified_at IS NULL" + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + f""" + SELECT id, file_path, width, height, gps_lat, gps_lon, exif_raw + FROM photos + {where} + ORDER BY id + LIMIT %s + """, + (batch_size,), + ) + rows = cur.fetchall() + + if not rows: + return 0 + + processed = 0 + for row in rows: + photo_id = row["id"] + local_path = db_path_to_local(row["file_path"]) + + try: + errors = verify_file(local_path, row) + except Exception: + log.exception("Unexpected error verifying id=%d (%s)", photo_id, local_path) + errors = [("critical", "read_error", "Unexpected exception during verification")] + + if dry_run: + for sev, code, msg in errors: + log.info("[DRY RUN] id=%d %s/%s: %s", photo_id, sev, code, msg) + else: + with conn.cursor() as cur: + # Smažeme staré záznamy pro tuto fotku — vždy chceme čerstvý snapshot. + cur.execute("DELETE FROM photo_errors WHERE photo_id = %s", (photo_id,)) + if errors: + psycopg2.extras.execute_values( + cur, + "INSERT INTO photo_errors (photo_id, severity, error_code, error_message) VALUES %s", + [(photo_id, sev, code, msg) for sev, code, msg in errors], + ) + cur.execute("UPDATE photos SET verified_at = NOW() WHERE id = %s", (photo_id,)) + conn.commit() + + processed += 1 + + if errors: + crit = sum(1 for s, _, _ in errors if s == "critical") + warn = sum(1 for s, _, _ in errors if s == "warning") + info = sum(1 for s, _, _ in errors if s == "info") + log.info("id=%d: %d errors (crit=%d, warn=%d, info=%d) — %s", + photo_id, len(errors), crit, warn, info, local_path.name) + + return processed + + +def main(): + parser = argparse.ArgumentParser(description="Verify photos and log errors") + parser.add_argument("--batch-size", type=int, default=BATCH_SIZE) + parser.add_argument("--dry-run", action="store_true", + help="Don't write to DB, only log what would happen") + args = parser.parse_args() + + limit = MAX_PHOTOS + + log.info("=" * 60) + log.info("Starting photo verification") + log.info(" batch_size=%d, dry_run=%s, limit=%s, reverify_all=%s", + args.batch_size, args.dry_run, limit or "all", REVERIFY_ALL) + log.info(" hostname=%s, platform=%s", socket.gethostname(), platform.system()) + log.info(" source base (local): %s", LOCAL_SOURCE_BASE) + + conn = get_conn() + total_processed = 0 + batch_num = 0 + + try: + if RESET: + reset_state(conn, args.dry_run) + + while True: + remaining = args.batch_size + if limit > 0: + remaining = min(args.batch_size, limit - total_processed) + if remaining <= 0: + log.info("Limit %d reached. Done.", limit) + break + + batch_num += 1 + t0 = time.time() + count = process_batch(conn, remaining, args.dry_run) + elapsed = time.time() - t0 + + if count == 0: + log.info("No more photos to verify. Done.") + break + + total_processed += count + log.info( + "Batch %d: verified %d photos in %.1fs (total: %d, %.1f/s)", + batch_num, count, elapsed, total_processed, + count / elapsed if elapsed > 0 else 0, + ) + except KeyboardInterrupt: + log.info("Interrupted by user. Total verified: %d", total_processed) + finally: + conn.close() + + log.info("Finished. Total verified: %d", total_processed) + + +if __name__ == "__main__": + main() diff --git a/migrations/002_add_photo_errors.sql b/migrations/002_add_photo_errors.sql new file mode 100644 index 0000000..cf7d8ac --- /dev/null +++ b/migrations/002_add_photo_errors.sql @@ -0,0 +1,25 @@ +-- Migration: Add photo_errors table + verified_at column to photos +-- Run v Navicatu nebo přes Python (CREATE INDEX CONCURRENTLY musí být mimo transakci). + +-- ── photo_errors ──────────────────────────────────────────────────────────── + +CREATE TABLE IF NOT EXISTS photo_errors ( + id BIGSERIAL PRIMARY KEY, + photo_id BIGINT NOT NULL REFERENCES photos(id) ON DELETE CASCADE, + severity VARCHAR(20) NOT NULL CHECK (severity IN ('critical', 'warning', 'info')), + error_code VARCHAR(50) NOT NULL, + error_message TEXT, + detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_photo_errors_photo_id ON photo_errors (photo_id); +CREATE INDEX IF NOT EXISTS idx_photo_errors_severity ON photo_errors (severity); +CREATE INDEX IF NOT EXISTS idx_photo_errors_code ON photo_errors (error_code); + +-- ── photos.verified_at ────────────────────────────────────────────────────── + +ALTER TABLE photos ADD COLUMN IF NOT EXISTS verified_at TIMESTAMPTZ; + +-- Partiální index — verifikační skript bude často filtrovat WHERE verified_at IS NULL. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_photos_verified_at_null + ON photos (id) WHERE verified_at IS NULL;