fotkyBuzalkovi/explore_photos.py

"""
Explorační skript: projde všechny fotky v demo_fotky/ a vytáhne maximum dat.
Výstup do konzole + JSON soubor pro detailní analýzu.
"""
import hashlib
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path

# Windows konzole - vynutit UTF-8
if sys.stdout.encoding.lower() != "utf-8":
    sys.stdout.reconfigure(encoding="utf-8")
    sys.stderr.reconfigure(encoding="utf-8")

import exifread
import imagehash
from PIL import Image, ImageOps, IptcImagePlugin
from PIL.ExifTags import TAGS, GPSTAGS

PHOTOS_DIR = Path(__file__).parent / "demo_fotky"
OUTPUT_JSON = Path(__file__).parent / "photo_exploration.json"


def file_hash_sha256(path: Path, chunk_size: int = 65536) -> str:
    """Hash celého souboru - detekce přesné kopie."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return h.hexdigest()


def pixel_hash_sha256(path: Path) -> str | None:
    """Hash dekódovaných pixelů - identita fotky nezávisle na metadatech.
    Aplikuje EXIF orientation pro konzistenci."""
    try:
        with Image.open(path) as img:
            img = ImageOps.exif_transpose(img)
            if img.mode != "RGB":
                img = img.convert("RGB")
            return hashlib.sha256(img.tobytes()).hexdigest()
    except Exception as e:
        return None


def perceptual_hashes(path: Path) -> dict:
    """Perceptuální hashe - detekce vizuálně podobných fotek.
    Každý hash je 64-bit, porovnává se Hamming distance."""
    out = {}
    try:
        with Image.open(path) as img:
            img = ImageOps.exif_transpose(img)
            out["phash"] = str(imagehash.phash(img))
            out["dhash"] = str(imagehash.dhash(img))
            out["ahash"] = str(imagehash.average_hash(img))
            out["whash"] = str(imagehash.whash(img))
    except Exception as e:
        out["_error"] = str(e)
    return out


def iptc_info(path: Path) -> dict:
    """IPTC metadata - keywords, title, description, author atd."""
    out = {}
    # Mapování IPTC numerických tagů na čitelné názvy
    iptc_names = {
        (2, 5):   "ObjectName",       # Title
        (2, 10):  "Urgency",
        (2, 15):  "Category",
        (2, 20):  "SupplementalCategories",
        (2, 25):  "Keywords",
        (2, 40):  "SpecialInstructions",
        (2, 55):  "DateCreated",
        (2, 60):  "TimeCreated",
        (2, 80):  "Byline",            # Creator/Author
        (2, 85):  "BylineTitle",
        (2, 90):  "City",
        (2, 92):  "SubLocation",
        (2, 95):  "ProvinceState",
        (2, 100): "CountryCode",
        (2, 101): "CountryName",
        (2, 103): "OriginalTransmissionReference",
        (2, 105): "Headline",
        (2, 110): "Credit",
        (2, 115): "Source",
        (2, 116): "Copyright",
        (2, 118): "Contact",
        (2, 120): "Caption",           # Description
        (2, 122): "WriterEditor",
    }
    try:
        with Image.open(path) as img:
            raw = IptcImagePlugin.getiptcinfo(img)
            if not raw:
                return {}
            for key, value in raw.items():
                name = iptc_names.get(key, f"IPTC{key}")
                if isinstance(value, bytes):
                    value = value.decode("utf-8", errors="replace")
                elif isinstance(value, list):
                    value = [v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v for v in value]
                out[name] = value
    except Exception as e:
        out["_error"] = str(e)
    return out


def xmp_info(path: Path) -> dict:
    """XMP metadata - moderní alternativa IPTC, často keywords/rating/regions."""
    out = {}
    try:
        with Image.open(path) as img:
            xmp_raw = img.info.get("xmp")
            if not xmp_raw:
                return {}
            if isinstance(xmp_raw, bytes):
                xmp_raw = xmp_raw.decode("utf-8", errors="replace")

            # Velmi jednoduchý parser - vytáhne nejčastější pole regexem
            patterns = {
                "creator_tool":  r'xmp:CreatorTool="([^"]+)"',
                "create_date":   r'xmp:CreateDate="([^"]+)"',
                "modify_date":   r'xmp:ModifyDate="([^"]+)"',
                "rating":        r'xmp:Rating="([^"]+)"',
                "label":         r'xmp:Label="([^"]+)"',
                "title":         r'<dc:title[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
                "description":   r'<dc:description[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
                "creator":       r'<dc:creator[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
                "subject_keywords": r'<dc:subject[^>]*>(.*?)</dc:subject>',
            }
            for name, pat in patterns.items():
                m = re.search(pat, xmp_raw, re.DOTALL)
                if m:
                    out[name] = m.group(1).strip()

            # Keywords z dc:subject - vytáhnout jednotlivé rdf:li
            if "subject_keywords" in out:
                kws = re.findall(r'<rdf:li[^>]*>([^<]+)</rdf:li>', out["subject_keywords"])
                out["subject_keywords"] = kws

            # Apple regions (rozpoznané obličeje s pozicí)
            face_count = len(re.findall(r'mwg-rs:Type="Face"', xmp_raw))
            if face_count:
                out["face_regions_count"] = face_count

            # Délka raw XMP pro představu
            out["_xmp_length_bytes"] = len(xmp_raw)
    except Exception as e:
        out["_error"] = str(e)
    return out


def filesystem_info(path: Path) -> dict:
    stat = path.stat()
    return {
        "file_name": path.name,
        "file_path": str(path),
        "file_size_bytes": stat.st_size,
        "file_size_mb": round(stat.st_size / 1024 / 1024, 2),
        "mtime": datetime.fromtimestamp(stat.st_mtime).isoformat(),
        "ctime": datetime.fromtimestamp(stat.st_ctime).isoformat(),
        "extension": path.suffix.lower(),
    }


def pillow_info(path: Path) -> dict:
    info = {}
    try:
        with Image.open(path) as img:
            info["format"] = img.format
            info["mode"] = img.mode
            info["width"] = img.width
            info["height"] = img.height
            info["megapixels"] = round((img.width * img.height) / 1_000_000, 2)
            info["has_transparency"] = img.mode in ("RGBA", "LA") or "transparency" in img.info
            info["dpi"] = img.info.get("dpi")
            info["icc_profile_present"] = "icc_profile" in img.info
            info["exif_present"] = bool(img.getexif())

            # XMP (často v JPG od Adobe)
            if "xmp" in img.info:
                xmp_raw = img.info["xmp"]
                if isinstance(xmp_raw, bytes):
                    xmp_raw = xmp_raw[:500].decode("utf-8", errors="ignore")
                info["xmp_snippet"] = str(xmp_raw)[:500]

            # Thumbnail embedded?
            info["has_embedded_thumbnail"] = "thumbnail" in img.info
    except Exception as e:
        info["error"] = str(e)
    return info


def pillow_exif(path: Path) -> dict:
    """Pillow EXIF — čitelné názvy."""
    out = {}
    try:
        with Image.open(path) as img:
            exif = img.getexif()
            if not exif:
                return {}
            for tag_id, value in exif.items():
                tag = TAGS.get(tag_id, f"Tag{tag_id}")
                # GPS info jako vnořený dict
                if tag == "GPSInfo":
                    gps = {}
                    for gps_tag_id, gps_value in value.items():
                        gps_tag = GPSTAGS.get(gps_tag_id, f"GPSTag{gps_tag_id}")
                        gps[gps_tag] = _serializable(gps_value)
                    out[tag] = gps
                else:
                    out[tag] = _serializable(value)
    except Exception as e:
        out["_error"] = str(e)
    return out


def exifread_tags(path: Path) -> dict:
    """ExifRead — často víc tagů než Pillow, mj. detailní MakerNote."""
    out = {}
    try:
        with open(path, "rb") as f:
            tags = exifread.process_file(f, details=True)
            for k, v in tags.items():
                # přeskočit binární thumbnail
                if "Thumbnail" in k and "JPEGInterchangeFormat" not in k:
                    continue
                out[k] = str(v)
    except Exception as e:
        out["_error"] = str(e)
    return out


def _serializable(v):
    """Pillow vrací občas IFDRational, bytes apod. → převést na JSON-friendly."""
    if isinstance(v, bytes):
        return v[:200].decode("utf-8", errors="replace")
    if isinstance(v, (tuple, list)):
        return [_serializable(x) for x in v]
    if isinstance(v, dict):
        return {str(k): _serializable(val) for k, val in v.items()}
    if hasattr(v, "numerator") and hasattr(v, "denominator"):
        try:
            return float(v)
        except Exception:
            return str(v)
    try:
        json.dumps(v)
        return v
    except (TypeError, ValueError):
        return str(v)


def explore_photo(path: Path) -> dict:
    return {
        "filesystem": filesystem_info(path),
        "hashes": {
            "sha256_file":   file_hash_sha256(path),
            "sha256_pixels": pixel_hash_sha256(path),
            **perceptual_hashes(path),
        },
        "pillow": pillow_info(path),
        "exif_pillow": pillow_exif(path),
        "exif_exifread": exifread_tags(path),
        "iptc": iptc_info(path),
        "xmp": xmp_info(path),
    }


def hamming_distance(h1: str, h2: str) -> int:
    """Hamming distance mezi dvěma hex perceptual hashes."""
    return bin(int(h1, 16) ^ int(h2, 16)).count("1")


def print_summary(photos: list[dict]) -> None:
    print(f"\n{'=' * 70}")
    print(f"PŘEHLED: {len(photos)} fotek")
    print(f"{'=' * 70}\n")

    # Které EXIF tagy existují napříč fotkami?
    all_pillow_keys = set()
    all_exifread_keys = set()
    for p in photos:
        all_pillow_keys.update(p["exif_pillow"].keys())
        all_exifread_keys.update(p["exif_exifread"].keys())

    print(f"Unikátní EXIF tagy (Pillow):    {len(all_pillow_keys)}")
    print(f"Unikátní EXIF tagy (ExifRead):  {len(all_exifread_keys)}")
    print()

    for i, p in enumerate(photos, 1):
        fs = p["filesystem"]
        pi = p["pillow"]
        h = p["hashes"]
        er = p["exif_exifread"]
        print(f"[{i}] {fs['file_name']}")
        print(f"    Velikost:    {fs['file_size_mb']} MB ({pi.get('width')}x{pi.get('height')}, {pi.get('megapixels')} Mpx)")
        print(f"    Formát:      {pi.get('format')} / mode={pi.get('mode')}")
        print(f"    sha256_file:   {h['sha256_file'][:16]}...")
        print(f"    sha256_pixels: {(h.get('sha256_pixels') or 'N/A')[:16]}...")
        print(f"    phash:       {h.get('phash')}    (perceptual)")
        print(f"    EXIF tagů:   ExifRead={len(er)}, Pillow={len(p['exif_pillow'])}")
        print(f"    IPTC polí:   {len([k for k in p['iptc'] if not k.startswith('_')])}")
        print(f"    XMP polí:    {len([k for k in p['xmp'] if not k.startswith('_')])}")

        # ExifRead je spolehlivější (Pillow má GPS bug)
        interesting = {
            "Kamera":     f"{er.get('Image Make', '')} {er.get('Image Model', '')}".strip(),
            "Objektiv":   er.get("EXIF LensModel"),
            "Datum":      er.get("EXIF DateTimeOriginal") or er.get("Image DateTime"),
            "TZ offset":  er.get("EXIF OffsetTimeOriginal") or er.get("EXIF OffsetTime"),
            "Clona":      er.get("EXIF FNumber"),
            "ISO":        er.get("EXIF ISOSpeedRatings"),
            "Expozice":   er.get("EXIF ExposureTime"),
            "Ohnisko mm": er.get("EXIF FocalLength"),
            "Flash":      er.get("EXIF Flash"),
            "GPS lat":    er.get("GPS GPSLatitude"),
            "GPS lon":    er.get("GPS GPSLongitude"),
            "Software":   er.get("Image Software"),
        }
        for k, v in interesting.items():
            if v and str(v).strip():
                print(f"    {k:12s}: {v}")

        # IPTC / XMP — vypsat všechno, co je
        if p["iptc"]:
            for k, v in p["iptc"].items():
                if not k.startswith("_"):
                    print(f"    IPTC.{k:8s}: {v}")
        if p["xmp"]:
            for k, v in p["xmp"].items():
                if not k.startswith("_"):
                    print(f"    XMP.{k:9s}: {v}")
        print()

    # Tabulka perceptuálních podobností (Hamming distance phash)
    print(f"{'=' * 70}")
    print("PERCEPTUÁLNÍ PODOBNOST (phash Hamming distance)")
    print("Hodnota 0-10 = vizuálně velmi podobné, >20 = odlišné")
    print(f"{'=' * 70}")
    n = len(photos)
    header = "        " + "".join(f"  [{i+1}]" for i in range(n))
    print(header)
    for i in range(n):
        row = f"   [{i+1}]  "
        for j in range(n):
            if i == j:
                row += "    -"
            else:
                h1 = photos[i]["hashes"].get("phash")
                h2 = photos[j]["hashes"].get("phash")
                if h1 and h2:
                    d = hamming_distance(h1, h2)
                    marker = "*" if d <= 10 and i != j else " "
                    row += f"  {d:3d}{marker}"
                else:
                    row += "  N/A "
        print(row)
    print("\n   * = vizuálně podobné fotky (možná duplikát po editaci)")
    print()


def main():
    if not PHOTOS_DIR.exists():
        print(f"[ERROR] Složka neexistuje: {PHOTOS_DIR}")
        return

    files = sorted([p for p in PHOTOS_DIR.iterdir()
                    if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".tiff", ".tif", ".webp"}])

    if not files:
        print(f"[WARN] Žádné fotky v {PHOTOS_DIR}")
        return

    print(f"Nalezeno {len(files)} fotek v {PHOTOS_DIR}\n")

    photos = []
    for f in files:
        print(f"  zpracovávám: {f.name} ...", end=" ", flush=True)
        try:
            photos.append(explore_photo(f))
            print("OK")
        except Exception as e:
            print(f"FAIL: {e}")

    print_summary(photos)

    # Uložit do JSON pro detailní analýzu
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(photos, f, indent=2, ensure_ascii=False, default=str)
    print(f"\n[OK] Detailní data uložena: {OUTPUT_JSON}")


if __name__ == "__main__":
    main()