notebookVb

2026-05-24 07:59:25 +02:00
parent 662c890257
commit 7e05384c1f
10 changed files with 87 additions and 1261 deletions
@@ -0,0 +1,398 @@
+"""
+Explorační skript: projde všechny fotky v demo_fotky/ a vytáhne maximum dat.
+Výstup do konzole + JSON soubor pro detailní analýzu.
+"""
+import hashlib
+import json
+import os
+import re
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# Windows konzole - vynutit UTF-8
+if sys.stdout.encoding.lower() != "utf-8":
+    sys.stdout.reconfigure(encoding="utf-8")
+    sys.stderr.reconfigure(encoding="utf-8")
+
+import exifread
+import imagehash
+from PIL import Image, ImageOps, IptcImagePlugin
+from PIL.ExifTags import TAGS, GPSTAGS
+
+PHOTOS_DIR = Path(__file__).parent / "demo_fotky"
+OUTPUT_JSON = Path(__file__).parent / "photo_exploration.json"
+
+
+def file_hash_sha256(path: Path, chunk_size: int = 65536) -> str:
+    """Hash celého souboru - detekce přesné kopie."""
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while chunk := f.read(chunk_size):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def pixel_hash_sha256(path: Path) -> str | None:
+    """Hash dekódovaných pixelů - identita fotky nezávisle na metadatech.
+    Aplikuje EXIF orientation pro konzistenci."""
+    try:
+        with Image.open(path) as img:
+            img = ImageOps.exif_transpose(img)
+            if img.mode != "RGB":
+                img = img.convert("RGB")
+            return hashlib.sha256(img.tobytes()).hexdigest()
+    except Exception as e:
+        return None
+
+
+def perceptual_hashes(path: Path) -> dict:
+    """Perceptuální hashe - detekce vizuálně podobných fotek.
+    Každý hash je 64-bit, porovnává se Hamming distance."""
+    out = {}
+    try:
+        with Image.open(path) as img:
+            img = ImageOps.exif_transpose(img)
+            out["phash"] = str(imagehash.phash(img))
+            out["dhash"] = str(imagehash.dhash(img))
+            out["ahash"] = str(imagehash.average_hash(img))
+            out["whash"] = str(imagehash.whash(img))
+    except Exception as e:
+        out["_error"] = str(e)
+    return out
+
+
+def iptc_info(path: Path) -> dict:
+    """IPTC metadata - keywords, title, description, author atd."""
+    out = {}
+    # Mapování IPTC numerických tagů na čitelné názvy
+    iptc_names = {
+        (2, 5):   "ObjectName",       # Title
+        (2, 10):  "Urgency",
+        (2, 15):  "Category",
+        (2, 20):  "SupplementalCategories",
+        (2, 25):  "Keywords",
+        (2, 40):  "SpecialInstructions",
+        (2, 55):  "DateCreated",
+        (2, 60):  "TimeCreated",
+        (2, 80):  "Byline",            # Creator/Author
+        (2, 85):  "BylineTitle",
+        (2, 90):  "City",
+        (2, 92):  "SubLocation",
+        (2, 95):  "ProvinceState",
+        (2, 100): "CountryCode",
+        (2, 101): "CountryName",
+        (2, 103): "OriginalTransmissionReference",
+        (2, 105): "Headline",
+        (2, 110): "Credit",
+        (2, 115): "Source",
+        (2, 116): "Copyright",
+        (2, 118): "Contact",
+        (2, 120): "Caption",           # Description
+        (2, 122): "WriterEditor",
+    }
+    try:
+        with Image.open(path) as img:
+            raw = IptcImagePlugin.getiptcinfo(img)
+            if not raw:
+                return {}
+            for key, value in raw.items():
+                name = iptc_names.get(key, f"IPTC{key}")
+                if isinstance(value, bytes):
+                    value = value.decode("utf-8", errors="replace")
+                elif isinstance(value, list):
+                    value = [v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v for v in value]
+                out[name] = value
+    except Exception as e:
+        out["_error"] = str(e)
+    return out
+
+
+def xmp_info(path: Path) -> dict:
+    """XMP metadata - moderní alternativa IPTC, často keywords/rating/regions."""
+    out = {}
+    try:
+        with Image.open(path) as img:
+            xmp_raw = img.info.get("xmp")
+            if not xmp_raw:
+                return {}
+            if isinstance(xmp_raw, bytes):
+                xmp_raw = xmp_raw.decode("utf-8", errors="replace")
+
+            # Velmi jednoduchý parser - vytáhne nejčastější pole regexem
+            patterns = {
+                "creator_tool":  r'xmp:CreatorTool="([^"]+)"',
+                "create_date":   r'xmp:CreateDate="([^"]+)"',
+                "modify_date":   r'xmp:ModifyDate="([^"]+)"',
+                "rating":        r'xmp:Rating="([^"]+)"',
+                "label":         r'xmp:Label="([^"]+)"',
+                "title":         r'<dc:title[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
+                "description":   r'<dc:description[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
+                "creator":       r'<dc:creator[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
+                "subject_keywords": r'<dc:subject[^>]*>(.*?)</dc:subject>',
+            }
+            for name, pat in patterns.items():
+                m = re.search(pat, xmp_raw, re.DOTALL)
+                if m:
+                    out[name] = m.group(1).strip()
+
+            # Keywords z dc:subject - vytáhnout jednotlivé rdf:li
+            if "subject_keywords" in out:
+                kws = re.findall(r'<rdf:li[^>]*>([^<]+)</rdf:li>', out["subject_keywords"])
+                out["subject_keywords"] = kws
+
+            # Apple regions (rozpoznané obličeje s pozicí)
+            face_count = len(re.findall(r'mwg-rs:Type="Face"', xmp_raw))
+            if face_count:
+                out["face_regions_count"] = face_count
+
+            # Délka raw XMP pro představu
+            out["_xmp_length_bytes"] = len(xmp_raw)
+    except Exception as e:
+        out["_error"] = str(e)
+    return out
+
+
+def filesystem_info(path: Path) -> dict:
+    stat = path.stat()
+    return {
+        "file_name": path.name,
+        "file_path": str(path),
+        "file_size_bytes": stat.st_size,
+        "file_size_mb": round(stat.st_size / 1024 / 1024, 2),
+        "mtime": datetime.fromtimestamp(stat.st_mtime).isoformat(),
+        "ctime": datetime.fromtimestamp(stat.st_ctime).isoformat(),
+        "extension": path.suffix.lower(),
+    }
+
+
+def pillow_info(path: Path) -> dict:
+    info = {}
+    try:
+        with Image.open(path) as img:
+            info["format"] = img.format
+            info["mode"] = img.mode
+            info["width"] = img.width
+            info["height"] = img.height
+            info["megapixels"] = round((img.width * img.height) / 1_000_000, 2)
+            info["has_transparency"] = img.mode in ("RGBA", "LA") or "transparency" in img.info
+            info["dpi"] = img.info.get("dpi")
+            info["icc_profile_present"] = "icc_profile" in img.info
+            info["exif_present"] = bool(img.getexif())
+
+            # XMP (často v JPG od Adobe)
+            if "xmp" in img.info:
+                xmp_raw = img.info["xmp"]
+                if isinstance(xmp_raw, bytes):
+                    xmp_raw = xmp_raw[:500].decode("utf-8", errors="ignore")
+                info["xmp_snippet"] = str(xmp_raw)[:500]
+
+            # Thumbnail embedded?
+            info["has_embedded_thumbnail"] = "thumbnail" in img.info
+    except Exception as e:
+        info["error"] = str(e)
+    return info
+
+
+def pillow_exif(path: Path) -> dict:
+    """Pillow EXIF — čitelné názvy."""
+    out = {}
+    try:
+        with Image.open(path) as img:
+            exif = img.getexif()
+            if not exif:
+                return {}
+            for tag_id, value in exif.items():
+                tag = TAGS.get(tag_id, f"Tag{tag_id}")
+                # GPS info jako vnořený dict
+                if tag == "GPSInfo":
+                    gps = {}
+                    for gps_tag_id, gps_value in value.items():
+                        gps_tag = GPSTAGS.get(gps_tag_id, f"GPSTag{gps_tag_id}")
+                        gps[gps_tag] = _serializable(gps_value)
+                    out[tag] = gps
+                else:
+                    out[tag] = _serializable(value)
+    except Exception as e:
+        out["_error"] = str(e)
+    return out
+
+
+def exifread_tags(path: Path) -> dict:
+    """ExifRead — často víc tagů než Pillow, mj. detailní MakerNote."""
+    out = {}
+    try:
+        with open(path, "rb") as f:
+            tags = exifread.process_file(f, details=True)
+            for k, v in tags.items():
+                # přeskočit binární thumbnail
+                if "Thumbnail" in k and "JPEGInterchangeFormat" not in k:
+                    continue
+                out[k] = str(v)
+    except Exception as e:
+        out["_error"] = str(e)
+    return out
+
+
+def _serializable(v):
+    """Pillow vrací občas IFDRational, bytes apod. → převést na JSON-friendly."""
+    if isinstance(v, bytes):
+        return v[:200].decode("utf-8", errors="replace")
+    if isinstance(v, (tuple, list)):
+        return [_serializable(x) for x in v]
+    if isinstance(v, dict):
+        return {str(k): _serializable(val) for k, val in v.items()}
+    if hasattr(v, "numerator") and hasattr(v, "denominator"):
+        try:
+            return float(v)
+        except Exception:
+            return str(v)
+    try:
+        json.dumps(v)
+        return v
+    except (TypeError, ValueError):
+        return str(v)
+
+
+def explore_photo(path: Path) -> dict:
+    return {
+        "filesystem": filesystem_info(path),
+        "hashes": {
+            "sha256_file":   file_hash_sha256(path),
+            "sha256_pixels": pixel_hash_sha256(path),
+            **perceptual_hashes(path),
+        },
+        "pillow": pillow_info(path),
+        "exif_pillow": pillow_exif(path),
+        "exif_exifread": exifread_tags(path),
+        "iptc": iptc_info(path),
+        "xmp": xmp_info(path),
+    }
+
+
+def hamming_distance(h1: str, h2: str) -> int:
+    """Hamming distance mezi dvěma hex perceptual hashes."""
+    return bin(int(h1, 16) ^ int(h2, 16)).count("1")
+
+
+def print_summary(photos: list[dict]) -> None:
+    print(f"\n{'=' * 70}")
+    print(f"PŘEHLED: {len(photos)} fotek")
+    print(f"{'=' * 70}\n")
+
+    # Které EXIF tagy existují napříč fotkami?
+    all_pillow_keys = set()
+    all_exifread_keys = set()
+    for p in photos:
+        all_pillow_keys.update(p["exif_pillow"].keys())
+        all_exifread_keys.update(p["exif_exifread"].keys())
+
+    print(f"Unikátní EXIF tagy (Pillow):    {len(all_pillow_keys)}")
+    print(f"Unikátní EXIF tagy (ExifRead):  {len(all_exifread_keys)}")
+    print()
+
+    for i, p in enumerate(photos, 1):
+        fs = p["filesystem"]
+        pi = p["pillow"]
+        h = p["hashes"]
+        er = p["exif_exifread"]
+        print(f"[{i}] {fs['file_name']}")
+        print(f"    Velikost:    {fs['file_size_mb']} MB ({pi.get('width')}x{pi.get('height')}, {pi.get('megapixels')} Mpx)")
+        print(f"    Formát:      {pi.get('format')} / mode={pi.get('mode')}")
+        print(f"    sha256_file:   {h['sha256_file'][:16]}...")
+        print(f"    sha256_pixels: {(h.get('sha256_pixels') or 'N/A')[:16]}...")
+        print(f"    phash:       {h.get('phash')}    (perceptual)")
+        print(f"    EXIF tagů:   ExifRead={len(er)}, Pillow={len(p['exif_pillow'])}")
+        print(f"    IPTC polí:   {len([k for k in p['iptc'] if not k.startswith('_')])}")
+        print(f"    XMP polí:    {len([k for k in p['xmp'] if not k.startswith('_')])}")
+
+        # ExifRead je spolehlivější (Pillow má GPS bug)
+        interesting = {
+            "Kamera":     f"{er.get('Image Make', '')} {er.get('Image Model', '')}".strip(),
+            "Objektiv":   er.get("EXIF LensModel"),
+            "Datum":      er.get("EXIF DateTimeOriginal") or er.get("Image DateTime"),
+            "TZ offset":  er.get("EXIF OffsetTimeOriginal") or er.get("EXIF OffsetTime"),
+            "Clona":      er.get("EXIF FNumber"),
+            "ISO":        er.get("EXIF ISOSpeedRatings"),
+            "Expozice":   er.get("EXIF ExposureTime"),
+            "Ohnisko mm": er.get("EXIF FocalLength"),
+            "Flash":      er.get("EXIF Flash"),
+            "GPS lat":    er.get("GPS GPSLatitude"),
+            "GPS lon":    er.get("GPS GPSLongitude"),
+            "Software":   er.get("Image Software"),
+        }
+        for k, v in interesting.items():
+            if v and str(v).strip():
+                print(f"    {k:12s}: {v}")
+
+        # IPTC / XMP — vypsat všechno, co je
+        if p["iptc"]:
+            for k, v in p["iptc"].items():
+                if not k.startswith("_"):
+                    print(f"    IPTC.{k:8s}: {v}")
+        if p["xmp"]:
+            for k, v in p["xmp"].items():
+                if not k.startswith("_"):
+                    print(f"    XMP.{k:9s}: {v}")
+        print()
+
+    # Tabulka perceptuálních podobností (Hamming distance phash)
+    print(f"{'=' * 70}")
+    print("PERCEPTUÁLNÍ PODOBNOST (phash Hamming distance)")
+    print("Hodnota 0-10 = vizuálně velmi podobné, >20 = odlišné")
+    print(f"{'=' * 70}")
+    n = len(photos)
+    header = "        " + "".join(f"  [{i+1}]" for i in range(n))
+    print(header)
+    for i in range(n):
+        row = f"   [{i+1}]  "
+        for j in range(n):
+            if i == j:
+                row += "    -"
+            else:
+                h1 = photos[i]["hashes"].get("phash")
+                h2 = photos[j]["hashes"].get("phash")
+                if h1 and h2:
+                    d = hamming_distance(h1, h2)
+                    marker = "*" if d <= 10 and i != j else " "
+                    row += f"  {d:3d}{marker}"
+                else:
+                    row += "  N/A "
+        print(row)
+    print("\n   * = vizuálně podobné fotky (možná duplikát po editaci)")
+    print()
+
+
+def main():
+    if not PHOTOS_DIR.exists():
+        print(f"[ERROR] Složka neexistuje: {PHOTOS_DIR}")
+        return
+
+    files = sorted([p for p in PHOTOS_DIR.iterdir()
+                    if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".tiff", ".tif", ".webp"}])
+
+    if not files:
+        print(f"[WARN] Žádné fotky v {PHOTOS_DIR}")
+        return
+
+    print(f"Nalezeno {len(files)} fotek v {PHOTOS_DIR}\n")
+
+    photos = []
+    for f in files:
+        print(f"  zpracovávám: {f.name} ...", end=" ", flush=True)
+        try:
+            photos.append(explore_photo(f))
+            print("OK")
+        except Exception as e:
+            print(f"FAIL: {e}")
+
+    print_summary(photos)
+
+    # Uložit do JSON pro detailní analýzu
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(photos, f, indent=2, ensure_ascii=False, default=str)
+    print(f"\n[OK] Detailní data uložena: {OUTPUT_JSON}")
+
+
+if __name__ == "__main__":
+    main()