""" Explorační skript: projde všechny fotky v demo_fotky/ a vytáhne maximum dat. Výstup do konzole + JSON soubor pro detailní analýzu. """ import hashlib import json import os import re import sys from datetime import datetime from pathlib import Path # Windows konzole - vynutit UTF-8 if sys.stdout.encoding.lower() != "utf-8": sys.stdout.reconfigure(encoding="utf-8") sys.stderr.reconfigure(encoding="utf-8") import exifread import imagehash from PIL import Image, ImageOps, IptcImagePlugin from PIL.ExifTags import TAGS, GPSTAGS PHOTOS_DIR = Path(__file__).parent / "demo_fotky" OUTPUT_JSON = Path(__file__).parent / "photo_exploration.json" def file_hash_sha256(path: Path, chunk_size: int = 65536) -> str: """Hash celého souboru - detekce přesné kopie.""" h = hashlib.sha256() with open(path, "rb") as f: while chunk := f.read(chunk_size): h.update(chunk) return h.hexdigest() def pixel_hash_sha256(path: Path) -> str | None: """Hash dekódovaných pixelů - identita fotky nezávisle na metadatech. Aplikuje EXIF orientation pro konzistenci.""" try: with Image.open(path) as img: img = ImageOps.exif_transpose(img) if img.mode != "RGB": img = img.convert("RGB") return hashlib.sha256(img.tobytes()).hexdigest() except Exception as e: return None def perceptual_hashes(path: Path) -> dict: """Perceptuální hashe - detekce vizuálně podobných fotek. Každý hash je 64-bit, porovnává se Hamming distance.""" out = {} try: with Image.open(path) as img: img = ImageOps.exif_transpose(img) out["phash"] = str(imagehash.phash(img)) out["dhash"] = str(imagehash.dhash(img)) out["ahash"] = str(imagehash.average_hash(img)) out["whash"] = str(imagehash.whash(img)) except Exception as e: out["_error"] = str(e) return out def iptc_info(path: Path) -> dict: """IPTC metadata - keywords, title, description, author atd.""" out = {} # Mapování IPTC numerických tagů na čitelné názvy iptc_names = { (2, 5): "ObjectName", # Title (2, 10): "Urgency", (2, 15): "Category", (2, 20): "SupplementalCategories", (2, 25): "Keywords", (2, 40): "SpecialInstructions", (2, 55): "DateCreated", (2, 60): "TimeCreated", (2, 80): "Byline", # Creator/Author (2, 85): "BylineTitle", (2, 90): "City", (2, 92): "SubLocation", (2, 95): "ProvinceState", (2, 100): "CountryCode", (2, 101): "CountryName", (2, 103): "OriginalTransmissionReference", (2, 105): "Headline", (2, 110): "Credit", (2, 115): "Source", (2, 116): "Copyright", (2, 118): "Contact", (2, 120): "Caption", # Description (2, 122): "WriterEditor", } try: with Image.open(path) as img: raw = IptcImagePlugin.getiptcinfo(img) if not raw: return {} for key, value in raw.items(): name = iptc_names.get(key, f"IPTC{key}") if isinstance(value, bytes): value = value.decode("utf-8", errors="replace") elif isinstance(value, list): value = [v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v for v in value] out[name] = value except Exception as e: out["_error"] = str(e) return out def xmp_info(path: Path) -> dict: """XMP metadata - moderní alternativa IPTC, často keywords/rating/regions.""" out = {} try: with Image.open(path) as img: xmp_raw = img.info.get("xmp") if not xmp_raw: return {} if isinstance(xmp_raw, bytes): xmp_raw = xmp_raw.decode("utf-8", errors="replace") # Velmi jednoduchý parser - vytáhne nejčastější pole regexem patterns = { "creator_tool": r'xmp:CreatorTool="([^"]+)"', "create_date": r'xmp:CreateDate="([^"]+)"', "modify_date": r'xmp:ModifyDate="([^"]+)"', "rating": r'xmp:Rating="([^"]+)"', "label": r'xmp:Label="([^"]+)"', "title": r']*>.*?]*>([^<]+)', "description": r']*>.*?]*>([^<]+)', "creator": r']*>.*?]*>([^<]+)', "subject_keywords": r']*>(.*?)', } for name, pat in patterns.items(): m = re.search(pat, xmp_raw, re.DOTALL) if m: out[name] = m.group(1).strip() # Keywords z dc:subject - vytáhnout jednotlivé rdf:li if "subject_keywords" in out: kws = re.findall(r']*>([^<]+)', out["subject_keywords"]) out["subject_keywords"] = kws # Apple regions (rozpoznané obličeje s pozicí) face_count = len(re.findall(r'mwg-rs:Type="Face"', xmp_raw)) if face_count: out["face_regions_count"] = face_count # Délka raw XMP pro představu out["_xmp_length_bytes"] = len(xmp_raw) except Exception as e: out["_error"] = str(e) return out def filesystem_info(path: Path) -> dict: stat = path.stat() return { "file_name": path.name, "file_path": str(path), "file_size_bytes": stat.st_size, "file_size_mb": round(stat.st_size / 1024 / 1024, 2), "mtime": datetime.fromtimestamp(stat.st_mtime).isoformat(), "ctime": datetime.fromtimestamp(stat.st_ctime).isoformat(), "extension": path.suffix.lower(), } def pillow_info(path: Path) -> dict: info = {} try: with Image.open(path) as img: info["format"] = img.format info["mode"] = img.mode info["width"] = img.width info["height"] = img.height info["megapixels"] = round((img.width * img.height) / 1_000_000, 2) info["has_transparency"] = img.mode in ("RGBA", "LA") or "transparency" in img.info info["dpi"] = img.info.get("dpi") info["icc_profile_present"] = "icc_profile" in img.info info["exif_present"] = bool(img.getexif()) # XMP (často v JPG od Adobe) if "xmp" in img.info: xmp_raw = img.info["xmp"] if isinstance(xmp_raw, bytes): xmp_raw = xmp_raw[:500].decode("utf-8", errors="ignore") info["xmp_snippet"] = str(xmp_raw)[:500] # Thumbnail embedded? info["has_embedded_thumbnail"] = "thumbnail" in img.info except Exception as e: info["error"] = str(e) return info def pillow_exif(path: Path) -> dict: """Pillow EXIF — čitelné názvy.""" out = {} try: with Image.open(path) as img: exif = img.getexif() if not exif: return {} for tag_id, value in exif.items(): tag = TAGS.get(tag_id, f"Tag{tag_id}") # GPS info jako vnořený dict if tag == "GPSInfo": gps = {} for gps_tag_id, gps_value in value.items(): gps_tag = GPSTAGS.get(gps_tag_id, f"GPSTag{gps_tag_id}") gps[gps_tag] = _serializable(gps_value) out[tag] = gps else: out[tag] = _serializable(value) except Exception as e: out["_error"] = str(e) return out def exifread_tags(path: Path) -> dict: """ExifRead — často víc tagů než Pillow, mj. detailní MakerNote.""" out = {} try: with open(path, "rb") as f: tags = exifread.process_file(f, details=True) for k, v in tags.items(): # přeskočit binární thumbnail if "Thumbnail" in k and "JPEGInterchangeFormat" not in k: continue out[k] = str(v) except Exception as e: out["_error"] = str(e) return out def _serializable(v): """Pillow vrací občas IFDRational, bytes apod. → převést na JSON-friendly.""" if isinstance(v, bytes): return v[:200].decode("utf-8", errors="replace") if isinstance(v, (tuple, list)): return [_serializable(x) for x in v] if isinstance(v, dict): return {str(k): _serializable(val) for k, val in v.items()} if hasattr(v, "numerator") and hasattr(v, "denominator"): try: return float(v) except Exception: return str(v) try: json.dumps(v) return v except (TypeError, ValueError): return str(v) def explore_photo(path: Path) -> dict: return { "filesystem": filesystem_info(path), "hashes": { "sha256_file": file_hash_sha256(path), "sha256_pixels": pixel_hash_sha256(path), **perceptual_hashes(path), }, "pillow": pillow_info(path), "exif_pillow": pillow_exif(path), "exif_exifread": exifread_tags(path), "iptc": iptc_info(path), "xmp": xmp_info(path), } def hamming_distance(h1: str, h2: str) -> int: """Hamming distance mezi dvěma hex perceptual hashes.""" return bin(int(h1, 16) ^ int(h2, 16)).count("1") def print_summary(photos: list[dict]) -> None: print(f"\n{'=' * 70}") print(f"PŘEHLED: {len(photos)} fotek") print(f"{'=' * 70}\n") # Které EXIF tagy existují napříč fotkami? all_pillow_keys = set() all_exifread_keys = set() for p in photos: all_pillow_keys.update(p["exif_pillow"].keys()) all_exifread_keys.update(p["exif_exifread"].keys()) print(f"Unikátní EXIF tagy (Pillow): {len(all_pillow_keys)}") print(f"Unikátní EXIF tagy (ExifRead): {len(all_exifread_keys)}") print() for i, p in enumerate(photos, 1): fs = p["filesystem"] pi = p["pillow"] h = p["hashes"] er = p["exif_exifread"] print(f"[{i}] {fs['file_name']}") print(f" Velikost: {fs['file_size_mb']} MB ({pi.get('width')}x{pi.get('height')}, {pi.get('megapixels')} Mpx)") print(f" Formát: {pi.get('format')} / mode={pi.get('mode')}") print(f" sha256_file: {h['sha256_file'][:16]}...") print(f" sha256_pixels: {(h.get('sha256_pixels') or 'N/A')[:16]}...") print(f" phash: {h.get('phash')} (perceptual)") print(f" EXIF tagů: ExifRead={len(er)}, Pillow={len(p['exif_pillow'])}") print(f" IPTC polí: {len([k for k in p['iptc'] if not k.startswith('_')])}") print(f" XMP polí: {len([k for k in p['xmp'] if not k.startswith('_')])}") # ExifRead je spolehlivější (Pillow má GPS bug) interesting = { "Kamera": f"{er.get('Image Make', '')} {er.get('Image Model', '')}".strip(), "Objektiv": er.get("EXIF LensModel"), "Datum": er.get("EXIF DateTimeOriginal") or er.get("Image DateTime"), "TZ offset": er.get("EXIF OffsetTimeOriginal") or er.get("EXIF OffsetTime"), "Clona": er.get("EXIF FNumber"), "ISO": er.get("EXIF ISOSpeedRatings"), "Expozice": er.get("EXIF ExposureTime"), "Ohnisko mm": er.get("EXIF FocalLength"), "Flash": er.get("EXIF Flash"), "GPS lat": er.get("GPS GPSLatitude"), "GPS lon": er.get("GPS GPSLongitude"), "Software": er.get("Image Software"), } for k, v in interesting.items(): if v and str(v).strip(): print(f" {k:12s}: {v}") # IPTC / XMP — vypsat všechno, co je if p["iptc"]: for k, v in p["iptc"].items(): if not k.startswith("_"): print(f" IPTC.{k:8s}: {v}") if p["xmp"]: for k, v in p["xmp"].items(): if not k.startswith("_"): print(f" XMP.{k:9s}: {v}") print() # Tabulka perceptuálních podobností (Hamming distance phash) print(f"{'=' * 70}") print("PERCEPTUÁLNÍ PODOBNOST (phash Hamming distance)") print("Hodnota 0-10 = vizuálně velmi podobné, >20 = odlišné") print(f"{'=' * 70}") n = len(photos) header = " " + "".join(f" [{i+1}]" for i in range(n)) print(header) for i in range(n): row = f" [{i+1}] " for j in range(n): if i == j: row += " -" else: h1 = photos[i]["hashes"].get("phash") h2 = photos[j]["hashes"].get("phash") if h1 and h2: d = hamming_distance(h1, h2) marker = "*" if d <= 10 and i != j else " " row += f" {d:3d}{marker}" else: row += " N/A " print(row) print("\n * = vizuálně podobné fotky (možná duplikát po editaci)") print() def main(): if not PHOTOS_DIR.exists(): print(f"[ERROR] Složka neexistuje: {PHOTOS_DIR}") return files = sorted([p for p in PHOTOS_DIR.iterdir() if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".tiff", ".tif", ".webp"}]) if not files: print(f"[WARN] Žádné fotky v {PHOTOS_DIR}") return print(f"Nalezeno {len(files)} fotek v {PHOTOS_DIR}\n") photos = [] for f in files: print(f" zpracovávám: {f.name} ...", end=" ", flush=True) try: photos.append(explore_photo(f)) print("OK") except Exception as e: print(f"FAIL: {e}") print_summary(photos) # Uložit do JSON pro detailní analýzu with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(photos, f, indent=2, ensure_ascii=False, default=str) print(f"\n[OK] Detailní data uložena: {OUTPUT_JSON}") if __name__ == "__main__": main()