Files
fotkyBuzalkovi/explore_photos.py
T
administrator b6aba06baa notebookVb
2026-05-21 07:11:54 +02:00

399 lines
14 KiB
Python

"""
Explorační skript: projde všechny fotky v demo_fotky/ a vytáhne maximum dat.
Výstup do konzole + JSON soubor pro detailní analýzu.
"""
import hashlib
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
# Windows konzole - vynutit UTF-8
if sys.stdout.encoding.lower() != "utf-8":
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
import exifread
import imagehash
from PIL import Image, ImageOps, IptcImagePlugin
from PIL.ExifTags import TAGS, GPSTAGS
PHOTOS_DIR = Path(__file__).parent / "demo_fotky"
OUTPUT_JSON = Path(__file__).parent / "photo_exploration.json"
def file_hash_sha256(path: Path, chunk_size: int = 65536) -> str:
"""Hash celého souboru - detekce přesné kopie."""
h = hashlib.sha256()
with open(path, "rb") as f:
while chunk := f.read(chunk_size):
h.update(chunk)
return h.hexdigest()
def pixel_hash_sha256(path: Path) -> str | None:
"""Hash dekódovaných pixelů - identita fotky nezávisle na metadatech.
Aplikuje EXIF orientation pro konzistenci."""
try:
with Image.open(path) as img:
img = ImageOps.exif_transpose(img)
if img.mode != "RGB":
img = img.convert("RGB")
return hashlib.sha256(img.tobytes()).hexdigest()
except Exception as e:
return None
def perceptual_hashes(path: Path) -> dict:
"""Perceptuální hashe - detekce vizuálně podobných fotek.
Každý hash je 64-bit, porovnává se Hamming distance."""
out = {}
try:
with Image.open(path) as img:
img = ImageOps.exif_transpose(img)
out["phash"] = str(imagehash.phash(img))
out["dhash"] = str(imagehash.dhash(img))
out["ahash"] = str(imagehash.average_hash(img))
out["whash"] = str(imagehash.whash(img))
except Exception as e:
out["_error"] = str(e)
return out
def iptc_info(path: Path) -> dict:
"""IPTC metadata - keywords, title, description, author atd."""
out = {}
# Mapování IPTC numerických tagů na čitelné názvy
iptc_names = {
(2, 5): "ObjectName", # Title
(2, 10): "Urgency",
(2, 15): "Category",
(2, 20): "SupplementalCategories",
(2, 25): "Keywords",
(2, 40): "SpecialInstructions",
(2, 55): "DateCreated",
(2, 60): "TimeCreated",
(2, 80): "Byline", # Creator/Author
(2, 85): "BylineTitle",
(2, 90): "City",
(2, 92): "SubLocation",
(2, 95): "ProvinceState",
(2, 100): "CountryCode",
(2, 101): "CountryName",
(2, 103): "OriginalTransmissionReference",
(2, 105): "Headline",
(2, 110): "Credit",
(2, 115): "Source",
(2, 116): "Copyright",
(2, 118): "Contact",
(2, 120): "Caption", # Description
(2, 122): "WriterEditor",
}
try:
with Image.open(path) as img:
raw = IptcImagePlugin.getiptcinfo(img)
if not raw:
return {}
for key, value in raw.items():
name = iptc_names.get(key, f"IPTC{key}")
if isinstance(value, bytes):
value = value.decode("utf-8", errors="replace")
elif isinstance(value, list):
value = [v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v for v in value]
out[name] = value
except Exception as e:
out["_error"] = str(e)
return out
def xmp_info(path: Path) -> dict:
"""XMP metadata - moderní alternativa IPTC, často keywords/rating/regions."""
out = {}
try:
with Image.open(path) as img:
xmp_raw = img.info.get("xmp")
if not xmp_raw:
return {}
if isinstance(xmp_raw, bytes):
xmp_raw = xmp_raw.decode("utf-8", errors="replace")
# Velmi jednoduchý parser - vytáhne nejčastější pole regexem
patterns = {
"creator_tool": r'xmp:CreatorTool="([^"]+)"',
"create_date": r'xmp:CreateDate="([^"]+)"',
"modify_date": r'xmp:ModifyDate="([^"]+)"',
"rating": r'xmp:Rating="([^"]+)"',
"label": r'xmp:Label="([^"]+)"',
"title": r'<dc:title[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
"description": r'<dc:description[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
"creator": r'<dc:creator[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
"subject_keywords": r'<dc:subject[^>]*>(.*?)</dc:subject>',
}
for name, pat in patterns.items():
m = re.search(pat, xmp_raw, re.DOTALL)
if m:
out[name] = m.group(1).strip()
# Keywords z dc:subject - vytáhnout jednotlivé rdf:li
if "subject_keywords" in out:
kws = re.findall(r'<rdf:li[^>]*>([^<]+)</rdf:li>', out["subject_keywords"])
out["subject_keywords"] = kws
# Apple regions (rozpoznané obličeje s pozicí)
face_count = len(re.findall(r'mwg-rs:Type="Face"', xmp_raw))
if face_count:
out["face_regions_count"] = face_count
# Délka raw XMP pro představu
out["_xmp_length_bytes"] = len(xmp_raw)
except Exception as e:
out["_error"] = str(e)
return out
def filesystem_info(path: Path) -> dict:
stat = path.stat()
return {
"file_name": path.name,
"file_path": str(path),
"file_size_bytes": stat.st_size,
"file_size_mb": round(stat.st_size / 1024 / 1024, 2),
"mtime": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"ctime": datetime.fromtimestamp(stat.st_ctime).isoformat(),
"extension": path.suffix.lower(),
}
def pillow_info(path: Path) -> dict:
info = {}
try:
with Image.open(path) as img:
info["format"] = img.format
info["mode"] = img.mode
info["width"] = img.width
info["height"] = img.height
info["megapixels"] = round((img.width * img.height) / 1_000_000, 2)
info["has_transparency"] = img.mode in ("RGBA", "LA") or "transparency" in img.info
info["dpi"] = img.info.get("dpi")
info["icc_profile_present"] = "icc_profile" in img.info
info["exif_present"] = bool(img.getexif())
# XMP (často v JPG od Adobe)
if "xmp" in img.info:
xmp_raw = img.info["xmp"]
if isinstance(xmp_raw, bytes):
xmp_raw = xmp_raw[:500].decode("utf-8", errors="ignore")
info["xmp_snippet"] = str(xmp_raw)[:500]
# Thumbnail embedded?
info["has_embedded_thumbnail"] = "thumbnail" in img.info
except Exception as e:
info["error"] = str(e)
return info
def pillow_exif(path: Path) -> dict:
"""Pillow EXIF — čitelné názvy."""
out = {}
try:
with Image.open(path) as img:
exif = img.getexif()
if not exif:
return {}
for tag_id, value in exif.items():
tag = TAGS.get(tag_id, f"Tag{tag_id}")
# GPS info jako vnořený dict
if tag == "GPSInfo":
gps = {}
for gps_tag_id, gps_value in value.items():
gps_tag = GPSTAGS.get(gps_tag_id, f"GPSTag{gps_tag_id}")
gps[gps_tag] = _serializable(gps_value)
out[tag] = gps
else:
out[tag] = _serializable(value)
except Exception as e:
out["_error"] = str(e)
return out
def exifread_tags(path: Path) -> dict:
"""ExifRead — často víc tagů než Pillow, mj. detailní MakerNote."""
out = {}
try:
with open(path, "rb") as f:
tags = exifread.process_file(f, details=True)
for k, v in tags.items():
# přeskočit binární thumbnail
if "Thumbnail" in k and "JPEGInterchangeFormat" not in k:
continue
out[k] = str(v)
except Exception as e:
out["_error"] = str(e)
return out
def _serializable(v):
"""Pillow vrací občas IFDRational, bytes apod. → převést na JSON-friendly."""
if isinstance(v, bytes):
return v[:200].decode("utf-8", errors="replace")
if isinstance(v, (tuple, list)):
return [_serializable(x) for x in v]
if isinstance(v, dict):
return {str(k): _serializable(val) for k, val in v.items()}
if hasattr(v, "numerator") and hasattr(v, "denominator"):
try:
return float(v)
except Exception:
return str(v)
try:
json.dumps(v)
return v
except (TypeError, ValueError):
return str(v)
def explore_photo(path: Path) -> dict:
return {
"filesystem": filesystem_info(path),
"hashes": {
"sha256_file": file_hash_sha256(path),
"sha256_pixels": pixel_hash_sha256(path),
**perceptual_hashes(path),
},
"pillow": pillow_info(path),
"exif_pillow": pillow_exif(path),
"exif_exifread": exifread_tags(path),
"iptc": iptc_info(path),
"xmp": xmp_info(path),
}
def hamming_distance(h1: str, h2: str) -> int:
"""Hamming distance mezi dvěma hex perceptual hashes."""
return bin(int(h1, 16) ^ int(h2, 16)).count("1")
def print_summary(photos: list[dict]) -> None:
print(f"\n{'=' * 70}")
print(f"PŘEHLED: {len(photos)} fotek")
print(f"{'=' * 70}\n")
# Které EXIF tagy existují napříč fotkami?
all_pillow_keys = set()
all_exifread_keys = set()
for p in photos:
all_pillow_keys.update(p["exif_pillow"].keys())
all_exifread_keys.update(p["exif_exifread"].keys())
print(f"Unikátní EXIF tagy (Pillow): {len(all_pillow_keys)}")
print(f"Unikátní EXIF tagy (ExifRead): {len(all_exifread_keys)}")
print()
for i, p in enumerate(photos, 1):
fs = p["filesystem"]
pi = p["pillow"]
h = p["hashes"]
er = p["exif_exifread"]
print(f"[{i}] {fs['file_name']}")
print(f" Velikost: {fs['file_size_mb']} MB ({pi.get('width')}x{pi.get('height')}, {pi.get('megapixels')} Mpx)")
print(f" Formát: {pi.get('format')} / mode={pi.get('mode')}")
print(f" sha256_file: {h['sha256_file'][:16]}...")
print(f" sha256_pixels: {(h.get('sha256_pixels') or 'N/A')[:16]}...")
print(f" phash: {h.get('phash')} (perceptual)")
print(f" EXIF tagů: ExifRead={len(er)}, Pillow={len(p['exif_pillow'])}")
print(f" IPTC polí: {len([k for k in p['iptc'] if not k.startswith('_')])}")
print(f" XMP polí: {len([k for k in p['xmp'] if not k.startswith('_')])}")
# ExifRead je spolehlivější (Pillow má GPS bug)
interesting = {
"Kamera": f"{er.get('Image Make', '')} {er.get('Image Model', '')}".strip(),
"Objektiv": er.get("EXIF LensModel"),
"Datum": er.get("EXIF DateTimeOriginal") or er.get("Image DateTime"),
"TZ offset": er.get("EXIF OffsetTimeOriginal") or er.get("EXIF OffsetTime"),
"Clona": er.get("EXIF FNumber"),
"ISO": er.get("EXIF ISOSpeedRatings"),
"Expozice": er.get("EXIF ExposureTime"),
"Ohnisko mm": er.get("EXIF FocalLength"),
"Flash": er.get("EXIF Flash"),
"GPS lat": er.get("GPS GPSLatitude"),
"GPS lon": er.get("GPS GPSLongitude"),
"Software": er.get("Image Software"),
}
for k, v in interesting.items():
if v and str(v).strip():
print(f" {k:12s}: {v}")
# IPTC / XMP — vypsat všechno, co je
if p["iptc"]:
for k, v in p["iptc"].items():
if not k.startswith("_"):
print(f" IPTC.{k:8s}: {v}")
if p["xmp"]:
for k, v in p["xmp"].items():
if not k.startswith("_"):
print(f" XMP.{k:9s}: {v}")
print()
# Tabulka perceptuálních podobností (Hamming distance phash)
print(f"{'=' * 70}")
print("PERCEPTUÁLNÍ PODOBNOST (phash Hamming distance)")
print("Hodnota 0-10 = vizuálně velmi podobné, >20 = odlišné")
print(f"{'=' * 70}")
n = len(photos)
header = " " + "".join(f" [{i+1}]" for i in range(n))
print(header)
for i in range(n):
row = f" [{i+1}] "
for j in range(n):
if i == j:
row += " -"
else:
h1 = photos[i]["hashes"].get("phash")
h2 = photos[j]["hashes"].get("phash")
if h1 and h2:
d = hamming_distance(h1, h2)
marker = "*" if d <= 10 and i != j else " "
row += f" {d:3d}{marker}"
else:
row += " N/A "
print(row)
print("\n * = vizuálně podobné fotky (možná duplikát po editaci)")
print()
def main():
if not PHOTOS_DIR.exists():
print(f"[ERROR] Složka neexistuje: {PHOTOS_DIR}")
return
files = sorted([p for p in PHOTOS_DIR.iterdir()
if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".tiff", ".tif", ".webp"}])
if not files:
print(f"[WARN] Žádné fotky v {PHOTOS_DIR}")
return
print(f"Nalezeno {len(files)} fotek v {PHOTOS_DIR}\n")
photos = []
for f in files:
print(f" zpracovávám: {f.name} ...", end=" ", flush=True)
try:
photos.append(explore_photo(f))
print("OK")
except Exception as e:
print(f"FAIL: {e}")
print_summary(photos)
# Uložit do JSON pro detailní analýzu
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(photos, f, indent=2, ensure_ascii=False, default=str)
print(f"\n[OK] Detailní data uložena: {OUTPUT_JSON}")
if __name__ == "__main__":
main()