399 lines
14 KiB
Python
399 lines
14 KiB
Python
"""
|
|
Explorační skript: projde všechny fotky v demo_fotky/ a vytáhne maximum dat.
|
|
Výstup do konzole + JSON soubor pro detailní analýzu.
|
|
"""
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Windows konzole - vynutit UTF-8
|
|
if sys.stdout.encoding.lower() != "utf-8":
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
sys.stderr.reconfigure(encoding="utf-8")
|
|
|
|
import exifread
|
|
import imagehash
|
|
from PIL import Image, ImageOps, IptcImagePlugin
|
|
from PIL.ExifTags import TAGS, GPSTAGS
|
|
|
|
PHOTOS_DIR = Path(__file__).parent / "demo_fotky"
|
|
OUTPUT_JSON = Path(__file__).parent / "photo_exploration.json"
|
|
|
|
|
|
def file_hash_sha256(path: Path, chunk_size: int = 65536) -> str:
|
|
"""Hash celého souboru - detekce přesné kopie."""
|
|
h = hashlib.sha256()
|
|
with open(path, "rb") as f:
|
|
while chunk := f.read(chunk_size):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
def pixel_hash_sha256(path: Path) -> str | None:
|
|
"""Hash dekódovaných pixelů - identita fotky nezávisle na metadatech.
|
|
Aplikuje EXIF orientation pro konzistenci."""
|
|
try:
|
|
with Image.open(path) as img:
|
|
img = ImageOps.exif_transpose(img)
|
|
if img.mode != "RGB":
|
|
img = img.convert("RGB")
|
|
return hashlib.sha256(img.tobytes()).hexdigest()
|
|
except Exception as e:
|
|
return None
|
|
|
|
|
|
def perceptual_hashes(path: Path) -> dict:
|
|
"""Perceptuální hashe - detekce vizuálně podobných fotek.
|
|
Každý hash je 64-bit, porovnává se Hamming distance."""
|
|
out = {}
|
|
try:
|
|
with Image.open(path) as img:
|
|
img = ImageOps.exif_transpose(img)
|
|
out["phash"] = str(imagehash.phash(img))
|
|
out["dhash"] = str(imagehash.dhash(img))
|
|
out["ahash"] = str(imagehash.average_hash(img))
|
|
out["whash"] = str(imagehash.whash(img))
|
|
except Exception as e:
|
|
out["_error"] = str(e)
|
|
return out
|
|
|
|
|
|
def iptc_info(path: Path) -> dict:
|
|
"""IPTC metadata - keywords, title, description, author atd."""
|
|
out = {}
|
|
# Mapování IPTC numerických tagů na čitelné názvy
|
|
iptc_names = {
|
|
(2, 5): "ObjectName", # Title
|
|
(2, 10): "Urgency",
|
|
(2, 15): "Category",
|
|
(2, 20): "SupplementalCategories",
|
|
(2, 25): "Keywords",
|
|
(2, 40): "SpecialInstructions",
|
|
(2, 55): "DateCreated",
|
|
(2, 60): "TimeCreated",
|
|
(2, 80): "Byline", # Creator/Author
|
|
(2, 85): "BylineTitle",
|
|
(2, 90): "City",
|
|
(2, 92): "SubLocation",
|
|
(2, 95): "ProvinceState",
|
|
(2, 100): "CountryCode",
|
|
(2, 101): "CountryName",
|
|
(2, 103): "OriginalTransmissionReference",
|
|
(2, 105): "Headline",
|
|
(2, 110): "Credit",
|
|
(2, 115): "Source",
|
|
(2, 116): "Copyright",
|
|
(2, 118): "Contact",
|
|
(2, 120): "Caption", # Description
|
|
(2, 122): "WriterEditor",
|
|
}
|
|
try:
|
|
with Image.open(path) as img:
|
|
raw = IptcImagePlugin.getiptcinfo(img)
|
|
if not raw:
|
|
return {}
|
|
for key, value in raw.items():
|
|
name = iptc_names.get(key, f"IPTC{key}")
|
|
if isinstance(value, bytes):
|
|
value = value.decode("utf-8", errors="replace")
|
|
elif isinstance(value, list):
|
|
value = [v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v for v in value]
|
|
out[name] = value
|
|
except Exception as e:
|
|
out["_error"] = str(e)
|
|
return out
|
|
|
|
|
|
def xmp_info(path: Path) -> dict:
|
|
"""XMP metadata - moderní alternativa IPTC, často keywords/rating/regions."""
|
|
out = {}
|
|
try:
|
|
with Image.open(path) as img:
|
|
xmp_raw = img.info.get("xmp")
|
|
if not xmp_raw:
|
|
return {}
|
|
if isinstance(xmp_raw, bytes):
|
|
xmp_raw = xmp_raw.decode("utf-8", errors="replace")
|
|
|
|
# Velmi jednoduchý parser - vytáhne nejčastější pole regexem
|
|
patterns = {
|
|
"creator_tool": r'xmp:CreatorTool="([^"]+)"',
|
|
"create_date": r'xmp:CreateDate="([^"]+)"',
|
|
"modify_date": r'xmp:ModifyDate="([^"]+)"',
|
|
"rating": r'xmp:Rating="([^"]+)"',
|
|
"label": r'xmp:Label="([^"]+)"',
|
|
"title": r'<dc:title[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
|
|
"description": r'<dc:description[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
|
|
"creator": r'<dc:creator[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
|
|
"subject_keywords": r'<dc:subject[^>]*>(.*?)</dc:subject>',
|
|
}
|
|
for name, pat in patterns.items():
|
|
m = re.search(pat, xmp_raw, re.DOTALL)
|
|
if m:
|
|
out[name] = m.group(1).strip()
|
|
|
|
# Keywords z dc:subject - vytáhnout jednotlivé rdf:li
|
|
if "subject_keywords" in out:
|
|
kws = re.findall(r'<rdf:li[^>]*>([^<]+)</rdf:li>', out["subject_keywords"])
|
|
out["subject_keywords"] = kws
|
|
|
|
# Apple regions (rozpoznané obličeje s pozicí)
|
|
face_count = len(re.findall(r'mwg-rs:Type="Face"', xmp_raw))
|
|
if face_count:
|
|
out["face_regions_count"] = face_count
|
|
|
|
# Délka raw XMP pro představu
|
|
out["_xmp_length_bytes"] = len(xmp_raw)
|
|
except Exception as e:
|
|
out["_error"] = str(e)
|
|
return out
|
|
|
|
|
|
def filesystem_info(path: Path) -> dict:
|
|
stat = path.stat()
|
|
return {
|
|
"file_name": path.name,
|
|
"file_path": str(path),
|
|
"file_size_bytes": stat.st_size,
|
|
"file_size_mb": round(stat.st_size / 1024 / 1024, 2),
|
|
"mtime": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
|
"ctime": datetime.fromtimestamp(stat.st_ctime).isoformat(),
|
|
"extension": path.suffix.lower(),
|
|
}
|
|
|
|
|
|
def pillow_info(path: Path) -> dict:
|
|
info = {}
|
|
try:
|
|
with Image.open(path) as img:
|
|
info["format"] = img.format
|
|
info["mode"] = img.mode
|
|
info["width"] = img.width
|
|
info["height"] = img.height
|
|
info["megapixels"] = round((img.width * img.height) / 1_000_000, 2)
|
|
info["has_transparency"] = img.mode in ("RGBA", "LA") or "transparency" in img.info
|
|
info["dpi"] = img.info.get("dpi")
|
|
info["icc_profile_present"] = "icc_profile" in img.info
|
|
info["exif_present"] = bool(img.getexif())
|
|
|
|
# XMP (často v JPG od Adobe)
|
|
if "xmp" in img.info:
|
|
xmp_raw = img.info["xmp"]
|
|
if isinstance(xmp_raw, bytes):
|
|
xmp_raw = xmp_raw[:500].decode("utf-8", errors="ignore")
|
|
info["xmp_snippet"] = str(xmp_raw)[:500]
|
|
|
|
# Thumbnail embedded?
|
|
info["has_embedded_thumbnail"] = "thumbnail" in img.info
|
|
except Exception as e:
|
|
info["error"] = str(e)
|
|
return info
|
|
|
|
|
|
def pillow_exif(path: Path) -> dict:
|
|
"""Pillow EXIF — čitelné názvy."""
|
|
out = {}
|
|
try:
|
|
with Image.open(path) as img:
|
|
exif = img.getexif()
|
|
if not exif:
|
|
return {}
|
|
for tag_id, value in exif.items():
|
|
tag = TAGS.get(tag_id, f"Tag{tag_id}")
|
|
# GPS info jako vnořený dict
|
|
if tag == "GPSInfo":
|
|
gps = {}
|
|
for gps_tag_id, gps_value in value.items():
|
|
gps_tag = GPSTAGS.get(gps_tag_id, f"GPSTag{gps_tag_id}")
|
|
gps[gps_tag] = _serializable(gps_value)
|
|
out[tag] = gps
|
|
else:
|
|
out[tag] = _serializable(value)
|
|
except Exception as e:
|
|
out["_error"] = str(e)
|
|
return out
|
|
|
|
|
|
def exifread_tags(path: Path) -> dict:
|
|
"""ExifRead — často víc tagů než Pillow, mj. detailní MakerNote."""
|
|
out = {}
|
|
try:
|
|
with open(path, "rb") as f:
|
|
tags = exifread.process_file(f, details=True)
|
|
for k, v in tags.items():
|
|
# přeskočit binární thumbnail
|
|
if "Thumbnail" in k and "JPEGInterchangeFormat" not in k:
|
|
continue
|
|
out[k] = str(v)
|
|
except Exception as e:
|
|
out["_error"] = str(e)
|
|
return out
|
|
|
|
|
|
def _serializable(v):
|
|
"""Pillow vrací občas IFDRational, bytes apod. → převést na JSON-friendly."""
|
|
if isinstance(v, bytes):
|
|
return v[:200].decode("utf-8", errors="replace")
|
|
if isinstance(v, (tuple, list)):
|
|
return [_serializable(x) for x in v]
|
|
if isinstance(v, dict):
|
|
return {str(k): _serializable(val) for k, val in v.items()}
|
|
if hasattr(v, "numerator") and hasattr(v, "denominator"):
|
|
try:
|
|
return float(v)
|
|
except Exception:
|
|
return str(v)
|
|
try:
|
|
json.dumps(v)
|
|
return v
|
|
except (TypeError, ValueError):
|
|
return str(v)
|
|
|
|
|
|
def explore_photo(path: Path) -> dict:
|
|
return {
|
|
"filesystem": filesystem_info(path),
|
|
"hashes": {
|
|
"sha256_file": file_hash_sha256(path),
|
|
"sha256_pixels": pixel_hash_sha256(path),
|
|
**perceptual_hashes(path),
|
|
},
|
|
"pillow": pillow_info(path),
|
|
"exif_pillow": pillow_exif(path),
|
|
"exif_exifread": exifread_tags(path),
|
|
"iptc": iptc_info(path),
|
|
"xmp": xmp_info(path),
|
|
}
|
|
|
|
|
|
def hamming_distance(h1: str, h2: str) -> int:
|
|
"""Hamming distance mezi dvěma hex perceptual hashes."""
|
|
return bin(int(h1, 16) ^ int(h2, 16)).count("1")
|
|
|
|
|
|
def print_summary(photos: list[dict]) -> None:
|
|
print(f"\n{'=' * 70}")
|
|
print(f"PŘEHLED: {len(photos)} fotek")
|
|
print(f"{'=' * 70}\n")
|
|
|
|
# Které EXIF tagy existují napříč fotkami?
|
|
all_pillow_keys = set()
|
|
all_exifread_keys = set()
|
|
for p in photos:
|
|
all_pillow_keys.update(p["exif_pillow"].keys())
|
|
all_exifread_keys.update(p["exif_exifread"].keys())
|
|
|
|
print(f"Unikátní EXIF tagy (Pillow): {len(all_pillow_keys)}")
|
|
print(f"Unikátní EXIF tagy (ExifRead): {len(all_exifread_keys)}")
|
|
print()
|
|
|
|
for i, p in enumerate(photos, 1):
|
|
fs = p["filesystem"]
|
|
pi = p["pillow"]
|
|
h = p["hashes"]
|
|
er = p["exif_exifread"]
|
|
print(f"[{i}] {fs['file_name']}")
|
|
print(f" Velikost: {fs['file_size_mb']} MB ({pi.get('width')}x{pi.get('height')}, {pi.get('megapixels')} Mpx)")
|
|
print(f" Formát: {pi.get('format')} / mode={pi.get('mode')}")
|
|
print(f" sha256_file: {h['sha256_file'][:16]}...")
|
|
print(f" sha256_pixels: {(h.get('sha256_pixels') or 'N/A')[:16]}...")
|
|
print(f" phash: {h.get('phash')} (perceptual)")
|
|
print(f" EXIF tagů: ExifRead={len(er)}, Pillow={len(p['exif_pillow'])}")
|
|
print(f" IPTC polí: {len([k for k in p['iptc'] if not k.startswith('_')])}")
|
|
print(f" XMP polí: {len([k for k in p['xmp'] if not k.startswith('_')])}")
|
|
|
|
# ExifRead je spolehlivější (Pillow má GPS bug)
|
|
interesting = {
|
|
"Kamera": f"{er.get('Image Make', '')} {er.get('Image Model', '')}".strip(),
|
|
"Objektiv": er.get("EXIF LensModel"),
|
|
"Datum": er.get("EXIF DateTimeOriginal") or er.get("Image DateTime"),
|
|
"TZ offset": er.get("EXIF OffsetTimeOriginal") or er.get("EXIF OffsetTime"),
|
|
"Clona": er.get("EXIF FNumber"),
|
|
"ISO": er.get("EXIF ISOSpeedRatings"),
|
|
"Expozice": er.get("EXIF ExposureTime"),
|
|
"Ohnisko mm": er.get("EXIF FocalLength"),
|
|
"Flash": er.get("EXIF Flash"),
|
|
"GPS lat": er.get("GPS GPSLatitude"),
|
|
"GPS lon": er.get("GPS GPSLongitude"),
|
|
"Software": er.get("Image Software"),
|
|
}
|
|
for k, v in interesting.items():
|
|
if v and str(v).strip():
|
|
print(f" {k:12s}: {v}")
|
|
|
|
# IPTC / XMP — vypsat všechno, co je
|
|
if p["iptc"]:
|
|
for k, v in p["iptc"].items():
|
|
if not k.startswith("_"):
|
|
print(f" IPTC.{k:8s}: {v}")
|
|
if p["xmp"]:
|
|
for k, v in p["xmp"].items():
|
|
if not k.startswith("_"):
|
|
print(f" XMP.{k:9s}: {v}")
|
|
print()
|
|
|
|
# Tabulka perceptuálních podobností (Hamming distance phash)
|
|
print(f"{'=' * 70}")
|
|
print("PERCEPTUÁLNÍ PODOBNOST (phash Hamming distance)")
|
|
print("Hodnota 0-10 = vizuálně velmi podobné, >20 = odlišné")
|
|
print(f"{'=' * 70}")
|
|
n = len(photos)
|
|
header = " " + "".join(f" [{i+1}]" for i in range(n))
|
|
print(header)
|
|
for i in range(n):
|
|
row = f" [{i+1}] "
|
|
for j in range(n):
|
|
if i == j:
|
|
row += " -"
|
|
else:
|
|
h1 = photos[i]["hashes"].get("phash")
|
|
h2 = photos[j]["hashes"].get("phash")
|
|
if h1 and h2:
|
|
d = hamming_distance(h1, h2)
|
|
marker = "*" if d <= 10 and i != j else " "
|
|
row += f" {d:3d}{marker}"
|
|
else:
|
|
row += " N/A "
|
|
print(row)
|
|
print("\n * = vizuálně podobné fotky (možná duplikát po editaci)")
|
|
print()
|
|
|
|
|
|
def main():
|
|
if not PHOTOS_DIR.exists():
|
|
print(f"[ERROR] Složka neexistuje: {PHOTOS_DIR}")
|
|
return
|
|
|
|
files = sorted([p for p in PHOTOS_DIR.iterdir()
|
|
if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".tiff", ".tif", ".webp"}])
|
|
|
|
if not files:
|
|
print(f"[WARN] Žádné fotky v {PHOTOS_DIR}")
|
|
return
|
|
|
|
print(f"Nalezeno {len(files)} fotek v {PHOTOS_DIR}\n")
|
|
|
|
photos = []
|
|
for f in files:
|
|
print(f" zpracovávám: {f.name} ...", end=" ", flush=True)
|
|
try:
|
|
photos.append(explore_photo(f))
|
|
print("OK")
|
|
except Exception as e:
|
|
print(f"FAIL: {e}")
|
|
|
|
print_summary(photos)
|
|
|
|
# Uložit do JSON pro detailní analýzu
|
|
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
|
json.dump(photos, f, indent=2, ensure_ascii=False, default=str)
|
|
print(f"\n[OK] Detailní data uložena: {OUTPUT_JSON}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|