notebookVb
This commit is contained in:
@@ -0,0 +1,398 @@
|
||||
"""
|
||||
Explorační skript: projde všechny fotky v demo_fotky/ a vytáhne maximum dat.
|
||||
Výstup do konzole + JSON soubor pro detailní analýzu.
|
||||
"""
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Windows konzole - vynutit UTF-8
|
||||
if sys.stdout.encoding.lower() != "utf-8":
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
|
||||
import exifread
|
||||
import imagehash
|
||||
from PIL import Image, ImageOps, IptcImagePlugin
|
||||
from PIL.ExifTags import TAGS, GPSTAGS
|
||||
|
||||
PHOTOS_DIR = Path(__file__).parent / "demo_fotky"
|
||||
OUTPUT_JSON = Path(__file__).parent / "photo_exploration.json"
|
||||
|
||||
|
||||
def file_hash_sha256(path: Path, chunk_size: int = 65536) -> str:
|
||||
"""Hash celého souboru - detekce přesné kopie."""
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def pixel_hash_sha256(path: Path) -> str | None:
|
||||
"""Hash dekódovaných pixelů - identita fotky nezávisle na metadatech.
|
||||
Aplikuje EXIF orientation pro konzistenci."""
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
img = ImageOps.exif_transpose(img)
|
||||
if img.mode != "RGB":
|
||||
img = img.convert("RGB")
|
||||
return hashlib.sha256(img.tobytes()).hexdigest()
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def perceptual_hashes(path: Path) -> dict:
|
||||
"""Perceptuální hashe - detekce vizuálně podobných fotek.
|
||||
Každý hash je 64-bit, porovnává se Hamming distance."""
|
||||
out = {}
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
img = ImageOps.exif_transpose(img)
|
||||
out["phash"] = str(imagehash.phash(img))
|
||||
out["dhash"] = str(imagehash.dhash(img))
|
||||
out["ahash"] = str(imagehash.average_hash(img))
|
||||
out["whash"] = str(imagehash.whash(img))
|
||||
except Exception as e:
|
||||
out["_error"] = str(e)
|
||||
return out
|
||||
|
||||
|
||||
def iptc_info(path: Path) -> dict:
|
||||
"""IPTC metadata - keywords, title, description, author atd."""
|
||||
out = {}
|
||||
# Mapování IPTC numerických tagů na čitelné názvy
|
||||
iptc_names = {
|
||||
(2, 5): "ObjectName", # Title
|
||||
(2, 10): "Urgency",
|
||||
(2, 15): "Category",
|
||||
(2, 20): "SupplementalCategories",
|
||||
(2, 25): "Keywords",
|
||||
(2, 40): "SpecialInstructions",
|
||||
(2, 55): "DateCreated",
|
||||
(2, 60): "TimeCreated",
|
||||
(2, 80): "Byline", # Creator/Author
|
||||
(2, 85): "BylineTitle",
|
||||
(2, 90): "City",
|
||||
(2, 92): "SubLocation",
|
||||
(2, 95): "ProvinceState",
|
||||
(2, 100): "CountryCode",
|
||||
(2, 101): "CountryName",
|
||||
(2, 103): "OriginalTransmissionReference",
|
||||
(2, 105): "Headline",
|
||||
(2, 110): "Credit",
|
||||
(2, 115): "Source",
|
||||
(2, 116): "Copyright",
|
||||
(2, 118): "Contact",
|
||||
(2, 120): "Caption", # Description
|
||||
(2, 122): "WriterEditor",
|
||||
}
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
raw = IptcImagePlugin.getiptcinfo(img)
|
||||
if not raw:
|
||||
return {}
|
||||
for key, value in raw.items():
|
||||
name = iptc_names.get(key, f"IPTC{key}")
|
||||
if isinstance(value, bytes):
|
||||
value = value.decode("utf-8", errors="replace")
|
||||
elif isinstance(value, list):
|
||||
value = [v.decode("utf-8", errors="replace") if isinstance(v, bytes) else v for v in value]
|
||||
out[name] = value
|
||||
except Exception as e:
|
||||
out["_error"] = str(e)
|
||||
return out
|
||||
|
||||
|
||||
def xmp_info(path: Path) -> dict:
|
||||
"""XMP metadata - moderní alternativa IPTC, často keywords/rating/regions."""
|
||||
out = {}
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
xmp_raw = img.info.get("xmp")
|
||||
if not xmp_raw:
|
||||
return {}
|
||||
if isinstance(xmp_raw, bytes):
|
||||
xmp_raw = xmp_raw.decode("utf-8", errors="replace")
|
||||
|
||||
# Velmi jednoduchý parser - vytáhne nejčastější pole regexem
|
||||
patterns = {
|
||||
"creator_tool": r'xmp:CreatorTool="([^"]+)"',
|
||||
"create_date": r'xmp:CreateDate="([^"]+)"',
|
||||
"modify_date": r'xmp:ModifyDate="([^"]+)"',
|
||||
"rating": r'xmp:Rating="([^"]+)"',
|
||||
"label": r'xmp:Label="([^"]+)"',
|
||||
"title": r'<dc:title[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
|
||||
"description": r'<dc:description[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
|
||||
"creator": r'<dc:creator[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>',
|
||||
"subject_keywords": r'<dc:subject[^>]*>(.*?)</dc:subject>',
|
||||
}
|
||||
for name, pat in patterns.items():
|
||||
m = re.search(pat, xmp_raw, re.DOTALL)
|
||||
if m:
|
||||
out[name] = m.group(1).strip()
|
||||
|
||||
# Keywords z dc:subject - vytáhnout jednotlivé rdf:li
|
||||
if "subject_keywords" in out:
|
||||
kws = re.findall(r'<rdf:li[^>]*>([^<]+)</rdf:li>', out["subject_keywords"])
|
||||
out["subject_keywords"] = kws
|
||||
|
||||
# Apple regions (rozpoznané obličeje s pozicí)
|
||||
face_count = len(re.findall(r'mwg-rs:Type="Face"', xmp_raw))
|
||||
if face_count:
|
||||
out["face_regions_count"] = face_count
|
||||
|
||||
# Délka raw XMP pro představu
|
||||
out["_xmp_length_bytes"] = len(xmp_raw)
|
||||
except Exception as e:
|
||||
out["_error"] = str(e)
|
||||
return out
|
||||
|
||||
|
||||
def filesystem_info(path: Path) -> dict:
|
||||
stat = path.stat()
|
||||
return {
|
||||
"file_name": path.name,
|
||||
"file_path": str(path),
|
||||
"file_size_bytes": stat.st_size,
|
||||
"file_size_mb": round(stat.st_size / 1024 / 1024, 2),
|
||||
"mtime": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
"ctime": datetime.fromtimestamp(stat.st_ctime).isoformat(),
|
||||
"extension": path.suffix.lower(),
|
||||
}
|
||||
|
||||
|
||||
def pillow_info(path: Path) -> dict:
|
||||
info = {}
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
info["format"] = img.format
|
||||
info["mode"] = img.mode
|
||||
info["width"] = img.width
|
||||
info["height"] = img.height
|
||||
info["megapixels"] = round((img.width * img.height) / 1_000_000, 2)
|
||||
info["has_transparency"] = img.mode in ("RGBA", "LA") or "transparency" in img.info
|
||||
info["dpi"] = img.info.get("dpi")
|
||||
info["icc_profile_present"] = "icc_profile" in img.info
|
||||
info["exif_present"] = bool(img.getexif())
|
||||
|
||||
# XMP (často v JPG od Adobe)
|
||||
if "xmp" in img.info:
|
||||
xmp_raw = img.info["xmp"]
|
||||
if isinstance(xmp_raw, bytes):
|
||||
xmp_raw = xmp_raw[:500].decode("utf-8", errors="ignore")
|
||||
info["xmp_snippet"] = str(xmp_raw)[:500]
|
||||
|
||||
# Thumbnail embedded?
|
||||
info["has_embedded_thumbnail"] = "thumbnail" in img.info
|
||||
except Exception as e:
|
||||
info["error"] = str(e)
|
||||
return info
|
||||
|
||||
|
||||
def pillow_exif(path: Path) -> dict:
|
||||
"""Pillow EXIF — čitelné názvy."""
|
||||
out = {}
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
exif = img.getexif()
|
||||
if not exif:
|
||||
return {}
|
||||
for tag_id, value in exif.items():
|
||||
tag = TAGS.get(tag_id, f"Tag{tag_id}")
|
||||
# GPS info jako vnořený dict
|
||||
if tag == "GPSInfo":
|
||||
gps = {}
|
||||
for gps_tag_id, gps_value in value.items():
|
||||
gps_tag = GPSTAGS.get(gps_tag_id, f"GPSTag{gps_tag_id}")
|
||||
gps[gps_tag] = _serializable(gps_value)
|
||||
out[tag] = gps
|
||||
else:
|
||||
out[tag] = _serializable(value)
|
||||
except Exception as e:
|
||||
out["_error"] = str(e)
|
||||
return out
|
||||
|
||||
|
||||
def exifread_tags(path: Path) -> dict:
|
||||
"""ExifRead — často víc tagů než Pillow, mj. detailní MakerNote."""
|
||||
out = {}
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
tags = exifread.process_file(f, details=True)
|
||||
for k, v in tags.items():
|
||||
# přeskočit binární thumbnail
|
||||
if "Thumbnail" in k and "JPEGInterchangeFormat" not in k:
|
||||
continue
|
||||
out[k] = str(v)
|
||||
except Exception as e:
|
||||
out["_error"] = str(e)
|
||||
return out
|
||||
|
||||
|
||||
def _serializable(v):
|
||||
"""Pillow vrací občas IFDRational, bytes apod. → převést na JSON-friendly."""
|
||||
if isinstance(v, bytes):
|
||||
return v[:200].decode("utf-8", errors="replace")
|
||||
if isinstance(v, (tuple, list)):
|
||||
return [_serializable(x) for x in v]
|
||||
if isinstance(v, dict):
|
||||
return {str(k): _serializable(val) for k, val in v.items()}
|
||||
if hasattr(v, "numerator") and hasattr(v, "denominator"):
|
||||
try:
|
||||
return float(v)
|
||||
except Exception:
|
||||
return str(v)
|
||||
try:
|
||||
json.dumps(v)
|
||||
return v
|
||||
except (TypeError, ValueError):
|
||||
return str(v)
|
||||
|
||||
|
||||
def explore_photo(path: Path) -> dict:
|
||||
return {
|
||||
"filesystem": filesystem_info(path),
|
||||
"hashes": {
|
||||
"sha256_file": file_hash_sha256(path),
|
||||
"sha256_pixels": pixel_hash_sha256(path),
|
||||
**perceptual_hashes(path),
|
||||
},
|
||||
"pillow": pillow_info(path),
|
||||
"exif_pillow": pillow_exif(path),
|
||||
"exif_exifread": exifread_tags(path),
|
||||
"iptc": iptc_info(path),
|
||||
"xmp": xmp_info(path),
|
||||
}
|
||||
|
||||
|
||||
def hamming_distance(h1: str, h2: str) -> int:
|
||||
"""Hamming distance mezi dvěma hex perceptual hashes."""
|
||||
return bin(int(h1, 16) ^ int(h2, 16)).count("1")
|
||||
|
||||
|
||||
def print_summary(photos: list[dict]) -> None:
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"PŘEHLED: {len(photos)} fotek")
|
||||
print(f"{'=' * 70}\n")
|
||||
|
||||
# Které EXIF tagy existují napříč fotkami?
|
||||
all_pillow_keys = set()
|
||||
all_exifread_keys = set()
|
||||
for p in photos:
|
||||
all_pillow_keys.update(p["exif_pillow"].keys())
|
||||
all_exifread_keys.update(p["exif_exifread"].keys())
|
||||
|
||||
print(f"Unikátní EXIF tagy (Pillow): {len(all_pillow_keys)}")
|
||||
print(f"Unikátní EXIF tagy (ExifRead): {len(all_exifread_keys)}")
|
||||
print()
|
||||
|
||||
for i, p in enumerate(photos, 1):
|
||||
fs = p["filesystem"]
|
||||
pi = p["pillow"]
|
||||
h = p["hashes"]
|
||||
er = p["exif_exifread"]
|
||||
print(f"[{i}] {fs['file_name']}")
|
||||
print(f" Velikost: {fs['file_size_mb']} MB ({pi.get('width')}x{pi.get('height')}, {pi.get('megapixels')} Mpx)")
|
||||
print(f" Formát: {pi.get('format')} / mode={pi.get('mode')}")
|
||||
print(f" sha256_file: {h['sha256_file'][:16]}...")
|
||||
print(f" sha256_pixels: {(h.get('sha256_pixels') or 'N/A')[:16]}...")
|
||||
print(f" phash: {h.get('phash')} (perceptual)")
|
||||
print(f" EXIF tagů: ExifRead={len(er)}, Pillow={len(p['exif_pillow'])}")
|
||||
print(f" IPTC polí: {len([k for k in p['iptc'] if not k.startswith('_')])}")
|
||||
print(f" XMP polí: {len([k for k in p['xmp'] if not k.startswith('_')])}")
|
||||
|
||||
# ExifRead je spolehlivější (Pillow má GPS bug)
|
||||
interesting = {
|
||||
"Kamera": f"{er.get('Image Make', '')} {er.get('Image Model', '')}".strip(),
|
||||
"Objektiv": er.get("EXIF LensModel"),
|
||||
"Datum": er.get("EXIF DateTimeOriginal") or er.get("Image DateTime"),
|
||||
"TZ offset": er.get("EXIF OffsetTimeOriginal") or er.get("EXIF OffsetTime"),
|
||||
"Clona": er.get("EXIF FNumber"),
|
||||
"ISO": er.get("EXIF ISOSpeedRatings"),
|
||||
"Expozice": er.get("EXIF ExposureTime"),
|
||||
"Ohnisko mm": er.get("EXIF FocalLength"),
|
||||
"Flash": er.get("EXIF Flash"),
|
||||
"GPS lat": er.get("GPS GPSLatitude"),
|
||||
"GPS lon": er.get("GPS GPSLongitude"),
|
||||
"Software": er.get("Image Software"),
|
||||
}
|
||||
for k, v in interesting.items():
|
||||
if v and str(v).strip():
|
||||
print(f" {k:12s}: {v}")
|
||||
|
||||
# IPTC / XMP — vypsat všechno, co je
|
||||
if p["iptc"]:
|
||||
for k, v in p["iptc"].items():
|
||||
if not k.startswith("_"):
|
||||
print(f" IPTC.{k:8s}: {v}")
|
||||
if p["xmp"]:
|
||||
for k, v in p["xmp"].items():
|
||||
if not k.startswith("_"):
|
||||
print(f" XMP.{k:9s}: {v}")
|
||||
print()
|
||||
|
||||
# Tabulka perceptuálních podobností (Hamming distance phash)
|
||||
print(f"{'=' * 70}")
|
||||
print("PERCEPTUÁLNÍ PODOBNOST (phash Hamming distance)")
|
||||
print("Hodnota 0-10 = vizuálně velmi podobné, >20 = odlišné")
|
||||
print(f"{'=' * 70}")
|
||||
n = len(photos)
|
||||
header = " " + "".join(f" [{i+1}]" for i in range(n))
|
||||
print(header)
|
||||
for i in range(n):
|
||||
row = f" [{i+1}] "
|
||||
for j in range(n):
|
||||
if i == j:
|
||||
row += " -"
|
||||
else:
|
||||
h1 = photos[i]["hashes"].get("phash")
|
||||
h2 = photos[j]["hashes"].get("phash")
|
||||
if h1 and h2:
|
||||
d = hamming_distance(h1, h2)
|
||||
marker = "*" if d <= 10 and i != j else " "
|
||||
row += f" {d:3d}{marker}"
|
||||
else:
|
||||
row += " N/A "
|
||||
print(row)
|
||||
print("\n * = vizuálně podobné fotky (možná duplikát po editaci)")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
if not PHOTOS_DIR.exists():
|
||||
print(f"[ERROR] Složka neexistuje: {PHOTOS_DIR}")
|
||||
return
|
||||
|
||||
files = sorted([p for p in PHOTOS_DIR.iterdir()
|
||||
if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".tiff", ".tif", ".webp"}])
|
||||
|
||||
if not files:
|
||||
print(f"[WARN] Žádné fotky v {PHOTOS_DIR}")
|
||||
return
|
||||
|
||||
print(f"Nalezeno {len(files)} fotek v {PHOTOS_DIR}\n")
|
||||
|
||||
photos = []
|
||||
for f in files:
|
||||
print(f" zpracovávám: {f.name} ...", end=" ", flush=True)
|
||||
try:
|
||||
photos.append(explore_photo(f))
|
||||
print("OK")
|
||||
except Exception as e:
|
||||
print(f"FAIL: {e}")
|
||||
|
||||
print_summary(photos)
|
||||
|
||||
# Uložit do JSON pro detailní analýzu
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(photos, f, indent=2, ensure_ascii=False, default=str)
|
||||
print(f"\n[OK] Detailní data uložena: {OUTPUT_JSON}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user