From c0aa4fb6849c560d97fd3f17b91d9de628e6f91c Mon Sep 17 00:00:00 2001 From: administrator Date: Fri, 22 May 2026 20:05:14 +0200 Subject: [PATCH] notebookVb --- .gitignore | 4 + import_to_db.py | 527 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 531 insertions(+) create mode 100644 import_to_db.py diff --git a/.gitignore b/.gitignore index 839cc9b..5634a16 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,7 @@ ENV.bak/ *.log .env .env.local + +# Generovaná data (velké soubory, nepatří do gitu) +output/ +photo_exploration.json diff --git a/import_to_db.py b/import_to_db.py new file mode 100644 index 0000000..7bbf06a --- /dev/null +++ b/import_to_db.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python3 +""" +Import JSONL metadat do PostgreSQL (fotky_buzalkovi). + +Použití: + python import_to_db.py # output/10_metadata.jsonl + python import_to_db.py output/jiny_soubor.jsonl + +Co dělá: + 1. Vytvoří databázi 'fotky_buzalkovi' pokud neexistuje + 2. Vytvoří tabulky photos / tags / photo_tags (IF NOT EXISTS) + 3. Importuje záznamy po dávkách (ON CONFLICT DO NOTHING → opakované spuštění je bezpečné) +""" + +import json +import os +import re +import struct +import sys +from datetime import datetime, timezone, timedelta +from pathlib import Path +from typing import Optional + +import psycopg2 +import psycopg2.extras +from psycopg2.extras import execute_values + +# ────────────────────────────────────────────────────────────────────────────── +# Konfigurace z .env (pokud je python-dotenv nainstalován) nebo z prostředí +# ────────────────────────────────────────────────────────────────────────────── +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass # bez dotenv čteme z os.environ nebo defaults + +DB_CONFIG = { + "host": os.getenv("DB_HOST", "192.168.1.76"), + "port": int(os.getenv("DB_PORT", "5432")), + "user": os.getenv("DB_USER", "vladimir.buzalka"), + "password": os.getenv("DB_PASSWORD", ""), + "dbname": os.getenv("DB_NAME", "fotky_buzalkovi"), +} + +DEFAULT_JSONL = Path(__file__).parent / "output" / "10_metadata.jsonl" +BATCH_SIZE = 500 + +# ────────────────────────────────────────────────────────────────────────────── +# Schema +# ────────────────────────────────────────────────────────────────────────────── +SCHEMA_SQL = """ +CREATE TABLE IF NOT EXISTS photos ( + id BIGSERIAL PRIMARY KEY, + + -- identita (3 úrovně) + sha256_file CHAR(64) UNIQUE NOT NULL, -- byte-přesná kopie + sha256_pixels CHAR(64), -- stejná fotka po změně metadat + phash BIGINT, -- vizuální podobnost (Hamming) + dhash BIGINT, -- doplňkový perceptuální hash + + -- soubor + file_path VARCHAR(2000) NOT NULL, + file_path_relative VARCHAR(2000), + file_name VARCHAR(500) NOT NULL, + file_stem VARCHAR(500), + file_ext VARCHAR(20), + file_size BIGINT, -- bytes + mime_type VARCHAR(50), + format VARCHAR(20), -- JPEG, PNG, HEIC… + mode VARCHAR(20), -- RGB, RGBA… + width INT, + height INT, + megapixels NUMERIC(8,2), + has_transparency BOOLEAN DEFAULT FALSE, + icc_profile BOOLEAN DEFAULT FALSE, + embedded_thumbnail BOOLEAN DEFAULT FALSE, + + -- časy + taken_at TIMESTAMPTZ, -- preferovaně z EXIF (s TZ) + taken_at_source VARCHAR(20), -- 'exif' / 'mtime' / 'unknown' + mtime TIMESTAMPTZ, -- filesystem mtime + collected_at TIMESTAMPTZ, -- kdy jsme skenovali + + -- technika (z EXIF) + camera_make VARCHAR(100), + camera_model VARCHAR(255), + lens_model VARCHAR(255), + iso INT, + aperture NUMERIC(5,2), + exposure_time VARCHAR(30), -- "1/500" + focal_length_mm NUMERIC(6,2), + + -- GPS (NULL pokud chybí) + gps_lat NUMERIC(10,7), + gps_lon NUMERIC(10,7), + gps_altitude NUMERIC(7,2), + + -- klasifikace + is_screenshot BOOLEAN DEFAULT FALSE, + face_count INT, -- z XMP / AI (zatím NULL) + + -- raw metadata jako JSONB pro dotazy a budoucí rozšíření + exif_raw JSONB, + iptc_raw JSONB, + xmp_raw JSONB, + + -- import / zpracování + imported_at TIMESTAMPTZ DEFAULT NOW(), + processing_status VARCHAR(50) DEFAULT 'pending' +); + +-- Indexy +CREATE INDEX IF NOT EXISTS idx_photos_sha256_pixels ON photos(sha256_pixels); +CREATE INDEX IF NOT EXISTS idx_photos_phash ON photos(phash); +CREATE INDEX IF NOT EXISTS idx_photos_taken_at ON photos(taken_at); +CREATE INDEX IF NOT EXISTS idx_photos_camera_model ON photos(camera_model); +CREATE INDEX IF NOT EXISTS idx_photos_file_name ON photos(file_name); +CREATE INDEX IF NOT EXISTS idx_photos_file_ext ON photos(file_ext); +CREATE INDEX IF NOT EXISTS idx_photos_exif_gin ON photos USING GIN (exif_raw); + +-- Tagy (hierarchické: místo > Praha > Karlův most) +CREATE TABLE IF NOT EXISTS tags ( + id SERIAL PRIMARY KEY, + name VARCHAR(100) NOT NULL, + parent_tag_id INT REFERENCES tags(id), + UNIQUE(name, parent_tag_id) +); + +-- Vazební tabulka foto ↔ tag +CREATE TABLE IF NOT EXISTS photo_tags ( + photo_id BIGINT REFERENCES photos(id) ON DELETE CASCADE, + tag_id INT REFERENCES tags(id) ON DELETE CASCADE, + source VARCHAR(20), -- 'manual' / 'iptc' / 'xmp' / 'auto' + created_at TIMESTAMPTZ DEFAULT NOW(), + PRIMARY KEY (photo_id, tag_id) +); +""" + +# ────────────────────────────────────────────────────────────────────────────── +# Pomocné parsovací funkce +# ────────────────────────────────────────────────────────────────────────────── + +def hex_to_int64(hex_str: str) -> Optional[int]: + """ + Převede 16-znakový hex hash (pHash/dHash) na signed int64 pro BIGINT. + ExifRead vrací např. "9ab964e46386999b" → potřebujeme signed 64-bit. + """ + if not hex_str: + return None + try: + unsigned = int(str(hex_str).strip(), 16) & 0xFFFFFFFFFFFFFFFF + return struct.unpack("q", struct.pack("Q", unsigned))[0] + except Exception: + return None + + +def parse_fraction(s) -> Optional[float]: + """ + Parsuje zlomky které ExifRead vrací jako stringy: + "3/4" → 0.75 + "1.75" → 1.75 + "28/10"→ 2.8 + """ + if s is None: + return None + try: + s = str(s).strip() + if "/" in s: + num, den = s.split("/", 1) + d = float(den) + return float(num) / d if d != 0 else None + return float(s) + except Exception: + return None + + +def parse_exif_datetime(dt_str, offset_str=None) -> Optional[datetime]: + """ + Parsuje EXIF datum "2026:05:18 13:54:47" + volitelný offset "+02:00". + Vrátí timezone-aware datetime. + """ + if not dt_str: + return None + try: + s = str(dt_str).strip() + # ExifRead formát: "YYYY:MM:DD HH:MM:SS" — první dva oddělovače jsou ':' + date_part = s[:10].replace(":", "-") + time_part = s[11:19] if len(s) >= 19 else "00:00:00" + dt = datetime.strptime(f"{date_part} {time_part}", "%Y-%m-%d %H:%M:%S") + + if offset_str: + m = re.match(r"([+-])(\d{2}):(\d{2})", str(offset_str).strip()) + if m: + sign = 1 if m.group(1) == "+" else -1 + tz = timezone(timedelta(hours=sign * int(m.group(2)), + minutes=sign * int(m.group(3)))) + return dt.replace(tzinfo=tz) + + # Bez offsetu ukládáme jako UTC (lepší než naive datetime) + return dt.replace(tzinfo=timezone.utc) + except Exception: + return None + + +def parse_gps_coord(coord_str, ref: str = None) -> Optional[float]: + """ + Parsuje GPS souřadnici z ExifRead. + Formáty: "[46, 5, 2762/100]", "46.083333", "46/1, 5/1, 276/100" + """ + if not coord_str: + return None + try: + s = str(coord_str).strip().strip("[]") + parts = [p.strip() for p in s.split(",")] + + degrees = parse_fraction(parts[0]) + minutes = parse_fraction(parts[1]) if len(parts) > 1 else 0.0 + seconds = parse_fraction(parts[2]) if len(parts) > 2 else 0.0 + + if degrees is None: + return None + + val = degrees + (minutes or 0.0) / 60.0 + (seconds or 0.0) / 3600.0 + if ref and str(ref).upper() in ("S", "W"): + val = -val + return round(val, 7) + except Exception: + return None + + +def parse_iso(raw) -> Optional[int]: + """ISO může být '800', '[800]', '[800, 0]' apod.""" + if raw is None: + return None + try: + s = str(raw).strip().strip("[]").split(",")[0].strip() + return int(float(s)) + except Exception: + return None + + +def clean_nullbytes(obj): + """ + Rekurzivně odstraní null byte \\x00 ze všech stringů. + PostgreSQL odmítá \\u0000 v text / JSONB polích. + """ + if isinstance(obj, str): + return obj.replace("\x00", "") + if isinstance(obj, dict): + return {k: clean_nullbytes(v) for k, v in obj.items()} + if isinstance(obj, list): + return [clean_nullbytes(v) for v in obj] + return obj + + +MIME_MAP = { + "JPEG": "image/jpeg", "JPG": "image/jpeg", + "PNG": "image/png", + "GIF": "image/gif", + "WEBP": "image/webp", + "HEIF": "image/heif", "HEIC": "image/heif", + "TIFF": "image/tiff", "TIF": "image/tiff", + "BMP": "image/bmp", +} + +# ────────────────────────────────────────────────────────────────────────────── +# Extrakce polí z jednoho JSONL záznamu +# ────────────────────────────────────────────────────────────────────────────── + +COLUMNS = [ + "sha256_file", "sha256_pixels", "phash", "dhash", + "file_path", "file_path_relative", "file_name", "file_stem", "file_ext", + "file_size", "mime_type", "format", "mode", "width", "height", "megapixels", + "has_transparency", "icc_profile", "embedded_thumbnail", + "taken_at", "taken_at_source", "mtime", "collected_at", + "camera_make", "camera_model", "lens_model", + "iso", "aperture", "exposure_time", "focal_length_mm", + "gps_lat", "gps_lon", "gps_altitude", + "is_screenshot", "face_count", + "exif_raw", "iptc_raw", "xmp_raw", +] + + +def extract_fields(rec: dict) -> tuple: + exif = clean_nullbytes(rec.get("exif") or {}) + iptc = clean_nullbytes(rec.get("iptc") or {}) + xmp = clean_nullbytes(rec.get("xmp") or {}) + + # ---- pHash / dHash ------------------------------------------------------- + phash = hex_to_int64(rec.get("phash")) + dhash = hex_to_int64(rec.get("dhash")) + + # ---- taken_at ------------------------------------------------------------ + dt_orig = exif.get("EXIF DateTimeOriginal") or exif.get("Image DateTime") + dt_offset = exif.get("EXIF OffsetTimeOriginal") or exif.get("EXIF OffsetTime") + taken_at = parse_exif_datetime(dt_orig, dt_offset) + taken_at_source = "exif" if taken_at else None + + mtime = None + if rec.get("mtime"): + try: + mtime = datetime.fromisoformat(rec["mtime"]) + except Exception: + pass + + if not taken_at and mtime: + taken_at = mtime + taken_at_source = "mtime" + + # ---- collected_at -------------------------------------------------------- + collected_at = None + if rec.get("collected_at"): + try: + collected_at = datetime.fromisoformat(rec["collected_at"]) + except Exception: + pass + + # ---- kamera / optika ----------------------------------------------------- + camera_make = (str(exif.get("Image Make", "") or "").strip()) or None + camera_model = (str(exif.get("Image Model", "") or "").strip()) or None + lens_model = (str(exif.get("EXIF LensModel", "") or "").strip()) or None + + iso = parse_iso(exif.get("EXIF ISOSpeedRatings")) + + _ap = parse_fraction(exif.get("EXIF FNumber")) + aperture = round(_ap, 2) if _ap is not None else None + + exposure_raw = exif.get("EXIF ExposureTime") + exposure_time = str(exposure_raw).strip() if exposure_raw else None + + _fl_raw = exif.get("EXIF FocalLength") + _fl = parse_fraction(str(_fl_raw).split()[0]) if _fl_raw else None + focal_length_mm = round(_fl, 2) if _fl is not None else None + + # ---- GPS ----------------------------------------------------------------- + gps_lat = parse_gps_coord( + exif.get("GPS GPSLatitude"), + exif.get("GPS GPSLatitudeRef") + ) + gps_lon = parse_gps_coord( + exif.get("GPS GPSLongitude"), + exif.get("GPS GPSLongitudeRef") + ) + _alt = parse_fraction(exif.get("GPS GPSAltitude")) + if _alt is not None and str(exif.get("GPS GPSAltitudeRef", "0")) == "1": + _alt = -_alt + gps_altitude = round(_alt, 2) if _alt is not None else None + + # ---- klasifikace --------------------------------------------------------- + xmp_desc = str( + xmp.get("description") or xmp.get("dc:description") or "" + ).lower() + is_screenshot = "screenshot" in xmp_desc + + face_count = None + if "face_regions_count" in xmp: + try: + face_count = int(xmp["face_regions_count"]) + except Exception: + pass + + # ---- soubor info --------------------------------------------------------- + fmt = (rec.get("format") or "").strip() + mime_type = MIME_MAP.get(fmt.upper(), f"image/{fmt.lower()}" if fmt else None) + + fields = { + "sha256_file": rec.get("sha256_file"), + "sha256_pixels": rec.get("sha256_pixels"), + "phash": phash, + "dhash": dhash, + "file_path": rec.get("file_path", ""), + "file_path_relative": rec.get("file_path_relative"), + "file_name": rec.get("file_name", ""), + "file_stem": rec.get("file_stem"), + "file_ext": (rec.get("file_ext") or "").lower().strip() or None, + "file_size": int(rec["file_size"]) if rec.get("file_size") else None, + "mime_type": mime_type, + "format": fmt or None, + "mode": rec.get("mode"), + "width": int(rec["width"]) if rec.get("width") else None, + "height": int(rec["height"]) if rec.get("height") else None, + "megapixels": rec.get("megapixels"), + "has_transparency": bool(rec.get("has_transparency")), + "icc_profile": bool(rec.get("icc_profile")), + "embedded_thumbnail": bool(rec.get("embedded_thumbnail")), + "taken_at": taken_at, + "taken_at_source": taken_at_source, + "mtime": mtime, + "collected_at": collected_at, + "camera_make": camera_make, + "camera_model": camera_model, + "lens_model": lens_model, + "iso": iso, + "aperture": aperture, + "exposure_time": exposure_time, + "focal_length_mm": focal_length_mm, + "gps_lat": gps_lat, + "gps_lon": gps_lon, + "gps_altitude": gps_altitude, + "is_screenshot": is_screenshot, + "face_count": face_count, + "exif_raw": json.dumps(exif, ensure_ascii=False) if exif else None, + "iptc_raw": json.dumps(iptc, ensure_ascii=False) if iptc else None, + "xmp_raw": json.dumps(xmp, ensure_ascii=False) if xmp else None, + } + return tuple(fields[c] for c in COLUMNS) + + +# ────────────────────────────────────────────────────────────────────────────── +# DB pomocníci +# ────────────────────────────────────────────────────────────────────────────── + +INSERT_SQL = f""" +INSERT INTO photos ({", ".join(COLUMNS)}) +VALUES %s +ON CONFLICT (sha256_file) DO NOTHING +RETURNING id +""" + + +def ensure_database(): + """Vytvoří cílovou databázi pokud ještě neexistuje.""" + target_db = DB_CONFIG["dbname"] + try: + admin_cfg = {**DB_CONFIG, "dbname": "postgres"} + conn = psycopg2.connect(**admin_cfg) + conn.autocommit = True + cur = conn.cursor() + cur.execute("SELECT 1 FROM pg_database WHERE datname = %s", (target_db,)) + if cur.fetchone(): + print(f" Databáze '{target_db}' existuje.") + else: + cur.execute(f'CREATE DATABASE "{target_db}" ENCODING = \'UTF8\'') + print(f" Databáze '{target_db}' vytvořena.") + cur.close() + conn.close() + except Exception as e: + print(f" [WARN] Nelze ověřit/vytvořit databázi: {e}") + print(f" Ujistěte se, že databáze '{target_db}' existuje ručně.") + + +def create_schema(conn): + with conn.cursor() as cur: + cur.execute(SCHEMA_SQL) + conn.commit() + print(" Schéma OK (tabulky a indexy vytvořeny / již existují).") + + +# ────────────────────────────────────────────────────────────────────────────── +# Hlavní import +# ────────────────────────────────────────────────────────────────────────────── + +def import_jsonl(jsonl_path: Path): + print(f"\n{'='*60}") + print(f" FotkyBuzalkovi — import do PostgreSQL") + print(f" Soubor : {jsonl_path}") + print(f" DB : {DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}") + print(f"{'='*60}\n") + + print("1) Kontrola databáze...") + ensure_database() + + print("2) Připojení a vytvoření schématu...") + conn = psycopg2.connect(**DB_CONFIG) + conn.autocommit = False + create_schema(conn) + + print("3) Import záznamů...") + total = inserted = errors = 0 + batch: list = [] + + with open(jsonl_path, encoding="utf-8") as f: + for lineno, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + rec = json.loads(line) + batch.append(extract_fields(rec)) + except Exception as e: + errors += 1 + if errors <= 10: + print(f"\n [CHYBA] řádek {lineno}: {e}") + continue + + if len(batch) >= BATCH_SIZE: + inserted += _flush(conn, batch) + total += len(batch) + batch = [] + _progress(total, inserted, errors) + + # zbytek + if batch: + inserted += _flush(conn, batch) + total += len(batch) + + conn.commit() + conn.close() + + print(f"\n\n{'='*60}") + print(f" Hotovo!") + print(f" Zpracováno : {total:>8}") + print(f" Vloženo : {inserted:>8}") + print(f" Duplicity : {total - inserted:>8} (přeskočeno)") + print(f" Chyby : {errors:>8}") + print(f"{'='*60}\n") + + +def _flush(conn, batch: list) -> int: + """Vrátí počet skutečně vložených řádků (duplicity jsou přeskočeny).""" + with conn.cursor() as cur: + rows = execute_values(cur, INSERT_SQL, batch, fetch=True) + return len(rows) + + +def _progress(total: int, inserted: int, errors: int): + print(f" {total:>8} řádků | {inserted:>8} vloženo | {errors} chyb", end="\r") + + +# ────────────────────────────────────────────────────────────────────────────── +if __name__ == "__main__": + path = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_JSONL + if not path.exists(): + print(f"[ERROR] Soubor nenalezen: {path}") + sys.exit(1) + import_jsonl(path)