This commit is contained in:
2026-05-22 06:33:37 +02:00
parent ad168aa5ac
commit 4d796a5801
2 changed files with 4135 additions and 6 deletions
+35 -6
View File
@@ -39,7 +39,9 @@ if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
import logging
import exifread
logging.getLogger("exifread").setLevel(logging.CRITICAL)
import imagehash
from PIL import Image, ImageOps, IptcImagePlugin
@@ -53,6 +55,9 @@ OUTPUT_DIR = Path(__file__).parent / "output"
OUTPUT_JSONL = OUTPUT_DIR / "10_metadata.jsonl"
ERROR_LOG = OUTPUT_DIR / "10_errors.log"
RESUME = True
WORKERS = 2
IPTC_TAG_NAMES = {
(2, 5): "ObjectName",
(2, 10): "Urgency",
@@ -133,6 +138,30 @@ def extract_gps(raw_tags: dict) -> dict:
return result
# ---------------------------------------------------------------------------
# JSON serializace
# ---------------------------------------------------------------------------
def _make_serializable(obj):
"""Rekurzivně převede vše co JSON nezná (IFDRational, bytes, tuple…) na základní typy."""
if hasattr(obj, "numerator") and hasattr(obj, "denominator"):
try:
return float(obj)
except Exception:
return str(obj)
if isinstance(obj, dict):
return {str(k): _make_serializable(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_make_serializable(x) for x in obj]
if isinstance(obj, bytes):
return obj[:200].decode("utf-8", errors="replace")
try:
json.dumps(obj)
return obj
except (TypeError, ValueError):
return str(obj)
# ---------------------------------------------------------------------------
# Hashe
# ---------------------------------------------------------------------------
@@ -411,14 +440,14 @@ def main():
help=f"Zdrojová složka (default: {SOURCE})")
parser.add_argument("--output", type=Path, default=OUTPUT_JSONL,
help=f"Výstupní JSONL soubor (default: {OUTPUT_JSONL})")
parser.add_argument("--resume", action="store_true",
help="Přeskočit soubory, které jsou již v JSONL")
parser.add_argument("--resume", action="store_true", default=RESUME,
help=f"Přeskočit soubory, které jsou již v JSONL (default: {RESUME})")
parser.add_argument("--dry-run", action="store_true",
help="Jen spočítat soubory, nic nezpracovat")
parser.add_argument("--limit", type=int, default=0,
help="Zpracovat maximálně N fotek (0 = vše)")
parser.add_argument("--workers", type=int, default=1,
help="Počet paralelních vláken (default: 1)")
parser.add_argument("--workers", type=int, default=WORKERS,
help=f"Počet paralelních vláken (default: {WORKERS})")
args = parser.parse_args()
source: Path = args.source
@@ -511,7 +540,7 @@ def main():
progress.tick(ok=not has_error)
if has_error:
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
batch.append(json.dumps(record, ensure_ascii=False))
batch.append(json.dumps(_make_serializable(record), ensure_ascii=False))
if len(batch) >= flush_every:
out_f.write("\n".join(batch) + "\n")
batch.clear()
@@ -531,7 +560,7 @@ def main():
progress.tick(ok=not has_error)
if has_error:
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
batch.append(json.dumps(record, ensure_ascii=False))
batch.append(json.dumps(_make_serializable(record), ensure_ascii=False))
if len(batch) >= flush_every:
out_f.write("\n".join(batch) + "\n")
batch.clear()
File diff suppressed because one or more lines are too long