w22
This commit is contained in:
+35
-6
@@ -39,7 +39,9 @@ if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
|
||||
import logging
|
||||
import exifread
|
||||
logging.getLogger("exifread").setLevel(logging.CRITICAL)
|
||||
import imagehash
|
||||
from PIL import Image, ImageOps, IptcImagePlugin
|
||||
|
||||
@@ -53,6 +55,9 @@ OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
OUTPUT_JSONL = OUTPUT_DIR / "10_metadata.jsonl"
|
||||
ERROR_LOG = OUTPUT_DIR / "10_errors.log"
|
||||
|
||||
RESUME = True
|
||||
WORKERS = 2
|
||||
|
||||
IPTC_TAG_NAMES = {
|
||||
(2, 5): "ObjectName",
|
||||
(2, 10): "Urgency",
|
||||
@@ -133,6 +138,30 @@ def extract_gps(raw_tags: dict) -> dict:
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# JSON serializace
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_serializable(obj):
|
||||
"""Rekurzivně převede vše co JSON nezná (IFDRational, bytes, tuple…) na základní typy."""
|
||||
if hasattr(obj, "numerator") and hasattr(obj, "denominator"):
|
||||
try:
|
||||
return float(obj)
|
||||
except Exception:
|
||||
return str(obj)
|
||||
if isinstance(obj, dict):
|
||||
return {str(k): _make_serializable(v) for k, v in obj.items()}
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [_make_serializable(x) for x in obj]
|
||||
if isinstance(obj, bytes):
|
||||
return obj[:200].decode("utf-8", errors="replace")
|
||||
try:
|
||||
json.dumps(obj)
|
||||
return obj
|
||||
except (TypeError, ValueError):
|
||||
return str(obj)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hashe
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -411,14 +440,14 @@ def main():
|
||||
help=f"Zdrojová složka (default: {SOURCE})")
|
||||
parser.add_argument("--output", type=Path, default=OUTPUT_JSONL,
|
||||
help=f"Výstupní JSONL soubor (default: {OUTPUT_JSONL})")
|
||||
parser.add_argument("--resume", action="store_true",
|
||||
help="Přeskočit soubory, které jsou již v JSONL")
|
||||
parser.add_argument("--resume", action="store_true", default=RESUME,
|
||||
help=f"Přeskočit soubory, které jsou již v JSONL (default: {RESUME})")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Jen spočítat soubory, nic nezpracovat")
|
||||
parser.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat maximálně N fotek (0 = vše)")
|
||||
parser.add_argument("--workers", type=int, default=1,
|
||||
help="Počet paralelních vláken (default: 1)")
|
||||
parser.add_argument("--workers", type=int, default=WORKERS,
|
||||
help=f"Počet paralelních vláken (default: {WORKERS})")
|
||||
args = parser.parse_args()
|
||||
|
||||
source: Path = args.source
|
||||
@@ -511,7 +540,7 @@ def main():
|
||||
progress.tick(ok=not has_error)
|
||||
if has_error:
|
||||
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
|
||||
batch.append(json.dumps(record, ensure_ascii=False))
|
||||
batch.append(json.dumps(_make_serializable(record), ensure_ascii=False))
|
||||
if len(batch) >= flush_every:
|
||||
out_f.write("\n".join(batch) + "\n")
|
||||
batch.clear()
|
||||
@@ -531,7 +560,7 @@ def main():
|
||||
progress.tick(ok=not has_error)
|
||||
if has_error:
|
||||
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
|
||||
batch.append(json.dumps(record, ensure_ascii=False))
|
||||
batch.append(json.dumps(_make_serializable(record), ensure_ascii=False))
|
||||
if len(batch) >= flush_every:
|
||||
out_f.write("\n".join(batch) + "\n")
|
||||
batch.clear()
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user