w22
This commit is contained in:
+35
-6
@@ -39,7 +39,9 @@ if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
|
|||||||
sys.stdout.reconfigure(encoding="utf-8")
|
sys.stdout.reconfigure(encoding="utf-8")
|
||||||
sys.stderr.reconfigure(encoding="utf-8")
|
sys.stderr.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
|
import logging
|
||||||
import exifread
|
import exifread
|
||||||
|
logging.getLogger("exifread").setLevel(logging.CRITICAL)
|
||||||
import imagehash
|
import imagehash
|
||||||
from PIL import Image, ImageOps, IptcImagePlugin
|
from PIL import Image, ImageOps, IptcImagePlugin
|
||||||
|
|
||||||
@@ -53,6 +55,9 @@ OUTPUT_DIR = Path(__file__).parent / "output"
|
|||||||
OUTPUT_JSONL = OUTPUT_DIR / "10_metadata.jsonl"
|
OUTPUT_JSONL = OUTPUT_DIR / "10_metadata.jsonl"
|
||||||
ERROR_LOG = OUTPUT_DIR / "10_errors.log"
|
ERROR_LOG = OUTPUT_DIR / "10_errors.log"
|
||||||
|
|
||||||
|
RESUME = True
|
||||||
|
WORKERS = 2
|
||||||
|
|
||||||
IPTC_TAG_NAMES = {
|
IPTC_TAG_NAMES = {
|
||||||
(2, 5): "ObjectName",
|
(2, 5): "ObjectName",
|
||||||
(2, 10): "Urgency",
|
(2, 10): "Urgency",
|
||||||
@@ -133,6 +138,30 @@ def extract_gps(raw_tags: dict) -> dict:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# JSON serializace
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _make_serializable(obj):
|
||||||
|
"""Rekurzivně převede vše co JSON nezná (IFDRational, bytes, tuple…) na základní typy."""
|
||||||
|
if hasattr(obj, "numerator") and hasattr(obj, "denominator"):
|
||||||
|
try:
|
||||||
|
return float(obj)
|
||||||
|
except Exception:
|
||||||
|
return str(obj)
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {str(k): _make_serializable(v) for k, v in obj.items()}
|
||||||
|
if isinstance(obj, (list, tuple)):
|
||||||
|
return [_make_serializable(x) for x in obj]
|
||||||
|
if isinstance(obj, bytes):
|
||||||
|
return obj[:200].decode("utf-8", errors="replace")
|
||||||
|
try:
|
||||||
|
json.dumps(obj)
|
||||||
|
return obj
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Hashe
|
# Hashe
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -411,14 +440,14 @@ def main():
|
|||||||
help=f"Zdrojová složka (default: {SOURCE})")
|
help=f"Zdrojová složka (default: {SOURCE})")
|
||||||
parser.add_argument("--output", type=Path, default=OUTPUT_JSONL,
|
parser.add_argument("--output", type=Path, default=OUTPUT_JSONL,
|
||||||
help=f"Výstupní JSONL soubor (default: {OUTPUT_JSONL})")
|
help=f"Výstupní JSONL soubor (default: {OUTPUT_JSONL})")
|
||||||
parser.add_argument("--resume", action="store_true",
|
parser.add_argument("--resume", action="store_true", default=RESUME,
|
||||||
help="Přeskočit soubory, které jsou již v JSONL")
|
help=f"Přeskočit soubory, které jsou již v JSONL (default: {RESUME})")
|
||||||
parser.add_argument("--dry-run", action="store_true",
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
help="Jen spočítat soubory, nic nezpracovat")
|
help="Jen spočítat soubory, nic nezpracovat")
|
||||||
parser.add_argument("--limit", type=int, default=0,
|
parser.add_argument("--limit", type=int, default=0,
|
||||||
help="Zpracovat maximálně N fotek (0 = vše)")
|
help="Zpracovat maximálně N fotek (0 = vše)")
|
||||||
parser.add_argument("--workers", type=int, default=1,
|
parser.add_argument("--workers", type=int, default=WORKERS,
|
||||||
help="Počet paralelních vláken (default: 1)")
|
help=f"Počet paralelních vláken (default: {WORKERS})")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
source: Path = args.source
|
source: Path = args.source
|
||||||
@@ -511,7 +540,7 @@ def main():
|
|||||||
progress.tick(ok=not has_error)
|
progress.tick(ok=not has_error)
|
||||||
if has_error:
|
if has_error:
|
||||||
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
|
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
|
||||||
batch.append(json.dumps(record, ensure_ascii=False))
|
batch.append(json.dumps(_make_serializable(record), ensure_ascii=False))
|
||||||
if len(batch) >= flush_every:
|
if len(batch) >= flush_every:
|
||||||
out_f.write("\n".join(batch) + "\n")
|
out_f.write("\n".join(batch) + "\n")
|
||||||
batch.clear()
|
batch.clear()
|
||||||
@@ -531,7 +560,7 @@ def main():
|
|||||||
progress.tick(ok=not has_error)
|
progress.tick(ok=not has_error)
|
||||||
if has_error:
|
if has_error:
|
||||||
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
|
err_f.write(f"{path}\t{record.get('fatal_error') or record.get('pil_error')}\n")
|
||||||
batch.append(json.dumps(record, ensure_ascii=False))
|
batch.append(json.dumps(_make_serializable(record), ensure_ascii=False))
|
||||||
if len(batch) >= flush_every:
|
if len(batch) >= flush_every:
|
||||||
out_f.write("\n".join(batch) + "\n")
|
out_f.write("\n".join(batch) + "\n")
|
||||||
batch.clear()
|
batch.clear()
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user