# ============================================================ # seaweed_backfill_v1.0.py # Verze: 1.0 # Datum: 2026-06-15 # Popis: Jednorázový backfill — nahraje do SeaweedFS Filer # všechny dokumenty z VTMF.documents, které jsou na disku # (downloaded=True, file!=null) ale ještě nemají seaweed_path. # Placeholdery a záznamy bez souboru přeskočí. # Lze spustit opakovaně — HEAD check zajistí dedup, # přerušení kdykoli naváže příště. # ============================================================ import hashlib import mimetypes import sys import urllib.error import urllib.request from datetime import datetime from pathlib import Path from pymongo import MongoClient, ASCENDING MONGO_URI = "mongodb://192.168.1.76:27017" MONGO_DB = "VTMF" MONGO_COLL = "documents" SEAWEED_FILER = "http://192.168.1.50:8888" SEAWEED_PREFIX = "/vtmf-documents" def log(msg): print(msg, flush=True) def sw_path(sha256): return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}" def seaweed_store(data, mime="application/octet-stream"): """HEAD check + PUT. Vrací (path, url, uploaded).""" sha256 = hashlib.sha256(data).hexdigest() path = sw_path(sha256) url = SEAWEED_FILER + path try: urllib.request.urlopen( urllib.request.Request(url, method="HEAD"), timeout=10) return path, url, False # dedup hit except urllib.error.HTTPError as e: if e.code != 404: raise urllib.request.urlopen( urllib.request.Request(url, data=data, method="PUT", headers={"Content-Type": mime}), timeout=120) return path, url, True def main(): client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") coll = client[MONGO_DB][MONGO_COLL] log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}") query = { "downloaded": True, "placeholder": {"$ne": True}, "seaweed_path": None, "file": {"$ne": None}, } todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)])) log(f"[i] Ke zpracování: {len(todo)} dokumentů\n") uploaded = dedup = skipped = failed = 0 for n, doc in enumerate(todo, 1): key = doc["_id"] path = doc.get("file") log(f"[{n}/{len(todo)}] {key}") if not path or not Path(path).exists(): log(f" [!] Soubor nenalezen na disku: {path} — přeskočeno.") skipped += 1 continue try: data = Path(path).read_bytes() mime = mimetypes.guess_type(path)[0] or "application/octet-stream" sha256_hex = hashlib.sha256(data).hexdigest() sw_p, sw_url, was_new = seaweed_store(data, mime) coll.update_one({"_id": key}, {"$set": { "sha256": sha256_hex, "seaweed_path": sw_p, "seaweed_url": sw_url, "seaweed_synced_at": datetime.now(), }}) if was_new: uploaded += 1 log(f" [ok] Nahráno → {sw_p}") else: dedup += 1 log(f" [i] Dedup hit → {sw_p}") except Exception as e: failed += 1 log(f" [!] Chyba: {e}") log(f"\n{'='*60}") log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, " f"{skipped} bez souboru, {failed} chyb.") log(f"{'='*60}") sys.exit(1 if failed else 0) if __name__ == "__main__": main()