# ============================================================ # seaweed_backfill_v1.1.py # Verze: 1.1 # Datum: 2026-06-15 # v1.1: retry 3x s 5s pauzou při HTTP 5xx (přechodná chyba serveru) # Popis: Jednorázový backfill — nahraje do SeaweedFS Filer # všechny dokumenty z VTMF.documents, které jsou na disku # (downloaded=True, file!=null) ale ještě nemají seaweed_path. # Placeholdery a záznamy bez souboru přeskočí. # Lze spustit opakovaně — HEAD check zajistí dedup, # přerušení kdykoli naváže příště. # ============================================================ import hashlib import mimetypes import sys import time import urllib.error import urllib.request from datetime import datetime from pathlib import Path from pymongo import MongoClient, ASCENDING MONGO_URI = "mongodb://192.168.1.76:27017" MONGO_DB = "VTMF" MONGO_COLL = "documents" SEAWEED_FILER = "http://192.168.1.50:8888" SEAWEED_PREFIX = "/vtmf-documents" def log(msg): print(msg, flush=True) def sw_path(sha256): return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}" MAX_ATTEMPTS = 3 RETRY_PAUSE = 5 # sekund mezi pokusy při 5xx def seaweed_store(data, mime="application/octet-stream"): """HEAD check + PUT s retry při 5xx. Vrací (path, url, uploaded).""" sha256 = hashlib.sha256(data).hexdigest() path = sw_path(sha256) url = SEAWEED_FILER + path try: urllib.request.urlopen( urllib.request.Request(url, method="HEAD"), timeout=10) return path, url, False # dedup hit except urllib.error.HTTPError as e: if e.code != 404: raise last_err = None for attempt in range(1, MAX_ATTEMPTS + 1): try: urllib.request.urlopen( urllib.request.Request(url, data=data, method="PUT", headers={"Content-Type": mime}), timeout=120) return path, url, True except urllib.error.HTTPError as e: if e.code < 500: raise # 4xx — nema smysl opakovat last_err = e if attempt < MAX_ATTEMPTS: log(f" [!] HTTP {e.code} (pokus {attempt}/{MAX_ATTEMPTS}), čekám {RETRY_PAUSE}s...") time.sleep(RETRY_PAUSE) raise last_err def main(): client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") coll = client[MONGO_DB][MONGO_COLL] log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}") query = { "downloaded": True, "placeholder": {"$ne": True}, "seaweed_path": None, "file": {"$ne": None}, } todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)])) log(f"[i] Ke zpracování: {len(todo)} dokumentů\n") uploaded = dedup = skipped = failed = 0 for n, doc in enumerate(todo, 1): key = doc["_id"] path = doc.get("file") if not path or not Path(path).exists(): log(f"[{n}/{len(todo)}] {key} [!] Soubor nenalezen na disku — přeskočeno.") skipped += 1 continue try: data = Path(path).read_bytes() size_kb = len(data) / 1024 size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" log(f"[{n}/{len(todo)}] {key} ({size_str} {Path(path).suffix.lstrip('.').upper()}) {doc.get('desc', '')[:60]}") mime = mimetypes.guess_type(path)[0] or "application/octet-stream" sha256_hex = hashlib.sha256(data).hexdigest() sw_p, sw_url, was_new = seaweed_store(data, mime) coll.update_one({"_id": key}, {"$set": { "sha256": sha256_hex, "seaweed_path": sw_p, "seaweed_url": sw_url, "seaweed_synced_at": datetime.now(), }}) if was_new: uploaded += 1 log(f" [ok] Nahráno ({size_str}) → {sw_p}") else: dedup += 1 log(f" [i] Dedup hit ({size_str}) → {sw_p}") except Exception as e: failed += 1 log(f" [!] Chyba: {e}") log(f"\n{'='*60}") log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, " f"{skipped} bez souboru, {failed} chyb.") log(f"{'='*60}") sys.exit(1 if failed else 0) if __name__ == "__main__": main()