Files
janssen/VTMFDownloadFiles/seaweed_backfill_v1.1.py
T
2026-06-15 16:10:47 +02:00

141 lines
4.4 KiB
Python

# ============================================================
# seaweed_backfill_v1.1.py
# Verze: 1.1
# Datum: 2026-06-15
# v1.1: retry 3x s 5s pauzou při HTTP 5xx (přechodná chyba serveru)
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
# všechny dokumenty z VTMF.documents, které jsou na disku
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
# Placeholdery a záznamy bez souboru přeskočí.
# Lze spustit opakovaně — HEAD check zajistí dedup,
# přerušení kdykoli naváže příště.
# ============================================================
import hashlib
import mimetypes
import sys
import time
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
SEAWEED_FILER = "http://192.168.1.50:8888"
SEAWEED_PREFIX = "/vtmf-documents"
def log(msg):
print(msg, flush=True)
def sw_path(sha256):
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
MAX_ATTEMPTS = 3
RETRY_PAUSE = 5 # sekund mezi pokusy při 5xx
def seaweed_store(data, mime="application/octet-stream"):
"""HEAD check + PUT s retry při 5xx. Vrací (path, url, uploaded)."""
sha256 = hashlib.sha256(data).hexdigest()
path = sw_path(sha256)
url = SEAWEED_FILER + path
try:
urllib.request.urlopen(
urllib.request.Request(url, method="HEAD"), timeout=10)
return path, url, False # dedup hit
except urllib.error.HTTPError as e:
if e.code != 404:
raise
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
try:
urllib.request.urlopen(
urllib.request.Request(url, data=data, method="PUT",
headers={"Content-Type": mime}),
timeout=120)
return path, url, True
except urllib.error.HTTPError as e:
if e.code < 500:
raise # 4xx — nema smysl opakovat
last_err = e
if attempt < MAX_ATTEMPTS:
log(f" [!] HTTP {e.code} (pokus {attempt}/{MAX_ATTEMPTS}), čekám {RETRY_PAUSE}s...")
time.sleep(RETRY_PAUSE)
raise last_err
def main():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
coll = client[MONGO_DB][MONGO_COLL]
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
query = {
"downloaded": True,
"placeholder": {"$ne": True},
"seaweed_path": None,
"file": {"$ne": None},
}
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
uploaded = dedup = skipped = failed = 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
path = doc.get("file")
if not path or not Path(path).exists():
log(f"[{n}/{len(todo)}] {key} [!] Soubor nenalezen na disku — přeskočeno.")
skipped += 1
continue
try:
data = Path(path).read_bytes()
size_kb = len(data) / 1024
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
log(f"[{n}/{len(todo)}] {key} ({size_str} {Path(path).suffix.lstrip('.').upper()}) {doc.get('desc', '')[:60]}")
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
sha256_hex = hashlib.sha256(data).hexdigest()
sw_p, sw_url, was_new = seaweed_store(data, mime)
coll.update_one({"_id": key}, {"$set": {
"sha256": sha256_hex,
"seaweed_path": sw_p,
"seaweed_url": sw_url,
"seaweed_synced_at": datetime.now(),
}})
if was_new:
uploaded += 1
log(f" [ok] Nahráno ({size_str}) → {sw_p}")
else:
dedup += 1
log(f" [i] Dedup hit ({size_str}) → {sw_p}")
except Exception as e:
failed += 1
log(f" [!] Chyba: {e}")
log(f"\n{'='*60}")
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
f"{skipped} bez souboru, {failed} chyb.")
log(f"{'='*60}")
sys.exit(1 if failed else 0)
if __name__ == "__main__":
main()