141 lines
4.4 KiB
Python
141 lines
4.4 KiB
Python
# ============================================================
|
|
# seaweed_backfill_v1.1.py
|
|
# Verze: 1.1
|
|
# Datum: 2026-06-15
|
|
# v1.1: retry 3x s 5s pauzou při HTTP 5xx (přechodná chyba serveru)
|
|
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
|
|
# všechny dokumenty z VTMF.documents, které jsou na disku
|
|
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
|
|
# Placeholdery a záznamy bez souboru přeskočí.
|
|
# Lze spustit opakovaně — HEAD check zajistí dedup,
|
|
# přerušení kdykoli naváže příště.
|
|
# ============================================================
|
|
|
|
import hashlib
|
|
import mimetypes
|
|
import sys
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from pymongo import MongoClient, ASCENDING
|
|
|
|
MONGO_URI = "mongodb://192.168.1.76:27017"
|
|
MONGO_DB = "VTMF"
|
|
MONGO_COLL = "documents"
|
|
|
|
SEAWEED_FILER = "http://192.168.1.50:8888"
|
|
SEAWEED_PREFIX = "/vtmf-documents"
|
|
|
|
|
|
def log(msg):
|
|
print(msg, flush=True)
|
|
|
|
|
|
def sw_path(sha256):
|
|
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
|
|
|
|
|
MAX_ATTEMPTS = 3
|
|
RETRY_PAUSE = 5 # sekund mezi pokusy při 5xx
|
|
|
|
|
|
def seaweed_store(data, mime="application/octet-stream"):
|
|
"""HEAD check + PUT s retry při 5xx. Vrací (path, url, uploaded)."""
|
|
sha256 = hashlib.sha256(data).hexdigest()
|
|
path = sw_path(sha256)
|
|
url = SEAWEED_FILER + path
|
|
|
|
try:
|
|
urllib.request.urlopen(
|
|
urllib.request.Request(url, method="HEAD"), timeout=10)
|
|
return path, url, False # dedup hit
|
|
except urllib.error.HTTPError as e:
|
|
if e.code != 404:
|
|
raise
|
|
|
|
last_err = None
|
|
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
try:
|
|
urllib.request.urlopen(
|
|
urllib.request.Request(url, data=data, method="PUT",
|
|
headers={"Content-Type": mime}),
|
|
timeout=120)
|
|
return path, url, True
|
|
except urllib.error.HTTPError as e:
|
|
if e.code < 500:
|
|
raise # 4xx — nema smysl opakovat
|
|
last_err = e
|
|
if attempt < MAX_ATTEMPTS:
|
|
log(f" [!] HTTP {e.code} (pokus {attempt}/{MAX_ATTEMPTS}), čekám {RETRY_PAUSE}s...")
|
|
time.sleep(RETRY_PAUSE)
|
|
raise last_err
|
|
|
|
|
|
def main():
|
|
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
|
client.admin.command("ping")
|
|
coll = client[MONGO_DB][MONGO_COLL]
|
|
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
|
|
|
query = {
|
|
"downloaded": True,
|
|
"placeholder": {"$ne": True},
|
|
"seaweed_path": None,
|
|
"file": {"$ne": None},
|
|
}
|
|
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
|
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
|
|
|
|
uploaded = dedup = skipped = failed = 0
|
|
|
|
for n, doc in enumerate(todo, 1):
|
|
key = doc["_id"]
|
|
path = doc.get("file")
|
|
|
|
if not path or not Path(path).exists():
|
|
log(f"[{n}/{len(todo)}] {key} [!] Soubor nenalezen na disku — přeskočeno.")
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
data = Path(path).read_bytes()
|
|
size_kb = len(data) / 1024
|
|
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
|
log(f"[{n}/{len(todo)}] {key} ({size_str} {Path(path).suffix.lstrip('.').upper()}) {doc.get('desc', '')[:60]}")
|
|
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
sha256_hex = hashlib.sha256(data).hexdigest()
|
|
|
|
sw_p, sw_url, was_new = seaweed_store(data, mime)
|
|
|
|
coll.update_one({"_id": key}, {"$set": {
|
|
"sha256": sha256_hex,
|
|
"seaweed_path": sw_p,
|
|
"seaweed_url": sw_url,
|
|
"seaweed_synced_at": datetime.now(),
|
|
}})
|
|
|
|
if was_new:
|
|
uploaded += 1
|
|
log(f" [ok] Nahráno ({size_str}) → {sw_p}")
|
|
else:
|
|
dedup += 1
|
|
log(f" [i] Dedup hit ({size_str}) → {sw_p}")
|
|
|
|
except Exception as e:
|
|
failed += 1
|
|
log(f" [!] Chyba: {e}")
|
|
|
|
log(f"\n{'='*60}")
|
|
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
|
|
f"{skipped} bez souboru, {failed} chyb.")
|
|
log(f"{'='*60}")
|
|
|
|
sys.exit(1 if failed else 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|