122 lines
3.6 KiB
Python
122 lines
3.6 KiB
Python
# ============================================================
|
|
# seaweed_backfill_v1.0.py
|
|
# Verze: 1.0
|
|
# Datum: 2026-06-15
|
|
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
|
|
# všechny dokumenty z VTMF.documents, které jsou na disku
|
|
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
|
|
# Placeholdery a záznamy bez souboru přeskočí.
|
|
# Lze spustit opakovaně — HEAD check zajistí dedup,
|
|
# přerušení kdykoli naváže příště.
|
|
# ============================================================
|
|
|
|
import hashlib
|
|
import mimetypes
|
|
import sys
|
|
import urllib.error
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from pymongo import MongoClient, ASCENDING
|
|
|
|
MONGO_URI = "mongodb://192.168.1.76:27017"
|
|
MONGO_DB = "VTMF"
|
|
MONGO_COLL = "documents"
|
|
|
|
SEAWEED_FILER = "http://192.168.1.50:8888"
|
|
SEAWEED_PREFIX = "/vtmf-documents"
|
|
|
|
|
|
def log(msg):
|
|
print(msg, flush=True)
|
|
|
|
|
|
def sw_path(sha256):
|
|
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
|
|
|
|
|
def seaweed_store(data, mime="application/octet-stream"):
|
|
"""HEAD check + PUT. Vrací (path, url, uploaded)."""
|
|
sha256 = hashlib.sha256(data).hexdigest()
|
|
path = sw_path(sha256)
|
|
url = SEAWEED_FILER + path
|
|
|
|
try:
|
|
urllib.request.urlopen(
|
|
urllib.request.Request(url, method="HEAD"), timeout=10)
|
|
return path, url, False # dedup hit
|
|
except urllib.error.HTTPError as e:
|
|
if e.code != 404:
|
|
raise
|
|
|
|
urllib.request.urlopen(
|
|
urllib.request.Request(url, data=data, method="PUT",
|
|
headers={"Content-Type": mime}),
|
|
timeout=120)
|
|
return path, url, True
|
|
|
|
|
|
def main():
|
|
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
|
client.admin.command("ping")
|
|
coll = client[MONGO_DB][MONGO_COLL]
|
|
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
|
|
|
query = {
|
|
"downloaded": True,
|
|
"placeholder": {"$ne": True},
|
|
"seaweed_path": None,
|
|
"file": {"$ne": None},
|
|
}
|
|
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
|
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
|
|
|
|
uploaded = dedup = skipped = failed = 0
|
|
|
|
for n, doc in enumerate(todo, 1):
|
|
key = doc["_id"]
|
|
path = doc.get("file")
|
|
log(f"[{n}/{len(todo)}] {key}")
|
|
|
|
if not path or not Path(path).exists():
|
|
log(f" [!] Soubor nenalezen na disku: {path} — přeskočeno.")
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
data = Path(path).read_bytes()
|
|
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
sha256_hex = hashlib.sha256(data).hexdigest()
|
|
|
|
sw_p, sw_url, was_new = seaweed_store(data, mime)
|
|
|
|
coll.update_one({"_id": key}, {"$set": {
|
|
"sha256": sha256_hex,
|
|
"seaweed_path": sw_p,
|
|
"seaweed_url": sw_url,
|
|
"seaweed_synced_at": datetime.now(),
|
|
}})
|
|
|
|
if was_new:
|
|
uploaded += 1
|
|
log(f" [ok] Nahráno → {sw_p}")
|
|
else:
|
|
dedup += 1
|
|
log(f" [i] Dedup hit → {sw_p}")
|
|
|
|
except Exception as e:
|
|
failed += 1
|
|
log(f" [!] Chyba: {e}")
|
|
|
|
log(f"\n{'='*60}")
|
|
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
|
|
f"{skipped} bez souboru, {failed} chyb.")
|
|
log(f"{'='*60}")
|
|
|
|
sys.exit(1 if failed else 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|