304 lines
11 KiB
Python
304 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
r"""
|
||
EUNI -> Plex (Other Videos) export.
|
||
|
||
Stahuje videa z SeaweedFS fileru na Plex share a pojmenuje je pro knihovnu EUNI.
|
||
|
||
Schema nazvu:
|
||
single-video kurz : "<Nazev kurzu> - <Prijmeni> (<rok>).mp4"
|
||
multi-video kurz : "<Nazev kurzu> - <NN> <segment> (<rok>).mp4" (autor vynechan)
|
||
|
||
Zavislosti:
|
||
pip install pymongo requests
|
||
|
||
Pouziti:
|
||
python plex_export.py --dry-run # jen vypise plan, nic nestahuje
|
||
python plex_export.py # stahuje na default cestu (\\tower\PlexBinHex\EUNI)
|
||
python plex_export.py --dest D:\plex\EUNI # jina cilova slozka
|
||
python plex_export.py --limit 5 # stahne jen prvnich 5 (test)
|
||
|
||
Skript je idempotentni: stahuje pres .part, overuje velikost, hotove preskakuje.
|
||
Kdyz spadne nebo ho preusis (Ctrl-C), staci spustit znovu a dojede zbytek.
|
||
Cilovy stroj potrebuje sit na: Mongo 192.168.1.76, filer 192.168.1.50:8888 a cilovy share.
|
||
"""
|
||
|
||
import argparse
|
||
import re
|
||
import sys
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from urllib.parse import quote
|
||
|
||
import requests
|
||
from pymongo import MongoClient
|
||
|
||
# --- aby cestina vypisovala na Windows konzoli (cp1252) bez padu ---
|
||
for _s in (sys.stdout, sys.stderr):
|
||
try:
|
||
_s.reconfigure(encoding="utf-8", errors="replace")
|
||
except Exception:
|
||
pass
|
||
|
||
DEF_MONGO = "192.168.1.76"
|
||
DEF_FILER = "http://192.168.1.50:8888/"
|
||
DEF_DEST = r"\\tower\PlexBinHex\EUNI"
|
||
MAX_NAME = 190 # bezpecna delka nazvu souboru (pod MAX_PATH)
|
||
|
||
TITLES = {"prof", "doc", "prim", "dr", "mudr", "mvdr", "pharmdr", "phdr",
|
||
"rndr", "mgr", "bc", "msc", "md", "et", "ing"}
|
||
DEGREES = {"ph.d.", "ph.d", "phd.", "phd", "csc.", "csc", "drsc.", "drsc",
|
||
"mba", "msc.", "msc", "m.d.", "md", "febo", "fesc", "fesc.", "feso",
|
||
"fean", "mha", "ph.", "d.", "fcma", "facp", "fefim", "frsph",
|
||
"febtm", "febu", "agaf", "dr."}
|
||
|
||
ILLEGAL = re.compile(r'[\\/:*?"<>|]')
|
||
# plnosirkove varianty znaku z puvodnich nazvu (vimeo/youtube downloader)
|
||
FULLWIDTH = str.maketrans({"?": "", ":": " ", "/": "-", "*": "",
|
||
"<": "", ">": "", "|": "-", """: ""})
|
||
VIMEO_TAIL = re.compile(r"\s*\[[0-9A-Za-z_\-]+\]\s*$")
|
||
|
||
|
||
def surname(autor):
|
||
if not autor:
|
||
return None
|
||
s = autor.split(",")[0].strip()
|
||
toks = s.split()
|
||
while toks and toks[0].lower().strip(".") in TITLES:
|
||
toks.pop(0)
|
||
while toks and toks[-1].lower().strip(",") in DEGREES:
|
||
toks.pop()
|
||
if not toks:
|
||
return None
|
||
if len(toks) == 1:
|
||
return toks[0]
|
||
return " ".join(toks[1:]) # vse po krestnim jmenu -> i dvojita prijmeni
|
||
|
||
|
||
def sanitize(name):
|
||
name = name.translate(FULLWIDTH)
|
||
name = ILLEGAL.sub("", name)
|
||
name = re.sub(r"\s+", " ", name).strip().rstrip(". ")
|
||
return name
|
||
|
||
|
||
def seq_num(soubor):
|
||
"""Vytahne explicitni poradove cislo z nazvu souboru (1.-30.), nebo None."""
|
||
b = Path(soubor.replace("\\", "/")).name
|
||
b = VIMEO_TAIL.sub("", b)
|
||
b = re.sub(r"\.mp4", "", b, flags=re.I)
|
||
for pat in (r"^\s*(\d{1,2})[\.\)]", # "1. Tonometrie", "2) ..."
|
||
r"[ _]p(\d{1,2})\b", # "p01 ...", "_p02_"
|
||
r"[ _](\d{1,2})[ _]", # "TABAK_01_", "Meluzinova 3 "
|
||
r"\b(?:cast|část|díl|dil|part)[ _]*(\d{1,2})\b",
|
||
r"[ _](\d{1,2})$"): # "... 2"
|
||
m = re.search(pat, b, flags=re.I)
|
||
if m:
|
||
n = int(m.group(1))
|
||
if 1 <= n <= 30:
|
||
return n
|
||
return None
|
||
|
||
|
||
def src_id(klic):
|
||
"""vimeo numeric id (chronologicky ~ poradi nahrani), jinak None."""
|
||
m = re.search(r"vimeo:(\d+)", klic or "")
|
||
return int(m.group(1)) if m else None
|
||
|
||
|
||
def order_items(items):
|
||
"""Seradi videa jednoho kurzu do logickeho poradi."""
|
||
nums = [seq_num(it.get("soubor", "")) for it in items]
|
||
if all(n is not None for n in nums) and len(set(nums)) == len(nums):
|
||
return [it for _, it in sorted(zip(nums, items), key=lambda z: z[0])]
|
||
# fallback: poradi nahrani na vimeu, jinak abecedne dle nazvu
|
||
def k(it):
|
||
sid = src_id(it.get("klic"))
|
||
return (0, sid, "") if sid is not None else (1, 0, it.get("soubor", ""))
|
||
return sorted(items, key=k)
|
||
|
||
|
||
def seg_label(soubor, nazev):
|
||
base = Path(soubor.replace("\\", "/")).name
|
||
base = re.sub(r"\.mp4\s*", " ", base, flags=re.I)
|
||
base = VIMEO_TAIL.sub("", base).strip()
|
||
for pref in (f"EUNI kurz - {nazev} - studijní materiál -",
|
||
f"EUNI kurz - {nazev} - studijní materiál",
|
||
f"EUNI kurz - {nazev} -",
|
||
f"EUNI kurz - {nazev}",
|
||
f"{nazev} -", f"{nazev}-", nazev):
|
||
if base.lower().startswith(pref.lower()):
|
||
base = base[len(pref):].strip(" -")
|
||
break
|
||
base = base.replace("_", " ").strip()
|
||
base = re.sub(r"^\s*\d{1,2}[\.\)]\s*", "", base) # zdvojene poradove cislo (mame vlastni NN)
|
||
base = re.sub(r"\s+", " ", base).strip()
|
||
if not base or re.fullmatch(r"[0-9A-Za-z]{16,}", base) or base.lower() in {
|
||
"studijní materiál", "zaznam", "záznam", "video"}:
|
||
return ""
|
||
return base
|
||
|
||
|
||
def clip(stem):
|
||
"""Zkrati prilis dlouhy nazev (bez .mp4) na bezpecnou delku."""
|
||
return stem if len(stem) <= MAX_NAME else stem[:MAX_NAME].rstrip(" -.")
|
||
|
||
|
||
def filer_url(filer, seaweed_path):
|
||
enc = "/".join(quote(p) for p in seaweed_path.split("/"))
|
||
return filer.rstrip("/") + "/" + enc
|
||
|
||
|
||
SLICE = 16 << 20 # 16 MB na Range usek (plny GET fileru je patologicky pomaly)
|
||
|
||
|
||
def download_resumable(url, dst, size, retries=8):
|
||
"""Stahuje po Range usecich do .part (filer servíruje plny GET ~50x pomaleji).
|
||
Pri vypadku navaze tam, kde skoncil. Vraci (ok, posledni_chyba)."""
|
||
tmp = dst.with_suffix(".part")
|
||
have = tmp.stat().st_size if tmp.exists() else 0
|
||
if have > size: # poskozeny zbytek -> od zacatku
|
||
tmp.unlink()
|
||
have = 0
|
||
last = None
|
||
fails = 0
|
||
while have < size:
|
||
end = min(have + SLICE, size) - 1
|
||
try:
|
||
with requests.get(url, headers={"Range": f"bytes={have}-{end}"},
|
||
stream=True, timeout=(15, 90)) as r:
|
||
if r.status_code not in (206, 200):
|
||
r.raise_for_status()
|
||
if r.status_code == 200: # filer ignoroval Range -> cely soubor
|
||
tmp.unlink(missing_ok=True)
|
||
with open(tmp, "wb") as f:
|
||
for chunk in r.iter_content(1 << 20):
|
||
f.write(chunk)
|
||
break
|
||
with open(tmp, "ab") as f:
|
||
for chunk in r.iter_content(1 << 20):
|
||
f.write(chunk)
|
||
have = tmp.stat().st_size
|
||
fails = 0 # usek prosel -> reset retry
|
||
except Exception as e:
|
||
last = e
|
||
fails += 1
|
||
if fails > retries:
|
||
break
|
||
time.sleep(min(2 ** fails, 20))
|
||
have = tmp.stat().st_size if tmp.exists() else 0
|
||
ok = tmp.exists() and tmp.stat().st_size == size
|
||
if ok:
|
||
tmp.replace(dst)
|
||
return ok, last
|
||
|
||
|
||
def build_plan(db):
|
||
kurzy = {k["_id"]: k for k in db.kurzy.find({})}
|
||
vids = list(db.materialy.find(
|
||
{"druh": "video", "seaweed_fids": {"$exists": True, "$ne": []}},
|
||
{"kurz_id": 1, "soubor": 1, "seaweed_path": 1, "seaweed_size": 1,
|
||
"klic": 1}))
|
||
|
||
by_course = {}
|
||
for v in vids:
|
||
by_course.setdefault(v["kurz_id"], []).append(v)
|
||
|
||
plan = [] # (seaweed_path, filename, size)
|
||
for kid, items in by_course.items():
|
||
k = kurzy.get(kid, {})
|
||
nazev = sanitize(k.get("nazev") or items[0].get("soubor", kid))
|
||
autor = surname(k.get("autor"))
|
||
dp = k.get("datum_publikace")
|
||
rok = dp.year if isinstance(dp, datetime) else None
|
||
ystr = f" ({rok})" if rok else ""
|
||
|
||
if len(items) == 1:
|
||
v = items[0]
|
||
who = f" - {autor}" if autor else ""
|
||
fn = clip(sanitize(f"{nazev}{who}{ystr}")) + ".mp4"
|
||
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
|
||
else:
|
||
for i, v in enumerate(order_items(items), 1):
|
||
lbl = seg_label(v.get("soubor", ""), k.get("nazev") or "")
|
||
mid = f" - {i:02d} {lbl}" if lbl else f" - {i:02d}"
|
||
fn = clip(sanitize(f"{nazev}{mid}{ystr}")) + ".mp4"
|
||
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
|
||
|
||
plan.sort(key=lambda x: x[1])
|
||
return by_course, plan
|
||
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--dry-run", action="store_true")
|
||
ap.add_argument("--dest", default=DEF_DEST)
|
||
ap.add_argument("--mongo", default=DEF_MONGO)
|
||
ap.add_argument("--filer", default=DEF_FILER)
|
||
ap.add_argument("--limit", type=int, default=0, help="stahne jen N souboru")
|
||
args = ap.parse_args()
|
||
|
||
cli = MongoClient(args.mongo, serverSelectionTimeoutMS=5000)
|
||
by_course, plan = build_plan(cli["EUNI"])
|
||
total = sum(p[2] for p in plan)
|
||
print(f"Kurzu s videem: {len(by_course)} | souboru k exportu: {len(plan)} "
|
||
f"| celkem {total/1024**3:.1f} GiB\n")
|
||
|
||
if args.dry_run:
|
||
for path, fn, size in plan:
|
||
print(f"{size/1024**2:8.1f} MB {fn}")
|
||
print(f"\n[DRY-RUN] nic nestazeno. Celkem {len(plan)} souboru, "
|
||
f"{total/1024**3:.1f} GiB.")
|
||
return
|
||
|
||
dest = Path(args.dest)
|
||
dest.mkdir(parents=True, exist_ok=True)
|
||
log_path = dest / "_export_log.txt"
|
||
|
||
# preflight: dosahnu na filer?
|
||
try:
|
||
requests.get(args.filer, timeout=8)
|
||
print(f"Filer OK: {args.filer}\n", flush=True)
|
||
except Exception as e:
|
||
sys.exit(f"NEDOSTUPNY FILER {args.filer} :: {e}\n"
|
||
f"Zkontroluj sit / VPN na 192.168.1.50:8888 z tohoto stroje.")
|
||
|
||
done = skipped = failed = 0
|
||
dl_bytes = 0
|
||
t0 = time.time()
|
||
with open(log_path, "a", encoding="utf-8") as log:
|
||
log.write(f"\n=== RUN {datetime.now():%Y-%m-%d %H:%M} | "
|
||
f"{len(plan)} planned | dest={dest} ===\n")
|
||
for n, (path, fn, size) in enumerate(plan, 1):
|
||
dst = dest / fn
|
||
if dst.exists() and dst.stat().st_size == size:
|
||
skipped += 1
|
||
continue
|
||
print(f"[{n}/{len(plan)}] ↓ {size/1024**2:.1f}MB {fn}", flush=True)
|
||
url = filer_url(args.filer, path)
|
||
ts = time.time()
|
||
ok, err = download_resumable(url, dst, size)
|
||
if ok:
|
||
done += 1
|
||
dl_bytes += size
|
||
sp = size / 1024**2 / max(time.time() - ts, 0.1)
|
||
msg = (f"[{n}/{len(plan)}] OK {size/1024**2:.1f}MB "
|
||
f"({sp:.1f} MB/s) {fn}")
|
||
else:
|
||
failed += 1
|
||
msg = f"[{n}/{len(plan)}] FAIL {fn} :: {err}"
|
||
print(msg, flush=True)
|
||
log.write(msg + "\n")
|
||
log.flush()
|
||
if args.limit and done >= args.limit:
|
||
break
|
||
dt = time.time() - t0
|
||
summary = (f"HOTOVO: {done} stazeno ({dl_bytes/1024**3:.1f} GiB), "
|
||
f"{skipped} preskoceno, {failed} chyb, {dt/60:.1f} min")
|
||
print("\n" + summary)
|
||
log.write(summary + "\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|