Files
janssen/EUNI/plex_export.py
2026-06-19 11:28:11 +02:00

304 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
r"""
EUNI -> Plex (Other Videos) export.
Stahuje videa z SeaweedFS fileru na Plex share a pojmenuje je pro knihovnu EUNI.
Schema nazvu:
single-video kurz : "<Nazev kurzu> - <Prijmeni> (<rok>).mp4"
multi-video kurz : "<Nazev kurzu> - <NN> <segment> (<rok>).mp4" (autor vynechan)
Zavislosti:
pip install pymongo requests
Pouziti:
python plex_export.py --dry-run # jen vypise plan, nic nestahuje
python plex_export.py # stahuje na default cestu (\\tower\PlexBinHex\EUNI)
python plex_export.py --dest D:\plex\EUNI # jina cilova slozka
python plex_export.py --limit 5 # stahne jen prvnich 5 (test)
Skript je idempotentni: stahuje pres .part, overuje velikost, hotove preskakuje.
Kdyz spadne nebo ho preusis (Ctrl-C), staci spustit znovu a dojede zbytek.
Cilovy stroj potrebuje sit na: Mongo 192.168.1.76, filer 192.168.1.50:8888 a cilovy share.
"""
import argparse
import re
import sys
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import quote
import requests
from pymongo import MongoClient
# --- aby cestina vypisovala na Windows konzoli (cp1252) bez padu ---
for _s in (sys.stdout, sys.stderr):
try:
_s.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass
DEF_MONGO = "192.168.1.76"
DEF_FILER = "http://192.168.1.50:8888/"
DEF_DEST = r"\\tower\PlexBinHex\EUNI"
MAX_NAME = 190 # bezpecna delka nazvu souboru (pod MAX_PATH)
TITLES = {"prof", "doc", "prim", "dr", "mudr", "mvdr", "pharmdr", "phdr",
"rndr", "mgr", "bc", "msc", "md", "et", "ing"}
DEGREES = {"ph.d.", "ph.d", "phd.", "phd", "csc.", "csc", "drsc.", "drsc",
"mba", "msc.", "msc", "m.d.", "md", "febo", "fesc", "fesc.", "feso",
"fean", "mha", "ph.", "d.", "fcma", "facp", "fefim", "frsph",
"febtm", "febu", "agaf", "dr."}
ILLEGAL = re.compile(r'[\\/:*?"<>|]')
# plnosirkove varianty znaku z puvodnich nazvu (vimeo/youtube downloader)
FULLWIDTH = str.maketrans({"": "", "": " ", "": "-", "": "",
"": "", "": "", "": "-", "": ""})
VIMEO_TAIL = re.compile(r"\s*\[[0-9A-Za-z_\-]+\]\s*$")
def surname(autor):
if not autor:
return None
s = autor.split(",")[0].strip()
toks = s.split()
while toks and toks[0].lower().strip(".") in TITLES:
toks.pop(0)
while toks and toks[-1].lower().strip(",") in DEGREES:
toks.pop()
if not toks:
return None
if len(toks) == 1:
return toks[0]
return " ".join(toks[1:]) # vse po krestnim jmenu -> i dvojita prijmeni
def sanitize(name):
name = name.translate(FULLWIDTH)
name = ILLEGAL.sub("", name)
name = re.sub(r"\s+", " ", name).strip().rstrip(". ")
return name
def seq_num(soubor):
"""Vytahne explicitni poradove cislo z nazvu souboru (1.-30.), nebo None."""
b = Path(soubor.replace("\\", "/")).name
b = VIMEO_TAIL.sub("", b)
b = re.sub(r"\.mp4", "", b, flags=re.I)
for pat in (r"^\s*(\d{1,2})[\.\)]", # "1. Tonometrie", "2) ..."
r"[ _]p(\d{1,2})\b", # "p01 ...", "_p02_"
r"[ _](\d{1,2})[ _]", # "TABAK_01_", "Meluzinova 3 "
r"\b(?:cast|část|díl|dil|part)[ _]*(\d{1,2})\b",
r"[ _](\d{1,2})$"): # "... 2"
m = re.search(pat, b, flags=re.I)
if m:
n = int(m.group(1))
if 1 <= n <= 30:
return n
return None
def src_id(klic):
"""vimeo numeric id (chronologicky ~ poradi nahrani), jinak None."""
m = re.search(r"vimeo:(\d+)", klic or "")
return int(m.group(1)) if m else None
def order_items(items):
"""Seradi videa jednoho kurzu do logickeho poradi."""
nums = [seq_num(it.get("soubor", "")) for it in items]
if all(n is not None for n in nums) and len(set(nums)) == len(nums):
return [it for _, it in sorted(zip(nums, items), key=lambda z: z[0])]
# fallback: poradi nahrani na vimeu, jinak abecedne dle nazvu
def k(it):
sid = src_id(it.get("klic"))
return (0, sid, "") if sid is not None else (1, 0, it.get("soubor", ""))
return sorted(items, key=k)
def seg_label(soubor, nazev):
base = Path(soubor.replace("\\", "/")).name
base = re.sub(r"\.mp4\s*", " ", base, flags=re.I)
base = VIMEO_TAIL.sub("", base).strip()
for pref in (f"EUNI kurz - {nazev} - studijní materiál -",
f"EUNI kurz - {nazev} - studijní materiál",
f"EUNI kurz - {nazev} -",
f"EUNI kurz - {nazev}",
f"{nazev} -", f"{nazev}-", nazev):
if base.lower().startswith(pref.lower()):
base = base[len(pref):].strip(" -")
break
base = base.replace("_", " ").strip()
base = re.sub(r"^\s*\d{1,2}[\.\)]\s*", "", base) # zdvojene poradove cislo (mame vlastni NN)
base = re.sub(r"\s+", " ", base).strip()
if not base or re.fullmatch(r"[0-9A-Za-z]{16,}", base) or base.lower() in {
"studijní materiál", "zaznam", "záznam", "video"}:
return ""
return base
def clip(stem):
"""Zkrati prilis dlouhy nazev (bez .mp4) na bezpecnou delku."""
return stem if len(stem) <= MAX_NAME else stem[:MAX_NAME].rstrip(" -.")
def filer_url(filer, seaweed_path):
enc = "/".join(quote(p) for p in seaweed_path.split("/"))
return filer.rstrip("/") + "/" + enc
SLICE = 16 << 20 # 16 MB na Range usek (plny GET fileru je patologicky pomaly)
def download_resumable(url, dst, size, retries=8):
"""Stahuje po Range usecich do .part (filer servíruje plny GET ~50x pomaleji).
Pri vypadku navaze tam, kde skoncil. Vraci (ok, posledni_chyba)."""
tmp = dst.with_suffix(".part")
have = tmp.stat().st_size if tmp.exists() else 0
if have > size: # poskozeny zbytek -> od zacatku
tmp.unlink()
have = 0
last = None
fails = 0
while have < size:
end = min(have + SLICE, size) - 1
try:
with requests.get(url, headers={"Range": f"bytes={have}-{end}"},
stream=True, timeout=(15, 90)) as r:
if r.status_code not in (206, 200):
r.raise_for_status()
if r.status_code == 200: # filer ignoroval Range -> cely soubor
tmp.unlink(missing_ok=True)
with open(tmp, "wb") as f:
for chunk in r.iter_content(1 << 20):
f.write(chunk)
break
with open(tmp, "ab") as f:
for chunk in r.iter_content(1 << 20):
f.write(chunk)
have = tmp.stat().st_size
fails = 0 # usek prosel -> reset retry
except Exception as e:
last = e
fails += 1
if fails > retries:
break
time.sleep(min(2 ** fails, 20))
have = tmp.stat().st_size if tmp.exists() else 0
ok = tmp.exists() and tmp.stat().st_size == size
if ok:
tmp.replace(dst)
return ok, last
def build_plan(db):
kurzy = {k["_id"]: k for k in db.kurzy.find({})}
vids = list(db.materialy.find(
{"druh": "video", "seaweed_fids": {"$exists": True, "$ne": []}},
{"kurz_id": 1, "soubor": 1, "seaweed_path": 1, "seaweed_size": 1,
"klic": 1}))
by_course = {}
for v in vids:
by_course.setdefault(v["kurz_id"], []).append(v)
plan = [] # (seaweed_path, filename, size)
for kid, items in by_course.items():
k = kurzy.get(kid, {})
nazev = sanitize(k.get("nazev") or items[0].get("soubor", kid))
autor = surname(k.get("autor"))
dp = k.get("datum_publikace")
rok = dp.year if isinstance(dp, datetime) else None
ystr = f" ({rok})" if rok else ""
if len(items) == 1:
v = items[0]
who = f" - {autor}" if autor else ""
fn = clip(sanitize(f"{nazev}{who}{ystr}")) + ".mp4"
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
else:
for i, v in enumerate(order_items(items), 1):
lbl = seg_label(v.get("soubor", ""), k.get("nazev") or "")
mid = f" - {i:02d} {lbl}" if lbl else f" - {i:02d}"
fn = clip(sanitize(f"{nazev}{mid}{ystr}")) + ".mp4"
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
plan.sort(key=lambda x: x[1])
return by_course, plan
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--dest", default=DEF_DEST)
ap.add_argument("--mongo", default=DEF_MONGO)
ap.add_argument("--filer", default=DEF_FILER)
ap.add_argument("--limit", type=int, default=0, help="stahne jen N souboru")
args = ap.parse_args()
cli = MongoClient(args.mongo, serverSelectionTimeoutMS=5000)
by_course, plan = build_plan(cli["EUNI"])
total = sum(p[2] for p in plan)
print(f"Kurzu s videem: {len(by_course)} | souboru k exportu: {len(plan)} "
f"| celkem {total/1024**3:.1f} GiB\n")
if args.dry_run:
for path, fn, size in plan:
print(f"{size/1024**2:8.1f} MB {fn}")
print(f"\n[DRY-RUN] nic nestazeno. Celkem {len(plan)} souboru, "
f"{total/1024**3:.1f} GiB.")
return
dest = Path(args.dest)
dest.mkdir(parents=True, exist_ok=True)
log_path = dest / "_export_log.txt"
# preflight: dosahnu na filer?
try:
requests.get(args.filer, timeout=8)
print(f"Filer OK: {args.filer}\n", flush=True)
except Exception as e:
sys.exit(f"NEDOSTUPNY FILER {args.filer} :: {e}\n"
f"Zkontroluj sit / VPN na 192.168.1.50:8888 z tohoto stroje.")
done = skipped = failed = 0
dl_bytes = 0
t0 = time.time()
with open(log_path, "a", encoding="utf-8") as log:
log.write(f"\n=== RUN {datetime.now():%Y-%m-%d %H:%M} | "
f"{len(plan)} planned | dest={dest} ===\n")
for n, (path, fn, size) in enumerate(plan, 1):
dst = dest / fn
if dst.exists() and dst.stat().st_size == size:
skipped += 1
continue
print(f"[{n}/{len(plan)}] ↓ {size/1024**2:.1f}MB {fn}", flush=True)
url = filer_url(args.filer, path)
ts = time.time()
ok, err = download_resumable(url, dst, size)
if ok:
done += 1
dl_bytes += size
sp = size / 1024**2 / max(time.time() - ts, 0.1)
msg = (f"[{n}/{len(plan)}] OK {size/1024**2:.1f}MB "
f"({sp:.1f} MB/s) {fn}")
else:
failed += 1
msg = f"[{n}/{len(plan)}] FAIL {fn} :: {err}"
print(msg, flush=True)
log.write(msg + "\n")
log.flush()
if args.limit and done >= args.limit:
break
dt = time.time() - t0
summary = (f"HOTOVO: {done} stazeno ({dl_bytes/1024**3:.1f} GiB), "
f"{skipped} preskoceno, {failed} chyb, {dt/60:.1f} min")
print("\n" + summary)
log.write(summary + "\n")
if __name__ == "__main__":
main()