Files
janssen/EUNI/plex_export.py
T
2026-06-19 05:31:31 +02:00

266 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
r"""
EUNI -> Plex (Other Videos) export.
Stahuje videa z SeaweedFS fileru na Plex share a pojmenuje je pro knihovnu EUNI.
Schema nazvu:
single-video kurz : "<Nazev kurzu> - <Prijmeni> (<rok>).mp4"
multi-video kurz : "<Nazev kurzu> - <NN> <segment> (<rok>).mp4" (autor vynechan)
Zavislosti:
pip install pymongo requests
Pouziti:
python plex_export.py --dry-run # jen vypise plan, nic nestahuje
python plex_export.py # stahuje na default cestu (\\tower\PlexBinHex\EUNI)
python plex_export.py --dest D:\plex\EUNI # jina cilova slozka
python plex_export.py --limit 5 # stahne jen prvnich 5 (test)
Skript je idempotentni: stahuje pres .part, overuje velikost, hotove preskakuje.
Kdyz spadne nebo ho preusis (Ctrl-C), staci spustit znovu a dojede zbytek.
Cilovy stroj potrebuje sit na: Mongo 192.168.1.76, filer 192.168.1.50:8888 a cilovy share.
"""
import argparse
import re
import sys
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import quote
import requests
from pymongo import MongoClient
# --- aby cestina vypisovala na Windows konzoli (cp1252) bez padu ---
for _s in (sys.stdout, sys.stderr):
try:
_s.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass
DEF_MONGO = "192.168.1.76"
DEF_FILER = "http://192.168.1.50:8888/"
DEF_DEST = r"\\tower\PlexBinHex\EUNI"
MAX_NAME = 190 # bezpecna delka nazvu souboru (pod MAX_PATH)
TITLES = {"prof", "doc", "prim", "dr", "mudr", "mvdr", "pharmdr", "phdr",
"rndr", "mgr", "bc", "msc", "md", "et", "ing"}
DEGREES = {"ph.d.", "ph.d", "phd.", "phd", "csc.", "csc", "drsc.", "drsc",
"mba", "msc.", "msc", "m.d.", "md", "febo", "fesc", "fesc.", "feso",
"fean", "mha", "ph.", "d.", "fcma", "facp", "fefim", "frsph",
"febtm", "febu", "agaf", "dr."}
ILLEGAL = re.compile(r'[\\/:*?"<>|]')
# plnosirkove varianty znaku z puvodnich nazvu (vimeo/youtube downloader)
FULLWIDTH = str.maketrans({"": "", "": " ", "": "-", "": "",
"": "", "": "", "": "-", "": ""})
VIMEO_TAIL = re.compile(r"\s*\[[0-9A-Za-z_\-]+\]\s*$")
def surname(autor):
if not autor:
return None
s = autor.split(",")[0].strip()
toks = s.split()
while toks and toks[0].lower().strip(".") in TITLES:
toks.pop(0)
while toks and toks[-1].lower().strip(",") in DEGREES:
toks.pop()
if not toks:
return None
if len(toks) == 1:
return toks[0]
return " ".join(toks[1:]) # vse po krestnim jmenu -> i dvojita prijmeni
def sanitize(name):
name = name.translate(FULLWIDTH)
name = ILLEGAL.sub("", name)
name = re.sub(r"\s+", " ", name).strip().rstrip(". ")
return name
def seq_num(soubor):
"""Vytahne explicitni poradove cislo z nazvu souboru (1.-30.), nebo None."""
b = Path(soubor.replace("\\", "/")).name
b = VIMEO_TAIL.sub("", b)
b = re.sub(r"\.mp4", "", b, flags=re.I)
for pat in (r"^\s*(\d{1,2})[\.\)]", # "1. Tonometrie", "2) ..."
r"[ _]p(\d{1,2})\b", # "p01 ...", "_p02_"
r"[ _](\d{1,2})[ _]", # "TABAK_01_", "Meluzinova 3 "
r"\b(?:cast|část|díl|dil|part)[ _]*(\d{1,2})\b",
r"[ _](\d{1,2})$"): # "... 2"
m = re.search(pat, b, flags=re.I)
if m:
n = int(m.group(1))
if 1 <= n <= 30:
return n
return None
def src_id(klic):
"""vimeo numeric id (chronologicky ~ poradi nahrani), jinak None."""
m = re.search(r"vimeo:(\d+)", klic or "")
return int(m.group(1)) if m else None
def order_items(items):
"""Seradi videa jednoho kurzu do logickeho poradi."""
nums = [seq_num(it.get("soubor", "")) for it in items]
if all(n is not None for n in nums) and len(set(nums)) == len(nums):
return [it for _, it in sorted(zip(nums, items), key=lambda z: z[0])]
# fallback: poradi nahrani na vimeu, jinak abecedne dle nazvu
def k(it):
sid = src_id(it.get("klic"))
return (0, sid, "") if sid is not None else (1, 0, it.get("soubor", ""))
return sorted(items, key=k)
def seg_label(soubor, nazev):
base = Path(soubor.replace("\\", "/")).name
base = re.sub(r"\.mp4\s*", " ", base, flags=re.I)
base = VIMEO_TAIL.sub("", base).strip()
for pref in (f"EUNI kurz - {nazev} - studijní materiál -",
f"EUNI kurz - {nazev} - studijní materiál",
f"EUNI kurz - {nazev} -",
f"EUNI kurz - {nazev}",
f"{nazev} -", f"{nazev}-", nazev):
if base.lower().startswith(pref.lower()):
base = base[len(pref):].strip(" -")
break
base = base.replace("_", " ").strip()
base = re.sub(r"^\s*\d{1,2}[\.\)]\s*", "", base) # zdvojene poradove cislo (mame vlastni NN)
base = re.sub(r"\s+", " ", base).strip()
if not base or re.fullmatch(r"[0-9A-Za-z]{16,}", base) or base.lower() in {
"studijní materiál", "zaznam", "záznam", "video"}:
return ""
return base
def clip(stem):
"""Zkrati prilis dlouhy nazev (bez .mp4) na bezpecnou delku."""
return stem if len(stem) <= MAX_NAME else stem[:MAX_NAME].rstrip(" -.")
def filer_url(filer, seaweed_path):
enc = "/".join(quote(p) for p in seaweed_path.split("/"))
return filer.rstrip("/") + "/" + enc
def build_plan(db):
kurzy = {k["_id"]: k for k in db.kurzy.find({})}
vids = list(db.materialy.find(
{"druh": "video", "seaweed_fids": {"$exists": True, "$ne": []}},
{"kurz_id": 1, "soubor": 1, "seaweed_path": 1, "seaweed_size": 1,
"klic": 1}))
by_course = {}
for v in vids:
by_course.setdefault(v["kurz_id"], []).append(v)
plan = [] # (seaweed_path, filename, size)
for kid, items in by_course.items():
k = kurzy.get(kid, {})
nazev = sanitize(k.get("nazev") or items[0].get("soubor", kid))
autor = surname(k.get("autor"))
dp = k.get("datum_publikace")
rok = dp.year if isinstance(dp, datetime) else None
ystr = f" ({rok})" if rok else ""
if len(items) == 1:
v = items[0]
who = f" - {autor}" if autor else ""
fn = clip(sanitize(f"{nazev}{who}{ystr}")) + ".mp4"
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
else:
for i, v in enumerate(order_items(items), 1):
lbl = seg_label(v.get("soubor", ""), k.get("nazev") or "")
mid = f" - {i:02d} {lbl}" if lbl else f" - {i:02d}"
fn = clip(sanitize(f"{nazev}{mid}{ystr}")) + ".mp4"
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
plan.sort(key=lambda x: x[1])
return by_course, plan
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--dest", default=DEF_DEST)
ap.add_argument("--mongo", default=DEF_MONGO)
ap.add_argument("--filer", default=DEF_FILER)
ap.add_argument("--limit", type=int, default=0, help="stahne jen N souboru")
args = ap.parse_args()
cli = MongoClient(args.mongo, serverSelectionTimeoutMS=5000)
by_course, plan = build_plan(cli["EUNI"])
total = sum(p[2] for p in plan)
print(f"Kurzu s videem: {len(by_course)} | souboru k exportu: {len(plan)} "
f"| celkem {total/1024**3:.1f} GiB\n")
if args.dry_run:
for path, fn, size in plan:
print(f"{size/1024**2:8.1f} MB {fn}")
print(f"\n[DRY-RUN] nic nestazeno. Celkem {len(plan)} souboru, "
f"{total/1024**3:.1f} GiB.")
return
dest = Path(args.dest)
dest.mkdir(parents=True, exist_ok=True)
log_path = dest / "_export_log.txt"
# preflight: dosahnu na filer?
try:
requests.get(args.filer, timeout=8)
print(f"Filer OK: {args.filer}\n", flush=True)
except Exception as e:
sys.exit(f"NEDOSTUPNY FILER {args.filer} :: {e}\n"
f"Zkontroluj sit / VPN na 192.168.1.50:8888 z tohoto stroje.")
done = skipped = failed = 0
dl_bytes = 0
t0 = time.time()
with open(log_path, "a", encoding="utf-8") as log:
log.write(f"\n=== RUN {datetime.now():%Y-%m-%d %H:%M} | "
f"{len(plan)} planned | dest={dest} ===\n")
for n, (path, fn, size) in enumerate(plan, 1):
dst = dest / fn
if dst.exists() and dst.stat().st_size == size:
skipped += 1
continue
print(f"[{n}/{len(plan)}] ↓ {size/1024**2:.1f}MB {fn}", flush=True)
try:
url = filer_url(args.filer, path)
ts = time.time()
# timeout=(connect, read) -> zaseknute spojeni spadne rychle
with requests.get(url, stream=True, timeout=(15, 90)) as r:
r.raise_for_status()
tmp = dst.with_suffix(".part")
with open(tmp, "wb") as f:
for chunk in r.iter_content(1 << 20):
f.write(chunk)
tmp.replace(dst)
done += 1
dl_bytes += size
sp = size / 1024**2 / max(time.time() - ts, 0.1)
msg = f"[{n}/{len(plan)}] OK {size/1024**2:.1f}MB ({sp:.1f} MB/s) {fn}"
except Exception as e:
failed += 1
msg = f"[{n}/{len(plan)}] FAIL {fn} :: {e}"
print(msg, flush=True)
log.write(msg + "\n")
log.flush()
if args.limit and done >= args.limit:
break
dt = time.time() - t0
summary = (f"HOTOVO: {done} stazeno ({dl_bytes/1024**3:.1f} GiB), "
f"{skipped} preskoceno, {failed} chyb, {dt/60:.1f} min")
print("\n" + summary)
log.write(summary + "\n")
if __name__ == "__main__":
main()