notebook
This commit is contained in:
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python3
|
||||
r"""
|
||||
EUNI -> Plex (Other Videos) export.
|
||||
|
||||
Stahuje videa z SeaweedFS fileru na Plex share a pojmenuje je pro knihovnu EUNI.
|
||||
|
||||
Schema nazvu:
|
||||
single-video kurz : "<Nazev kurzu> - <Prijmeni> (<rok>).mp4"
|
||||
multi-video kurz : "<Nazev kurzu> - <NN> <segment> (<rok>).mp4" (autor vynechan)
|
||||
|
||||
Zavislosti:
|
||||
pip install pymongo requests
|
||||
|
||||
Pouziti:
|
||||
python plex_export.py --dry-run # jen vypise plan, nic nestahuje
|
||||
python plex_export.py # stahuje na default cestu (\\tower\PlexBinHex\EUNI)
|
||||
python plex_export.py --dest D:\plex\EUNI # jina cilova slozka
|
||||
python plex_export.py --limit 5 # stahne jen prvnich 5 (test)
|
||||
|
||||
Skript je idempotentni: stahuje pres .part, overuje velikost, hotove preskakuje.
|
||||
Kdyz spadne nebo ho preusis (Ctrl-C), staci spustit znovu a dojede zbytek.
|
||||
Cilovy stroj potrebuje sit na: Mongo 192.168.1.76, filer 192.168.1.50:8888 a cilovy share.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
from pymongo import MongoClient
|
||||
|
||||
# --- aby cestina vypisovala na Windows konzoli (cp1252) bez padu ---
|
||||
for _s in (sys.stdout, sys.stderr):
|
||||
try:
|
||||
_s.reconfigure(encoding="utf-8", errors="replace")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
DEF_MONGO = "192.168.1.76"
|
||||
DEF_FILER = "http://192.168.1.50:8888/"
|
||||
DEF_DEST = r"\\tower\PlexBinHex\EUNI"
|
||||
MAX_NAME = 190 # bezpecna delka nazvu souboru (pod MAX_PATH)
|
||||
|
||||
TITLES = {"prof", "doc", "prim", "dr", "mudr", "mvdr", "pharmdr", "phdr",
|
||||
"rndr", "mgr", "bc", "msc", "md", "et", "ing"}
|
||||
DEGREES = {"ph.d.", "ph.d", "phd.", "phd", "csc.", "csc", "drsc.", "drsc",
|
||||
"mba", "msc.", "msc", "m.d.", "md", "febo", "fesc", "fesc.", "feso",
|
||||
"fean", "mha", "ph.", "d.", "fcma", "facp", "fefim", "frsph",
|
||||
"febtm", "febu", "agaf", "dr."}
|
||||
|
||||
ILLEGAL = re.compile(r'[\\/:*?"<>|]')
|
||||
# plnosirkove varianty znaku z puvodnich nazvu (vimeo/youtube downloader)
|
||||
FULLWIDTH = str.maketrans({"?": "", ":": " ", "/": "-", "*": "",
|
||||
"<": "", ">": "", "|": "-", """: ""})
|
||||
VIMEO_TAIL = re.compile(r"\s*\[[0-9A-Za-z_\-]+\]\s*$")
|
||||
|
||||
|
||||
def surname(autor):
|
||||
if not autor:
|
||||
return None
|
||||
s = autor.split(",")[0].strip()
|
||||
toks = s.split()
|
||||
while toks and toks[0].lower().strip(".") in TITLES:
|
||||
toks.pop(0)
|
||||
while toks and toks[-1].lower().strip(",") in DEGREES:
|
||||
toks.pop()
|
||||
if not toks:
|
||||
return None
|
||||
if len(toks) == 1:
|
||||
return toks[0]
|
||||
return " ".join(toks[1:]) # vse po krestnim jmenu -> i dvojita prijmeni
|
||||
|
||||
|
||||
def sanitize(name):
|
||||
name = name.translate(FULLWIDTH)
|
||||
name = ILLEGAL.sub("", name)
|
||||
name = re.sub(r"\s+", " ", name).strip().rstrip(". ")
|
||||
return name
|
||||
|
||||
|
||||
def seq_num(soubor):
|
||||
"""Vytahne explicitni poradove cislo z nazvu souboru (1.-30.), nebo None."""
|
||||
b = Path(soubor.replace("\\", "/")).name
|
||||
b = VIMEO_TAIL.sub("", b)
|
||||
b = re.sub(r"\.mp4", "", b, flags=re.I)
|
||||
for pat in (r"^\s*(\d{1,2})[\.\)]", # "1. Tonometrie", "2) ..."
|
||||
r"[ _]p(\d{1,2})\b", # "p01 ...", "_p02_"
|
||||
r"[ _](\d{1,2})[ _]", # "TABAK_01_", "Meluzinova 3 "
|
||||
r"\b(?:cast|část|díl|dil|part)[ _]*(\d{1,2})\b",
|
||||
r"[ _](\d{1,2})$"): # "... 2"
|
||||
m = re.search(pat, b, flags=re.I)
|
||||
if m:
|
||||
n = int(m.group(1))
|
||||
if 1 <= n <= 30:
|
||||
return n
|
||||
return None
|
||||
|
||||
|
||||
def src_id(klic):
|
||||
"""vimeo numeric id (chronologicky ~ poradi nahrani), jinak None."""
|
||||
m = re.search(r"vimeo:(\d+)", klic or "")
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def order_items(items):
|
||||
"""Seradi videa jednoho kurzu do logickeho poradi."""
|
||||
nums = [seq_num(it.get("soubor", "")) for it in items]
|
||||
if all(n is not None for n in nums) and len(set(nums)) == len(nums):
|
||||
return [it for _, it in sorted(zip(nums, items), key=lambda z: z[0])]
|
||||
# fallback: poradi nahrani na vimeu, jinak abecedne dle nazvu
|
||||
def k(it):
|
||||
sid = src_id(it.get("klic"))
|
||||
return (0, sid, "") if sid is not None else (1, 0, it.get("soubor", ""))
|
||||
return sorted(items, key=k)
|
||||
|
||||
|
||||
def seg_label(soubor, nazev):
|
||||
base = Path(soubor.replace("\\", "/")).name
|
||||
base = re.sub(r"\.mp4\s*", " ", base, flags=re.I)
|
||||
base = VIMEO_TAIL.sub("", base).strip()
|
||||
for pref in (f"EUNI kurz - {nazev} - studijní materiál -",
|
||||
f"EUNI kurz - {nazev} - studijní materiál",
|
||||
f"EUNI kurz - {nazev} -",
|
||||
f"EUNI kurz - {nazev}",
|
||||
f"{nazev} -", f"{nazev}-", nazev):
|
||||
if base.lower().startswith(pref.lower()):
|
||||
base = base[len(pref):].strip(" -")
|
||||
break
|
||||
base = base.replace("_", " ").strip()
|
||||
base = re.sub(r"^\s*\d{1,2}[\.\)]\s*", "", base) # zdvojene poradove cislo (mame vlastni NN)
|
||||
base = re.sub(r"\s+", " ", base).strip()
|
||||
if not base or re.fullmatch(r"[0-9A-Za-z]{16,}", base) or base.lower() in {
|
||||
"studijní materiál", "zaznam", "záznam", "video"}:
|
||||
return ""
|
||||
return base
|
||||
|
||||
|
||||
def clip(stem):
|
||||
"""Zkrati prilis dlouhy nazev (bez .mp4) na bezpecnou delku."""
|
||||
return stem if len(stem) <= MAX_NAME else stem[:MAX_NAME].rstrip(" -.")
|
||||
|
||||
|
||||
def filer_url(filer, seaweed_path):
|
||||
enc = "/".join(quote(p) for p in seaweed_path.split("/"))
|
||||
return filer.rstrip("/") + "/" + enc
|
||||
|
||||
|
||||
def build_plan(db):
|
||||
kurzy = {k["_id"]: k for k in db.kurzy.find({})}
|
||||
vids = list(db.materialy.find(
|
||||
{"druh": "video", "seaweed_fids": {"$exists": True, "$ne": []}},
|
||||
{"kurz_id": 1, "soubor": 1, "seaweed_path": 1, "seaweed_size": 1,
|
||||
"klic": 1}))
|
||||
|
||||
by_course = {}
|
||||
for v in vids:
|
||||
by_course.setdefault(v["kurz_id"], []).append(v)
|
||||
|
||||
plan = [] # (seaweed_path, filename, size)
|
||||
for kid, items in by_course.items():
|
||||
k = kurzy.get(kid, {})
|
||||
nazev = sanitize(k.get("nazev") or items[0].get("soubor", kid))
|
||||
autor = surname(k.get("autor"))
|
||||
dp = k.get("datum_publikace")
|
||||
rok = dp.year if isinstance(dp, datetime) else None
|
||||
ystr = f" ({rok})" if rok else ""
|
||||
|
||||
if len(items) == 1:
|
||||
v = items[0]
|
||||
who = f" - {autor}" if autor else ""
|
||||
fn = clip(sanitize(f"{nazev}{who}{ystr}")) + ".mp4"
|
||||
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
|
||||
else:
|
||||
for i, v in enumerate(order_items(items), 1):
|
||||
lbl = seg_label(v.get("soubor", ""), k.get("nazev") or "")
|
||||
mid = f" - {i:02d} {lbl}" if lbl else f" - {i:02d}"
|
||||
fn = clip(sanitize(f"{nazev}{mid}{ystr}")) + ".mp4"
|
||||
plan.append((v["seaweed_path"], fn, v["seaweed_size"]))
|
||||
|
||||
plan.sort(key=lambda x: x[1])
|
||||
return by_course, plan
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
ap.add_argument("--dest", default=DEF_DEST)
|
||||
ap.add_argument("--mongo", default=DEF_MONGO)
|
||||
ap.add_argument("--filer", default=DEF_FILER)
|
||||
ap.add_argument("--limit", type=int, default=0, help="stahne jen N souboru")
|
||||
args = ap.parse_args()
|
||||
|
||||
cli = MongoClient(args.mongo, serverSelectionTimeoutMS=5000)
|
||||
by_course, plan = build_plan(cli["EUNI"])
|
||||
total = sum(p[2] for p in plan)
|
||||
print(f"Kurzu s videem: {len(by_course)} | souboru k exportu: {len(plan)} "
|
||||
f"| celkem {total/1024**3:.1f} GiB\n")
|
||||
|
||||
if args.dry_run:
|
||||
for path, fn, size in plan:
|
||||
print(f"{size/1024**2:8.1f} MB {fn}")
|
||||
print(f"\n[DRY-RUN] nic nestazeno. Celkem {len(plan)} souboru, "
|
||||
f"{total/1024**3:.1f} GiB.")
|
||||
return
|
||||
|
||||
dest = Path(args.dest)
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
log_path = dest / "_export_log.txt"
|
||||
done = skipped = failed = 0
|
||||
dl_bytes = 0
|
||||
t0 = time.time()
|
||||
with open(log_path, "a", encoding="utf-8") as log:
|
||||
log.write(f"\n=== RUN {datetime.now():%Y-%m-%d %H:%M} | "
|
||||
f"{len(plan)} planned | dest={dest} ===\n")
|
||||
for n, (path, fn, size) in enumerate(plan, 1):
|
||||
dst = dest / fn
|
||||
if dst.exists() and dst.stat().st_size == size:
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
url = filer_url(args.filer, path)
|
||||
with requests.get(url, stream=True, timeout=300) as r:
|
||||
r.raise_for_status()
|
||||
tmp = dst.with_suffix(".part")
|
||||
with open(tmp, "wb") as f:
|
||||
for chunk in r.iter_content(1 << 20):
|
||||
f.write(chunk)
|
||||
tmp.replace(dst)
|
||||
done += 1
|
||||
dl_bytes += size
|
||||
msg = f"[{n}/{len(plan)}] OK {size/1024**2:.1f}MB {fn}"
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
msg = f"[{n}/{len(plan)}] FAIL {fn} :: {e}"
|
||||
print(msg, flush=True)
|
||||
log.write(msg + "\n")
|
||||
log.flush()
|
||||
if args.limit and done >= args.limit:
|
||||
break
|
||||
dt = time.time() - t0
|
||||
summary = (f"HOTOVO: {done} stazeno ({dl_bytes/1024**3:.1f} GiB), "
|
||||
f"{skipped} preskoceno, {failed} chyb, {dt/60:.1f} min")
|
||||
print("\n" + summary)
|
||||
log.write(summary + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user