import os import re import nntplib from concurrent.futures import ThreadPoolExecutor, as_completed from dotenv import load_dotenv from db import get_conn # ================= CONFIG ================= GROUP = "alt.binaries.e-book.magazines" SUBJECT_KEY = "PC Pro 2011-07.pdf" RAW_DIR = r"downloads/raw" MAX_WORKERS = 5 # ========================================== load_dotenv() EWEKA_USER = os.getenv("EWEKA_USER") EWEKA_PASS = os.getenv("EWEKA_PASS") os.makedirs(RAW_DIR, exist_ok=True) # ---------- DB: load parts ---------- conn = get_conn() cur = conn.cursor() cur.execute(""" SELECT article_number, metadata->>'subject' FROM articles WHERE newsgroup = %s AND metadata->>'subject' LIKE %s ORDER BY article_number """, (GROUP, f"%{SUBJECT_KEY}%")) rows = cur.fetchall() part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") parts = [] for art_num, subject in rows: m = part_re.search(subject or "") if not m: continue part_no = int(m.group(1)) parts.append((part_no, art_num)) parts.sort(key=lambda x: x[0]) print(f"📦 Parts to download: {len(parts)}") # ---------- worker ---------- def download_part(part_no: int, art_num: int): out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw") if os.path.exists(out_path): return f"⏭️ part {part_no} exists" with nntplib.NNTP_SSL( "news.eweka.nl", 563, EWEKA_USER, EWEKA_PASS, readermode=True, timeout=120 ) as nntp: nntp.group(GROUP) _, info = nntp.body(art_num) with open(out_path, "wb") as f: for line in info.lines: f.write(line) f.write(b"\n") return f"⬇️ part {part_no} done" # ---------- parallel execution ---------- errors = 0 with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = [ executor.submit(download_part, part_no, art_num) for part_no, art_num in parts ] for future in as_completed(futures): try: msg = future.result() print(msg) except Exception as e: errors += 1 print(f"❌ ERROR: {e}") print("🎉 DONE") print(f"⚠️ Errors: {errors}")