import os import re import nntplib from dotenv import load_dotenv from db import get_conn # ================= CONFIG ================= GROUP = "alt.binaries.e-book.magazines" SUBJECT_KEY = "PC Pro 2011-07.pdf" RAW_DIR = r"downloads/raw" # ========================================== load_dotenv() EWEKA_USER = os.getenv("EWEKA_USER") EWEKA_PASS = os.getenv("EWEKA_PASS") os.makedirs(RAW_DIR, exist_ok=True) # --- DB: najdi články --- conn = get_conn() cur = conn.cursor() cur.execute(""" SELECT article_number, metadata->>'subject' FROM articles WHERE newsgroup = %s AND metadata->>'subject' LIKE %s ORDER BY article_number """, (GROUP, f"%{SUBJECT_KEY}%")) rows = cur.fetchall() # --- parse part number from subject --- part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") parts = [] for art_num, subject in rows: m = part_re.search(subject or "") if not m: raise RuntimeError(f"Cannot parse part number from subject: {subject}") part_no = int(m.group(1)) parts.append((part_no, art_num)) parts.sort(key=lambda x: x[0]) print(f"📦 Parts to download: {len(parts)}") # --- NNTP download --- with nntplib.NNTP_SSL( "news.eweka.nl", 563, EWEKA_USER, EWEKA_PASS, readermode=True ) as nntp: nntp.group(GROUP) for part_no, art_num in parts: out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw") if os.path.exists(out_path): continue print(f"⬇️ Downloading RAW part {part_no} (article {art_num})") _, info = nntp.body(art_num) with open(out_path, "wb") as f: for line in info.lines: f.write(line) f.write(b"\n") print("🎉 DONE – RAW NNTP data stored")