import os import re import nntplib from dotenv import load_dotenv from db import get_conn # ================== CONFIG ================== GROUP = "alt.binaries.e-book.magazines" SUBJECT_KEY = "PC Pro 2011-07.pdf" OUT_DIR = r"downloads/PC_Pro_2011-07" # ============================================ load_dotenv() EWEKA_USER = os.getenv("EWEKA_USER") EWEKA_PASS = os.getenv("EWEKA_PASS") os.makedirs(OUT_DIR, exist_ok=True) def yenc_decode_and_extract_headers(lines: list[bytes]): """ Decode yEnc BODY lines and extract yEnc headers. Returns: (decoded_bytes, yenc_header_lines) """ out = bytearray() yenc_headers = [] for line in lines: # --- undo NNTP dot-stuffing --- if line.startswith(b".."): line = line[1:] elif line.startswith(b"."): line = line[1:] # --- capture yEnc control lines --- if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"): yenc_headers.append(line.decode("latin-1")) continue # --- yEnc decode --- i = 0 length = len(line) while i < length: c = line[i] if c == ord("="): i += 1 if i >= length: break c = (line[i] - 64) & 0xFF out.append((c - 42) & 0xFF) i += 1 return bytes(out), yenc_headers # ------------------ DB ------------------ conn = get_conn() cur = conn.cursor() cur.execute(""" SELECT article_number, metadata->>'subject' FROM articles WHERE newsgroup = %s AND metadata->>'subject' LIKE %s ORDER BY article_number """, (GROUP, f"%{SUBJECT_KEY}%")) rows = cur.fetchall() # parse part number from subject part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") parts = [] for art_num, subject in rows: m = part_re.search(subject or "") if not m: raise RuntimeError(f"Cannot parse part number from subject: {subject}") part_no = int(m.group(1)) parts.append((part_no, art_num)) parts.sort(key=lambda x: x[0]) # ------------------ NNTP ------------------ with nntplib.NNTP_SSL( "news.eweka.nl", 563, EWEKA_USER, EWEKA_PASS, readermode=True ) as nntp: nntp.group(GROUP) for part_no, art_num in parts: bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader") if os.path.exists(bin_path) and os.path.exists(hdr_path): continue _, info = nntp.body(art_num) decoded, headers = yenc_decode_and_extract_headers(info.lines) with open(bin_path, "wb") as f: f.write(decoded) with open(hdr_path, "w", encoding="utf-8") as f: f.write("\n".join(headers))