import os import re import nntplib from dotenv import load_dotenv from db import get_conn def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes: """ Decode yEnc from NNTP BODY lines. Handles NNTP dot-stuffing and logs what happens. """ out = bytearray() saw_ybegin = False data_lines = 0 for idx, orig_line in enumerate(lines): line = orig_line # --- NNTP dot-stuffing --- if line.startswith(b".."): if debug: print(f" [dot] line {idx}: '..' -> '.'") line = line[1:] elif line.startswith(b"."): if debug: print(f" [dot] line {idx}: '.' removed") line = line[1:] # --- yEnc control lines --- if line.startswith(b"=ybegin"): saw_ybegin = True if debug: print(f" [yEnc] =ybegin detected") continue if line.startswith(b"=ypart"): if debug: print(f" [yEnc] =ypart detected") continue if line.startswith(b"=yend"): if debug: print(f" [yEnc] =yend detected") continue # --- actual yEnc data --- data_lines += 1 i = 0 length = len(line) while i < length: c = line[i] if c == ord('='): i += 1 if i >= length: break c = (line[i] - 64) & 0xFF out.append((c - 42) & 0xFF) i += 1 if debug: print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}") print(f" [yEnc] decoded_bytes={len(out)}") if not saw_ybegin: print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen") return bytes(out) # def yenc_decode_lines(lines: list[bytes]) -> bytes: # """ # Decode yEnc from NNTP BODY lines. # Handles NNTP dot-stuffing correctly. # """ # out = bytearray() # # for line in lines: # # --- undo NNTP dot-stuffing --- # if line.startswith(b".."): # line = line[1:] # elif line.startswith(b"."): # line = line[1:] # # # --- skip yEnc control lines --- # if line.startswith(b"=ybegin"): # continue # if line.startswith(b"=ypart"): # continue # if line.startswith(b"=yend"): # continue # # i = 0 # length = len(line) # # while i < length: # c = line[i] # # if c == ord('='): # yEnc escape # i += 1 # if i >= length: # break # c = (line[i] - 64) & 0xFF # # out.append((c - 42) & 0xFF) # i += 1 # # return bytes(out) # ================== CONFIG ================== GROUP = "alt.binaries.e-book.magazines" SUBJECT_KEY = "PC Pro 2011-07.pdf" OUT_DIR = r"downloads/PC_Pro_2011-07" FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf" # ============================================ load_dotenv() EWEKA_USER = os.getenv("EWEKA_USER") EWEKA_PASS = os.getenv("EWEKA_PASS") os.makedirs(OUT_DIR, exist_ok=True) print("🔌 Connecting to PostgreSQL...") conn = get_conn() cur = conn.cursor() # --- load article numbers + subject --- cur.execute(""" SELECT article_number, metadata->>'subject' FROM articles WHERE newsgroup = %s AND metadata->>'subject' LIKE %s ORDER BY article_number """, (GROUP, f"%{SUBJECT_KEY}%")) rows = cur.fetchall() print(f"📦 Found {len(rows)} parts") # --- parse part number from subject --- part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") parts = [] for art_num, subject in rows: m = part_re.search(subject or "") if not m: raise RuntimeError(f"Cannot parse part number from subject: {subject}") part_no = int(m.group(1)) parts.append((part_no, art_num)) # sort by part number (1..N) parts.sort(key=lambda x: x[0]) print("🔌 Connecting to Eweka NNTP...") with nntplib.NNTP_SSL( "news.eweka.nl", 563, EWEKA_USER, EWEKA_PASS, readermode=True ) as nntp: nntp.group(GROUP) for idx, (part_no, art_num) in enumerate(parts, start=1): out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") if os.path.exists(out_path): print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping") continue print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})") resp, info = nntp.body(art_num) print(f" BODY lines received: {len(info.lines)}") # rychlá kontrola prvních řádků for ln in info.lines[:3]: print(f" RAW:", ln[:80]) decoded = yenc_decode_lines(info.lines, debug=True) print(f" RESULT bytes: {len(decoded)}") with open(out_path, "wb") as f: f.write(decoded) print("🧩 Assembling final PDF...") with open(FINAL_PDF, "wb") as out: for part_no, _ in parts: part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") with open(part_path, "rb") as pf: out.write(pf.read()) print("🎉 DONE") print(f"📄 Final PDF: {FINAL_PDF}")