newsgroups/23 ulozeni a slepeni.py

import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn

# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
OUT_DIR = r"downloads/PC_Pro_2011-07"
# ============================================

load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")

os.makedirs(OUT_DIR, exist_ok=True)


def yenc_decode_and_extract_headers(lines: list[bytes]):
    """
    Decode yEnc BODY lines and extract yEnc headers.
    Returns: (decoded_bytes, yenc_header_lines)
    """
    out = bytearray()
    yenc_headers = []

    for line in lines:
        # --- undo NNTP dot-stuffing ---
        if line.startswith(b".."):
            line = line[1:]
        elif line.startswith(b"."):
            line = line[1:]

        # --- capture yEnc control lines ---
        if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
            yenc_headers.append(line.decode("latin-1"))
            continue

        # --- yEnc decode ---
        i = 0
        length = len(line)
        while i < length:
            c = line[i]
            if c == ord("="):
                i += 1
                if i >= length:
                    break
                c = (line[i] - 64) & 0xFF
            out.append((c - 42) & 0xFF)
            i += 1

    return bytes(out), yenc_headers


# ------------------ DB ------------------
conn = get_conn()
cur = conn.cursor()

cur.execute("""
    SELECT article_number, metadata->>'subject'
    FROM articles
    WHERE newsgroup = %s
      AND metadata->>'subject' LIKE %s
    ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))

rows = cur.fetchall()

# parse part number from subject
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []

for art_num, subject in rows:
    m = part_re.search(subject or "")
    if not m:
        raise RuntimeError(f"Cannot parse part number from subject: {subject}")
    part_no = int(m.group(1))
    parts.append((part_no, art_num))

parts.sort(key=lambda x: x[0])

# ------------------ NNTP ------------------
with nntplib.NNTP_SSL(
    "news.eweka.nl",
    563,
    EWEKA_USER,
    EWEKA_PASS,
    readermode=True
) as nntp:

    nntp.group(GROUP)

    for part_no, art_num in parts:
        bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
        hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")

        if os.path.exists(bin_path) and os.path.exists(hdr_path):
            continue

        _, info = nntp.body(art_num)

        decoded, headers = yenc_decode_and_extract_headers(info.lines)

        with open(bin_path, "wb") as f:
            f.write(decoded)

        with open(hdr_path, "w", encoding="utf-8") as f:
            f.write("\n".join(headers))