Files
newsgroups/23 ulozeni a slepeni.py
2025-12-28 11:34:15 +01:00

110 lines
2.8 KiB
Python

import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
OUT_DIR = r"downloads/PC_Pro_2011-07"
# ============================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(OUT_DIR, exist_ok=True)
def yenc_decode_and_extract_headers(lines: list[bytes]):
"""
Decode yEnc BODY lines and extract yEnc headers.
Returns: (decoded_bytes, yenc_header_lines)
"""
out = bytearray()
yenc_headers = []
for line in lines:
# --- undo NNTP dot-stuffing ---
if line.startswith(b".."):
line = line[1:]
elif line.startswith(b"."):
line = line[1:]
# --- capture yEnc control lines ---
if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
yenc_headers.append(line.decode("latin-1"))
continue
# --- yEnc decode ---
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord("="):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
return bytes(out), yenc_headers
# ------------------ DB ------------------
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
# parse part number from subject
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
part_no = int(m.group(1))
parts.append((part_no, art_num))
parts.sort(key=lambda x: x[0])
# ------------------ NNTP ------------------
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True
) as nntp:
nntp.group(GROUP)
for part_no, art_num in parts:
bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")
if os.path.exists(bin_path) and os.path.exists(hdr_path):
continue
_, info = nntp.body(art_num)
decoded, headers = yenc_decode_and_extract_headers(info.lines)
with open(bin_path, "wb") as f:
f.write(decoded)
with open(hdr_path, "w", encoding="utf-8") as f:
f.write("\n".join(headers))