110 lines
2.8 KiB
Python
110 lines
2.8 KiB
Python
import os
|
|
import re
|
|
import nntplib
|
|
from dotenv import load_dotenv
|
|
from db import get_conn
|
|
|
|
# ================== CONFIG ==================
|
|
GROUP = "alt.binaries.e-book.magazines"
|
|
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
|
OUT_DIR = r"downloads/PC_Pro_2011-07"
|
|
# ============================================
|
|
|
|
load_dotenv()
|
|
EWEKA_USER = os.getenv("EWEKA_USER")
|
|
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
|
|
|
os.makedirs(OUT_DIR, exist_ok=True)
|
|
|
|
|
|
def yenc_decode_and_extract_headers(lines: list[bytes]):
|
|
"""
|
|
Decode yEnc BODY lines and extract yEnc headers.
|
|
Returns: (decoded_bytes, yenc_header_lines)
|
|
"""
|
|
out = bytearray()
|
|
yenc_headers = []
|
|
|
|
for line in lines:
|
|
# --- undo NNTP dot-stuffing ---
|
|
if line.startswith(b".."):
|
|
line = line[1:]
|
|
elif line.startswith(b"."):
|
|
line = line[1:]
|
|
|
|
# --- capture yEnc control lines ---
|
|
if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
|
|
yenc_headers.append(line.decode("latin-1"))
|
|
continue
|
|
|
|
# --- yEnc decode ---
|
|
i = 0
|
|
length = len(line)
|
|
while i < length:
|
|
c = line[i]
|
|
if c == ord("="):
|
|
i += 1
|
|
if i >= length:
|
|
break
|
|
c = (line[i] - 64) & 0xFF
|
|
out.append((c - 42) & 0xFF)
|
|
i += 1
|
|
|
|
return bytes(out), yenc_headers
|
|
|
|
|
|
# ------------------ DB ------------------
|
|
conn = get_conn()
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("""
|
|
SELECT article_number, metadata->>'subject'
|
|
FROM articles
|
|
WHERE newsgroup = %s
|
|
AND metadata->>'subject' LIKE %s
|
|
ORDER BY article_number
|
|
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
|
|
|
rows = cur.fetchall()
|
|
|
|
# parse part number from subject
|
|
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
|
parts = []
|
|
|
|
for art_num, subject in rows:
|
|
m = part_re.search(subject or "")
|
|
if not m:
|
|
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
|
|
part_no = int(m.group(1))
|
|
parts.append((part_no, art_num))
|
|
|
|
parts.sort(key=lambda x: x[0])
|
|
|
|
# ------------------ NNTP ------------------
|
|
with nntplib.NNTP_SSL(
|
|
"news.eweka.nl",
|
|
563,
|
|
EWEKA_USER,
|
|
EWEKA_PASS,
|
|
readermode=True
|
|
) as nntp:
|
|
|
|
nntp.group(GROUP)
|
|
|
|
for part_no, art_num in parts:
|
|
bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
|
|
hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")
|
|
|
|
if os.path.exists(bin_path) and os.path.exists(hdr_path):
|
|
continue
|
|
|
|
_, info = nntp.body(art_num)
|
|
|
|
decoded, headers = yenc_decode_and_extract_headers(info.lines)
|
|
|
|
with open(bin_path, "wb") as f:
|
|
f.write(decoded)
|
|
|
|
with open(hdr_path, "w", encoding="utf-8") as f:
|
|
f.write("\n".join(headers))
|