Files
newsgroups/25 stahni raw.py
2025-12-28 11:34:15 +01:00

75 lines
1.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
RAW_DIR = r"downloads/raw"
# ==========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(RAW_DIR, exist_ok=True)
# --- DB: najdi články ---
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
# --- parse part number from subject ---
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
part_no = int(m.group(1))
parts.append((part_no, art_num))
parts.sort(key=lambda x: x[0])
print(f"📦 Parts to download: {len(parts)}")
# --- NNTP download ---
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True
) as nntp:
nntp.group(GROUP)
for part_no, art_num in parts:
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
if os.path.exists(out_path):
continue
print(f"⬇️ Downloading RAW part {part_no} (article {art_num})")
_, info = nntp.body(art_num)
with open(out_path, "wb") as f:
for line in info.lines:
f.write(line)
f.write(b"\n")
print("🎉 DONE RAW NNTP data stored")