75 lines
1.7 KiB
Python
75 lines
1.7 KiB
Python
import os
|
||
import re
|
||
import nntplib
|
||
from dotenv import load_dotenv
|
||
from db import get_conn
|
||
|
||
# ================= CONFIG =================
|
||
GROUP = "alt.binaries.e-book.magazines"
|
||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||
RAW_DIR = r"downloads/raw"
|
||
# ==========================================
|
||
|
||
load_dotenv()
|
||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||
|
||
os.makedirs(RAW_DIR, exist_ok=True)
|
||
|
||
# --- DB: najdi články ---
|
||
conn = get_conn()
|
||
cur = conn.cursor()
|
||
|
||
cur.execute("""
|
||
SELECT article_number, metadata->>'subject'
|
||
FROM articles
|
||
WHERE newsgroup = %s
|
||
AND metadata->>'subject' LIKE %s
|
||
ORDER BY article_number
|
||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||
|
||
rows = cur.fetchall()
|
||
|
||
# --- parse part number from subject ---
|
||
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
||
parts = []
|
||
|
||
for art_num, subject in rows:
|
||
m = part_re.search(subject or "")
|
||
if not m:
|
||
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
|
||
part_no = int(m.group(1))
|
||
parts.append((part_no, art_num))
|
||
|
||
parts.sort(key=lambda x: x[0])
|
||
|
||
print(f"📦 Parts to download: {len(parts)}")
|
||
|
||
# --- NNTP download ---
|
||
with nntplib.NNTP_SSL(
|
||
"news.eweka.nl",
|
||
563,
|
||
EWEKA_USER,
|
||
EWEKA_PASS,
|
||
readermode=True
|
||
) as nntp:
|
||
|
||
nntp.group(GROUP)
|
||
|
||
for part_no, art_num in parts:
|
||
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
|
||
|
||
if os.path.exists(out_path):
|
||
continue
|
||
|
||
print(f"⬇️ Downloading RAW part {part_no} (article {art_num})")
|
||
|
||
_, info = nntp.body(art_num)
|
||
|
||
with open(out_path, "wb") as f:
|
||
for line in info.lines:
|
||
f.write(line)
|
||
f.write(b"\n")
|
||
|
||
print("🎉 DONE – RAW NNTP data stored")
|