This commit is contained in:
2025-12-28 11:34:15 +01:00
parent f8dc6566bc
commit 64472e59ba
6 changed files with 488 additions and 143 deletions

74
25 stahni raw.py Normal file
View File

@@ -0,0 +1,74 @@
import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
RAW_DIR = r"downloads/raw"
# ==========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(RAW_DIR, exist_ok=True)
# --- DB: najdi články ---
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
# --- parse part number from subject ---
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
part_no = int(m.group(1))
parts.append((part_no, art_num))
parts.sort(key=lambda x: x[0])
print(f"📦 Parts to download: {len(parts)}")
# --- NNTP download ---
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True
) as nntp:
nntp.group(GROUP)
for part_no, art_num in parts:
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
if os.path.exists(out_path):
continue
print(f"⬇️ Downloading RAW part {part_no} (article {art_num})")
_, info = nntp.body(art_num)
with open(out_path, "wb") as f:
for line in info.lines:
f.write(line)
f.write(b"\n")
print("🎉 DONE RAW NNTP data stored")