z230
This commit is contained in:
74
25 stahni raw.py
Normal file
74
25 stahni raw.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import os
|
||||
import re
|
||||
import nntplib
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
|
||||
# ================= CONFIG =================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||||
RAW_DIR = r"downloads/raw"
|
||||
# ==========================================
|
||||
|
||||
load_dotenv()
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
os.makedirs(RAW_DIR, exist_ok=True)
|
||||
|
||||
# --- DB: najdi články ---
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT article_number, metadata->>'subject'
|
||||
FROM articles
|
||||
WHERE newsgroup = %s
|
||||
AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number
|
||||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
# --- parse part number from subject ---
|
||||
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
||||
parts = []
|
||||
|
||||
for art_num, subject in rows:
|
||||
m = part_re.search(subject or "")
|
||||
if not m:
|
||||
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
|
||||
part_no = int(m.group(1))
|
||||
parts.append((part_no, art_num))
|
||||
|
||||
parts.sort(key=lambda x: x[0])
|
||||
|
||||
print(f"📦 Parts to download: {len(parts)}")
|
||||
|
||||
# --- NNTP download ---
|
||||
with nntplib.NNTP_SSL(
|
||||
"news.eweka.nl",
|
||||
563,
|
||||
EWEKA_USER,
|
||||
EWEKA_PASS,
|
||||
readermode=True
|
||||
) as nntp:
|
||||
|
||||
nntp.group(GROUP)
|
||||
|
||||
for part_no, art_num in parts:
|
||||
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
|
||||
|
||||
if os.path.exists(out_path):
|
||||
continue
|
||||
|
||||
print(f"⬇️ Downloading RAW part {part_no} (article {art_num})")
|
||||
|
||||
_, info = nntp.body(art_num)
|
||||
|
||||
with open(out_path, "wb") as f:
|
||||
for line in info.lines:
|
||||
f.write(line)
|
||||
f.write(b"\n")
|
||||
|
||||
print("🎉 DONE – RAW NNTP data stored")
|
||||
Reference in New Issue
Block a user