94 lines
2.2 KiB
Python
94 lines
2.2 KiB
Python
import os
|
|
import re
|
|
import nntplib
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dotenv import load_dotenv
|
|
from db import get_conn
|
|
|
|
# ================= CONFIG =================
|
|
GROUP = "alt.binaries.e-book.magazines"
|
|
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
|
RAW_DIR = r"downloads/raw"
|
|
MAX_WORKERS = 5
|
|
# ==========================================
|
|
|
|
load_dotenv()
|
|
EWEKA_USER = os.getenv("EWEKA_USER")
|
|
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
|
|
|
os.makedirs(RAW_DIR, exist_ok=True)
|
|
|
|
# ---------- DB: load parts ----------
|
|
conn = get_conn()
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("""
|
|
SELECT article_number, metadata->>'subject'
|
|
FROM articles
|
|
WHERE newsgroup = %s
|
|
AND metadata->>'subject' LIKE %s
|
|
ORDER BY article_number
|
|
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
|
|
|
rows = cur.fetchall()
|
|
|
|
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
|
parts = []
|
|
|
|
for art_num, subject in rows:
|
|
m = part_re.search(subject or "")
|
|
if not m:
|
|
continue
|
|
part_no = int(m.group(1))
|
|
parts.append((part_no, art_num))
|
|
|
|
parts.sort(key=lambda x: x[0])
|
|
|
|
print(f"📦 Parts to download: {len(parts)}")
|
|
|
|
# ---------- worker ----------
|
|
def download_part(part_no: int, art_num: int):
|
|
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
|
|
|
|
if os.path.exists(out_path):
|
|
return f"⏭️ part {part_no} exists"
|
|
|
|
with nntplib.NNTP_SSL(
|
|
"news.eweka.nl",
|
|
563,
|
|
EWEKA_USER,
|
|
EWEKA_PASS,
|
|
readermode=True,
|
|
timeout=120
|
|
) as nntp:
|
|
|
|
nntp.group(GROUP)
|
|
_, info = nntp.body(art_num)
|
|
|
|
with open(out_path, "wb") as f:
|
|
for line in info.lines:
|
|
f.write(line)
|
|
f.write(b"\n")
|
|
|
|
return f"⬇️ part {part_no} done"
|
|
|
|
# ---------- parallel execution ----------
|
|
errors = 0
|
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
futures = [
|
|
executor.submit(download_part, part_no, art_num)
|
|
for part_no, art_num in parts
|
|
]
|
|
|
|
for future in as_completed(futures):
|
|
try:
|
|
msg = future.result()
|
|
print(msg)
|
|
except Exception as e:
|
|
errors += 1
|
|
print(f"❌ ERROR: {e}")
|
|
|
|
print("🎉 DONE")
|
|
print(f"⚠️ Errors: {errors}")
|