z230
This commit is contained in:
93
25 stahni raw multithread.py
Normal file
93
25 stahni raw multithread.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import os
|
||||
import re
|
||||
import nntplib
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
|
||||
# ================= CONFIG =================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||||
RAW_DIR = r"downloads/raw"
|
||||
MAX_WORKERS = 5
|
||||
# ==========================================
|
||||
|
||||
load_dotenv()
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
os.makedirs(RAW_DIR, exist_ok=True)
|
||||
|
||||
# ---------- DB: load parts ----------
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT article_number, metadata->>'subject'
|
||||
FROM articles
|
||||
WHERE newsgroup = %s
|
||||
AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number
|
||||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
||||
parts = []
|
||||
|
||||
for art_num, subject in rows:
|
||||
m = part_re.search(subject or "")
|
||||
if not m:
|
||||
continue
|
||||
part_no = int(m.group(1))
|
||||
parts.append((part_no, art_num))
|
||||
|
||||
parts.sort(key=lambda x: x[0])
|
||||
|
||||
print(f"📦 Parts to download: {len(parts)}")
|
||||
|
||||
# ---------- worker ----------
|
||||
def download_part(part_no: int, art_num: int):
|
||||
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
|
||||
|
||||
if os.path.exists(out_path):
|
||||
return f"⏭️ part {part_no} exists"
|
||||
|
||||
with nntplib.NNTP_SSL(
|
||||
"news.eweka.nl",
|
||||
563,
|
||||
EWEKA_USER,
|
||||
EWEKA_PASS,
|
||||
readermode=True,
|
||||
timeout=120
|
||||
) as nntp:
|
||||
|
||||
nntp.group(GROUP)
|
||||
_, info = nntp.body(art_num)
|
||||
|
||||
with open(out_path, "wb") as f:
|
||||
for line in info.lines:
|
||||
f.write(line)
|
||||
f.write(b"\n")
|
||||
|
||||
return f"⬇️ part {part_no} done"
|
||||
|
||||
# ---------- parallel execution ----------
|
||||
errors = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
futures = [
|
||||
executor.submit(download_part, part_no, art_num)
|
||||
for part_no, art_num in parts
|
||||
]
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
msg = future.result()
|
||||
print(msg)
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f"❌ ERROR: {e}")
|
||||
|
||||
print("🎉 DONE")
|
||||
print(f"⚠️ Errors: {errors}")
|
||||
Reference in New Issue
Block a user