import nntplib import os from dotenv import load_dotenv from datetime import datetime, UTC from db import get_conn from psycopg.types.json import Json def sanitize(value): if isinstance(value, str): return value.encode("utf-8", errors="surrogatepass") \ .decode("utf-8", errors="replace") return value # ================= CONFIG ================= GROUP = "alt.binaries.e-book.magazines" TOTAL_ARTICLES = 100_000 BATCH_SIZE = 1_000 # ========================================= load_dotenv() EWEKA_USER = os.getenv("EWEKA_USER") EWEKA_PASS = os.getenv("EWEKA_PASS") print("🔌 Connecting to PostgreSQL...") conn = get_conn() conn.autocommit = True cur = conn.cursor() print("🔌 Connecting to Eweka NNTP...") with nntplib.NNTP_SSL( host="news.eweka.nl", port=563, user=EWEKA_USER, password=EWEKA_PASS, readermode=True, ) as nntp: # --- GROUP --- resp, count, first, last, name = nntp.group(GROUP) first = int(first) last = int(last) print(f"📂 Group: {name}") print(f"📐 Range: {first} – {last}") start_global = max(first, last - TOTAL_ARTICLES + 1) print(f"🎯 Target range: {start_global} – {last}") for batch_start in range(start_global, last + 1, BATCH_SIZE): batch_end = min(batch_start + BATCH_SIZE - 1, last) print(f"📜 XOVER {batch_start}-{batch_end}") try: resp, overviews = nntp.xover(batch_start, batch_end) except Exception as e: print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}") continue rows = [] for art_num, fields in overviews: clean_fields = {k: sanitize(v) for k, v in fields.items()} metadata = { "group": GROUP, "article_number": art_num, **clean_fields, } rows.append(( GROUP, art_num, fields.get("message-id"), Json(metadata), # 👈 TADY datetime.now(UTC), )) if rows: cur.executemany( """ INSERT INTO articles (newsgroup, article_number, message_id, metadata, fetched_at) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (message_id) DO NOTHING """, rows, ) print("🎉 DONE – last 100k articles ingested")