This commit is contained in:
2025-12-27 17:24:30 +01:00
parent ea485bad29
commit f8dc6566bc
7 changed files with 542 additions and 0 deletions

View File

@@ -0,0 +1,93 @@
import nntplib
import os
from dotenv import load_dotenv
from datetime import datetime, UTC
from db import get_conn
from psycopg.types.json import Json
def sanitize(value):
if isinstance(value, str):
return value.encode("utf-8", errors="surrogatepass") \
.decode("utf-8", errors="replace")
return value
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
TOTAL_ARTICLES = 100_000
BATCH_SIZE = 1_000
# =========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
conn.autocommit = True
cur = conn.cursor()
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
host="news.eweka.nl",
port=563,
user=EWEKA_USER,
password=EWEKA_PASS,
readermode=True,
) as nntp:
# --- GROUP ---
resp, count, first, last, name = nntp.group(GROUP)
first = int(first)
last = int(last)
print(f"📂 Group: {name}")
print(f"📐 Range: {first} {last}")
start_global = max(first, last - TOTAL_ARTICLES + 1)
print(f"🎯 Target range: {start_global} {last}")
for batch_start in range(start_global, last + 1, BATCH_SIZE):
batch_end = min(batch_start + BATCH_SIZE - 1, last)
print(f"📜 XOVER {batch_start}-{batch_end}")
try:
resp, overviews = nntp.xover(batch_start, batch_end)
except Exception as e:
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
continue
rows = []
for art_num, fields in overviews:
clean_fields = {k: sanitize(v) for k, v in fields.items()}
metadata = {
"group": GROUP,
"article_number": art_num,
**clean_fields,
}
rows.append((
GROUP,
art_num,
fields.get("message-id"),
Json(metadata), # 👈 TADY
datetime.now(UTC),
))
if rows:
cur.executemany(
"""
INSERT INTO articles
(newsgroup, article_number, message_id, metadata, fetched_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (message_id) DO NOTHING
""",
rows,
)
print("🎉 DONE last 100k articles ingested")