Files
newsgroups/21 poslednich 100k.py
2025-12-27 17:24:30 +01:00

95 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import nntplib
import os
from dotenv import load_dotenv
from datetime import datetime, UTC
from db import get_conn
from psycopg.types.json import Json
def sanitize(value):
if isinstance(value, str):
return value.encode("utf-8", errors="surrogatepass") \
.decode("utf-8", errors="replace")
return value
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
TOTAL_ARTICLES = 50_000_000
BATCH_SIZE = 10_000
# =========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
conn.autocommit = True
cur = conn.cursor()
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
host="news.eweka.nl",
port=563,
user=EWEKA_USER,
password=EWEKA_PASS,
readermode=True,
) as nntp:
# --- GROUP ---
resp, count, first, last, name = nntp.group(GROUP)
first = int(first)
last = int(last)
start_global = first
end_global = min(first + TOTAL_ARTICLES - 1, last)
print(f"🎯 Target range: {start_global} {end_global}")
print(f"📂 Group: {name}")
print(f"📐 Range: {first} {last}")
for batch_start in range(start_global, end_global + 1, BATCH_SIZE):
batch_end = min(batch_start + BATCH_SIZE - 1, end_global)
print(f"📜 XOVER {batch_start}-{batch_end}")
try:
resp, overviews = nntp.xover(batch_start, batch_end)
except Exception as e:
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
continue
rows = []
for art_num, fields in overviews:
clean_fields = {k: sanitize(v) for k, v in fields.items()}
metadata = {
"group": GROUP,
"article_number": art_num,
**clean_fields,
}
rows.append((
GROUP,
art_num,
fields.get("message-id"),
Json(metadata), # 👈 TADY
datetime.now(UTC),
))
if rows:
cur.executemany(
"""
INSERT INTO articles
(newsgroup, article_number, message_id, metadata, fetched_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (message_id) DO NOTHING
""",
rows,
)
print("🎉 DONE last 100k articles ingested")