98 lines
2.5 KiB
Python
98 lines
2.5 KiB
Python
import nntplib
|
||
import os
|
||
from dotenv import load_dotenv
|
||
from datetime import datetime, UTC
|
||
from db import get_conn
|
||
from psycopg.types.json import Json
|
||
|
||
|
||
def sanitize(value):
|
||
if isinstance(value, str):
|
||
return value.encode("utf-8", errors="surrogatepass") \
|
||
.decode("utf-8", errors="replace")
|
||
return value
|
||
|
||
|
||
# ================= CONFIG =================
|
||
GROUP = "alt.binaries.e-book.magazines"
|
||
TOTAL_ARTICLES = 75_000_000
|
||
BATCH_SIZE = 1_000
|
||
FIRST=40805000
|
||
# =========================================
|
||
|
||
load_dotenv()
|
||
|
||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||
|
||
print("🔌 Connecting to PostgreSQL...")
|
||
conn = get_conn()
|
||
conn.autocommit = True
|
||
cur = conn.cursor()
|
||
|
||
print("🔌 Connecting to Eweka NNTP...")
|
||
with nntplib.NNTP_SSL(
|
||
host="news.eweka.nl",
|
||
port=563,
|
||
user=EWEKA_USER,
|
||
password=EWEKA_PASS,
|
||
readermode=True,
|
||
) as nntp:
|
||
|
||
# --- GROUP ---
|
||
resp, count, first, last, name = nntp.group(GROUP)
|
||
first = int(first)
|
||
last = int(last)
|
||
|
||
# start_global = first
|
||
start_global = FIRST
|
||
# end_global = min(first + TOTAL_ARTICLES - 1, last)
|
||
end_global = last
|
||
|
||
print(f"🎯 Target range: {start_global} – {end_global}")
|
||
|
||
print(f"📂 Group: {name}")
|
||
print(f"📐 Range: {first} – {last}")
|
||
|
||
for batch_start in range(start_global, end_global + 1, BATCH_SIZE):
|
||
batch_end = min(batch_start + BATCH_SIZE - 1, end_global)
|
||
|
||
print(f"📜 XOVER {batch_start}-{batch_end}")
|
||
|
||
try:
|
||
resp, overviews = nntp.xover(batch_start, batch_end)
|
||
except Exception as e:
|
||
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
|
||
continue
|
||
|
||
rows = []
|
||
for art_num, fields in overviews:
|
||
clean_fields = {k: sanitize(v) for k, v in fields.items()}
|
||
|
||
metadata = {
|
||
"group": GROUP,
|
||
"article_number": art_num,
|
||
**clean_fields,
|
||
}
|
||
|
||
rows.append((
|
||
GROUP,
|
||
art_num,
|
||
fields.get("message-id"),
|
||
Json(metadata), # 👈 TADY
|
||
datetime.now(UTC),
|
||
))
|
||
|
||
if rows:
|
||
cur.executemany(
|
||
"""
|
||
INSERT INTO articles
|
||
(newsgroup, article_number, message_id, metadata, fetched_at)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
ON CONFLICT (message_id) DO NOTHING
|
||
""",
|
||
rows,
|
||
)
|
||
|
||
print("🎉 DONE – last 100k articles ingested")
|