z230
This commit is contained in:
93
20 Alt binaries ebook magazines.py
Normal file
93
20 Alt binaries ebook magazines.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import nntplib
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime, UTC
|
||||
from db import get_conn
|
||||
from psycopg.types.json import Json
|
||||
|
||||
|
||||
def sanitize(value):
|
||||
if isinstance(value, str):
|
||||
return value.encode("utf-8", errors="surrogatepass") \
|
||||
.decode("utf-8", errors="replace")
|
||||
return value
|
||||
|
||||
|
||||
# ================= CONFIG =================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
TOTAL_ARTICLES = 100_000
|
||||
BATCH_SIZE = 1_000
|
||||
# =========================================
|
||||
|
||||
load_dotenv()
|
||||
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
conn = get_conn()
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
host="news.eweka.nl",
|
||||
port=563,
|
||||
user=EWEKA_USER,
|
||||
password=EWEKA_PASS,
|
||||
readermode=True,
|
||||
) as nntp:
|
||||
|
||||
# --- GROUP ---
|
||||
resp, count, first, last, name = nntp.group(GROUP)
|
||||
first = int(first)
|
||||
last = int(last)
|
||||
|
||||
print(f"📂 Group: {name}")
|
||||
print(f"📐 Range: {first} – {last}")
|
||||
|
||||
start_global = max(first, last - TOTAL_ARTICLES + 1)
|
||||
print(f"🎯 Target range: {start_global} – {last}")
|
||||
|
||||
|
||||
for batch_start in range(start_global, last + 1, BATCH_SIZE):
|
||||
batch_end = min(batch_start + BATCH_SIZE - 1, last)
|
||||
|
||||
print(f"📜 XOVER {batch_start}-{batch_end}")
|
||||
|
||||
try:
|
||||
resp, overviews = nntp.xover(batch_start, batch_end)
|
||||
except Exception as e:
|
||||
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for art_num, fields in overviews:
|
||||
clean_fields = {k: sanitize(v) for k, v in fields.items()}
|
||||
|
||||
metadata = {
|
||||
"group": GROUP,
|
||||
"article_number": art_num,
|
||||
**clean_fields,
|
||||
}
|
||||
|
||||
rows.append((
|
||||
GROUP,
|
||||
art_num,
|
||||
fields.get("message-id"),
|
||||
Json(metadata), # 👈 TADY
|
||||
datetime.now(UTC),
|
||||
))
|
||||
|
||||
if rows:
|
||||
cur.executemany(
|
||||
"""
|
||||
INSERT INTO articles
|
||||
(newsgroup, article_number, message_id, metadata, fetched_at)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (message_id) DO NOTHING
|
||||
""",
|
||||
rows,
|
||||
)
|
||||
|
||||
print("🎉 DONE – last 100k articles ingested")
|
||||
Reference in New Issue
Block a user