157 lines
6.0 KiB
Python
157 lines
6.0 KiB
Python
import nntplib
|
||
import os
|
||
import time
|
||
import socket
|
||
from dotenv import load_dotenv
|
||
from datetime import datetime, UTC
|
||
from db import get_conn
|
||
from psycopg.types.json import Json
|
||
|
||
# --- ZVÝŠENÍ LIMITŮ PRO STABILITU ---
|
||
nntplib._MAXLINE = 1048576 # 1MB limit pro dlouhé hlavičky
|
||
socket.setdefaulttimeout(30) # Prevence nekonečného čekání na síť
|
||
|
||
# ================= CONFIG =================
|
||
GROUP = "alt.binaries.e-book.magazines"
|
||
BATCH_SIZE = 5000 # Menší batch pro častější feedback v konzoli
|
||
# =========================================
|
||
|
||
load_dotenv()
|
||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||
|
||
|
||
def get_current_frontier(cur):
|
||
print(f"🔍 Hledám v databázi hranici spodního bloku pro skupinu {GROUP}...")
|
||
cur.execute("""
|
||
SELECT MAX(article_number)
|
||
FROM articles
|
||
WHERE newsgroup = %s AND article_number < 10000000
|
||
""", (GROUP,))
|
||
res = cur.fetchone()[0]
|
||
val = int(res) if res else None
|
||
print(f"📍 Nalezena hranice: {val if val else 'Žádná (začínám od nuly)'}")
|
||
return val
|
||
|
||
|
||
def is_batch_missing(cur, start, end):
|
||
# Tady print nedáváme, aby to nespamovalo 1000x za sekundu
|
||
cur.execute("""
|
||
SELECT EXISTS (
|
||
SELECT 1 FROM articles
|
||
WHERE newsgroup = %s AND article_number BETWEEN %s AND %s
|
||
)
|
||
""", (GROUP, start, end))
|
||
return not cur.fetchone()[0]
|
||
|
||
|
||
def fast_fill():
|
||
print(f"🚀 Startuji proces: {datetime.now().strftime('%H:%M:%S')}")
|
||
|
||
conn = get_conn()
|
||
conn.autocommit = True
|
||
cur = conn.cursor()
|
||
|
||
current_id = get_current_frontier(cur)
|
||
if not current_id:
|
||
print("⚠️ Varování: Spodní hranice nenalezena, zkusím se zeptat serveru na nejstarší článek.")
|
||
|
||
print(f"🔌 Připojuji se k Eweka NNTP SSL (news.eweka.nl)...")
|
||
try:
|
||
with nntplib.NNTP_SSL("news.eweka.nl", 563, user=EWEKA_USER, password=EWEKA_PASS) as nntp:
|
||
resp, count, first, last, name = nntp.group(GROUP)
|
||
first, last = int(first), int(last)
|
||
|
||
if not current_id:
|
||
current_id = first
|
||
|
||
print(f"📊 INFO O SKUPINĚ:")
|
||
print(f" Název: {name}")
|
||
print(f" Článků na serveru: {count}")
|
||
print(f" Rozsah: {first} až {last}")
|
||
print(f" Aktuální pozice: {current_id}")
|
||
print(f" Zbývá zpracovat cca: {last - current_id} článků")
|
||
print("-" * 50)
|
||
|
||
total_downloaded = 0
|
||
start_process_time = time.time()
|
||
|
||
while current_id < last:
|
||
batch_start = current_id + 1
|
||
batch_end = min(batch_start + BATCH_SIZE - 1, last)
|
||
|
||
# 1. KROK: Kontrola existence
|
||
if not is_batch_missing(cur, batch_start, batch_end):
|
||
# Zkusíme skočit dál
|
||
cur.execute(
|
||
"SELECT MAX(article_number) FROM articles WHERE newsgroup = %s AND article_number BETWEEN %s AND %s",
|
||
(GROUP, batch_start, batch_end + 100000))
|
||
|
||
res = cur.fetchone()[0]
|
||
if res:
|
||
current_id = int(res)
|
||
print(f"⏩ [SKIP] Data existují. Přeskakuji na ID: {current_id}", end="\r")
|
||
else:
|
||
current_id = batch_end # Sychr
|
||
continue
|
||
|
||
# 2. KROK: Stahování (tady dáváme hodně printů)
|
||
print(f"\n💎 [DÍRA] Nalezena mezera: {batch_start} - {batch_end}")
|
||
|
||
# --- SÍŤOVÁ ČÁST ---
|
||
print(f" 📡 NNTP XOVER...", end=" ", flush=True)
|
||
t_net_start = time.time()
|
||
try:
|
||
resp, overviews = nntp.xover(batch_start, batch_end)
|
||
t_net_end = time.time()
|
||
print(f"OK ({len(overviews)} hlaviček za {t_net_end - t_net_start:.2f}s)")
|
||
|
||
rows = []
|
||
for art_num, fields in overviews:
|
||
rows.append((
|
||
GROUP, art_num, fields.get("message-id"),
|
||
Json({"group": GROUP, "article_number": art_num, **fields}),
|
||
datetime.now(UTC)
|
||
))
|
||
|
||
# --- DATABÁZOVÁ ČÁST ---
|
||
if rows:
|
||
print(f" 💾 POSTGRES INSERT...", end=" ", flush=True)
|
||
t_db_start = time.time()
|
||
cur.executemany(
|
||
"""
|
||
INSERT INTO articles (newsgroup, article_number, message_id, metadata, fetched_at)
|
||
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING
|
||
""", rows
|
||
)
|
||
t_db_end = time.time()
|
||
total_downloaded += len(rows)
|
||
print(f"OK ({t_db_end - t_db_start:.2f}s)")
|
||
else:
|
||
print(f" ℹ️ Žádná data k uložení (prázdný rozsah na serveru)")
|
||
|
||
current_id = batch_end
|
||
|
||
except Exception as e:
|
||
print(f"\n❌ CHYBA v bloku {batch_start}-{batch_end}: {e}")
|
||
print(" Zkouším popojít o jeden BATCH dál...")
|
||
current_id += BATCH_SIZE
|
||
time.sleep(2) # Krátká pauza při chybě
|
||
|
||
except Exception as e:
|
||
print(f"\n💥 KRITICKÁ CHYBA PŘIPOJENÍ: {e}")
|
||
finally:
|
||
total_time = time.time() - start_process_time
|
||
print("\n" + "=" * 50)
|
||
print(f"🏁 KONEC RELACE")
|
||
print(f"⏱️ Celkový čas: {total_time:.1f} sekund")
|
||
print(f"📦 Celkem staženo nových hlaviček: {total_downloaded}")
|
||
if total_time > 0:
|
||
print(f"🚀 Průměrná rychlost: {total_downloaded / total_time:.1f} hlaviček/s")
|
||
print("=" * 50)
|
||
cur.close()
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
fast_fill() |