Merge remote-tracking branch 'origin/main'

# Conflicts: # 90 test.py
2025-12-29 23:28:05 +01:00
parent 4678d3fe7f 8fc1d149dc
commit 0f88f0c7d0
5 changed files with 333 additions and 132 deletions
--- a/.gitignore
+++ b/.gitignore
--- a/clanku.py
+++ b/clanku.py
@@ -1,71 +0,0 @@
-import nntplib
-import os
-from dotenv import load_dotenv
-from db import get_conn
-
-# ================== CONFIG ==================
-GROUP = "alt.binaries.e-book.magazines"
-SUBJECT_KEY = "PC Pro 2011-07.pdf"
-# ============================================
-
-load_dotenv()
-
-EWEKA_USER = os.getenv("EWEKA_USER")
-EWEKA_PASS = os.getenv("EWEKA_PASS")
-
-print("🔌 Connecting to PostgreSQL...")
-conn = get_conn()
-cur = conn.cursor()
-
-cur.execute("""
-    SELECT article_number
-    FROM articles
-    WHERE newsgroup = %s
-      AND metadata->>'subject' LIKE %s
-    ORDER BY article_number
-""", (GROUP, f"%{SUBJECT_KEY}%"))
-
-article_numbers = [row[0] for row in cur.fetchall()]
-total = len(article_numbers)
-
-print(f"📦 Found {total} parts in DB")
-
-if total == 0:
-    print("❌ No articles found, aborting.")
-    exit(1)
-
-print("🔌 Connecting to Eweka NNTP...")
-with nntplib.NNTP_SSL(
-    "news.eweka.nl",
-    563,
-    EWEKA_USER,
-    EWEKA_PASS,
-    readermode=True,
-) as nntp:
-
-    nntp.group(GROUP)
-
-    existing = []
-    missing = []
-
-    for idx, art in enumerate(article_numbers, start=1):
-        try:
-            nntp.stat(art)
-            existing.append(art)
-            print(f"✅ [{idx}/{total}] EXISTS  article {art}")
-        except Exception:
-            missing.append(art)
-            print(f"❌ [{idx}/{total}] MISSING article {art}")
-
-print("\n================ RESULT ================")
-print(f"Total parts : {total}")
-print(f"Existing    : {len(existing)}")
-print(f"Missing     : {len(missing)}")
-
-if existing:
-    print("\nExisting article_numbers:")
-    print(existing)
-
-if missing:
-    print("\nMissing article_numbers (first 20):")
-    print(missing[:20])
--- a/FastLane.py
+++ b/FastLane.py
@@ -0,0 +1,97 @@
+import os
+import nntplib
+import sabctools
+import io
+import binascii
+from dotenv import load_dotenv
+from db import get_conn
+
+BASE_DIR = "FastLane"
+OUTPUT_DIR = os.path.join(BASE_DIR, "output")
+if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR)
+
+load_dotenv()
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+EWEKA_HOST = "news.eweka.nl"
+
+NEWSGROUP = 'alt.binaries.e-book.magazines'
+SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
+FINAL_PATH = os.path.join(OUTPUT_DIR, "Fast_Lane_Biker_Dec_2011.pdf")
+
+
+def final_precision_downloader():
+    print("🚀 Startuji FINÁLNÍ OPRAVENÉ stahování (bez překryvu bajtů)...")
+
+    conn = get_conn()
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT article_number FROM articles
+        WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
+        ORDER BY article_number;
+    """, (NEWSGROUP, SUBJECT_FILTER))
+    articles = cur.fetchall()
+
+    if not articles: return
+
+    try:
+        server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
+        server.group(NEWSGROUP)
+    except Exception as e:
+        print(f"💥 Chyba NNTP: {e}");
+        return
+
+    # Zjištění celkové velikosti
+    resp, info = server.body(str(articles[0][0]))
+    stuffed = b"\r\n".join([(b"." + l if l.startswith(b".") else l) for l in info.lines])
+    wrapped = b"222 0 <id>\r\n" + stuffed + b"\r\n.\r\n"
+    decoder = sabctools.Decoder(len(wrapped))
+    decoder.process(io.BytesIO(wrapped).readinto(decoder))
+    meta = next(decoder, None)
+
+    total_size = meta.file_size
+    print(f"📏 Alokuji soubor: {total_size} bajtů.")
+    with open(FINAL_PATH, "wb") as f:
+        f.truncate(total_size)
+
+    # Zápis na offsety
+    part_fixed_size = 384000
+    with open(FINAL_PATH, "r+b") as f_out:
+        for i, (art_num,) in enumerate(articles):
+            try:
+                resp, info = server.body(str(art_num))
+                # Re-stuffing teček
+                stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
+                raw_body = b"\r\n".join(stuffed_lines)
+
+                wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
+                decoder = sabctools.Decoder(len(wrapped))
+                decoder.process(io.BytesIO(wrapped).readinto(decoder))
+                res = next(decoder, None)
+
+                if res and res.data:
+                    # MATEMATICKÁ OPRAVA: Výpočet bez překryvu
+                    # První part (i=0) -> Offset 0
+                    # Druhý part (i=1) -> Offset 384000 (ne 383999!)
+                    current_offset = i * part_fixed_size
+
+                    f_out.seek(current_offset)
+                    f_out.write(res.data)
+
+                    v_crc = binascii.crc32(res.data)
+                    e_crc = getattr(res, 'crc_expected', 0)
+                    status = "✅" if (e_crc == 0 or v_crc == e_crc) else "❌ CRC FAIL"
+
+                    print(f"   [{status}] Part {art_num} -> Offset: {current_offset}, Len: {len(res.data)}")
+            except Exception as e:
+                print(f"   ❌ Chyba u {art_num}: {e}")
+
+    print("-" * 50)
+    print(f"🏁 HOTOVO! Teď už to PDF musí být perfektní.")
+    server.quit()
+    cur.close()
+    conn.close()
+
+
+if __name__ == "__main__":
+    final_precision_downloader()
--- a/test.py
+++ b/test.py
@@ -1,85 +1,103 @@
+import os
+import nntplib
 import sabctools
 import io
-import os
 import binascii
-import re
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from dotenv import load_dotenv
+from db import get_conn

-# --- KONFIGURACE ---+
-INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
-OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded"
+# --- Konfigurace zůstává ---
+load_dotenv()
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+EWEKA_HOST = "news.eweka.nl"
+NEWSGROUP = 'alt.binaries.e-book.magazines'
+SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
+FINAL_PATH = os.path.join("FastLane/output", "Fast_Lane_Biker_Dec_2011.pdf")
+
+MAX_WORKERS = 50
+PART_FIXED_SIZE = 384000
+file_lock = threading.Lock()


-def ultimate_bomba_decoder():
-    if not os.path.exists(INPUT_FILE):
-        print(f"❌ Soubor nenalezen: {INPUT_FILE}")
+def download_chunk(articles_subset):
+    """
+    Tato funkce běží v jednom vlákně a obsluhuje JEDNO TRVALÉ spojení.
+    articles_subset je seznam tuplic (index, art_num).
+    """
+    if not articles_subset:
        return

-    print(f"📖 Načítám soubor...")
-    with open(INPUT_FILE, "rb") as f:
-        raw_data = f.read()
+    try:
+        # PŘIHLÁŠENÍ JEN JEDNOU NA ZAČÁTKU
+        server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
+        server.group(NEWSGROUP)

-    # 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT
-    # Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13
-    yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data)
-    expected_size = 0
-    expected_crc_str = ""
-    if yend_match:
-        expected_size = int(yend_match.group(1))
-        expected_crc_str = yend_match.group(2).decode().lower()
-        print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}")
+        # Otevřeme soubor pro zápis (v r+b módu)
+        with open(FINAL_PATH, "r+b") as f_out:
+            for index, art_num in articles_subset:
+                try:
+                    resp, info = server.body(str(art_num))
+                    stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
+                    raw_body = b"\r\n".join(stuffed_lines)

-    # 2. KLÍČOVÁ OPRAVA (Čištění dat)
-    # Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n
-    processed_data = raw_data.strip()
-    # Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n
-    processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n")
+                    wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
+                    decoder = sabctools.Decoder(len(wrapped))
+                    decoder.process(io.BytesIO(wrapped).readinto(decoder))
+                    res = next(decoder, None)

-    # 3. ZABALENÍ DO NNTP OBÁLKY
-    wrapped = b"222 0 <part1@id>\r\n" + processed_data + b"\r\n.\r\n"
+                    if res and res.data:
+                        current_offset = index * PART_FIXED_SIZE

-    # 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API)
-    decoder = sabctools.Decoder(len(wrapped))
-    buf = io.BytesIO(wrapped)
-    n = buf.readinto(decoder)
-    decoder.process(n)
+                        # Zápis na specifický offset
+                        with file_lock:
+                            f_out.seek(current_offset)
+                            f_out.write(res.data)

-    response = next(decoder, None)
+                        print(f"   [OK] Part {art_num} (Vlákno {threading.current_thread().name})")
+                except Exception as e:
+                    print(f"   [!] Chyba u článku {art_num}: {e}")

-    if response and response.data:
-        # 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32)
-        # binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát
-        vypoctene_crc_int = binascii.crc32(response.data)
-        vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower()
+        server.quit()
+    except Exception as e:
+        print(f"   [!!!] Vlákno se nemohlo připojit: {e}")

-        real_size = len(response.data)

-        print("-" * 40)
-        print(f"📊 Kontrola integrity:")
-        print(f"   Skutečná velikost: {real_size} (Očekáváno: {expected_size})")
-        print(f"   Vypočítané CRC:    {vypoctene_crc_str}")
-        print(f"   Očekávané CRC:     {expected_crc_str}")
+def final_precision_downloader():
+    conn = get_conn()
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT article_number FROM articles
+        WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
+        ORDER BY article_number;
+    """, (NEWSGROUP, SUBJECT_FILTER))
+    articles = cur.fetchall()
+    cur.close()
+    conn.close()

-        if vypoctene_crc_str == expected_crc_str:
-            print("✅ BINGO! Soubor je 100% v pořádku.")
-        else:
-            print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.")
+    if not articles: return

-        # 6. ULOŽENÍ
-        if not os.path.exists(OUTPUT_DIR):
-            os.makedirs(OUTPUT_DIR)
+    # Příprava souboru (alokace místa)
+    # ... (zde ponechte váš kód pro zjištění total_size a truncate) ...
+    # Pro ukázku zkráceno:
+    total_size = len(articles) * PART_FIXED_SIZE  # Přibližné, raději použijte váš výpočet z minula
+    with open(FINAL_PATH, "wb") as f:
+        f.truncate(total_size)

-        # Jméno z yEnc hlavičky: PC Pro 2011-07.pdf
-        out_name = response.file_name or "decoded_part.bin"
-        out_path = os.path.join(OUTPUT_DIR, out_name)
+    # ROZDĚLENÍ PRÁCE: Rozdělíme seznam článků na 5 částí
+    all_tasks = [(i, art[0]) for i, art in enumerate(articles)]
+    chunk_size = (len(all_tasks) + MAX_WORKERS - 1) // MAX_WORKERS
+    subsets = [all_tasks[i:i + chunk_size] for i in range(0, len(all_tasks), chunk_size)]

-        with open(out_path, "wb") as f_out:
-            f_out.write(response.data)
+    print(f"🚀 Startuji stahování s {len(subsets)} trvalými spoji...")

-        print(f"💾 Uloženo do: {out_path}")
-        print("-" * 40)
-    else:
-        print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.")
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        executor.map(download_chunk, subsets)
+
+    print("🏁 Hotovo.")


 if __name__ == "__main__":
-    ultimate_bomba_decoder()
+    final_precision_downloader()
--- a/StahniVsechnyHlavickyDetekujMezery.py
+++ b/StahniVsechnyHlavickyDetekujMezery.py
@@ -0,0 +1,157 @@
+import nntplib
+import os
+import time
+import socket
+from dotenv import load_dotenv
+from datetime import datetime, UTC
+from db import get_conn
+from psycopg.types.json import Json
+
+# --- ZVÝŠENÍ LIMITŮ PRO STABILITU ---
+nntplib._MAXLINE = 1048576  # 1MB limit pro dlouhé hlavičky
+socket.setdefaulttimeout(30)  # Prevence nekonečného čekání na síť
+
+# ================= CONFIG =================
+GROUP = "alt.binaries.e-book.magazines"
+BATCH_SIZE = 5000  # Menší batch pro častější feedback v konzoli
+# =========================================
+
+load_dotenv()
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+
+def get_current_frontier(cur):
+    print(f"🔍 Hledám v databázi hranici spodního bloku pro skupinu {GROUP}...")
+    cur.execute("""
+        SELECT MAX(article_number) 
+        FROM articles 
+        WHERE newsgroup = %s AND article_number < 10000000
+    """, (GROUP,))
+    res = cur.fetchone()[0]
+    val = int(res) if res else None
+    print(f"📍 Nalezena hranice: {val if val else 'Žádná (začínám od nuly)'}")
+    return val
+
+
+def is_batch_missing(cur, start, end):
+    # Tady print nedáváme, aby to nespamovalo 1000x za sekundu
+    cur.execute("""
+        SELECT EXISTS (
+            SELECT 1 FROM articles 
+            WHERE newsgroup = %s AND article_number BETWEEN %s AND %s
+        )
+    """, (GROUP, start, end))
+    return not cur.fetchone()[0]
+
+
+def fast_fill():
+    print(f"🚀 Startuji proces: {datetime.now().strftime('%H:%M:%S')}")
+
+    conn = get_conn()
+    conn.autocommit = True
+    cur = conn.cursor()
+
+    current_id = get_current_frontier(cur)
+    if not current_id:
+        print("⚠️ Varování: Spodní hranice nenalezena, zkusím se zeptat serveru na nejstarší článek.")
+
+    print(f"🔌 Připojuji se k Eweka NNTP SSL (news.eweka.nl)...")
+    try:
+        with nntplib.NNTP_SSL("news.eweka.nl", 563, user=EWEKA_USER, password=EWEKA_PASS) as nntp:
+            resp, count, first, last, name = nntp.group(GROUP)
+            first, last = int(first), int(last)
+
+            if not current_id:
+                current_id = first
+
+            print(f"📊 INFO O SKUPINĚ:")
+            print(f"   Název: {name}")
+            print(f"   Článků na serveru: {count}")
+            print(f"   Rozsah: {first} až {last}")
+            print(f"   Aktuální pozice: {current_id}")
+            print(f"   Zbývá zpracovat cca: {last - current_id} článků")
+            print("-" * 50)
+
+            total_downloaded = 0
+            start_process_time = time.time()
+
+            while current_id < last:
+                batch_start = current_id + 1
+                batch_end = min(batch_start + BATCH_SIZE - 1, last)
+
+                # 1. KROK: Kontrola existence
+                if not is_batch_missing(cur, batch_start, batch_end):
+                    # Zkusíme skočit dál
+                    cur.execute(
+                        "SELECT MAX(article_number) FROM articles WHERE newsgroup = %s AND article_number BETWEEN %s AND %s",
+                        (GROUP, batch_start, batch_end + 100000))
+
+                    res = cur.fetchone()[0]
+                    if res:
+                        current_id = int(res)
+                        print(f"⏩ [SKIP] Data existují. Přeskakuji na ID: {current_id}", end="\r")
+                    else:
+                        current_id = batch_end  # Sychr
+                    continue
+
+                # 2. KROK: Stahování (tady dáváme hodně printů)
+                print(f"\n💎 [DÍRA] Nalezena mezera: {batch_start} - {batch_end}")
+
+                # --- SÍŤOVÁ ČÁST ---
+                print(f"   📡 NNTP XOVER...", end=" ", flush=True)
+                t_net_start = time.time()
+                try:
+                    resp, overviews = nntp.xover(batch_start, batch_end)
+                    t_net_end = time.time()
+                    print(f"OK ({len(overviews)} hlaviček za {t_net_end - t_net_start:.2f}s)")
+
+                    rows = []
+                    for art_num, fields in overviews:
+                        rows.append((
+                            GROUP, art_num, fields.get("message-id"),
+                            Json({"group": GROUP, "article_number": art_num, **fields}),
+                            datetime.now(UTC)
+                        ))
+
+                    # --- DATABÁZOVÁ ČÁST ---
+                    if rows:
+                        print(f"   💾 POSTGRES INSERT...", end=" ", flush=True)
+                        t_db_start = time.time()
+                        cur.executemany(
+                            """
+                            INSERT INTO articles (newsgroup, article_number, message_id, metadata, fetched_at) 
+                            VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING
+                            """, rows
+                        )
+                        t_db_end = time.time()
+                        total_downloaded += len(rows)
+                        print(f"OK ({t_db_end - t_db_start:.2f}s)")
+                    else:
+                        print(f"   ℹ️ Žádná data k uložení (prázdný rozsah na serveru)")
+
+                    current_id = batch_end
+
+                except Exception as e:
+                    print(f"\n❌ CHYBA v bloku {batch_start}-{batch_end}: {e}")
+                    print("   Zkouším popojít o jeden BATCH dál...")
+                    current_id += BATCH_SIZE
+                    time.sleep(2)  # Krátká pauza při chybě
+
+    except Exception as e:
+        print(f"\n💥 KRITICKÁ CHYBA PŘIPOJENÍ: {e}")
+    finally:
+        total_time = time.time() - start_process_time
+        print("\n" + "=" * 50)
+        print(f"🏁 KONEC RELACE")
+        print(f"⏱️ Celkový čas: {total_time:.1f} sekund")
+        print(f"📦 Celkem staženo nových hlaviček: {total_downloaded}")
+        if total_time > 0:
+            print(f"🚀 Průměrná rychlost: {total_downloaded / total_time:.1f} hlaviček/s")
+        print("=" * 50)
+        cur.close()
+        conn.close()
+
+
+if __name__ == "__main__":
+    fast_fill()