From 1124a1fbe1cb84ee783d81df91fc0116e7828e63 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Mon, 29 Dec 2025 22:51:13 +0100 Subject: [PATCH] z230 --- .gitignore | Bin 98 -> 118 bytes 22 stat test posledniho clanku.py | 184 ++++++++++++++++++++++-------- 27 test FastLane.py | 97 ++++++++++++++++ 90 test.py | 140 +++++++++++++---------- 4 files changed, 311 insertions(+), 110 deletions(-) create mode 100644 27 test FastLane.py diff --git a/.gitignore b/.gitignore index 1cb94063dac3def018c0f7f5d4a2fe1753cc76bf..9b14ff17c144409c22cff05487c92f4f762b51d9 100644 GIT binary patch delta 33 ncmYc)llcEHg(07zoFR`P2S_I}q%agS=reFJOk|2-;$i>*tS<-c delta 13 UcmXR*V*3AYqC^Y}F9R0?03}fa_y7O^ diff --git a/22 stat test posledniho clanku.py b/22 stat test posledniho clanku.py index 89eabe2..9c18a04 100644 --- a/22 stat test posledniho clanku.py +++ b/22 stat test posledniho clanku.py @@ -1,71 +1,157 @@ import nntplib import os +import time +import socket from dotenv import load_dotenv +from datetime import datetime, UTC from db import get_conn +from psycopg.types.json import Json -# ================== CONFIG ================== +# --- ZVÝŠENÍ LIMITŮ PRO STABILITU --- +nntplib._MAXLINE = 1048576 # 1MB limit pro dlouhé hlavičky +socket.setdefaulttimeout(30) # Prevence nekonečného čekání na síť + +# ================= CONFIG ================= GROUP = "alt.binaries.e-book.magazines" -SUBJECT_KEY = "PC Pro 2011-07.pdf" -# ============================================ +BATCH_SIZE = 5000 # Menší batch pro častější feedback v konzoli +# ========================================= load_dotenv() - EWEKA_USER = os.getenv("EWEKA_USER") EWEKA_PASS = os.getenv("EWEKA_PASS") -print("🔌 Connecting to PostgreSQL...") -conn = get_conn() -cur = conn.cursor() -cur.execute(""" - SELECT article_number - FROM articles - WHERE newsgroup = %s - AND metadata->>'subject' LIKE %s - ORDER BY article_number -""", (GROUP, f"%{SUBJECT_KEY}%")) +def get_current_frontier(cur): + print(f"🔍 Hledám v databázi hranici spodního bloku pro skupinu {GROUP}...") + cur.execute(""" + SELECT MAX(article_number) + FROM articles + WHERE newsgroup = %s AND article_number < 10000000 + """, (GROUP,)) + res = cur.fetchone()[0] + val = int(res) if res else None + print(f"📍 Nalezena hranice: {val if val else 'Žádná (začínám od nuly)'}") + return val -article_numbers = [row[0] for row in cur.fetchall()] -total = len(article_numbers) -print(f"📦 Found {total} parts in DB") +def is_batch_missing(cur, start, end): + # Tady print nedáváme, aby to nespamovalo 1000x za sekundu + cur.execute(""" + SELECT EXISTS ( + SELECT 1 FROM articles + WHERE newsgroup = %s AND article_number BETWEEN %s AND %s + ) + """, (GROUP, start, end)) + return not cur.fetchone()[0] -if total == 0: - print("❌ No articles found, aborting.") - exit(1) -print("🔌 Connecting to Eweka NNTP...") -with nntplib.NNTP_SSL( - "news.eweka.nl", - 563, - EWEKA_USER, - EWEKA_PASS, - readermode=True, -) as nntp: +def fast_fill(): + print(f"🚀 Startuji proces: {datetime.now().strftime('%H:%M:%S')}") - nntp.group(GROUP) + conn = get_conn() + conn.autocommit = True + cur = conn.cursor() - existing = [] - missing = [] + current_id = get_current_frontier(cur) + if not current_id: + print("⚠️ Varování: Spodní hranice nenalezena, zkusím se zeptat serveru na nejstarší článek.") - for idx, art in enumerate(article_numbers, start=1): - try: - nntp.stat(art) - existing.append(art) - print(f"✅ [{idx}/{total}] EXISTS article {art}") - except Exception: - missing.append(art) - print(f"❌ [{idx}/{total}] MISSING article {art}") + print(f"🔌 Připojuji se k Eweka NNTP SSL (news.eweka.nl)...") + try: + with nntplib.NNTP_SSL("news.eweka.nl", 563, user=EWEKA_USER, password=EWEKA_PASS) as nntp: + resp, count, first, last, name = nntp.group(GROUP) + first, last = int(first), int(last) -print("\n================ RESULT ================") -print(f"Total parts : {total}") -print(f"Existing : {len(existing)}") -print(f"Missing : {len(missing)}") + if not current_id: + current_id = first -if existing: - print("\nExisting article_numbers:") - print(existing) + print(f"📊 INFO O SKUPINĚ:") + print(f" Název: {name}") + print(f" Článků na serveru: {count}") + print(f" Rozsah: {first} až {last}") + print(f" Aktuální pozice: {current_id}") + print(f" Zbývá zpracovat cca: {last - current_id} článků") + print("-" * 50) -if missing: - print("\nMissing article_numbers (first 20):") - print(missing[:20]) + total_downloaded = 0 + start_process_time = time.time() + + while current_id < last: + batch_start = current_id + 1 + batch_end = min(batch_start + BATCH_SIZE - 1, last) + + # 1. KROK: Kontrola existence + if not is_batch_missing(cur, batch_start, batch_end): + # Zkusíme skočit dál + cur.execute( + "SELECT MAX(article_number) FROM articles WHERE newsgroup = %s AND article_number BETWEEN %s AND %s", + (GROUP, batch_start, batch_end + 100000)) + + res = cur.fetchone()[0] + if res: + current_id = int(res) + print(f"⏩ [SKIP] Data existují. Přeskakuji na ID: {current_id}", end="\r") + else: + current_id = batch_end # Sychr + continue + + # 2. KROK: Stahování (tady dáváme hodně printů) + print(f"\n💎 [DÍRA] Nalezena mezera: {batch_start} - {batch_end}") + + # --- SÍŤOVÁ ČÁST --- + print(f" 📡 NNTP XOVER...", end=" ", flush=True) + t_net_start = time.time() + try: + resp, overviews = nntp.xover(batch_start, batch_end) + t_net_end = time.time() + print(f"OK ({len(overviews)} hlaviček za {t_net_end - t_net_start:.2f}s)") + + rows = [] + for art_num, fields in overviews: + rows.append(( + GROUP, art_num, fields.get("message-id"), + Json({"group": GROUP, "article_number": art_num, **fields}), + datetime.now(UTC) + )) + + # --- DATABÁZOVÁ ČÁST --- + if rows: + print(f" 💾 POSTGRES INSERT...", end=" ", flush=True) + t_db_start = time.time() + cur.executemany( + """ + INSERT INTO articles (newsgroup, article_number, message_id, metadata, fetched_at) + VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING + """, rows + ) + t_db_end = time.time() + total_downloaded += len(rows) + print(f"OK ({t_db_end - t_db_start:.2f}s)") + else: + print(f" ℹ️ Žádná data k uložení (prázdný rozsah na serveru)") + + current_id = batch_end + + except Exception as e: + print(f"\n❌ CHYBA v bloku {batch_start}-{batch_end}: {e}") + print(" Zkouším popojít o jeden BATCH dál...") + current_id += BATCH_SIZE + time.sleep(2) # Krátká pauza při chybě + + except Exception as e: + print(f"\n💥 KRITICKÁ CHYBA PŘIPOJENÍ: {e}") + finally: + total_time = time.time() - start_process_time + print("\n" + "=" * 50) + print(f"🏁 KONEC RELACE") + print(f"⏱️ Celkový čas: {total_time:.1f} sekund") + print(f"📦 Celkem staženo nových hlaviček: {total_downloaded}") + if total_time > 0: + print(f"🚀 Průměrná rychlost: {total_downloaded / total_time:.1f} hlaviček/s") + print("=" * 50) + cur.close() + conn.close() + + +if __name__ == "__main__": + fast_fill() \ No newline at end of file diff --git a/27 test FastLane.py b/27 test FastLane.py new file mode 100644 index 0000000..c1f016e --- /dev/null +++ b/27 test FastLane.py @@ -0,0 +1,97 @@ +import os +import nntplib +import sabctools +import io +import binascii +from dotenv import load_dotenv +from db import get_conn + +BASE_DIR = "FastLane" +OUTPUT_DIR = os.path.join(BASE_DIR, "output") +if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) + +load_dotenv() +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") +EWEKA_HOST = "news.eweka.nl" + +NEWSGROUP = 'alt.binaries.e-book.magazines' +SUBJECT_FILTER = '%PC Pro 2011-07.pdf%' +FINAL_PATH = os.path.join(OUTPUT_DIR, "Fast_Lane_Biker_Dec_2011.pdf") + + +def final_precision_downloader(): + print("🚀 Startuji FINÁLNÍ OPRAVENÉ stahování (bez překryvu bajtů)...") + + conn = get_conn() + cur = conn.cursor() + cur.execute(""" + SELECT article_number FROM articles + WHERE newsgroup = %s AND metadata->>'subject' LIKE %s + ORDER BY article_number; + """, (NEWSGROUP, SUBJECT_FILTER)) + articles = cur.fetchall() + + if not articles: return + + try: + server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS) + server.group(NEWSGROUP) + except Exception as e: + print(f"💥 Chyba NNTP: {e}"); + return + + # Zjištění celkové velikosti + resp, info = server.body(str(articles[0][0])) + stuffed = b"\r\n".join([(b"." + l if l.startswith(b".") else l) for l in info.lines]) + wrapped = b"222 0 \r\n" + stuffed + b"\r\n.\r\n" + decoder = sabctools.Decoder(len(wrapped)) + decoder.process(io.BytesIO(wrapped).readinto(decoder)) + meta = next(decoder, None) + + total_size = meta.file_size + print(f"📏 Alokuji soubor: {total_size} bajtů.") + with open(FINAL_PATH, "wb") as f: + f.truncate(total_size) + + # Zápis na offsety + part_fixed_size = 384000 + with open(FINAL_PATH, "r+b") as f_out: + for i, (art_num,) in enumerate(articles): + try: + resp, info = server.body(str(art_num)) + # Re-stuffing teček + stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines] + raw_body = b"\r\n".join(stuffed_lines) + + wrapped = b"222 0 \r\n" + raw_body + b"\r\n.\r\n" + decoder = sabctools.Decoder(len(wrapped)) + decoder.process(io.BytesIO(wrapped).readinto(decoder)) + res = next(decoder, None) + + if res and res.data: + # MATEMATICKÁ OPRAVA: Výpočet bez překryvu + # První part (i=0) -> Offset 0 + # Druhý part (i=1) -> Offset 384000 (ne 383999!) + current_offset = i * part_fixed_size + + f_out.seek(current_offset) + f_out.write(res.data) + + v_crc = binascii.crc32(res.data) + e_crc = getattr(res, 'crc_expected', 0) + status = "✅" if (e_crc == 0 or v_crc == e_crc) else "❌ CRC FAIL" + + print(f" [{status}] Part {art_num} -> Offset: {current_offset}, Len: {len(res.data)}") + except Exception as e: + print(f" ❌ Chyba u {art_num}: {e}") + + print("-" * 50) + print(f"🏁 HOTOVO! Teď už to PDF musí být perfektní.") + server.quit() + cur.close() + conn.close() + + +if __name__ == "__main__": + final_precision_downloader() \ No newline at end of file diff --git a/90 test.py b/90 test.py index 116d95d..7420db2 100644 --- a/90 test.py +++ b/90 test.py @@ -1,85 +1,103 @@ +import os +import nntplib import sabctools import io -import os import binascii -import re +import threading +from concurrent.futures import ThreadPoolExecutor +from dotenv import load_dotenv +from db import get_conn -# --- KONFIGURACE --- -INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw" -OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded" +# --- Konfigurace zůstává --- +load_dotenv() +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") +EWEKA_HOST = "news.eweka.nl" +NEWSGROUP = 'alt.binaries.e-book.magazines' +SUBJECT_FILTER = '%PC Pro 2011-07.pdf%' +FINAL_PATH = os.path.join("FastLane/output", "Fast_Lane_Biker_Dec_2011.pdf") + +MAX_WORKERS = 50 +PART_FIXED_SIZE = 384000 +file_lock = threading.Lock() -def ultimate_bomba_decoder(): - if not os.path.exists(INPUT_FILE): - print(f"❌ Soubor nenalezen: {INPUT_FILE}") +def download_chunk(articles_subset): + """ + Tato funkce běží v jednom vlákně a obsluhuje JEDNO TRVALÉ spojení. + articles_subset je seznam tuplic (index, art_num). + """ + if not articles_subset: return - print(f"📖 Načítám soubor...") - with open(INPUT_FILE, "rb") as f: - raw_data = f.read() + try: + # PŘIHLÁŠENÍ JEN JEDNOU NA ZAČÁTKU + server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS) + server.group(NEWSGROUP) - # 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT - # Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13 - yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data) - expected_size = 0 - expected_crc_str = "" - if yend_match: - expected_size = int(yend_match.group(1)) - expected_crc_str = yend_match.group(2).decode().lower() - print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}") + # Otevřeme soubor pro zápis (v r+b módu) + with open(FINAL_PATH, "r+b") as f_out: + for index, art_num in articles_subset: + try: + resp, info = server.body(str(art_num)) + stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines] + raw_body = b"\r\n".join(stuffed_lines) - # 2. KLÍČOVÁ OPRAVA (Čištění dat) - # Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n - processed_data = raw_data.strip() - # Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n - processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n") + wrapped = b"222 0 \r\n" + raw_body + b"\r\n.\r\n" + decoder = sabctools.Decoder(len(wrapped)) + decoder.process(io.BytesIO(wrapped).readinto(decoder)) + res = next(decoder, None) - # 3. ZABALENÍ DO NNTP OBÁLKY - wrapped = b"222 0 \r\n" + processed_data + b"\r\n.\r\n" + if res and res.data: + current_offset = index * PART_FIXED_SIZE - # 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API) - decoder = sabctools.Decoder(len(wrapped)) - buf = io.BytesIO(wrapped) - n = buf.readinto(decoder) - decoder.process(n) + # Zápis na specifický offset + with file_lock: + f_out.seek(current_offset) + f_out.write(res.data) - response = next(decoder, None) + print(f" [OK] Part {art_num} (Vlákno {threading.current_thread().name})") + except Exception as e: + print(f" [!] Chyba u článku {art_num}: {e}") - if response and response.data: - # 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32) - # binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát - vypoctene_crc_int = binascii.crc32(response.data) - vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower() + server.quit() + except Exception as e: + print(f" [!!!] Vlákno se nemohlo připojit: {e}") - real_size = len(response.data) - print("-" * 40) - print(f"📊 Kontrola integrity:") - print(f" Skutečná velikost: {real_size} (Očekáváno: {expected_size})") - print(f" Vypočítané CRC: {vypoctene_crc_str}") - print(f" Očekávané CRC: {expected_crc_str}") +def final_precision_downloader(): + conn = get_conn() + cur = conn.cursor() + cur.execute(""" + SELECT article_number FROM articles + WHERE newsgroup = %s AND metadata->>'subject' LIKE %s + ORDER BY article_number; + """, (NEWSGROUP, SUBJECT_FILTER)) + articles = cur.fetchall() + cur.close() + conn.close() - if vypoctene_crc_str == expected_crc_str: - print("✅ BINGO! Soubor je 100% v pořádku.") - else: - print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.") + if not articles: return - # 6. ULOŽENÍ - if not os.path.exists(OUTPUT_DIR): - os.makedirs(OUTPUT_DIR) + # Příprava souboru (alokace místa) + # ... (zde ponechte váš kód pro zjištění total_size a truncate) ... + # Pro ukázku zkráceno: + total_size = len(articles) * PART_FIXED_SIZE # Přibližné, raději použijte váš výpočet z minula + with open(FINAL_PATH, "wb") as f: + f.truncate(total_size) - # Jméno z yEnc hlavičky: PC Pro 2011-07.pdf - out_name = response.file_name or "decoded_part.bin" - out_path = os.path.join(OUTPUT_DIR, out_name) + # ROZDĚLENÍ PRÁCE: Rozdělíme seznam článků na 5 částí + all_tasks = [(i, art[0]) for i, art in enumerate(articles)] + chunk_size = (len(all_tasks) + MAX_WORKERS - 1) // MAX_WORKERS + subsets = [all_tasks[i:i + chunk_size] for i in range(0, len(all_tasks), chunk_size)] - with open(out_path, "wb") as f_out: - f_out.write(response.data) + print(f"🚀 Startuji stahování s {len(subsets)} trvalými spoji...") - print(f"💾 Uloženo do: {out_path}") - print("-" * 40) - else: - print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.") + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + executor.map(download_chunk, subsets) + + print("🏁 Hotovo.") if __name__ == "__main__": - ultimate_bomba_decoder() \ No newline at end of file + final_precision_downloader() \ No newline at end of file