Merge remote-tracking branch 'origin/main'

# Conflicts:
#	90 test.py
This commit is contained in:
2025-12-29 23:28:05 +01:00
5 changed files with 333 additions and 132 deletions

BIN
.gitignore vendored

Binary file not shown.

View File

@@ -1,71 +0,0 @@
import nntplib
import os
from dotenv import load_dotenv
from db import get_conn
# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
# ============================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
article_numbers = [row[0] for row in cur.fetchall()]
total = len(article_numbers)
print(f"📦 Found {total} parts in DB")
if total == 0:
print("❌ No articles found, aborting.")
exit(1)
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True,
) as nntp:
nntp.group(GROUP)
existing = []
missing = []
for idx, art in enumerate(article_numbers, start=1):
try:
nntp.stat(art)
existing.append(art)
print(f"✅ [{idx}/{total}] EXISTS article {art}")
except Exception:
missing.append(art)
print(f"❌ [{idx}/{total}] MISSING article {art}")
print("\n================ RESULT ================")
print(f"Total parts : {total}")
print(f"Existing : {len(existing)}")
print(f"Missing : {len(missing)}")
if existing:
print("\nExisting article_numbers:")
print(existing)
if missing:
print("\nMissing article_numbers (first 20):")
print(missing[:20])

97
27 test FastLane.py Normal file
View File

@@ -0,0 +1,97 @@
import os
import nntplib
import sabctools
import io
import binascii
from dotenv import load_dotenv
from db import get_conn
BASE_DIR = "FastLane"
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR)
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
EWEKA_HOST = "news.eweka.nl"
NEWSGROUP = 'alt.binaries.e-book.magazines'
SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
FINAL_PATH = os.path.join(OUTPUT_DIR, "Fast_Lane_Biker_Dec_2011.pdf")
def final_precision_downloader():
print("🚀 Startuji FINÁLNÍ OPRAVENÉ stahování (bez překryvu bajtů)...")
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number FROM articles
WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
ORDER BY article_number;
""", (NEWSGROUP, SUBJECT_FILTER))
articles = cur.fetchall()
if not articles: return
try:
server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
server.group(NEWSGROUP)
except Exception as e:
print(f"💥 Chyba NNTP: {e}");
return
# Zjištění celkové velikosti
resp, info = server.body(str(articles[0][0]))
stuffed = b"\r\n".join([(b"." + l if l.startswith(b".") else l) for l in info.lines])
wrapped = b"222 0 <id>\r\n" + stuffed + b"\r\n.\r\n"
decoder = sabctools.Decoder(len(wrapped))
decoder.process(io.BytesIO(wrapped).readinto(decoder))
meta = next(decoder, None)
total_size = meta.file_size
print(f"📏 Alokuji soubor: {total_size} bajtů.")
with open(FINAL_PATH, "wb") as f:
f.truncate(total_size)
# Zápis na offsety
part_fixed_size = 384000
with open(FINAL_PATH, "r+b") as f_out:
for i, (art_num,) in enumerate(articles):
try:
resp, info = server.body(str(art_num))
# Re-stuffing teček
stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
raw_body = b"\r\n".join(stuffed_lines)
wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
decoder = sabctools.Decoder(len(wrapped))
decoder.process(io.BytesIO(wrapped).readinto(decoder))
res = next(decoder, None)
if res and res.data:
# MATEMATICKÁ OPRAVA: Výpočet bez překryvu
# První part (i=0) -> Offset 0
# Druhý part (i=1) -> Offset 384000 (ne 383999!)
current_offset = i * part_fixed_size
f_out.seek(current_offset)
f_out.write(res.data)
v_crc = binascii.crc32(res.data)
e_crc = getattr(res, 'crc_expected', 0)
status = "" if (e_crc == 0 or v_crc == e_crc) else "❌ CRC FAIL"
print(f" [{status}] Part {art_num} -> Offset: {current_offset}, Len: {len(res.data)}")
except Exception as e:
print(f" ❌ Chyba u {art_num}: {e}")
print("-" * 50)
print(f"🏁 HOTOVO! Teď už to PDF musí být perfektní.")
server.quit()
cur.close()
conn.close()
if __name__ == "__main__":
final_precision_downloader()

View File

@@ -1,85 +1,103 @@
import os
import nntplib
import sabctools
import io
import os
import binascii
import re
import threading
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv
from db import get_conn
# --- KONFIGURACE ---+
INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded"
# --- Konfigurace zůstává ---
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
EWEKA_HOST = "news.eweka.nl"
NEWSGROUP = 'alt.binaries.e-book.magazines'
SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
FINAL_PATH = os.path.join("FastLane/output", "Fast_Lane_Biker_Dec_2011.pdf")
MAX_WORKERS = 50
PART_FIXED_SIZE = 384000
file_lock = threading.Lock()
def ultimate_bomba_decoder():
if not os.path.exists(INPUT_FILE):
print(f"❌ Soubor nenalezen: {INPUT_FILE}")
def download_chunk(articles_subset):
"""
Tato funkce běží v jednom vlákně a obsluhuje JEDNO TRVALÉ spojení.
articles_subset je seznam tuplic (index, art_num).
"""
if not articles_subset:
return
print(f"📖 Načítám soubor...")
with open(INPUT_FILE, "rb") as f:
raw_data = f.read()
try:
# PŘIHLÁŠENÍ JEN JEDNOU NA ZAČÁTKU
server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
server.group(NEWSGROUP)
# 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT
# Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13
yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data)
expected_size = 0
expected_crc_str = ""
if yend_match:
expected_size = int(yend_match.group(1))
expected_crc_str = yend_match.group(2).decode().lower()
print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}")
# Otevřeme soubor pro zápis (v r+b módu)
with open(FINAL_PATH, "r+b") as f_out:
for index, art_num in articles_subset:
try:
resp, info = server.body(str(art_num))
stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
raw_body = b"\r\n".join(stuffed_lines)
# 2. KLÍČOVÁ OPRAVA (Čištění dat)
# Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n
processed_data = raw_data.strip()
# Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n
processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n")
wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
decoder = sabctools.Decoder(len(wrapped))
decoder.process(io.BytesIO(wrapped).readinto(decoder))
res = next(decoder, None)
# 3. ZABALENÍ DO NNTP OBÁLKY
wrapped = b"222 0 <part1@id>\r\n" + processed_data + b"\r\n.\r\n"
if res and res.data:
current_offset = index * PART_FIXED_SIZE
# 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API)
decoder = sabctools.Decoder(len(wrapped))
buf = io.BytesIO(wrapped)
n = buf.readinto(decoder)
decoder.process(n)
# Zápis na specifický offset
with file_lock:
f_out.seek(current_offset)
f_out.write(res.data)
response = next(decoder, None)
print(f" [OK] Part {art_num} (Vlákno {threading.current_thread().name})")
except Exception as e:
print(f" [!] Chyba u článku {art_num}: {e}")
if response and response.data:
# 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32)
# binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát
vypoctene_crc_int = binascii.crc32(response.data)
vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower()
server.quit()
except Exception as e:
print(f" [!!!] Vlákno se nemohlo připojit: {e}")
real_size = len(response.data)
print("-" * 40)
print(f"📊 Kontrola integrity:")
print(f" Skutečná velikost: {real_size} (Očekáváno: {expected_size})")
print(f" Vypočítané CRC: {vypoctene_crc_str}")
print(f" Očekávané CRC: {expected_crc_str}")
def final_precision_downloader():
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number FROM articles
WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
ORDER BY article_number;
""", (NEWSGROUP, SUBJECT_FILTER))
articles = cur.fetchall()
cur.close()
conn.close()
if vypoctene_crc_str == expected_crc_str:
print("✅ BINGO! Soubor je 100% v pořádku.")
else:
print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.")
if not articles: return
# 6. ULOŽENÍ
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# Příprava souboru (alokace místa)
# ... (zde ponechte váš kód pro zjištění total_size a truncate) ...
# Pro ukázku zkráceno:
total_size = len(articles) * PART_FIXED_SIZE # Přibližné, raději použijte váš výpočet z minula
with open(FINAL_PATH, "wb") as f:
f.truncate(total_size)
# Jméno z yEnc hlavičky: PC Pro 2011-07.pdf
out_name = response.file_name or "decoded_part.bin"
out_path = os.path.join(OUTPUT_DIR, out_name)
# ROZDĚLENÍ PRÁCE: Rozdělíme seznam článků na 5 částí
all_tasks = [(i, art[0]) for i, art in enumerate(articles)]
chunk_size = (len(all_tasks) + MAX_WORKERS - 1) // MAX_WORKERS
subsets = [all_tasks[i:i + chunk_size] for i in range(0, len(all_tasks), chunk_size)]
with open(out_path, "wb") as f_out:
f_out.write(response.data)
print(f"🚀 Startuji stahování s {len(subsets)} trvalými spoji...")
print(f"💾 Uloženo do: {out_path}")
print("-" * 40)
else:
print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
executor.map(download_chunk, subsets)
print("🏁 Hotovo.")
if __name__ == "__main__":
ultimate_bomba_decoder()
final_precision_downloader()

View File

@@ -0,0 +1,157 @@
import nntplib
import os
import time
import socket
from dotenv import load_dotenv
from datetime import datetime, UTC
from db import get_conn
from psycopg.types.json import Json
# --- ZVÝŠENÍ LIMITŮ PRO STABILITU ---
nntplib._MAXLINE = 1048576 # 1MB limit pro dlouhé hlavičky
socket.setdefaulttimeout(30) # Prevence nekonečného čekání na síť
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
BATCH_SIZE = 5000 # Menší batch pro častější feedback v konzoli
# =========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
def get_current_frontier(cur):
print(f"🔍 Hledám v databázi hranici spodního bloku pro skupinu {GROUP}...")
cur.execute("""
SELECT MAX(article_number)
FROM articles
WHERE newsgroup = %s AND article_number < 10000000
""", (GROUP,))
res = cur.fetchone()[0]
val = int(res) if res else None
print(f"📍 Nalezena hranice: {val if val else 'Žádná (začínám od nuly)'}")
return val
def is_batch_missing(cur, start, end):
# Tady print nedáváme, aby to nespamovalo 1000x za sekundu
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM articles
WHERE newsgroup = %s AND article_number BETWEEN %s AND %s
)
""", (GROUP, start, end))
return not cur.fetchone()[0]
def fast_fill():
print(f"🚀 Startuji proces: {datetime.now().strftime('%H:%M:%S')}")
conn = get_conn()
conn.autocommit = True
cur = conn.cursor()
current_id = get_current_frontier(cur)
if not current_id:
print("⚠️ Varování: Spodní hranice nenalezena, zkusím se zeptat serveru na nejstarší článek.")
print(f"🔌 Připojuji se k Eweka NNTP SSL (news.eweka.nl)...")
try:
with nntplib.NNTP_SSL("news.eweka.nl", 563, user=EWEKA_USER, password=EWEKA_PASS) as nntp:
resp, count, first, last, name = nntp.group(GROUP)
first, last = int(first), int(last)
if not current_id:
current_id = first
print(f"📊 INFO O SKUPINĚ:")
print(f" Název: {name}")
print(f" Článků na serveru: {count}")
print(f" Rozsah: {first}{last}")
print(f" Aktuální pozice: {current_id}")
print(f" Zbývá zpracovat cca: {last - current_id} článků")
print("-" * 50)
total_downloaded = 0
start_process_time = time.time()
while current_id < last:
batch_start = current_id + 1
batch_end = min(batch_start + BATCH_SIZE - 1, last)
# 1. KROK: Kontrola existence
if not is_batch_missing(cur, batch_start, batch_end):
# Zkusíme skočit dál
cur.execute(
"SELECT MAX(article_number) FROM articles WHERE newsgroup = %s AND article_number BETWEEN %s AND %s",
(GROUP, batch_start, batch_end + 100000))
res = cur.fetchone()[0]
if res:
current_id = int(res)
print(f"⏩ [SKIP] Data existují. Přeskakuji na ID: {current_id}", end="\r")
else:
current_id = batch_end # Sychr
continue
# 2. KROK: Stahování (tady dáváme hodně printů)
print(f"\n💎 [DÍRA] Nalezena mezera: {batch_start} - {batch_end}")
# --- SÍŤOVÁ ČÁST ---
print(f" 📡 NNTP XOVER...", end=" ", flush=True)
t_net_start = time.time()
try:
resp, overviews = nntp.xover(batch_start, batch_end)
t_net_end = time.time()
print(f"OK ({len(overviews)} hlaviček za {t_net_end - t_net_start:.2f}s)")
rows = []
for art_num, fields in overviews:
rows.append((
GROUP, art_num, fields.get("message-id"),
Json({"group": GROUP, "article_number": art_num, **fields}),
datetime.now(UTC)
))
# --- DATABÁZOVÁ ČÁST ---
if rows:
print(f" 💾 POSTGRES INSERT...", end=" ", flush=True)
t_db_start = time.time()
cur.executemany(
"""
INSERT INTO articles (newsgroup, article_number, message_id, metadata, fetched_at)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING
""", rows
)
t_db_end = time.time()
total_downloaded += len(rows)
print(f"OK ({t_db_end - t_db_start:.2f}s)")
else:
print(f" Žádná data k uložení (prázdný rozsah na serveru)")
current_id = batch_end
except Exception as e:
print(f"\n❌ CHYBA v bloku {batch_start}-{batch_end}: {e}")
print(" Zkouším popojít o jeden BATCH dál...")
current_id += BATCH_SIZE
time.sleep(2) # Krátká pauza při chybě
except Exception as e:
print(f"\n💥 KRITICKÁ CHYBA PŘIPOJENÍ: {e}")
finally:
total_time = time.time() - start_process_time
print("\n" + "=" * 50)
print(f"🏁 KONEC RELACE")
print(f"⏱️ Celkový čas: {total_time:.1f} sekund")
print(f"📦 Celkem staženo nových hlaviček: {total_downloaded}")
if total_time > 0:
print(f"🚀 Průměrná rychlost: {total_downloaded / total_time:.1f} hlaviček/s")
print("=" * 50)
cur.close()
conn.close()
if __name__ == "__main__":
fast_fill()