z230
This commit is contained in:
BIN
.gitignore
vendored
BIN
.gitignore
vendored
Binary file not shown.
@@ -1,71 +1,157 @@
|
||||
import nntplib
|
||||
import os
|
||||
import time
|
||||
import socket
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime, UTC
|
||||
from db import get_conn
|
||||
from psycopg.types.json import Json
|
||||
|
||||
# ================== CONFIG ==================
|
||||
# --- ZVÝŠENÍ LIMITŮ PRO STABILITU ---
|
||||
nntplib._MAXLINE = 1048576 # 1MB limit pro dlouhé hlavičky
|
||||
socket.setdefaulttimeout(30) # Prevence nekonečného čekání na síť
|
||||
|
||||
# ================= CONFIG =================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||||
# ============================================
|
||||
BATCH_SIZE = 5000 # Menší batch pro častější feedback v konzoli
|
||||
# =========================================
|
||||
|
||||
load_dotenv()
|
||||
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
|
||||
def get_current_frontier(cur):
|
||||
print(f"🔍 Hledám v databázi hranici spodního bloku pro skupinu {GROUP}...")
|
||||
cur.execute("""
|
||||
SELECT MAX(article_number)
|
||||
FROM articles
|
||||
WHERE newsgroup = %s AND article_number < 10000000
|
||||
""", (GROUP,))
|
||||
res = cur.fetchone()[0]
|
||||
val = int(res) if res else None
|
||||
print(f"📍 Nalezena hranice: {val if val else 'Žádná (začínám od nuly)'}")
|
||||
return val
|
||||
|
||||
|
||||
def is_batch_missing(cur, start, end):
|
||||
# Tady print nedáváme, aby to nespamovalo 1000x za sekundu
|
||||
cur.execute("""
|
||||
SELECT EXISTS (
|
||||
SELECT 1 FROM articles
|
||||
WHERE newsgroup = %s AND article_number BETWEEN %s AND %s
|
||||
)
|
||||
""", (GROUP, start, end))
|
||||
return not cur.fetchone()[0]
|
||||
|
||||
|
||||
def fast_fill():
|
||||
print(f"🚀 Startuji proces: {datetime.now().strftime('%H:%M:%S')}")
|
||||
|
||||
conn = get_conn()
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT article_number
|
||||
FROM articles
|
||||
WHERE newsgroup = %s
|
||||
AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number
|
||||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||||
current_id = get_current_frontier(cur)
|
||||
if not current_id:
|
||||
print("⚠️ Varování: Spodní hranice nenalezena, zkusím se zeptat serveru na nejstarší článek.")
|
||||
|
||||
article_numbers = [row[0] for row in cur.fetchall()]
|
||||
total = len(article_numbers)
|
||||
|
||||
print(f"📦 Found {total} parts in DB")
|
||||
|
||||
if total == 0:
|
||||
print("❌ No articles found, aborting.")
|
||||
exit(1)
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
"news.eweka.nl",
|
||||
563,
|
||||
EWEKA_USER,
|
||||
EWEKA_PASS,
|
||||
readermode=True,
|
||||
) as nntp:
|
||||
|
||||
nntp.group(GROUP)
|
||||
|
||||
existing = []
|
||||
missing = []
|
||||
|
||||
for idx, art in enumerate(article_numbers, start=1):
|
||||
print(f"🔌 Připojuji se k Eweka NNTP SSL (news.eweka.nl)...")
|
||||
try:
|
||||
nntp.stat(art)
|
||||
existing.append(art)
|
||||
print(f"✅ [{idx}/{total}] EXISTS article {art}")
|
||||
except Exception:
|
||||
missing.append(art)
|
||||
print(f"❌ [{idx}/{total}] MISSING article {art}")
|
||||
with nntplib.NNTP_SSL("news.eweka.nl", 563, user=EWEKA_USER, password=EWEKA_PASS) as nntp:
|
||||
resp, count, first, last, name = nntp.group(GROUP)
|
||||
first, last = int(first), int(last)
|
||||
|
||||
print("\n================ RESULT ================")
|
||||
print(f"Total parts : {total}")
|
||||
print(f"Existing : {len(existing)}")
|
||||
print(f"Missing : {len(missing)}")
|
||||
if not current_id:
|
||||
current_id = first
|
||||
|
||||
if existing:
|
||||
print("\nExisting article_numbers:")
|
||||
print(existing)
|
||||
print(f"📊 INFO O SKUPINĚ:")
|
||||
print(f" Název: {name}")
|
||||
print(f" Článků na serveru: {count}")
|
||||
print(f" Rozsah: {first} až {last}")
|
||||
print(f" Aktuální pozice: {current_id}")
|
||||
print(f" Zbývá zpracovat cca: {last - current_id} článků")
|
||||
print("-" * 50)
|
||||
|
||||
if missing:
|
||||
print("\nMissing article_numbers (first 20):")
|
||||
print(missing[:20])
|
||||
total_downloaded = 0
|
||||
start_process_time = time.time()
|
||||
|
||||
while current_id < last:
|
||||
batch_start = current_id + 1
|
||||
batch_end = min(batch_start + BATCH_SIZE - 1, last)
|
||||
|
||||
# 1. KROK: Kontrola existence
|
||||
if not is_batch_missing(cur, batch_start, batch_end):
|
||||
# Zkusíme skočit dál
|
||||
cur.execute(
|
||||
"SELECT MAX(article_number) FROM articles WHERE newsgroup = %s AND article_number BETWEEN %s AND %s",
|
||||
(GROUP, batch_start, batch_end + 100000))
|
||||
|
||||
res = cur.fetchone()[0]
|
||||
if res:
|
||||
current_id = int(res)
|
||||
print(f"⏩ [SKIP] Data existují. Přeskakuji na ID: {current_id}", end="\r")
|
||||
else:
|
||||
current_id = batch_end # Sychr
|
||||
continue
|
||||
|
||||
# 2. KROK: Stahování (tady dáváme hodně printů)
|
||||
print(f"\n💎 [DÍRA] Nalezena mezera: {batch_start} - {batch_end}")
|
||||
|
||||
# --- SÍŤOVÁ ČÁST ---
|
||||
print(f" 📡 NNTP XOVER...", end=" ", flush=True)
|
||||
t_net_start = time.time()
|
||||
try:
|
||||
resp, overviews = nntp.xover(batch_start, batch_end)
|
||||
t_net_end = time.time()
|
||||
print(f"OK ({len(overviews)} hlaviček za {t_net_end - t_net_start:.2f}s)")
|
||||
|
||||
rows = []
|
||||
for art_num, fields in overviews:
|
||||
rows.append((
|
||||
GROUP, art_num, fields.get("message-id"),
|
||||
Json({"group": GROUP, "article_number": art_num, **fields}),
|
||||
datetime.now(UTC)
|
||||
))
|
||||
|
||||
# --- DATABÁZOVÁ ČÁST ---
|
||||
if rows:
|
||||
print(f" 💾 POSTGRES INSERT...", end=" ", flush=True)
|
||||
t_db_start = time.time()
|
||||
cur.executemany(
|
||||
"""
|
||||
INSERT INTO articles (newsgroup, article_number, message_id, metadata, fetched_at)
|
||||
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING
|
||||
""", rows
|
||||
)
|
||||
t_db_end = time.time()
|
||||
total_downloaded += len(rows)
|
||||
print(f"OK ({t_db_end - t_db_start:.2f}s)")
|
||||
else:
|
||||
print(f" ℹ️ Žádná data k uložení (prázdný rozsah na serveru)")
|
||||
|
||||
current_id = batch_end
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ CHYBA v bloku {batch_start}-{batch_end}: {e}")
|
||||
print(" Zkouším popojít o jeden BATCH dál...")
|
||||
current_id += BATCH_SIZE
|
||||
time.sleep(2) # Krátká pauza při chybě
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n💥 KRITICKÁ CHYBA PŘIPOJENÍ: {e}")
|
||||
finally:
|
||||
total_time = time.time() - start_process_time
|
||||
print("\n" + "=" * 50)
|
||||
print(f"🏁 KONEC RELACE")
|
||||
print(f"⏱️ Celkový čas: {total_time:.1f} sekund")
|
||||
print(f"📦 Celkem staženo nových hlaviček: {total_downloaded}")
|
||||
if total_time > 0:
|
||||
print(f"🚀 Průměrná rychlost: {total_downloaded / total_time:.1f} hlaviček/s")
|
||||
print("=" * 50)
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fast_fill()
|
||||
97
27 test FastLane.py
Normal file
97
27 test FastLane.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import os
|
||||
import nntplib
|
||||
import sabctools
|
||||
import io
|
||||
import binascii
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
|
||||
BASE_DIR = "FastLane"
|
||||
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
|
||||
if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR)
|
||||
|
||||
load_dotenv()
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
EWEKA_HOST = "news.eweka.nl"
|
||||
|
||||
NEWSGROUP = 'alt.binaries.e-book.magazines'
|
||||
SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
|
||||
FINAL_PATH = os.path.join(OUTPUT_DIR, "Fast_Lane_Biker_Dec_2011.pdf")
|
||||
|
||||
|
||||
def final_precision_downloader():
|
||||
print("🚀 Startuji FINÁLNÍ OPRAVENÉ stahování (bez překryvu bajtů)...")
|
||||
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT article_number FROM articles
|
||||
WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number;
|
||||
""", (NEWSGROUP, SUBJECT_FILTER))
|
||||
articles = cur.fetchall()
|
||||
|
||||
if not articles: return
|
||||
|
||||
try:
|
||||
server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
|
||||
server.group(NEWSGROUP)
|
||||
except Exception as e:
|
||||
print(f"💥 Chyba NNTP: {e}");
|
||||
return
|
||||
|
||||
# Zjištění celkové velikosti
|
||||
resp, info = server.body(str(articles[0][0]))
|
||||
stuffed = b"\r\n".join([(b"." + l if l.startswith(b".") else l) for l in info.lines])
|
||||
wrapped = b"222 0 <id>\r\n" + stuffed + b"\r\n.\r\n"
|
||||
decoder = sabctools.Decoder(len(wrapped))
|
||||
decoder.process(io.BytesIO(wrapped).readinto(decoder))
|
||||
meta = next(decoder, None)
|
||||
|
||||
total_size = meta.file_size
|
||||
print(f"📏 Alokuji soubor: {total_size} bajtů.")
|
||||
with open(FINAL_PATH, "wb") as f:
|
||||
f.truncate(total_size)
|
||||
|
||||
# Zápis na offsety
|
||||
part_fixed_size = 384000
|
||||
with open(FINAL_PATH, "r+b") as f_out:
|
||||
for i, (art_num,) in enumerate(articles):
|
||||
try:
|
||||
resp, info = server.body(str(art_num))
|
||||
# Re-stuffing teček
|
||||
stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
|
||||
raw_body = b"\r\n".join(stuffed_lines)
|
||||
|
||||
wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
|
||||
decoder = sabctools.Decoder(len(wrapped))
|
||||
decoder.process(io.BytesIO(wrapped).readinto(decoder))
|
||||
res = next(decoder, None)
|
||||
|
||||
if res and res.data:
|
||||
# MATEMATICKÁ OPRAVA: Výpočet bez překryvu
|
||||
# První part (i=0) -> Offset 0
|
||||
# Druhý part (i=1) -> Offset 384000 (ne 383999!)
|
||||
current_offset = i * part_fixed_size
|
||||
|
||||
f_out.seek(current_offset)
|
||||
f_out.write(res.data)
|
||||
|
||||
v_crc = binascii.crc32(res.data)
|
||||
e_crc = getattr(res, 'crc_expected', 0)
|
||||
status = "✅" if (e_crc == 0 or v_crc == e_crc) else "❌ CRC FAIL"
|
||||
|
||||
print(f" [{status}] Part {art_num} -> Offset: {current_offset}, Len: {len(res.data)}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Chyba u {art_num}: {e}")
|
||||
|
||||
print("-" * 50)
|
||||
print(f"🏁 HOTOVO! Teď už to PDF musí být perfektní.")
|
||||
server.quit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
final_precision_downloader()
|
||||
142
90 test.py
142
90 test.py
@@ -1,85 +1,103 @@
|
||||
import os
|
||||
import nntplib
|
||||
import sabctools
|
||||
import io
|
||||
import os
|
||||
import binascii
|
||||
import re
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
|
||||
# --- KONFIGURACE ---
|
||||
INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
|
||||
OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded"
|
||||
# --- Konfigurace zůstává ---
|
||||
load_dotenv()
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
EWEKA_HOST = "news.eweka.nl"
|
||||
NEWSGROUP = 'alt.binaries.e-book.magazines'
|
||||
SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
|
||||
FINAL_PATH = os.path.join("FastLane/output", "Fast_Lane_Biker_Dec_2011.pdf")
|
||||
|
||||
MAX_WORKERS = 50
|
||||
PART_FIXED_SIZE = 384000
|
||||
file_lock = threading.Lock()
|
||||
|
||||
|
||||
def ultimate_bomba_decoder():
|
||||
if not os.path.exists(INPUT_FILE):
|
||||
print(f"❌ Soubor nenalezen: {INPUT_FILE}")
|
||||
def download_chunk(articles_subset):
|
||||
"""
|
||||
Tato funkce běží v jednom vlákně a obsluhuje JEDNO TRVALÉ spojení.
|
||||
articles_subset je seznam tuplic (index, art_num).
|
||||
"""
|
||||
if not articles_subset:
|
||||
return
|
||||
|
||||
print(f"📖 Načítám soubor...")
|
||||
with open(INPUT_FILE, "rb") as f:
|
||||
raw_data = f.read()
|
||||
try:
|
||||
# PŘIHLÁŠENÍ JEN JEDNOU NA ZAČÁTKU
|
||||
server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
|
||||
server.group(NEWSGROUP)
|
||||
|
||||
# 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT
|
||||
# Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13
|
||||
yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data)
|
||||
expected_size = 0
|
||||
expected_crc_str = ""
|
||||
if yend_match:
|
||||
expected_size = int(yend_match.group(1))
|
||||
expected_crc_str = yend_match.group(2).decode().lower()
|
||||
print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}")
|
||||
# Otevřeme soubor pro zápis (v r+b módu)
|
||||
with open(FINAL_PATH, "r+b") as f_out:
|
||||
for index, art_num in articles_subset:
|
||||
try:
|
||||
resp, info = server.body(str(art_num))
|
||||
stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
|
||||
raw_body = b"\r\n".join(stuffed_lines)
|
||||
|
||||
# 2. KLÍČOVÁ OPRAVA (Čištění dat)
|
||||
# Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n
|
||||
processed_data = raw_data.strip()
|
||||
# Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n
|
||||
processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n")
|
||||
|
||||
# 3. ZABALENÍ DO NNTP OBÁLKY
|
||||
wrapped = b"222 0 <part1@id>\r\n" + processed_data + b"\r\n.\r\n"
|
||||
|
||||
# 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API)
|
||||
wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
|
||||
decoder = sabctools.Decoder(len(wrapped))
|
||||
buf = io.BytesIO(wrapped)
|
||||
n = buf.readinto(decoder)
|
||||
decoder.process(n)
|
||||
decoder.process(io.BytesIO(wrapped).readinto(decoder))
|
||||
res = next(decoder, None)
|
||||
|
||||
response = next(decoder, None)
|
||||
if res and res.data:
|
||||
current_offset = index * PART_FIXED_SIZE
|
||||
|
||||
if response and response.data:
|
||||
# 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32)
|
||||
# binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát
|
||||
vypoctene_crc_int = binascii.crc32(response.data)
|
||||
vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower()
|
||||
# Zápis na specifický offset
|
||||
with file_lock:
|
||||
f_out.seek(current_offset)
|
||||
f_out.write(res.data)
|
||||
|
||||
real_size = len(response.data)
|
||||
print(f" [OK] Part {art_num} (Vlákno {threading.current_thread().name})")
|
||||
except Exception as e:
|
||||
print(f" [!] Chyba u článku {art_num}: {e}")
|
||||
|
||||
print("-" * 40)
|
||||
print(f"📊 Kontrola integrity:")
|
||||
print(f" Skutečná velikost: {real_size} (Očekáváno: {expected_size})")
|
||||
print(f" Vypočítané CRC: {vypoctene_crc_str}")
|
||||
print(f" Očekávané CRC: {expected_crc_str}")
|
||||
server.quit()
|
||||
except Exception as e:
|
||||
print(f" [!!!] Vlákno se nemohlo připojit: {e}")
|
||||
|
||||
if vypoctene_crc_str == expected_crc_str:
|
||||
print("✅ BINGO! Soubor je 100% v pořádku.")
|
||||
else:
|
||||
print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.")
|
||||
|
||||
# 6. ULOŽENÍ
|
||||
if not os.path.exists(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
def final_precision_downloader():
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT article_number FROM articles
|
||||
WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number;
|
||||
""", (NEWSGROUP, SUBJECT_FILTER))
|
||||
articles = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
# Jméno z yEnc hlavičky: PC Pro 2011-07.pdf
|
||||
out_name = response.file_name or "decoded_part.bin"
|
||||
out_path = os.path.join(OUTPUT_DIR, out_name)
|
||||
if not articles: return
|
||||
|
||||
with open(out_path, "wb") as f_out:
|
||||
f_out.write(response.data)
|
||||
# Příprava souboru (alokace místa)
|
||||
# ... (zde ponechte váš kód pro zjištění total_size a truncate) ...
|
||||
# Pro ukázku zkráceno:
|
||||
total_size = len(articles) * PART_FIXED_SIZE # Přibližné, raději použijte váš výpočet z minula
|
||||
with open(FINAL_PATH, "wb") as f:
|
||||
f.truncate(total_size)
|
||||
|
||||
print(f"💾 Uloženo do: {out_path}")
|
||||
print("-" * 40)
|
||||
else:
|
||||
print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.")
|
||||
# ROZDĚLENÍ PRÁCE: Rozdělíme seznam článků na 5 částí
|
||||
all_tasks = [(i, art[0]) for i, art in enumerate(articles)]
|
||||
chunk_size = (len(all_tasks) + MAX_WORKERS - 1) // MAX_WORKERS
|
||||
subsets = [all_tasks[i:i + chunk_size] for i in range(0, len(all_tasks), chunk_size)]
|
||||
|
||||
print(f"🚀 Startuji stahování s {len(subsets)} trvalými spoji...")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
executor.map(download_chunk, subsets)
|
||||
|
||||
print("🏁 Hotovo.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ultimate_bomba_decoder()
|
||||
final_precision_downloader()
|
||||
Reference in New Issue
Block a user