This commit is contained in:
2025-12-28 11:34:15 +01:00
parent f8dc6566bc
commit 64472e59ba
6 changed files with 488 additions and 143 deletions

View File

@@ -3,119 +3,11 @@ import re
import nntplib import nntplib
from dotenv import load_dotenv from dotenv import load_dotenv
from db import get_conn from db import get_conn
def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
"""
Decode yEnc from NNTP BODY lines.
Handles NNTP dot-stuffing and logs what happens.
"""
out = bytearray()
saw_ybegin = False
data_lines = 0
for idx, orig_line in enumerate(lines):
line = orig_line
# --- NNTP dot-stuffing ---
if line.startswith(b".."):
if debug:
print(f" [dot] line {idx}: '..' -> '.'")
line = line[1:]
elif line.startswith(b"."):
if debug:
print(f" [dot] line {idx}: '.' removed")
line = line[1:]
# --- yEnc control lines ---
if line.startswith(b"=ybegin"):
saw_ybegin = True
if debug:
print(f" [yEnc] =ybegin detected")
continue
if line.startswith(b"=ypart"):
if debug:
print(f" [yEnc] =ypart detected")
continue
if line.startswith(b"=yend"):
if debug:
print(f" [yEnc] =yend detected")
continue
# --- actual yEnc data ---
data_lines += 1
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord('='):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
if debug:
print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
print(f" [yEnc] decoded_bytes={len(out)}")
if not saw_ybegin:
print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen")
return bytes(out)
# def yenc_decode_lines(lines: list[bytes]) -> bytes:
# """
# Decode yEnc from NNTP BODY lines.
# Handles NNTP dot-stuffing correctly.
# """
# out = bytearray()
#
# for line in lines:
# # --- undo NNTP dot-stuffing ---
# if line.startswith(b".."):
# line = line[1:]
# elif line.startswith(b"."):
# line = line[1:]
#
# # --- skip yEnc control lines ---
# if line.startswith(b"=ybegin"):
# continue
# if line.startswith(b"=ypart"):
# continue
# if line.startswith(b"=yend"):
# continue
#
# i = 0
# length = len(line)
#
# while i < length:
# c = line[i]
#
# if c == ord('='): # yEnc escape
# i += 1
# if i >= length:
# break
# c = (line[i] - 64) & 0xFF
#
# out.append((c - 42) & 0xFF)
# i += 1
#
# return bytes(out)
# ================== CONFIG ================== # ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines" GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf" SUBJECT_KEY = "PC Pro 2011-07.pdf"
OUT_DIR = r"downloads/PC_Pro_2011-07" OUT_DIR = r"downloads/PC_Pro_2011-07"
FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
# ============================================ # ============================================
load_dotenv() load_dotenv()
@@ -124,11 +16,47 @@ EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(OUT_DIR, exist_ok=True) os.makedirs(OUT_DIR, exist_ok=True)
print("🔌 Connecting to PostgreSQL...")
def yenc_decode_and_extract_headers(lines: list[bytes]):
"""
Decode yEnc BODY lines and extract yEnc headers.
Returns: (decoded_bytes, yenc_header_lines)
"""
out = bytearray()
yenc_headers = []
for line in lines:
# --- undo NNTP dot-stuffing ---
if line.startswith(b".."):
line = line[1:]
elif line.startswith(b"."):
line = line[1:]
# --- capture yEnc control lines ---
if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
yenc_headers.append(line.decode("latin-1"))
continue
# --- yEnc decode ---
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord("="):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
return bytes(out), yenc_headers
# ------------------ DB ------------------
conn = get_conn() conn = get_conn()
cur = conn.cursor() cur = conn.cursor()
# --- load article numbers + subject ---
cur.execute(""" cur.execute("""
SELECT article_number, metadata->>'subject' SELECT article_number, metadata->>'subject'
FROM articles FROM articles
@@ -138,12 +66,11 @@ cur.execute("""
""", (GROUP, f"%{SUBJECT_KEY}%")) """, (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall() rows = cur.fetchall()
print(f"📦 Found {len(rows)} parts")
# --- parse part number from subject --- # parse part number from subject
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = [] parts = []
for art_num, subject in rows: for art_num, subject in rows:
m = part_re.search(subject or "") m = part_re.search(subject or "")
if not m: if not m:
@@ -151,10 +78,9 @@ for art_num, subject in rows:
part_no = int(m.group(1)) part_no = int(m.group(1))
parts.append((part_no, art_num)) parts.append((part_no, art_num))
# sort by part number (1..N)
parts.sort(key=lambda x: x[0]) parts.sort(key=lambda x: x[0])
print("🔌 Connecting to Eweka NNTP...") # ------------------ NNTP ------------------
with nntplib.NNTP_SSL( with nntplib.NNTP_SSL(
"news.eweka.nl", "news.eweka.nl",
563, 563,
@@ -165,38 +91,19 @@ with nntplib.NNTP_SSL(
nntp.group(GROUP) nntp.group(GROUP)
for idx, (part_no, art_num) in enumerate(parts, start=1): for part_no, art_num in parts:
out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")
if os.path.exists(out_path): if os.path.exists(bin_path) and os.path.exists(hdr_path):
print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping")
continue continue
print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})") _, info = nntp.body(art_num)
resp, info = nntp.body(art_num) decoded, headers = yenc_decode_and_extract_headers(info.lines)
print(f" BODY lines received: {len(info.lines)}") with open(bin_path, "wb") as f:
# rychlá kontrola prvních řádků
for ln in info.lines[:3]:
print(f" RAW:", ln[:80])
decoded = yenc_decode_lines(info.lines, debug=True)
print(f" RESULT bytes: {len(decoded)}")
with open(out_path, "wb") as f:
f.write(decoded) f.write(decoded)
with open(hdr_path, "w", encoding="utf-8") as f:
print("🧩 Assembling final PDF...") f.write("\n".join(headers))
with open(FINAL_PDF, "wb") as out:
for part_no, _ in parts:
part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
with open(part_path, "rb") as pf:
out.write(pf.read())
print("🎉 DONE")
print(f"📄 Final PDF: {FINAL_PDF}")

View File

@@ -0,0 +1,93 @@
import os
import re
# ================= CONFIG =================
PARTS_DIR = r"downloads/PC_Pro_2011-07"
OUTPUT_DIR = r"downloads"
# ==========================================
ybegin_re = re.compile(r"size=(\d+)")
name_re = re.compile(r"name=(.+)")
ypart_re = re.compile(r"begin=(\d+)\s+end=(\d+)")
parts = []
total_size = None
final_name = None
# --- 1. načti všechny yEnc hlavičky ---
for fname in os.listdir(PARTS_DIR):
if not fname.endswith(".yEncHeader"):
continue
part_id = fname.replace(".yEncHeader", "")
hdr_path = os.path.join(PARTS_DIR, fname)
bin_path = os.path.join(PARTS_DIR, f"{part_id}.bin")
if not os.path.exists(bin_path):
raise RuntimeError(f"Missing bin file for {fname}")
with open(hdr_path, "r", encoding="utf-8", errors="replace") as f:
lines = f.read().splitlines()
begin = end = None
for line in lines:
if line.startswith("=ybegin"):
if total_size is None:
m = ybegin_re.search(line)
if m:
total_size = int(m.group(1))
m = name_re.search(line)
if m:
final_name = m.group(1)
if line.startswith("=ypart"):
m = ypart_re.search(line)
if not m:
raise RuntimeError(f"Cannot parse ypart in {fname}")
begin = int(m.group(1))
end = int(m.group(2))
if begin is None or end is None:
raise RuntimeError(f"Missing begin/end in {fname}")
parts.append({
"bin": bin_path,
"begin": begin,
"end": end
})
# --- sanity checks ---
if total_size is None or final_name is None:
raise RuntimeError("Missing ybegin info (size/name)")
output_path = os.path.join(OUTPUT_DIR, final_name)
print(f"📄 Final file: {output_path}")
print(f"📦 Total size: {total_size} bytes")
print(f"🧩 Parts: {len(parts)}")
# --- 2. alokuj cílový soubor ---
with open(output_path, "wb") as f:
f.truncate(total_size)
# --- 3. zapiš jednotlivé části na správné offsety ---
with open(output_path, "r+b") as out:
for p in parts:
expected_len = p["end"] - p["begin"] + 1
with open(p["bin"], "rb") as bf:
data = bf.read()
if len(data) != expected_len:
print(
f"⚠️ Size mismatch in {os.path.basename(p['bin'])}: "
f"expected {expected_len}, got {len(data)}"
)
# kratší data jsou OK (CRC, padding, end-of-part)
out.seek(p["begin"] - 1) # yEnc je 1-based
out.write(data)
print("🎉 DONE file assembled correctly")

View File

@@ -0,0 +1,93 @@
import os
import re
import nntplib
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv
from db import get_conn
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
RAW_DIR = r"downloads/raw"
MAX_WORKERS = 5
# ==========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(RAW_DIR, exist_ok=True)
# ---------- DB: load parts ----------
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
continue
part_no = int(m.group(1))
parts.append((part_no, art_num))
parts.sort(key=lambda x: x[0])
print(f"📦 Parts to download: {len(parts)}")
# ---------- worker ----------
def download_part(part_no: int, art_num: int):
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
if os.path.exists(out_path):
return f"⏭️ part {part_no} exists"
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True,
timeout=120
) as nntp:
nntp.group(GROUP)
_, info = nntp.body(art_num)
with open(out_path, "wb") as f:
for line in info.lines:
f.write(line)
f.write(b"\n")
return f"⬇️ part {part_no} done"
# ---------- parallel execution ----------
errors = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = [
executor.submit(download_part, part_no, art_num)
for part_no, art_num in parts
]
for future in as_completed(futures):
try:
msg = future.result()
print(msg)
except Exception as e:
errors += 1
print(f"❌ ERROR: {e}")
print("🎉 DONE")
print(f"⚠️ Errors: {errors}")

74
25 stahni raw.py Normal file
View File

@@ -0,0 +1,74 @@
import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
RAW_DIR = r"downloads/raw"
# ==========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(RAW_DIR, exist_ok=True)
# --- DB: najdi články ---
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
# --- parse part number from subject ---
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
part_no = int(m.group(1))
parts.append((part_no, art_num))
parts.sort(key=lambda x: x[0])
print(f"📦 Parts to download: {len(parts)}")
# --- NNTP download ---
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True
) as nntp:
nntp.group(GROUP)
for part_no, art_num in parts:
out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
if os.path.exists(out_path):
continue
print(f"⬇️ Downloading RAW part {part_no} (article {art_num})")
_, info = nntp.body(art_num)
with open(out_path, "wb") as f:
for line in info.lines:
f.write(line)
f.write(b"\n")
print("🎉 DONE RAW NNTP data stored")

93
26 Analyze one.py Normal file
View File

@@ -0,0 +1,93 @@
import binascii
RAW_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
# --------------------------------------------------
# yEnc decoder (offline, minimal, correct)
# --------------------------------------------------
def yenc_decode(lines: list[bytes]) -> bytes:
out = bytearray()
for line in lines:
# undo NNTP dot-stuffing
if line.startswith(b".."):
line = line[1:]
elif line.startswith(b"."):
line = line[1:]
i = 0
while i < len(line):
c = line[i]
if c == ord("="):
i += 1
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
return bytes(out)
# --------------------------------------------------
# 1. read RAW file
# --------------------------------------------------
with open(RAW_FILE, "rb") as f:
raw_lines = f.read().splitlines()
print(f"📄 RAW lines: {len(raw_lines)}")
# --------------------------------------------------
# 2. split parts
# --------------------------------------------------
ybegin = None
ypart = None
yend = None
data_lines = []
for line in raw_lines:
if line.startswith(b"=ybegin"):
ybegin = line.decode(errors="replace")
continue
if line.startswith(b"=ypart"):
ypart = line.decode(errors="replace")
continue
if line.startswith(b"=yend"):
yend = line.decode(errors="replace")
continue
data_lines.append(line)
print("🧾 ybegin:", ybegin)
print("🧾 ypart :", ypart)
print("🧾 yend :", yend)
print(f"📦 DATA lines: {len(data_lines)}")
# --------------------------------------------------
# 3. decode yEnc DATA
# --------------------------------------------------
decoded = yenc_decode(data_lines)
print(f"📦 Decoded bytes: {len(decoded)}")
# --------------------------------------------------
# 4. extract pcrc32
# --------------------------------------------------
pcrc = None
if "pcrc32=" in yend:
pcrc = yend.split("pcrc32=")[1].strip()
if not pcrc:
raise RuntimeError("No pcrc32 found")
# --------------------------------------------------
# 5. compute CRC32
# --------------------------------------------------
crc_calc = f"{binascii.crc32(decoded) & 0xffffffff:08x}"
print(f"🔎 CRC expected: {pcrc}")
print(f"🔎 CRC computed: {crc_calc}")
if crc_calc == pcrc:
print("✅ CRC MATCH — PART IS PERFECT")
else:
print("❌ CRC MISMATCH — DATA IS BROKEN")

85
90 test.py Normal file
View File

@@ -0,0 +1,85 @@
import sabctools
import io
import os
import binascii
import re
# --- KONFIGURACE ---
INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded"
def ultimate_bomba_decoder():
if not os.path.exists(INPUT_FILE):
print(f"❌ Soubor nenalezen: {INPUT_FILE}")
return
print(f"📖 Načítám soubor...")
with open(INPUT_FILE, "rb") as f:
raw_data = f.read()
# 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT
# Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13
yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data)
expected_size = 0
expected_crc_str = ""
if yend_match:
expected_size = int(yend_match.group(1))
expected_crc_str = yend_match.group(2).decode().lower()
print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}")
# 2. KLÍČOVÁ OPRAVA (Čištění dat)
# Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n
processed_data = raw_data.strip()
# Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n
processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n")
# 3. ZABALENÍ DO NNTP OBÁLKY
wrapped = b"222 0 <part1@id>\r\n" + processed_data + b"\r\n.\r\n"
# 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API)
decoder = sabctools.Decoder(len(wrapped))
buf = io.BytesIO(wrapped)
n = buf.readinto(decoder)
decoder.process(n)
response = next(decoder, None)
if response and response.data:
# 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32)
# binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát
vypoctene_crc_int = binascii.crc32(response.data)
vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower()
real_size = len(response.data)
print("-" * 40)
print(f"📊 Kontrola integrity:")
print(f" Skutečná velikost: {real_size} (Očekáváno: {expected_size})")
print(f" Vypočítané CRC: {vypoctene_crc_str}")
print(f" Očekávané CRC: {expected_crc_str}")
if vypoctene_crc_str == expected_crc_str:
print("✅ BINGO! Soubor je 100% v pořádku.")
else:
print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.")
# 6. ULOŽENÍ
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# Jméno z yEnc hlavičky: PC Pro 2011-07.pdf
out_name = response.file_name or "decoded_part.bin"
out_path = os.path.join(OUTPUT_DIR, out_name)
with open(out_path, "wb") as f_out:
f_out.write(response.data)
print(f"💾 Uloženo do: {out_path}")
print("-" * 40)
else:
print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.")
if __name__ == "__main__":
ultimate_bomba_decoder()