z230

2025-12-28 11:34:15 +01:00
parent f8dc6566bc
commit 64472e59ba
6 changed files with 488 additions and 143 deletions
--- a/slepeni.py
+++ b/slepeni.py
@@ -3,119 +3,11 @@ import re
 import nntplib
 from dotenv import load_dotenv
 from db import get_conn
 def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
    """
    Decode yEnc from NNTP BODY lines.
    Handles NNTP dot-stuffing and logs what happens.
    """
    out = bytearray()
    saw_ybegin = False
    data_lines = 0
    for idx, orig_line in enumerate(lines):
        line = orig_line
        # --- NNTP dot-stuffing ---
        if line.startswith(b".."):
            if debug:
                print(f"    [dot] line {idx}: '..' -> '.'")
            line = line[1:]
        elif line.startswith(b"."):
            if debug:
                print(f"    [dot] line {idx}: '.' removed")
            line = line[1:]
        # --- yEnc control lines ---
        if line.startswith(b"=ybegin"):
            saw_ybegin = True
            if debug:
                print(f"    [yEnc] =ybegin detected")
            continue
        if line.startswith(b"=ypart"):
            if debug:
                print(f"    [yEnc] =ypart detected")
            continue
        if line.startswith(b"=yend"):
            if debug:
                print(f"    [yEnc] =yend detected")
            continue
        # --- actual yEnc data ---
        data_lines += 1
        i = 0
        length = len(line)
        while i < length:
            c = line[i]
            if c == ord('='):
                i += 1
                if i >= length:
                    break
                c = (line[i] - 64) & 0xFF
            out.append((c - 42) & 0xFF)
            i += 1
    if debug:
        print(f"    [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
        print(f"    [yEnc] decoded_bytes={len(out)}")
    if not saw_ybegin:
        print("⚠️  WARNING: yEnc decoder used but =ybegin was NOT seen")
    return bytes(out)
 # def yenc_decode_lines(lines: list[bytes]) -> bytes:
 #     """
 #     Decode yEnc from NNTP BODY lines.
 #     Handles NNTP dot-stuffing correctly.
 #     """
 #     out = bytearray()
 #
 #     for line in lines:
 #         # --- undo NNTP dot-stuffing ---
 #         if line.startswith(b".."):
 #             line = line[1:]
 #         elif line.startswith(b"."):
 #             line = line[1:]
 #
 #         # --- skip yEnc control lines ---
 #         if line.startswith(b"=ybegin"):
 #             continue
 #         if line.startswith(b"=ypart"):
 #             continue
 #         if line.startswith(b"=yend"):
 #             continue
 #
 #         i = 0
 #         length = len(line)
 #
 #         while i < length:
 #             c = line[i]
 #
 #             if c == ord('='):   # yEnc escape
 #                 i += 1
 #                 if i >= length:
 #                     break
 #                 c = (line[i] - 64) & 0xFF
 #
 #             out.append((c - 42) & 0xFF)
 #             i += 1
 #
 #     return bytes(out)
 # ================== CONFIG ==================
 GROUP = "alt.binaries.e-book.magazines"
 SUBJECT_KEY = "PC Pro 2011-07.pdf"
 OUT_DIR = r"downloads/PC_Pro_2011-07"
 FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
 # ============================================
 load_dotenv()
@@ -124,11 +16,47 @@ EWEKA_PASS = os.getenv("EWEKA_PASS")
 os.makedirs(OUT_DIR, exist_ok=True)
-print("🔌 Connecting to PostgreSQL...")
+
 def yenc_decode_and_extract_headers(lines: list[bytes]):
    """
    Decode yEnc BODY lines and extract yEnc headers.
    Returns: (decoded_bytes, yenc_header_lines)
    """
    out = bytearray()
    yenc_headers = []
    for line in lines:
        # --- undo NNTP dot-stuffing ---
        if line.startswith(b".."):
            line = line[1:]
        elif line.startswith(b"."):
            line = line[1:]
        # --- capture yEnc control lines ---
        if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
            yenc_headers.append(line.decode("latin-1"))
            continue
        # --- yEnc decode ---
        i = 0
        length = len(line)
        while i < length:
            c = line[i]
            if c == ord("="):
                i += 1
                if i >= length:
                    break
                c = (line[i] - 64) & 0xFF
            out.append((c - 42) & 0xFF)
            i += 1
    return bytes(out), yenc_headers
 # ------------------ DB ------------------
 conn = get_conn()
 cur = conn.cursor()
 # --- load article numbers + subject ---
 cur.execute("""
    SELECT article_number, metadata->>'subject'
    FROM articles
@@ -138,12 +66,11 @@ cur.execute("""
 """, (GROUP, f"%{SUBJECT_KEY}%"))
 rows = cur.fetchall()
 print(f"📦 Found {len(rows)} parts")
-# --- parse part number from subject ---
+# parse part number from subject
 part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
 parts = []
 for art_num, subject in rows:
    m = part_re.search(subject or "")
    if not m:
@@ -151,10 +78,9 @@ for art_num, subject in rows:
    part_no = int(m.group(1))
    parts.append((part_no, art_num))
 # sort by part number (1..N)
 parts.sort(key=lambda x: x[0])
-print("🔌 Connecting to Eweka NNTP...")
+# ------------------ NNTP ------------------
 with nntplib.NNTP_SSL(
    "news.eweka.nl",
    563,
@@ -165,38 +91,19 @@ with nntplib.NNTP_SSL(
    nntp.group(GROUP)
-    for idx, (part_no, art_num) in enumerate(parts, start=1):
+    for part_no, art_num in parts:
-        out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+        bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
        hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")
-        if os.path.exists(out_path):
+        if os.path.exists(bin_path) and os.path.exists(hdr_path):
            print(f"⏭️  [{idx}/{len(parts)}] part {part_no} already exists, skipping")
            continue
-        print(f"⬇️  [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
+        _, info = nntp.body(art_num)
-        resp, info = nntp.body(art_num)
+        decoded, headers = yenc_decode_and_extract_headers(info.lines)
-        print(f"    BODY lines received: {len(info.lines)}")
+        with open(bin_path, "wb") as f:
        # rychlá kontrola prvních řádků
        for ln in info.lines[:3]:
            print(f"    RAW:", ln[:80])
        decoded = yenc_decode_lines(info.lines, debug=True)
        print(f"    RESULT bytes: {len(decoded)}")
        with open(out_path, "wb") as f:
            f.write(decoded)
-
+        with open(hdr_path, "w", encoding="utf-8") as f:
-print("🧩 Assembling final PDF...")
+            f.write("\n".join(headers))
 with open(FINAL_PDF, "wb") as out:
    for part_no, _ in parts:
        part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
        with open(part_path, "rb") as pf:
            out.write(pf.read())
 print("🎉 DONE")
 print(f"📄 Final PDF: {FINAL_PDF}")
--- a/slepení2.PY
+++ b/slepení2.PY
@@ -0,0 +1,93 @@
 import os
 import re
 # ================= CONFIG =================
 PARTS_DIR = r"downloads/PC_Pro_2011-07"
 OUTPUT_DIR = r"downloads"
 # ==========================================
 ybegin_re = re.compile(r"size=(\d+)")
 name_re   = re.compile(r"name=(.+)")
 ypart_re  = re.compile(r"begin=(\d+)\s+end=(\d+)")
 parts = []
 total_size = None
 final_name = None
 # --- 1. načti všechny yEnc hlavičky ---
 for fname in os.listdir(PARTS_DIR):
    if not fname.endswith(".yEncHeader"):
        continue
    part_id = fname.replace(".yEncHeader", "")
    hdr_path = os.path.join(PARTS_DIR, fname)
    bin_path = os.path.join(PARTS_DIR, f"{part_id}.bin")
    if not os.path.exists(bin_path):
        raise RuntimeError(f"Missing bin file for {fname}")
    with open(hdr_path, "r", encoding="utf-8", errors="replace") as f:
        lines = f.read().splitlines()
    begin = end = None
    for line in lines:
        if line.startswith("=ybegin"):
            if total_size is None:
                m = ybegin_re.search(line)
                if m:
                    total_size = int(m.group(1))
                m = name_re.search(line)
                if m:
                    final_name = m.group(1)
        if line.startswith("=ypart"):
            m = ypart_re.search(line)
            if not m:
                raise RuntimeError(f"Cannot parse ypart in {fname}")
            begin = int(m.group(1))
            end   = int(m.group(2))
    if begin is None or end is None:
        raise RuntimeError(f"Missing begin/end in {fname}")
    parts.append({
        "bin": bin_path,
        "begin": begin,
        "end": end
    })
 # --- sanity checks ---
 if total_size is None or final_name is None:
    raise RuntimeError("Missing ybegin info (size/name)")
 output_path = os.path.join(OUTPUT_DIR, final_name)
 print(f"📄 Final file: {output_path}")
 print(f"📦 Total size: {total_size} bytes")
 print(f"🧩 Parts: {len(parts)}")
 # --- 2. alokuj cílový soubor ---
 with open(output_path, "wb") as f:
    f.truncate(total_size)
 # --- 3. zapiš jednotlivé části na správné offsety ---
 with open(output_path, "r+b") as out:
    for p in parts:
        expected_len = p["end"] - p["begin"] + 1
        with open(p["bin"], "rb") as bf:
            data = bf.read()
        if len(data) != expected_len:
            print(
                f"⚠️  Size mismatch in {os.path.basename(p['bin'])}: "
                f"expected {expected_len}, got {len(data)}"
            )
        # kratší data jsou OK (CRC, padding, end-of-part)
        out.seek(p["begin"] - 1)  # yEnc je 1-based
        out.write(data)
 print("🎉 DONE – file assembled correctly")
--- a/multithread.py
+++ b/multithread.py
@@ -0,0 +1,93 @@
 import os
 import re
 import nntplib
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dotenv import load_dotenv
 from db import get_conn
 # ================= CONFIG =================
 GROUP = "alt.binaries.e-book.magazines"
 SUBJECT_KEY = "PC Pro 2011-07.pdf"
 RAW_DIR = r"downloads/raw"
 MAX_WORKERS = 5
 # ==========================================
 load_dotenv()
 EWEKA_USER = os.getenv("EWEKA_USER")
 EWEKA_PASS = os.getenv("EWEKA_PASS")
 os.makedirs(RAW_DIR, exist_ok=True)
 # ---------- DB: load parts ----------
 conn = get_conn()
 cur = conn.cursor()
 cur.execute("""
    SELECT article_number, metadata->>'subject'
    FROM articles
    WHERE newsgroup = %s
      AND metadata->>'subject' LIKE %s
    ORDER BY article_number
 """, (GROUP, f"%{SUBJECT_KEY}%"))
 rows = cur.fetchall()
 part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
 parts = []
 for art_num, subject in rows:
    m = part_re.search(subject or "")
    if not m:
        continue
    part_no = int(m.group(1))
    parts.append((part_no, art_num))
 parts.sort(key=lambda x: x[0])
 print(f"📦 Parts to download: {len(parts)}")
 # ---------- worker ----------
 def download_part(part_no: int, art_num: int):
    out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
    if os.path.exists(out_path):
        return f"⏭️  part {part_no} exists"
    with nntplib.NNTP_SSL(
        "news.eweka.nl",
        563,
        EWEKA_USER,
        EWEKA_PASS,
        readermode=True,
        timeout=120
    ) as nntp:
        nntp.group(GROUP)
        _, info = nntp.body(art_num)
        with open(out_path, "wb") as f:
            for line in info.lines:
                f.write(line)
                f.write(b"\n")
    return f"⬇️  part {part_no} done"
 # ---------- parallel execution ----------
 errors = 0
 with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [
        executor.submit(download_part, part_no, art_num)
        for part_no, art_num in parts
    ]
    for future in as_completed(futures):
        try:
            msg = future.result()
            print(msg)
        except Exception as e:
            errors += 1
            print(f"❌ ERROR: {e}")
 print("🎉 DONE")
 print(f"⚠️  Errors: {errors}")
--- a/raw.py
+++ b/raw.py
@@ -0,0 +1,74 @@
 import os
 import re
 import nntplib
 from dotenv import load_dotenv
 from db import get_conn
 # ================= CONFIG =================
 GROUP = "alt.binaries.e-book.magazines"
 SUBJECT_KEY = "PC Pro 2011-07.pdf"
 RAW_DIR = r"downloads/raw"
 # ==========================================
 load_dotenv()
 EWEKA_USER = os.getenv("EWEKA_USER")
 EWEKA_PASS = os.getenv("EWEKA_PASS")
 os.makedirs(RAW_DIR, exist_ok=True)
 # --- DB: najdi články ---
 conn = get_conn()
 cur = conn.cursor()
 cur.execute("""
    SELECT article_number, metadata->>'subject'
    FROM articles
    WHERE newsgroup = %s
      AND metadata->>'subject' LIKE %s
    ORDER BY article_number
 """, (GROUP, f"%{SUBJECT_KEY}%"))
 rows = cur.fetchall()
 # --- parse part number from subject ---
 part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
 parts = []
 for art_num, subject in rows:
    m = part_re.search(subject or "")
    if not m:
        raise RuntimeError(f"Cannot parse part number from subject: {subject}")
    part_no = int(m.group(1))
    parts.append((part_no, art_num))
 parts.sort(key=lambda x: x[0])
 print(f"📦 Parts to download: {len(parts)}")
 # --- NNTP download ---
 with nntplib.NNTP_SSL(
    "news.eweka.nl",
    563,
    EWEKA_USER,
    EWEKA_PASS,
    readermode=True
 ) as nntp:
    nntp.group(GROUP)
    for part_no, art_num in parts:
        out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw")
        if os.path.exists(out_path):
            continue
        print(f"⬇️  Downloading RAW part {part_no} (article {art_num})")
        _, info = nntp.body(art_num)
        with open(out_path, "wb") as f:
            for line in info.lines:
                f.write(line)
                f.write(b"\n")
 print("🎉 DONE – RAW NNTP data stored")
--- a/one.py
+++ b/one.py
@@ -0,0 +1,93 @@
 import binascii
 RAW_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
 # --------------------------------------------------
 # yEnc decoder (offline, minimal, correct)
 # --------------------------------------------------
 def yenc_decode(lines: list[bytes]) -> bytes:
    out = bytearray()
    for line in lines:
        # undo NNTP dot-stuffing
        if line.startswith(b".."):
            line = line[1:]
        elif line.startswith(b"."):
            line = line[1:]
        i = 0
        while i < len(line):
            c = line[i]
            if c == ord("="):
                i += 1
                c = (line[i] - 64) & 0xFF
            out.append((c - 42) & 0xFF)
            i += 1
    return bytes(out)
 # --------------------------------------------------
 # 1. read RAW file
 # --------------------------------------------------
 with open(RAW_FILE, "rb") as f:
    raw_lines = f.read().splitlines()
 print(f"📄 RAW lines: {len(raw_lines)}")
 # --------------------------------------------------
 # 2. split parts
 # --------------------------------------------------
 ybegin = None
 ypart = None
 yend = None
 data_lines = []
 for line in raw_lines:
    if line.startswith(b"=ybegin"):
        ybegin = line.decode(errors="replace")
        continue
    if line.startswith(b"=ypart"):
        ypart = line.decode(errors="replace")
        continue
    if line.startswith(b"=yend"):
        yend = line.decode(errors="replace")
        continue
    data_lines.append(line)
 print("🧾 ybegin:", ybegin)
 print("🧾 ypart :", ypart)
 print("🧾 yend  :", yend)
 print(f"📦 DATA lines: {len(data_lines)}")
 # --------------------------------------------------
 # 3. decode yEnc DATA
 # --------------------------------------------------
 decoded = yenc_decode(data_lines)
 print(f"📦 Decoded bytes: {len(decoded)}")
 # --------------------------------------------------
 # 4. extract pcrc32
 # --------------------------------------------------
 pcrc = None
 if "pcrc32=" in yend:
    pcrc = yend.split("pcrc32=")[1].strip()
 if not pcrc:
    raise RuntimeError("No pcrc32 found")
 # --------------------------------------------------
 # 5. compute CRC32
 # --------------------------------------------------
 crc_calc = f"{binascii.crc32(decoded) & 0xffffffff:08x}"
 print(f"🔎 CRC expected: {pcrc}")
 print(f"🔎 CRC computed: {crc_calc}")
 if crc_calc == pcrc:
    print("✅ CRC MATCH — PART IS PERFECT")
 else:
    print("❌ CRC MISMATCH — DATA IS BROKEN")
--- a/test.py
+++ b/test.py
@@ -0,0 +1,85 @@
 import sabctools
 import io
 import os
 import binascii
 import re
 # --- KONFIGURACE ---
 INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw"
 OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded"
 def ultimate_bomba_decoder():
    if not os.path.exists(INPUT_FILE):
        print(f"❌ Soubor nenalezen: {INPUT_FILE}")
        return
    print(f"📖 Načítám soubor...")
    with open(INPUT_FILE, "rb") as f:
        raw_data = f.read()
    # 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT
    # Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13
    yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data)
    expected_size = 0
    expected_crc_str = ""
    if yend_match:
        expected_size = int(yend_match.group(1))
        expected_crc_str = yend_match.group(2).decode().lower()
        print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}")
    # 2. KLÍČOVÁ OPRAVA (Čištění dat)
    # Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n
    processed_data = raw_data.strip()
    # Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n
    processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n")
    # 3. ZABALENÍ DO NNTP OBÁLKY
    wrapped = b"222 0 <part1@id>\r\n" + processed_data + b"\r\n.\r\n"
    # 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API)
    decoder = sabctools.Decoder(len(wrapped))
    buf = io.BytesIO(wrapped)
    n = buf.readinto(decoder)
    decoder.process(n)
    response = next(decoder, None)
    if response and response.data:
        # 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32)
        # binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát
        vypoctene_crc_int = binascii.crc32(response.data)
        vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower()
        real_size = len(response.data)
        print("-" * 40)
        print(f"📊 Kontrola integrity:")
        print(f"   Skutečná velikost: {real_size} (Očekáváno: {expected_size})")
        print(f"   Vypočítané CRC:    {vypoctene_crc_str}")
        print(f"   Očekávané CRC:     {expected_crc_str}")
        if vypoctene_crc_str == expected_crc_str:
            print("✅ BINGO! Soubor je 100% v pořádku.")
        else:
            print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.")
        # 6. ULOŽENÍ
        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)
        # Jméno z yEnc hlavičky: PC Pro 2011-07.pdf
        out_name = response.file_name or "decoded_part.bin"
        out_path = os.path.join(OUTPUT_DIR, out_name)
        with open(out_path, "wb") as f_out:
            f_out.write(response.data)
        print(f"💾 Uloženo do: {out_path}")
        print("-" * 40)
    else:
        print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.")
 if __name__ == "__main__":
    ultimate_bomba_decoder()