z230

2025-12-28 11:34:15 +01:00
parent f8dc6566bc
commit 64472e59ba
6 changed files with 488 additions and 143 deletions
--- a/slepeni.py
+++ b/slepeni.py
@@ -3,119 +3,11 @@ import re
 import nntplib
 from dotenv import load_dotenv
 from db import get_conn
-def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
-    """
-    Decode yEnc from NNTP BODY lines.
-    Handles NNTP dot-stuffing and logs what happens.
-    """
-    out = bytearray()
-    saw_ybegin = False
-    data_lines = 0
-
-    for idx, orig_line in enumerate(lines):
-        line = orig_line
-
-        # --- NNTP dot-stuffing ---
-        if line.startswith(b".."):
-            if debug:
-                print(f"    [dot] line {idx}: '..' -> '.'")
-            line = line[1:]
-        elif line.startswith(b"."):
-            if debug:
-                print(f"    [dot] line {idx}: '.' removed")
-            line = line[1:]
-
-        # --- yEnc control lines ---
-        if line.startswith(b"=ybegin"):
-            saw_ybegin = True
-            if debug:
-                print(f"    [yEnc] =ybegin detected")
-            continue
-
-        if line.startswith(b"=ypart"):
-            if debug:
-                print(f"    [yEnc] =ypart detected")
-            continue
-
-        if line.startswith(b"=yend"):
-            if debug:
-                print(f"    [yEnc] =yend detected")
-            continue
-
-        # --- actual yEnc data ---
-        data_lines += 1
-        i = 0
-        length = len(line)
-
-        while i < length:
-            c = line[i]
-
-            if c == ord('='):
-                i += 1
-                if i >= length:
-                    break
-                c = (line[i] - 64) & 0xFF
-
-            out.append((c - 42) & 0xFF)
-            i += 1
-
-    if debug:
-        print(f"    [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
-        print(f"    [yEnc] decoded_bytes={len(out)}")
-
-    if not saw_ybegin:
-        print("⚠️  WARNING: yEnc decoder used but =ybegin was NOT seen")
-
-    return bytes(out)
-
-
-# def yenc_decode_lines(lines: list[bytes]) -> bytes:
-#     """
-#     Decode yEnc from NNTP BODY lines.
-#     Handles NNTP dot-stuffing correctly.
-#     """
-#     out = bytearray()
-#
-#     for line in lines:
-#         # --- undo NNTP dot-stuffing ---
-#         if line.startswith(b".."):
-#             line = line[1:]
-#         elif line.startswith(b"."):
-#             line = line[1:]
-#
-#         # --- skip yEnc control lines ---
-#         if line.startswith(b"=ybegin"):
-#             continue
-#         if line.startswith(b"=ypart"):
-#             continue
-#         if line.startswith(b"=yend"):
-#             continue
-#
-#         i = 0
-#         length = len(line)
-#
-#         while i < length:
-#             c = line[i]
-#
-#             if c == ord('='):   # yEnc escape
-#                 i += 1
-#                 if i >= length:
-#                     break
-#                 c = (line[i] - 64) & 0xFF
-#
-#             out.append((c - 42) & 0xFF)
-#             i += 1
-#
-#     return bytes(out)
-
-
-

 # ================== CONFIG ==================
 GROUP = "alt.binaries.e-book.magazines"
 SUBJECT_KEY = "PC Pro 2011-07.pdf"
 OUT_DIR = r"downloads/PC_Pro_2011-07"
-FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
 # ============================================

 load_dotenv()
@@ -124,11 +16,47 @@ EWEKA_PASS = os.getenv("EWEKA_PASS")

 os.makedirs(OUT_DIR, exist_ok=True)

-print("🔌 Connecting to PostgreSQL...")
+
+def yenc_decode_and_extract_headers(lines: list[bytes]):
+    """
+    Decode yEnc BODY lines and extract yEnc headers.
+    Returns: (decoded_bytes, yenc_header_lines)
+    """
+    out = bytearray()
+    yenc_headers = []
+
+    for line in lines:
+        # --- undo NNTP dot-stuffing ---
+        if line.startswith(b".."):
+            line = line[1:]
+        elif line.startswith(b"."):
+            line = line[1:]
+
+        # --- capture yEnc control lines ---
+        if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
+            yenc_headers.append(line.decode("latin-1"))
+            continue
+
+        # --- yEnc decode ---
+        i = 0
+        length = len(line)
+        while i < length:
+            c = line[i]
+            if c == ord("="):
+                i += 1
+                if i >= length:
+                    break
+                c = (line[i] - 64) & 0xFF
+            out.append((c - 42) & 0xFF)
+            i += 1
+
+    return bytes(out), yenc_headers
+
+
+# ------------------ DB ------------------
 conn = get_conn()
 cur = conn.cursor()

-# --- load article numbers + subject ---
 cur.execute("""
    SELECT article_number, metadata->>'subject'
    FROM articles
@@ -138,12 +66,11 @@ cur.execute("""
 """, (GROUP, f"%{SUBJECT_KEY}%"))

 rows = cur.fetchall()
-print(f"📦 Found {len(rows)} parts")

-# --- parse part number from subject ---
+# parse part number from subject
 part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
-
 parts = []
+
 for art_num, subject in rows:
    m = part_re.search(subject or "")
    if not m:
@@ -151,10 +78,9 @@ for art_num, subject in rows:
    part_no = int(m.group(1))
    parts.append((part_no, art_num))

-# sort by part number (1..N)
 parts.sort(key=lambda x: x[0])

-print("🔌 Connecting to Eweka NNTP...")
+# ------------------ NNTP ------------------
 with nntplib.NNTP_SSL(
    "news.eweka.nl",
    563,
@@ -165,38 +91,19 @@ with nntplib.NNTP_SSL(

    nntp.group(GROUP)

-    for idx, (part_no, art_num) in enumerate(parts, start=1):
-        out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+    for part_no, art_num in parts:
+        bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+        hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")

-        if os.path.exists(out_path):
-            print(f"⏭️  [{idx}/{len(parts)}] part {part_no} already exists, skipping")
+        if os.path.exists(bin_path) and os.path.exists(hdr_path):
            continue

-        print(f"⬇️  [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
+        _, info = nntp.body(art_num)

-        resp, info = nntp.body(art_num)
+        decoded, headers = yenc_decode_and_extract_headers(info.lines)

-        print(f"    BODY lines received: {len(info.lines)}")
-
-        # rychlá kontrola prvních řádků
-        for ln in info.lines[:3]:
-            print(f"    RAW:", ln[:80])
-
-        decoded = yenc_decode_lines(info.lines, debug=True)
-
-        print(f"    RESULT bytes: {len(decoded)}")
-
-        with open(out_path, "wb") as f:
+        with open(bin_path, "wb") as f:
            f.write(decoded)

-
-print("🧩 Assembling final PDF...")
-
-with open(FINAL_PDF, "wb") as out:
-    for part_no, _ in parts:
-        part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
-        with open(part_path, "rb") as pf:
-            out.write(pf.read())
-
-print("🎉 DONE")
-print(f"📄 Final PDF: {FINAL_PDF}")
+        with open(hdr_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(headers))