z230

2025-12-27 17:24:30 +01:00
parent ea485bad29
commit f8dc6566bc
7 changed files with 542 additions and 0 deletions
--- a/slepeni.py
+++ b/slepeni.py
@@ -0,0 +1,202 @@
+import os
+import re
+import nntplib
+from dotenv import load_dotenv
+from db import get_conn
+def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
+    """
+    Decode yEnc from NNTP BODY lines.
+    Handles NNTP dot-stuffing and logs what happens.
+    """
+    out = bytearray()
+    saw_ybegin = False
+    data_lines = 0
+
+    for idx, orig_line in enumerate(lines):
+        line = orig_line
+
+        # --- NNTP dot-stuffing ---
+        if line.startswith(b".."):
+            if debug:
+                print(f"    [dot] line {idx}: '..' -> '.'")
+            line = line[1:]
+        elif line.startswith(b"."):
+            if debug:
+                print(f"    [dot] line {idx}: '.' removed")
+            line = line[1:]
+
+        # --- yEnc control lines ---
+        if line.startswith(b"=ybegin"):
+            saw_ybegin = True
+            if debug:
+                print(f"    [yEnc] =ybegin detected")
+            continue
+
+        if line.startswith(b"=ypart"):
+            if debug:
+                print(f"    [yEnc] =ypart detected")
+            continue
+
+        if line.startswith(b"=yend"):
+            if debug:
+                print(f"    [yEnc] =yend detected")
+            continue
+
+        # --- actual yEnc data ---
+        data_lines += 1
+        i = 0
+        length = len(line)
+
+        while i < length:
+            c = line[i]
+
+            if c == ord('='):
+                i += 1
+                if i >= length:
+                    break
+                c = (line[i] - 64) & 0xFF
+
+            out.append((c - 42) & 0xFF)
+            i += 1
+
+    if debug:
+        print(f"    [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
+        print(f"    [yEnc] decoded_bytes={len(out)}")
+
+    if not saw_ybegin:
+        print("⚠️  WARNING: yEnc decoder used but =ybegin was NOT seen")
+
+    return bytes(out)
+
+
+# def yenc_decode_lines(lines: list[bytes]) -> bytes:
+#     """
+#     Decode yEnc from NNTP BODY lines.
+#     Handles NNTP dot-stuffing correctly.
+#     """
+#     out = bytearray()
+#
+#     for line in lines:
+#         # --- undo NNTP dot-stuffing ---
+#         if line.startswith(b".."):
+#             line = line[1:]
+#         elif line.startswith(b"."):
+#             line = line[1:]
+#
+#         # --- skip yEnc control lines ---
+#         if line.startswith(b"=ybegin"):
+#             continue
+#         if line.startswith(b"=ypart"):
+#             continue
+#         if line.startswith(b"=yend"):
+#             continue
+#
+#         i = 0
+#         length = len(line)
+#
+#         while i < length:
+#             c = line[i]
+#
+#             if c == ord('='):   # yEnc escape
+#                 i += 1
+#                 if i >= length:
+#                     break
+#                 c = (line[i] - 64) & 0xFF
+#
+#             out.append((c - 42) & 0xFF)
+#             i += 1
+#
+#     return bytes(out)
+
+
+
+
+# ================== CONFIG ==================
+GROUP = "alt.binaries.e-book.magazines"
+SUBJECT_KEY = "PC Pro 2011-07.pdf"
+OUT_DIR = r"downloads/PC_Pro_2011-07"
+FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
+# ============================================
+
+load_dotenv()
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+os.makedirs(OUT_DIR, exist_ok=True)
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+cur = conn.cursor()
+
+# --- load article numbers + subject ---
+cur.execute("""
+    SELECT article_number, metadata->>'subject'
+    FROM articles
+    WHERE newsgroup = %s
+      AND metadata->>'subject' LIKE %s
+    ORDER BY article_number
+""", (GROUP, f"%{SUBJECT_KEY}%"))
+
+rows = cur.fetchall()
+print(f"📦 Found {len(rows)} parts")
+
+# --- parse part number from subject ---
+part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
+
+parts = []
+for art_num, subject in rows:
+    m = part_re.search(subject or "")
+    if not m:
+        raise RuntimeError(f"Cannot parse part number from subject: {subject}")
+    part_no = int(m.group(1))
+    parts.append((part_no, art_num))
+
+# sort by part number (1..N)
+parts.sort(key=lambda x: x[0])
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    "news.eweka.nl",
+    563,
+    EWEKA_USER,
+    EWEKA_PASS,
+    readermode=True
+) as nntp:
+
+    nntp.group(GROUP)
+
+    for idx, (part_no, art_num) in enumerate(parts, start=1):
+        out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+
+        if os.path.exists(out_path):
+            print(f"⏭️  [{idx}/{len(parts)}] part {part_no} already exists, skipping")
+            continue
+
+        print(f"⬇️  [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
+
+        resp, info = nntp.body(art_num)
+
+        print(f"    BODY lines received: {len(info.lines)}")
+
+        # rychlá kontrola prvních řádků
+        for ln in info.lines[:3]:
+            print(f"    RAW:", ln[:80])
+
+        decoded = yenc_decode_lines(info.lines, debug=True)
+
+        print(f"    RESULT bytes: {len(decoded)}")
+
+        with open(out_path, "wb") as f:
+            f.write(decoded)
+
+
+print("🧩 Assembling final PDF...")
+
+with open(FINAL_PDF, "wb") as out:
+    for part_no, _ in parts:
+        part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+        with open(part_path, "rb") as pf:
+            out.write(pf.read())
+
+print("🎉 DONE")
+print(f"📄 Final PDF: {FINAL_PDF}")