From f8dc6566bc4b1ee47a70d46c44820c1ba771daf9 Mon Sep 17 00:00:00 2001
From: Vladimir Buzalka <vladimir.buzalka@buzalka.cz>
Date: Sat, 27 Dec 2025 17:24:30 +0100
Subject: [PATCH] z230

---
 .env                               |  10 ++
 10 list of newsgroups.py           |  58 +++++++++
 20 Alt binaries ebook magazines.py |  93 +++++++++++++
 21 poslednich 100k.py              |  94 ++++++++++++++
 22 stat test posledniho clanku.py  |  71 ++++++++++
 23 ulozeni a slepeni.py            | 202 +++++++++++++++++++++++++++++
 db.py                              |  14 ++
 7 files changed, 542 insertions(+)
 create mode 100644 .env
 create mode 100644 10 list of newsgroups.py
 create mode 100644 20 Alt binaries ebook magazines.py
 create mode 100644 21 poslednich 100k.py
 create mode 100644 22 stat test posledniho clanku.py
 create mode 100644 23 ulozeni a slepeni.py
 create mode 100644 db.py

diff --git a/.env b/.env
new file mode 100644
index 0000000..4b10d20
--- /dev/null
+++ b/.env
@@ -0,0 +1,10 @@
+# ===== EWEKA =====
+EWEKA_USER=d6ef27c2d6496b22
+EWEKA_PASS=Vlado7309208104
+
+# ===== POSTGRES =====
+PG_HOST=192.168.1.76
+PG_PORT=5432
+PG_DB=newsgroups
+PG_USER=vladimir.buzalka
+PG_PASS=Vlado7309208104++
diff --git a/10 list of newsgroups.py b/10 list of newsgroups.py
new file mode 100644
index 0000000..83c7db0
--- /dev/null
+++ b/10 list of newsgroups.py	
@@ -0,0 +1,58 @@
+from datetime import datetime, UTC
+import nntplib
+from db import get_conn
+from dotenv import load_dotenv
+import os
+
+load_dotenv()  # načte .env
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+PROVIDER = "eweka"
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+
+conn.autocommit = True
+cur = conn.cursor()
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    host="news.eweka.nl",
+    port=563,
+    user=EWEKA_USER,
+    password=EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    print("📜 Fetching LIST ACTIVE...")
+    resp, groups = nntp.list()
+    print(f"📦 Received {len(groups)} groups")
+
+    rows = [
+        (
+            name,
+            int(first),
+            int(last),
+            flag,
+            PROVIDER,
+            datetime.now(UTC),
+        )
+        for name, last, first, flag in groups
+    ]
+
+    cur.executemany(
+        """
+        INSERT INTO newsgroups
+            (name, first_article, last_article, posting_flag, provider, fetched_at)
+        VALUES (%s, %s, %s, %s, %s, %s)
+        ON CONFLICT (name) DO UPDATE SET
+            first_article = EXCLUDED.first_article,
+            last_article  = EXCLUDED.last_article,
+            posting_flag  = EXCLUDED.posting_flag,
+            fetched_at    = EXCLUDED.fetched_at
+        """,
+        rows,
+    )
+
+print("🎉 DONE")
diff --git a/20 Alt binaries ebook magazines.py b/20 Alt binaries ebook magazines.py
new file mode 100644
index 0000000..89faf8d
--- /dev/null
+++ b/20 Alt binaries ebook magazines.py	
@@ -0,0 +1,93 @@
+import nntplib
+import os
+from dotenv import load_dotenv
+from datetime import datetime, UTC
+from db import get_conn
+from psycopg.types.json import Json
+
+
+def sanitize(value):
+    if isinstance(value, str):
+        return value.encode("utf-8", errors="surrogatepass") \
+                    .decode("utf-8", errors="replace")
+    return value
+
+
+# ================= CONFIG =================
+GROUP = "alt.binaries.e-book.magazines"
+TOTAL_ARTICLES = 100_000
+BATCH_SIZE = 1_000
+# =========================================
+
+load_dotenv()
+
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+conn.autocommit = True
+cur = conn.cursor()
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    host="news.eweka.nl",
+    port=563,
+    user=EWEKA_USER,
+    password=EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    # --- GROUP ---
+    resp, count, first, last, name = nntp.group(GROUP)
+    first = int(first)
+    last = int(last)
+
+    print(f"📂 Group: {name}")
+    print(f"📐 Range: {first} – {last}")
+
+    start_global = max(first, last - TOTAL_ARTICLES + 1)
+    print(f"🎯 Target range: {start_global} – {last}")
+
+
+    for batch_start in range(start_global, last + 1, BATCH_SIZE):
+        batch_end = min(batch_start + BATCH_SIZE - 1, last)
+
+        print(f"📜 XOVER {batch_start}-{batch_end}")
+
+        try:
+            resp, overviews = nntp.xover(batch_start, batch_end)
+        except Exception as e:
+            print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
+            continue
+
+        rows = []
+        for art_num, fields in overviews:
+            clean_fields = {k: sanitize(v) for k, v in fields.items()}
+
+            metadata = {
+                "group": GROUP,
+                "article_number": art_num,
+                **clean_fields,
+            }
+
+            rows.append((
+                GROUP,
+                art_num,
+                fields.get("message-id"),
+                Json(metadata),  # 👈 TADY
+                datetime.now(UTC),
+            ))
+
+        if rows:
+            cur.executemany(
+                """
+                INSERT INTO articles
+                    (newsgroup, article_number, message_id, metadata, fetched_at)
+                VALUES (%s, %s, %s, %s, %s)
+                ON CONFLICT (message_id) DO NOTHING
+                """,
+                rows,
+            )
+
+print("🎉 DONE – last 100k articles ingested")
diff --git a/21 poslednich 100k.py b/21 poslednich 100k.py
new file mode 100644
index 0000000..7383083
--- /dev/null
+++ b/21 poslednich 100k.py	
@@ -0,0 +1,94 @@
+import nntplib
+import os
+from dotenv import load_dotenv
+from datetime import datetime, UTC
+from db import get_conn
+from psycopg.types.json import Json
+
+
+def sanitize(value):
+    if isinstance(value, str):
+        return value.encode("utf-8", errors="surrogatepass") \
+                    .decode("utf-8", errors="replace")
+    return value
+
+
+# ================= CONFIG =================
+GROUP = "alt.binaries.e-book.magazines"
+TOTAL_ARTICLES = 50_000_000
+BATCH_SIZE = 10_000
+# =========================================
+
+load_dotenv()
+
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+conn.autocommit = True
+cur = conn.cursor()
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    host="news.eweka.nl",
+    port=563,
+    user=EWEKA_USER,
+    password=EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    # --- GROUP ---
+    resp, count, first, last, name = nntp.group(GROUP)
+    first = int(first)
+    last = int(last)
+
+    start_global = first
+    end_global = min(first + TOTAL_ARTICLES - 1, last)
+
+    print(f"🎯 Target range: {start_global} – {end_global}")
+
+    print(f"📂 Group: {name}")
+    print(f"📐 Range: {first} – {last}")
+
+    for batch_start in range(start_global, end_global + 1, BATCH_SIZE):
+        batch_end = min(batch_start + BATCH_SIZE - 1, end_global)
+
+        print(f"📜 XOVER {batch_start}-{batch_end}")
+
+        try:
+            resp, overviews = nntp.xover(batch_start, batch_end)
+        except Exception as e:
+            print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
+            continue
+
+        rows = []
+        for art_num, fields in overviews:
+            clean_fields = {k: sanitize(v) for k, v in fields.items()}
+
+            metadata = {
+                "group": GROUP,
+                "article_number": art_num,
+                **clean_fields,
+            }
+
+            rows.append((
+                GROUP,
+                art_num,
+                fields.get("message-id"),
+                Json(metadata),  # 👈 TADY
+                datetime.now(UTC),
+            ))
+
+        if rows:
+            cur.executemany(
+                """
+                INSERT INTO articles
+                    (newsgroup, article_number, message_id, metadata, fetched_at)
+                VALUES (%s, %s, %s, %s, %s)
+                ON CONFLICT (message_id) DO NOTHING
+                """,
+                rows,
+            )
+
+print("🎉 DONE – last 100k articles ingested")
diff --git a/22 stat test posledniho clanku.py b/22 stat test posledniho clanku.py
new file mode 100644
index 0000000..89eabe2
--- /dev/null
+++ b/22 stat test posledniho clanku.py	
@@ -0,0 +1,71 @@
+import nntplib
+import os
+from dotenv import load_dotenv
+from db import get_conn
+
+# ================== CONFIG ==================
+GROUP = "alt.binaries.e-book.magazines"
+SUBJECT_KEY = "PC Pro 2011-07.pdf"
+# ============================================
+
+load_dotenv()
+
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+cur = conn.cursor()
+
+cur.execute("""
+    SELECT article_number
+    FROM articles
+    WHERE newsgroup = %s
+      AND metadata->>'subject' LIKE %s
+    ORDER BY article_number
+""", (GROUP, f"%{SUBJECT_KEY}%"))
+
+article_numbers = [row[0] for row in cur.fetchall()]
+total = len(article_numbers)
+
+print(f"📦 Found {total} parts in DB")
+
+if total == 0:
+    print("❌ No articles found, aborting.")
+    exit(1)
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    "news.eweka.nl",
+    563,
+    EWEKA_USER,
+    EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    nntp.group(GROUP)
+
+    existing = []
+    missing = []
+
+    for idx, art in enumerate(article_numbers, start=1):
+        try:
+            nntp.stat(art)
+            existing.append(art)
+            print(f"✅ [{idx}/{total}] EXISTS  article {art}")
+        except Exception:
+            missing.append(art)
+            print(f"❌ [{idx}/{total}] MISSING article {art}")
+
+print("\n================ RESULT ================")
+print(f"Total parts : {total}")
+print(f"Existing    : {len(existing)}")
+print(f"Missing     : {len(missing)}")
+
+if existing:
+    print("\nExisting article_numbers:")
+    print(existing)
+
+if missing:
+    print("\nMissing article_numbers (first 20):")
+    print(missing[:20])
diff --git a/23 ulozeni a slepeni.py b/23 ulozeni a slepeni.py
new file mode 100644
index 0000000..382e63a
--- /dev/null
+++ b/23 ulozeni a slepeni.py	
@@ -0,0 +1,202 @@
+import os
+import re
+import nntplib
+from dotenv import load_dotenv
+from db import get_conn
+def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
+    """
+    Decode yEnc from NNTP BODY lines.
+    Handles NNTP dot-stuffing and logs what happens.
+    """
+    out = bytearray()
+    saw_ybegin = False
+    data_lines = 0
+
+    for idx, orig_line in enumerate(lines):
+        line = orig_line
+
+        # --- NNTP dot-stuffing ---
+        if line.startswith(b".."):
+            if debug:
+                print(f"    [dot] line {idx}: '..' -> '.'")
+            line = line[1:]
+        elif line.startswith(b"."):
+            if debug:
+                print(f"    [dot] line {idx}: '.' removed")
+            line = line[1:]
+
+        # --- yEnc control lines ---
+        if line.startswith(b"=ybegin"):
+            saw_ybegin = True
+            if debug:
+                print(f"    [yEnc] =ybegin detected")
+            continue
+
+        if line.startswith(b"=ypart"):
+            if debug:
+                print(f"    [yEnc] =ypart detected")
+            continue
+
+        if line.startswith(b"=yend"):
+            if debug:
+                print(f"    [yEnc] =yend detected")
+            continue
+
+        # --- actual yEnc data ---
+        data_lines += 1
+        i = 0
+        length = len(line)
+
+        while i < length:
+            c = line[i]
+
+            if c == ord('='):
+                i += 1
+                if i >= length:
+                    break
+                c = (line[i] - 64) & 0xFF
+
+            out.append((c - 42) & 0xFF)
+            i += 1
+
+    if debug:
+        print(f"    [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
+        print(f"    [yEnc] decoded_bytes={len(out)}")
+
+    if not saw_ybegin:
+        print("⚠️  WARNING: yEnc decoder used but =ybegin was NOT seen")
+
+    return bytes(out)
+
+
+# def yenc_decode_lines(lines: list[bytes]) -> bytes:
+#     """
+#     Decode yEnc from NNTP BODY lines.
+#     Handles NNTP dot-stuffing correctly.
+#     """
+#     out = bytearray()
+#
+#     for line in lines:
+#         # --- undo NNTP dot-stuffing ---
+#         if line.startswith(b".."):
+#             line = line[1:]
+#         elif line.startswith(b"."):
+#             line = line[1:]
+#
+#         # --- skip yEnc control lines ---
+#         if line.startswith(b"=ybegin"):
+#             continue
+#         if line.startswith(b"=ypart"):
+#             continue
+#         if line.startswith(b"=yend"):
+#             continue
+#
+#         i = 0
+#         length = len(line)
+#
+#         while i < length:
+#             c = line[i]
+#
+#             if c == ord('='):   # yEnc escape
+#                 i += 1
+#                 if i >= length:
+#                     break
+#                 c = (line[i] - 64) & 0xFF
+#
+#             out.append((c - 42) & 0xFF)
+#             i += 1
+#
+#     return bytes(out)
+
+
+
+
+# ================== CONFIG ==================
+GROUP = "alt.binaries.e-book.magazines"
+SUBJECT_KEY = "PC Pro 2011-07.pdf"
+OUT_DIR = r"downloads/PC_Pro_2011-07"
+FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
+# ============================================
+
+load_dotenv()
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+os.makedirs(OUT_DIR, exist_ok=True)
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+cur = conn.cursor()
+
+# --- load article numbers + subject ---
+cur.execute("""
+    SELECT article_number, metadata->>'subject'
+    FROM articles
+    WHERE newsgroup = %s
+      AND metadata->>'subject' LIKE %s
+    ORDER BY article_number
+""", (GROUP, f"%{SUBJECT_KEY}%"))
+
+rows = cur.fetchall()
+print(f"📦 Found {len(rows)} parts")
+
+# --- parse part number from subject ---
+part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
+
+parts = []
+for art_num, subject in rows:
+    m = part_re.search(subject or "")
+    if not m:
+        raise RuntimeError(f"Cannot parse part number from subject: {subject}")
+    part_no = int(m.group(1))
+    parts.append((part_no, art_num))
+
+# sort by part number (1..N)
+parts.sort(key=lambda x: x[0])
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    "news.eweka.nl",
+    563,
+    EWEKA_USER,
+    EWEKA_PASS,
+    readermode=True
+) as nntp:
+
+    nntp.group(GROUP)
+
+    for idx, (part_no, art_num) in enumerate(parts, start=1):
+        out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+
+        if os.path.exists(out_path):
+            print(f"⏭️  [{idx}/{len(parts)}] part {part_no} already exists, skipping")
+            continue
+
+        print(f"⬇️  [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
+
+        resp, info = nntp.body(art_num)
+
+        print(f"    BODY lines received: {len(info.lines)}")
+
+        # rychlá kontrola prvních řádků
+        for ln in info.lines[:3]:
+            print(f"    RAW:", ln[:80])
+
+        decoded = yenc_decode_lines(info.lines, debug=True)
+
+        print(f"    RESULT bytes: {len(decoded)}")
+
+        with open(out_path, "wb") as f:
+            f.write(decoded)
+
+
+print("🧩 Assembling final PDF...")
+
+with open(FINAL_PDF, "wb") as out:
+    for part_no, _ in parts:
+        part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+        with open(part_path, "rb") as pf:
+            out.write(pf.read())
+
+print("🎉 DONE")
+print(f"📄 Final PDF: {FINAL_PDF}")
diff --git a/db.py b/db.py
new file mode 100644
index 0000000..b107d59
--- /dev/null
+++ b/db.py
@@ -0,0 +1,14 @@
+from dotenv import load_dotenv
+import os, psycopg
+
+load_dotenv()
+
+def get_conn():
+    return psycopg.connect(
+        host=os.getenv("PG_HOST"),
+        port=int(os.getenv("PG_PORT", 5432)),
+        dbname=os.getenv("PG_DB"),
+        user=os.getenv("PG_USER"),
+        password=os.getenv("PG_PASS"),
+        connect_timeout=5,
+    )