z230

2025-12-27 17:24:30 +01:00
parent ea485bad29
commit f8dc6566bc
7 changed files with 542 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1,10 @@
+# ===== EWEKA =====
+EWEKA_USER=d6ef27c2d6496b22
+EWEKA_PASS=Vlado7309208104
+
+# ===== POSTGRES =====
+PG_HOST=192.168.1.76
+PG_PORT=5432
+PG_DB=newsgroups
+PG_USER=vladimir.buzalka
+PG_PASS=Vlado7309208104++
--- a/newsgroups.py
+++ b/newsgroups.py
@@ -0,0 +1,58 @@
+from datetime import datetime, UTC
+import nntplib
+from db import get_conn
+from dotenv import load_dotenv
+import os
+
+load_dotenv()  # načte .env
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+PROVIDER = "eweka"
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+
+conn.autocommit = True
+cur = conn.cursor()
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    host="news.eweka.nl",
+    port=563,
+    user=EWEKA_USER,
+    password=EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    print("📜 Fetching LIST ACTIVE...")
+    resp, groups = nntp.list()
+    print(f"📦 Received {len(groups)} groups")
+
+    rows = [
+        (
+            name,
+            int(first),
+            int(last),
+            flag,
+            PROVIDER,
+            datetime.now(UTC),
+        )
+        for name, last, first, flag in groups
+    ]
+
+    cur.executemany(
+        """
+        INSERT INTO newsgroups
+            (name, first_article, last_article, posting_flag, provider, fetched_at)
+        VALUES (%s, %s, %s, %s, %s, %s)
+        ON CONFLICT (name) DO UPDATE SET
+            first_article = EXCLUDED.first_article,
+            last_article  = EXCLUDED.last_article,
+            posting_flag  = EXCLUDED.posting_flag,
+            fetched_at    = EXCLUDED.fetched_at
+        """,
+        rows,
+    )
+
+print("🎉 DONE")
--- a/magazines.py
+++ b/magazines.py
@@ -0,0 +1,93 @@
+import nntplib
+import os
+from dotenv import load_dotenv
+from datetime import datetime, UTC
+from db import get_conn
+from psycopg.types.json import Json
+
+
+def sanitize(value):
+    if isinstance(value, str):
+        return value.encode("utf-8", errors="surrogatepass") \
+                    .decode("utf-8", errors="replace")
+    return value
+
+
+# ================= CONFIG =================
+GROUP = "alt.binaries.e-book.magazines"
+TOTAL_ARTICLES = 100_000
+BATCH_SIZE = 1_000
+# =========================================
+
+load_dotenv()
+
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+conn.autocommit = True
+cur = conn.cursor()
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    host="news.eweka.nl",
+    port=563,
+    user=EWEKA_USER,
+    password=EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    # --- GROUP ---
+    resp, count, first, last, name = nntp.group(GROUP)
+    first = int(first)
+    last = int(last)
+
+    print(f"📂 Group: {name}")
+    print(f"📐 Range: {first} – {last}")
+
+    start_global = max(first, last - TOTAL_ARTICLES + 1)
+    print(f"🎯 Target range: {start_global} – {last}")
+
+
+    for batch_start in range(start_global, last + 1, BATCH_SIZE):
+        batch_end = min(batch_start + BATCH_SIZE - 1, last)
+
+        print(f"📜 XOVER {batch_start}-{batch_end}")
+
+        try:
+            resp, overviews = nntp.xover(batch_start, batch_end)
+        except Exception as e:
+            print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
+            continue
+
+        rows = []
+        for art_num, fields in overviews:
+            clean_fields = {k: sanitize(v) for k, v in fields.items()}
+
+            metadata = {
+                "group": GROUP,
+                "article_number": art_num,
+                **clean_fields,
+            }
+
+            rows.append((
+                GROUP,
+                art_num,
+                fields.get("message-id"),
+                Json(metadata),  # 👈 TADY
+                datetime.now(UTC),
+            ))
+
+        if rows:
+            cur.executemany(
+                """
+                INSERT INTO articles
+                    (newsgroup, article_number, message_id, metadata, fetched_at)
+                VALUES (%s, %s, %s, %s, %s)
+                ON CONFLICT (message_id) DO NOTHING
+                """,
+                rows,
+            )
+
+print("🎉 DONE – last 100k articles ingested")
--- a/100k.py
+++ b/100k.py
@@ -0,0 +1,94 @@
+import nntplib
+import os
+from dotenv import load_dotenv
+from datetime import datetime, UTC
+from db import get_conn
+from psycopg.types.json import Json
+
+
+def sanitize(value):
+    if isinstance(value, str):
+        return value.encode("utf-8", errors="surrogatepass") \
+                    .decode("utf-8", errors="replace")
+    return value
+
+
+# ================= CONFIG =================
+GROUP = "alt.binaries.e-book.magazines"
+TOTAL_ARTICLES = 50_000_000
+BATCH_SIZE = 10_000
+# =========================================
+
+load_dotenv()
+
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+conn.autocommit = True
+cur = conn.cursor()
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    host="news.eweka.nl",
+    port=563,
+    user=EWEKA_USER,
+    password=EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    # --- GROUP ---
+    resp, count, first, last, name = nntp.group(GROUP)
+    first = int(first)
+    last = int(last)
+
+    start_global = first
+    end_global = min(first + TOTAL_ARTICLES - 1, last)
+
+    print(f"🎯 Target range: {start_global} – {end_global}")
+
+    print(f"📂 Group: {name}")
+    print(f"📐 Range: {first} – {last}")
+
+    for batch_start in range(start_global, end_global + 1, BATCH_SIZE):
+        batch_end = min(batch_start + BATCH_SIZE - 1, end_global)
+
+        print(f"📜 XOVER {batch_start}-{batch_end}")
+
+        try:
+            resp, overviews = nntp.xover(batch_start, batch_end)
+        except Exception as e:
+            print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
+            continue
+
+        rows = []
+        for art_num, fields in overviews:
+            clean_fields = {k: sanitize(v) for k, v in fields.items()}
+
+            metadata = {
+                "group": GROUP,
+                "article_number": art_num,
+                **clean_fields,
+            }
+
+            rows.append((
+                GROUP,
+                art_num,
+                fields.get("message-id"),
+                Json(metadata),  # 👈 TADY
+                datetime.now(UTC),
+            ))
+
+        if rows:
+            cur.executemany(
+                """
+                INSERT INTO articles
+                    (newsgroup, article_number, message_id, metadata, fetched_at)
+                VALUES (%s, %s, %s, %s, %s)
+                ON CONFLICT (message_id) DO NOTHING
+                """,
+                rows,
+            )
+
+print("🎉 DONE – last 100k articles ingested")
--- a/clanku.py
+++ b/clanku.py
@@ -0,0 +1,71 @@
+import nntplib
+import os
+from dotenv import load_dotenv
+from db import get_conn
+
+# ================== CONFIG ==================
+GROUP = "alt.binaries.e-book.magazines"
+SUBJECT_KEY = "PC Pro 2011-07.pdf"
+# ============================================
+
+load_dotenv()
+
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+cur = conn.cursor()
+
+cur.execute("""
+    SELECT article_number
+    FROM articles
+    WHERE newsgroup = %s
+      AND metadata->>'subject' LIKE %s
+    ORDER BY article_number
+""", (GROUP, f"%{SUBJECT_KEY}%"))
+
+article_numbers = [row[0] for row in cur.fetchall()]
+total = len(article_numbers)
+
+print(f"📦 Found {total} parts in DB")
+
+if total == 0:
+    print("❌ No articles found, aborting.")
+    exit(1)
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    "news.eweka.nl",
+    563,
+    EWEKA_USER,
+    EWEKA_PASS,
+    readermode=True,
+) as nntp:
+
+    nntp.group(GROUP)
+
+    existing = []
+    missing = []
+
+    for idx, art in enumerate(article_numbers, start=1):
+        try:
+            nntp.stat(art)
+            existing.append(art)
+            print(f"✅ [{idx}/{total}] EXISTS  article {art}")
+        except Exception:
+            missing.append(art)
+            print(f"❌ [{idx}/{total}] MISSING article {art}")
+
+print("\n================ RESULT ================")
+print(f"Total parts : {total}")
+print(f"Existing    : {len(existing)}")
+print(f"Missing     : {len(missing)}")
+
+if existing:
+    print("\nExisting article_numbers:")
+    print(existing)
+
+if missing:
+    print("\nMissing article_numbers (first 20):")
+    print(missing[:20])
--- a/slepeni.py
+++ b/slepeni.py
@@ -0,0 +1,202 @@
+import os
+import re
+import nntplib
+from dotenv import load_dotenv
+from db import get_conn
+def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
+    """
+    Decode yEnc from NNTP BODY lines.
+    Handles NNTP dot-stuffing and logs what happens.
+    """
+    out = bytearray()
+    saw_ybegin = False
+    data_lines = 0
+
+    for idx, orig_line in enumerate(lines):
+        line = orig_line
+
+        # --- NNTP dot-stuffing ---
+        if line.startswith(b".."):
+            if debug:
+                print(f"    [dot] line {idx}: '..' -> '.'")
+            line = line[1:]
+        elif line.startswith(b"."):
+            if debug:
+                print(f"    [dot] line {idx}: '.' removed")
+            line = line[1:]
+
+        # --- yEnc control lines ---
+        if line.startswith(b"=ybegin"):
+            saw_ybegin = True
+            if debug:
+                print(f"    [yEnc] =ybegin detected")
+            continue
+
+        if line.startswith(b"=ypart"):
+            if debug:
+                print(f"    [yEnc] =ypart detected")
+            continue
+
+        if line.startswith(b"=yend"):
+            if debug:
+                print(f"    [yEnc] =yend detected")
+            continue
+
+        # --- actual yEnc data ---
+        data_lines += 1
+        i = 0
+        length = len(line)
+
+        while i < length:
+            c = line[i]
+
+            if c == ord('='):
+                i += 1
+                if i >= length:
+                    break
+                c = (line[i] - 64) & 0xFF
+
+            out.append((c - 42) & 0xFF)
+            i += 1
+
+    if debug:
+        print(f"    [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
+        print(f"    [yEnc] decoded_bytes={len(out)}")
+
+    if not saw_ybegin:
+        print("⚠️  WARNING: yEnc decoder used but =ybegin was NOT seen")
+
+    return bytes(out)
+
+
+# def yenc_decode_lines(lines: list[bytes]) -> bytes:
+#     """
+#     Decode yEnc from NNTP BODY lines.
+#     Handles NNTP dot-stuffing correctly.
+#     """
+#     out = bytearray()
+#
+#     for line in lines:
+#         # --- undo NNTP dot-stuffing ---
+#         if line.startswith(b".."):
+#             line = line[1:]
+#         elif line.startswith(b"."):
+#             line = line[1:]
+#
+#         # --- skip yEnc control lines ---
+#         if line.startswith(b"=ybegin"):
+#             continue
+#         if line.startswith(b"=ypart"):
+#             continue
+#         if line.startswith(b"=yend"):
+#             continue
+#
+#         i = 0
+#         length = len(line)
+#
+#         while i < length:
+#             c = line[i]
+#
+#             if c == ord('='):   # yEnc escape
+#                 i += 1
+#                 if i >= length:
+#                     break
+#                 c = (line[i] - 64) & 0xFF
+#
+#             out.append((c - 42) & 0xFF)
+#             i += 1
+#
+#     return bytes(out)
+
+
+
+
+# ================== CONFIG ==================
+GROUP = "alt.binaries.e-book.magazines"
+SUBJECT_KEY = "PC Pro 2011-07.pdf"
+OUT_DIR = r"downloads/PC_Pro_2011-07"
+FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
+# ============================================
+
+load_dotenv()
+EWEKA_USER = os.getenv("EWEKA_USER")
+EWEKA_PASS = os.getenv("EWEKA_PASS")
+
+os.makedirs(OUT_DIR, exist_ok=True)
+
+print("🔌 Connecting to PostgreSQL...")
+conn = get_conn()
+cur = conn.cursor()
+
+# --- load article numbers + subject ---
+cur.execute("""
+    SELECT article_number, metadata->>'subject'
+    FROM articles
+    WHERE newsgroup = %s
+      AND metadata->>'subject' LIKE %s
+    ORDER BY article_number
+""", (GROUP, f"%{SUBJECT_KEY}%"))
+
+rows = cur.fetchall()
+print(f"📦 Found {len(rows)} parts")
+
+# --- parse part number from subject ---
+part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
+
+parts = []
+for art_num, subject in rows:
+    m = part_re.search(subject or "")
+    if not m:
+        raise RuntimeError(f"Cannot parse part number from subject: {subject}")
+    part_no = int(m.group(1))
+    parts.append((part_no, art_num))
+
+# sort by part number (1..N)
+parts.sort(key=lambda x: x[0])
+
+print("🔌 Connecting to Eweka NNTP...")
+with nntplib.NNTP_SSL(
+    "news.eweka.nl",
+    563,
+    EWEKA_USER,
+    EWEKA_PASS,
+    readermode=True
+) as nntp:
+
+    nntp.group(GROUP)
+
+    for idx, (part_no, art_num) in enumerate(parts, start=1):
+        out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+
+        if os.path.exists(out_path):
+            print(f"⏭️  [{idx}/{len(parts)}] part {part_no} already exists, skipping")
+            continue
+
+        print(f"⬇️  [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
+
+        resp, info = nntp.body(art_num)
+
+        print(f"    BODY lines received: {len(info.lines)}")
+
+        # rychlá kontrola prvních řádků
+        for ln in info.lines[:3]:
+            print(f"    RAW:", ln[:80])
+
+        decoded = yenc_decode_lines(info.lines, debug=True)
+
+        print(f"    RESULT bytes: {len(decoded)}")
+
+        with open(out_path, "wb") as f:
+            f.write(decoded)
+
+
+print("🧩 Assembling final PDF...")
+
+with open(FINAL_PDF, "wb") as out:
+    for part_no, _ in parts:
+        part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
+        with open(part_path, "rb") as pf:
+            out.write(pf.read())
+
+print("🎉 DONE")
+print(f"📄 Final PDF: {FINAL_PDF}")
--- a/db.py
+++ b/db.py
@@ -0,0 +1,14 @@
+from dotenv import load_dotenv
+import os, psycopg
+
+load_dotenv()
+
+def get_conn():
+    return psycopg.connect(
+        host=os.getenv("PG_HOST"),
+        port=int(os.getenv("PG_PORT", 5432)),
+        dbname=os.getenv("PG_DB"),
+        user=os.getenv("PG_USER"),
+        password=os.getenv("PG_PASS"),
+        connect_timeout=5,
+    )