From f8dc6566bc4b1ee47a70d46c44820c1ba771daf9 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Sat, 27 Dec 2025 17:24:30 +0100 Subject: [PATCH] z230 --- .env | 10 ++ 10 list of newsgroups.py | 58 +++++++++ 20 Alt binaries ebook magazines.py | 93 +++++++++++++ 21 poslednich 100k.py | 94 ++++++++++++++ 22 stat test posledniho clanku.py | 71 ++++++++++ 23 ulozeni a slepeni.py | 202 +++++++++++++++++++++++++++++ db.py | 14 ++ 7 files changed, 542 insertions(+) create mode 100644 .env create mode 100644 10 list of newsgroups.py create mode 100644 20 Alt binaries ebook magazines.py create mode 100644 21 poslednich 100k.py create mode 100644 22 stat test posledniho clanku.py create mode 100644 23 ulozeni a slepeni.py create mode 100644 db.py diff --git a/.env b/.env new file mode 100644 index 0000000..4b10d20 --- /dev/null +++ b/.env @@ -0,0 +1,10 @@ +# ===== EWEKA ===== +EWEKA_USER=d6ef27c2d6496b22 +EWEKA_PASS=Vlado7309208104 + +# ===== POSTGRES ===== +PG_HOST=192.168.1.76 +PG_PORT=5432 +PG_DB=newsgroups +PG_USER=vladimir.buzalka +PG_PASS=Vlado7309208104++ diff --git a/10 list of newsgroups.py b/10 list of newsgroups.py new file mode 100644 index 0000000..83c7db0 --- /dev/null +++ b/10 list of newsgroups.py @@ -0,0 +1,58 @@ +from datetime import datetime, UTC +import nntplib +from db import get_conn +from dotenv import load_dotenv +import os + +load_dotenv() # načte .env +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +PROVIDER = "eweka" + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() + +conn.autocommit = True +cur = conn.cursor() + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + host="news.eweka.nl", + port=563, + user=EWEKA_USER, + password=EWEKA_PASS, + readermode=True, +) as nntp: + + print("πŸ“œ Fetching LIST ACTIVE...") + resp, groups = nntp.list() + print(f"πŸ“¦ Received {len(groups)} groups") + + rows = [ + ( + name, + int(first), + int(last), + flag, + PROVIDER, + datetime.now(UTC), + ) + for name, last, first, flag in groups + ] + + cur.executemany( + """ + INSERT INTO newsgroups + (name, first_article, last_article, posting_flag, provider, fetched_at) + VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (name) DO UPDATE SET + first_article = EXCLUDED.first_article, + last_article = EXCLUDED.last_article, + posting_flag = EXCLUDED.posting_flag, + fetched_at = EXCLUDED.fetched_at + """, + rows, + ) + +print("πŸŽ‰ DONE") diff --git a/20 Alt binaries ebook magazines.py b/20 Alt binaries ebook magazines.py new file mode 100644 index 0000000..89faf8d --- /dev/null +++ b/20 Alt binaries ebook magazines.py @@ -0,0 +1,93 @@ +import nntplib +import os +from dotenv import load_dotenv +from datetime import datetime, UTC +from db import get_conn +from psycopg.types.json import Json + + +def sanitize(value): + if isinstance(value, str): + return value.encode("utf-8", errors="surrogatepass") \ + .decode("utf-8", errors="replace") + return value + + +# ================= CONFIG ================= +GROUP = "alt.binaries.e-book.magazines" +TOTAL_ARTICLES = 100_000 +BATCH_SIZE = 1_000 +# ========================================= + +load_dotenv() + +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +conn.autocommit = True +cur = conn.cursor() + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + host="news.eweka.nl", + port=563, + user=EWEKA_USER, + password=EWEKA_PASS, + readermode=True, +) as nntp: + + # --- GROUP --- + resp, count, first, last, name = nntp.group(GROUP) + first = int(first) + last = int(last) + + print(f"πŸ“‚ Group: {name}") + print(f"πŸ“ Range: {first} – {last}") + + start_global = max(first, last - TOTAL_ARTICLES + 1) + print(f"🎯 Target range: {start_global} – {last}") + + + for batch_start in range(start_global, last + 1, BATCH_SIZE): + batch_end = min(batch_start + BATCH_SIZE - 1, last) + + print(f"πŸ“œ XOVER {batch_start}-{batch_end}") + + try: + resp, overviews = nntp.xover(batch_start, batch_end) + except Exception as e: + print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}") + continue + + rows = [] + for art_num, fields in overviews: + clean_fields = {k: sanitize(v) for k, v in fields.items()} + + metadata = { + "group": GROUP, + "article_number": art_num, + **clean_fields, + } + + rows.append(( + GROUP, + art_num, + fields.get("message-id"), + Json(metadata), # πŸ‘ˆ TADY + datetime.now(UTC), + )) + + if rows: + cur.executemany( + """ + INSERT INTO articles + (newsgroup, article_number, message_id, metadata, fetched_at) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (message_id) DO NOTHING + """, + rows, + ) + +print("πŸŽ‰ DONE – last 100k articles ingested") diff --git a/21 poslednich 100k.py b/21 poslednich 100k.py new file mode 100644 index 0000000..7383083 --- /dev/null +++ b/21 poslednich 100k.py @@ -0,0 +1,94 @@ +import nntplib +import os +from dotenv import load_dotenv +from datetime import datetime, UTC +from db import get_conn +from psycopg.types.json import Json + + +def sanitize(value): + if isinstance(value, str): + return value.encode("utf-8", errors="surrogatepass") \ + .decode("utf-8", errors="replace") + return value + + +# ================= CONFIG ================= +GROUP = "alt.binaries.e-book.magazines" +TOTAL_ARTICLES = 50_000_000 +BATCH_SIZE = 10_000 +# ========================================= + +load_dotenv() + +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +conn.autocommit = True +cur = conn.cursor() + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + host="news.eweka.nl", + port=563, + user=EWEKA_USER, + password=EWEKA_PASS, + readermode=True, +) as nntp: + + # --- GROUP --- + resp, count, first, last, name = nntp.group(GROUP) + first = int(first) + last = int(last) + + start_global = first + end_global = min(first + TOTAL_ARTICLES - 1, last) + + print(f"🎯 Target range: {start_global} – {end_global}") + + print(f"πŸ“‚ Group: {name}") + print(f"πŸ“ Range: {first} – {last}") + + for batch_start in range(start_global, end_global + 1, BATCH_SIZE): + batch_end = min(batch_start + BATCH_SIZE - 1, end_global) + + print(f"πŸ“œ XOVER {batch_start}-{batch_end}") + + try: + resp, overviews = nntp.xover(batch_start, batch_end) + except Exception as e: + print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}") + continue + + rows = [] + for art_num, fields in overviews: + clean_fields = {k: sanitize(v) for k, v in fields.items()} + + metadata = { + "group": GROUP, + "article_number": art_num, + **clean_fields, + } + + rows.append(( + GROUP, + art_num, + fields.get("message-id"), + Json(metadata), # πŸ‘ˆ TADY + datetime.now(UTC), + )) + + if rows: + cur.executemany( + """ + INSERT INTO articles + (newsgroup, article_number, message_id, metadata, fetched_at) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (message_id) DO NOTHING + """, + rows, + ) + +print("πŸŽ‰ DONE – last 100k articles ingested") diff --git a/22 stat test posledniho clanku.py b/22 stat test posledniho clanku.py new file mode 100644 index 0000000..89eabe2 --- /dev/null +++ b/22 stat test posledniho clanku.py @@ -0,0 +1,71 @@ +import nntplib +import os +from dotenv import load_dotenv +from db import get_conn + +# ================== CONFIG ================== +GROUP = "alt.binaries.e-book.magazines" +SUBJECT_KEY = "PC Pro 2011-07.pdf" +# ============================================ + +load_dotenv() + +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +cur = conn.cursor() + +cur.execute(""" + SELECT article_number + FROM articles + WHERE newsgroup = %s + AND metadata->>'subject' LIKE %s + ORDER BY article_number +""", (GROUP, f"%{SUBJECT_KEY}%")) + +article_numbers = [row[0] for row in cur.fetchall()] +total = len(article_numbers) + +print(f"πŸ“¦ Found {total} parts in DB") + +if total == 0: + print("❌ No articles found, aborting.") + exit(1) + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + "news.eweka.nl", + 563, + EWEKA_USER, + EWEKA_PASS, + readermode=True, +) as nntp: + + nntp.group(GROUP) + + existing = [] + missing = [] + + for idx, art in enumerate(article_numbers, start=1): + try: + nntp.stat(art) + existing.append(art) + print(f"βœ… [{idx}/{total}] EXISTS article {art}") + except Exception: + missing.append(art) + print(f"❌ [{idx}/{total}] MISSING article {art}") + +print("\n================ RESULT ================") +print(f"Total parts : {total}") +print(f"Existing : {len(existing)}") +print(f"Missing : {len(missing)}") + +if existing: + print("\nExisting article_numbers:") + print(existing) + +if missing: + print("\nMissing article_numbers (first 20):") + print(missing[:20]) diff --git a/23 ulozeni a slepeni.py b/23 ulozeni a slepeni.py new file mode 100644 index 0000000..382e63a --- /dev/null +++ b/23 ulozeni a slepeni.py @@ -0,0 +1,202 @@ +import os +import re +import nntplib +from dotenv import load_dotenv +from db import get_conn +def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes: + """ + Decode yEnc from NNTP BODY lines. + Handles NNTP dot-stuffing and logs what happens. + """ + out = bytearray() + saw_ybegin = False + data_lines = 0 + + for idx, orig_line in enumerate(lines): + line = orig_line + + # --- NNTP dot-stuffing --- + if line.startswith(b".."): + if debug: + print(f" [dot] line {idx}: '..' -> '.'") + line = line[1:] + elif line.startswith(b"."): + if debug: + print(f" [dot] line {idx}: '.' removed") + line = line[1:] + + # --- yEnc control lines --- + if line.startswith(b"=ybegin"): + saw_ybegin = True + if debug: + print(f" [yEnc] =ybegin detected") + continue + + if line.startswith(b"=ypart"): + if debug: + print(f" [yEnc] =ypart detected") + continue + + if line.startswith(b"=yend"): + if debug: + print(f" [yEnc] =yend detected") + continue + + # --- actual yEnc data --- + data_lines += 1 + i = 0 + length = len(line) + + while i < length: + c = line[i] + + if c == ord('='): + i += 1 + if i >= length: + break + c = (line[i] - 64) & 0xFF + + out.append((c - 42) & 0xFF) + i += 1 + + if debug: + print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}") + print(f" [yEnc] decoded_bytes={len(out)}") + + if not saw_ybegin: + print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen") + + return bytes(out) + + +# def yenc_decode_lines(lines: list[bytes]) -> bytes: +# """ +# Decode yEnc from NNTP BODY lines. +# Handles NNTP dot-stuffing correctly. +# """ +# out = bytearray() +# +# for line in lines: +# # --- undo NNTP dot-stuffing --- +# if line.startswith(b".."): +# line = line[1:] +# elif line.startswith(b"."): +# line = line[1:] +# +# # --- skip yEnc control lines --- +# if line.startswith(b"=ybegin"): +# continue +# if line.startswith(b"=ypart"): +# continue +# if line.startswith(b"=yend"): +# continue +# +# i = 0 +# length = len(line) +# +# while i < length: +# c = line[i] +# +# if c == ord('='): # yEnc escape +# i += 1 +# if i >= length: +# break +# c = (line[i] - 64) & 0xFF +# +# out.append((c - 42) & 0xFF) +# i += 1 +# +# return bytes(out) + + + + +# ================== CONFIG ================== +GROUP = "alt.binaries.e-book.magazines" +SUBJECT_KEY = "PC Pro 2011-07.pdf" +OUT_DIR = r"downloads/PC_Pro_2011-07" +FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf" +# ============================================ + +load_dotenv() +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +os.makedirs(OUT_DIR, exist_ok=True) + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +cur = conn.cursor() + +# --- load article numbers + subject --- +cur.execute(""" + SELECT article_number, metadata->>'subject' + FROM articles + WHERE newsgroup = %s + AND metadata->>'subject' LIKE %s + ORDER BY article_number +""", (GROUP, f"%{SUBJECT_KEY}%")) + +rows = cur.fetchall() +print(f"πŸ“¦ Found {len(rows)} parts") + +# --- parse part number from subject --- +part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") + +parts = [] +for art_num, subject in rows: + m = part_re.search(subject or "") + if not m: + raise RuntimeError(f"Cannot parse part number from subject: {subject}") + part_no = int(m.group(1)) + parts.append((part_no, art_num)) + +# sort by part number (1..N) +parts.sort(key=lambda x: x[0]) + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + "news.eweka.nl", + 563, + EWEKA_USER, + EWEKA_PASS, + readermode=True +) as nntp: + + nntp.group(GROUP) + + for idx, (part_no, art_num) in enumerate(parts, start=1): + out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") + + if os.path.exists(out_path): + print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping") + continue + + print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})") + + resp, info = nntp.body(art_num) + + print(f" BODY lines received: {len(info.lines)}") + + # rychlΓ‘ kontrola prvnΓ­ch Ε™Γ‘dkΕ― + for ln in info.lines[:3]: + print(f" RAW:", ln[:80]) + + decoded = yenc_decode_lines(info.lines, debug=True) + + print(f" RESULT bytes: {len(decoded)}") + + with open(out_path, "wb") as f: + f.write(decoded) + + +print("🧩 Assembling final PDF...") + +with open(FINAL_PDF, "wb") as out: + for part_no, _ in parts: + part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") + with open(part_path, "rb") as pf: + out.write(pf.read()) + +print("πŸŽ‰ DONE") +print(f"πŸ“„ Final PDF: {FINAL_PDF}") diff --git a/db.py b/db.py new file mode 100644 index 0000000..b107d59 --- /dev/null +++ b/db.py @@ -0,0 +1,14 @@ +from dotenv import load_dotenv +import os, psycopg + +load_dotenv() + +def get_conn(): + return psycopg.connect( + host=os.getenv("PG_HOST"), + port=int(os.getenv("PG_PORT", 5432)), + dbname=os.getenv("PG_DB"), + user=os.getenv("PG_USER"), + password=os.getenv("PG_PASS"), + connect_timeout=5, + )