diff --git a/.env b/.env new file mode 100644 index 0000000..4b10d20 --- /dev/null +++ b/.env @@ -0,0 +1,10 @@ +# ===== EWEKA ===== +EWEKA_USER=d6ef27c2d6496b22 +EWEKA_PASS=Vlado7309208104 + +# ===== POSTGRES ===== +PG_HOST=192.168.1.76 +PG_PORT=5432 +PG_DB=newsgroups +PG_USER=vladimir.buzalka +PG_PASS=Vlado7309208104++ diff --git a/10 list of newsgroups.py b/10 list of newsgroups.py new file mode 100644 index 0000000..83c7db0 --- /dev/null +++ b/10 list of newsgroups.py @@ -0,0 +1,58 @@ +from datetime import datetime, UTC +import nntplib +from db import get_conn +from dotenv import load_dotenv +import os + +load_dotenv() # načte .env +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +PROVIDER = "eweka" + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() + +conn.autocommit = True +cur = conn.cursor() + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + host="news.eweka.nl", + port=563, + user=EWEKA_USER, + password=EWEKA_PASS, + readermode=True, +) as nntp: + + print("πŸ“œ Fetching LIST ACTIVE...") + resp, groups = nntp.list() + print(f"πŸ“¦ Received {len(groups)} groups") + + rows = [ + ( + name, + int(first), + int(last), + flag, + PROVIDER, + datetime.now(UTC), + ) + for name, last, first, flag in groups + ] + + cur.executemany( + """ + INSERT INTO newsgroups + (name, first_article, last_article, posting_flag, provider, fetched_at) + VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (name) DO UPDATE SET + first_article = EXCLUDED.first_article, + last_article = EXCLUDED.last_article, + posting_flag = EXCLUDED.posting_flag, + fetched_at = EXCLUDED.fetched_at + """, + rows, + ) + +print("πŸŽ‰ DONE") diff --git a/20 Alt binaries ebook magazines.py b/20 Alt binaries ebook magazines.py new file mode 100644 index 0000000..89faf8d --- /dev/null +++ b/20 Alt binaries ebook magazines.py @@ -0,0 +1,93 @@ +import nntplib +import os +from dotenv import load_dotenv +from datetime import datetime, UTC +from db import get_conn +from psycopg.types.json import Json + + +def sanitize(value): + if isinstance(value, str): + return value.encode("utf-8", errors="surrogatepass") \ + .decode("utf-8", errors="replace") + return value + + +# ================= CONFIG ================= +GROUP = "alt.binaries.e-book.magazines" +TOTAL_ARTICLES = 100_000 +BATCH_SIZE = 1_000 +# ========================================= + +load_dotenv() + +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +conn.autocommit = True +cur = conn.cursor() + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + host="news.eweka.nl", + port=563, + user=EWEKA_USER, + password=EWEKA_PASS, + readermode=True, +) as nntp: + + # --- GROUP --- + resp, count, first, last, name = nntp.group(GROUP) + first = int(first) + last = int(last) + + print(f"πŸ“‚ Group: {name}") + print(f"πŸ“ Range: {first} – {last}") + + start_global = max(first, last - TOTAL_ARTICLES + 1) + print(f"🎯 Target range: {start_global} – {last}") + + + for batch_start in range(start_global, last + 1, BATCH_SIZE): + batch_end = min(batch_start + BATCH_SIZE - 1, last) + + print(f"πŸ“œ XOVER {batch_start}-{batch_end}") + + try: + resp, overviews = nntp.xover(batch_start, batch_end) + except Exception as e: + print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}") + continue + + rows = [] + for art_num, fields in overviews: + clean_fields = {k: sanitize(v) for k, v in fields.items()} + + metadata = { + "group": GROUP, + "article_number": art_num, + **clean_fields, + } + + rows.append(( + GROUP, + art_num, + fields.get("message-id"), + Json(metadata), # πŸ‘ˆ TADY + datetime.now(UTC), + )) + + if rows: + cur.executemany( + """ + INSERT INTO articles + (newsgroup, article_number, message_id, metadata, fetched_at) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (message_id) DO NOTHING + """, + rows, + ) + +print("πŸŽ‰ DONE – last 100k articles ingested") diff --git a/21 poslednich 100k.py b/21 poslednich 100k.py new file mode 100644 index 0000000..7383083 --- /dev/null +++ b/21 poslednich 100k.py @@ -0,0 +1,94 @@ +import nntplib +import os +from dotenv import load_dotenv +from datetime import datetime, UTC +from db import get_conn +from psycopg.types.json import Json + + +def sanitize(value): + if isinstance(value, str): + return value.encode("utf-8", errors="surrogatepass") \ + .decode("utf-8", errors="replace") + return value + + +# ================= CONFIG ================= +GROUP = "alt.binaries.e-book.magazines" +TOTAL_ARTICLES = 50_000_000 +BATCH_SIZE = 10_000 +# ========================================= + +load_dotenv() + +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +conn.autocommit = True +cur = conn.cursor() + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + host="news.eweka.nl", + port=563, + user=EWEKA_USER, + password=EWEKA_PASS, + readermode=True, +) as nntp: + + # --- GROUP --- + resp, count, first, last, name = nntp.group(GROUP) + first = int(first) + last = int(last) + + start_global = first + end_global = min(first + TOTAL_ARTICLES - 1, last) + + print(f"🎯 Target range: {start_global} – {end_global}") + + print(f"πŸ“‚ Group: {name}") + print(f"πŸ“ Range: {first} – {last}") + + for batch_start in range(start_global, end_global + 1, BATCH_SIZE): + batch_end = min(batch_start + BATCH_SIZE - 1, end_global) + + print(f"πŸ“œ XOVER {batch_start}-{batch_end}") + + try: + resp, overviews = nntp.xover(batch_start, batch_end) + except Exception as e: + print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}") + continue + + rows = [] + for art_num, fields in overviews: + clean_fields = {k: sanitize(v) for k, v in fields.items()} + + metadata = { + "group": GROUP, + "article_number": art_num, + **clean_fields, + } + + rows.append(( + GROUP, + art_num, + fields.get("message-id"), + Json(metadata), # πŸ‘ˆ TADY + datetime.now(UTC), + )) + + if rows: + cur.executemany( + """ + INSERT INTO articles + (newsgroup, article_number, message_id, metadata, fetched_at) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (message_id) DO NOTHING + """, + rows, + ) + +print("πŸŽ‰ DONE – last 100k articles ingested") diff --git a/22 stat test posledniho clanku.py b/22 stat test posledniho clanku.py new file mode 100644 index 0000000..89eabe2 --- /dev/null +++ b/22 stat test posledniho clanku.py @@ -0,0 +1,71 @@ +import nntplib +import os +from dotenv import load_dotenv +from db import get_conn + +# ================== CONFIG ================== +GROUP = "alt.binaries.e-book.magazines" +SUBJECT_KEY = "PC Pro 2011-07.pdf" +# ============================================ + +load_dotenv() + +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +cur = conn.cursor() + +cur.execute(""" + SELECT article_number + FROM articles + WHERE newsgroup = %s + AND metadata->>'subject' LIKE %s + ORDER BY article_number +""", (GROUP, f"%{SUBJECT_KEY}%")) + +article_numbers = [row[0] for row in cur.fetchall()] +total = len(article_numbers) + +print(f"πŸ“¦ Found {total} parts in DB") + +if total == 0: + print("❌ No articles found, aborting.") + exit(1) + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + "news.eweka.nl", + 563, + EWEKA_USER, + EWEKA_PASS, + readermode=True, +) as nntp: + + nntp.group(GROUP) + + existing = [] + missing = [] + + for idx, art in enumerate(article_numbers, start=1): + try: + nntp.stat(art) + existing.append(art) + print(f"βœ… [{idx}/{total}] EXISTS article {art}") + except Exception: + missing.append(art) + print(f"❌ [{idx}/{total}] MISSING article {art}") + +print("\n================ RESULT ================") +print(f"Total parts : {total}") +print(f"Existing : {len(existing)}") +print(f"Missing : {len(missing)}") + +if existing: + print("\nExisting article_numbers:") + print(existing) + +if missing: + print("\nMissing article_numbers (first 20):") + print(missing[:20]) diff --git a/23 ulozeni a slepeni.py b/23 ulozeni a slepeni.py new file mode 100644 index 0000000..382e63a --- /dev/null +++ b/23 ulozeni a slepeni.py @@ -0,0 +1,202 @@ +import os +import re +import nntplib +from dotenv import load_dotenv +from db import get_conn +def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes: + """ + Decode yEnc from NNTP BODY lines. + Handles NNTP dot-stuffing and logs what happens. + """ + out = bytearray() + saw_ybegin = False + data_lines = 0 + + for idx, orig_line in enumerate(lines): + line = orig_line + + # --- NNTP dot-stuffing --- + if line.startswith(b".."): + if debug: + print(f" [dot] line {idx}: '..' -> '.'") + line = line[1:] + elif line.startswith(b"."): + if debug: + print(f" [dot] line {idx}: '.' removed") + line = line[1:] + + # --- yEnc control lines --- + if line.startswith(b"=ybegin"): + saw_ybegin = True + if debug: + print(f" [yEnc] =ybegin detected") + continue + + if line.startswith(b"=ypart"): + if debug: + print(f" [yEnc] =ypart detected") + continue + + if line.startswith(b"=yend"): + if debug: + print(f" [yEnc] =yend detected") + continue + + # --- actual yEnc data --- + data_lines += 1 + i = 0 + length = len(line) + + while i < length: + c = line[i] + + if c == ord('='): + i += 1 + if i >= length: + break + c = (line[i] - 64) & 0xFF + + out.append((c - 42) & 0xFF) + i += 1 + + if debug: + print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}") + print(f" [yEnc] decoded_bytes={len(out)}") + + if not saw_ybegin: + print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen") + + return bytes(out) + + +# def yenc_decode_lines(lines: list[bytes]) -> bytes: +# """ +# Decode yEnc from NNTP BODY lines. +# Handles NNTP dot-stuffing correctly. +# """ +# out = bytearray() +# +# for line in lines: +# # --- undo NNTP dot-stuffing --- +# if line.startswith(b".."): +# line = line[1:] +# elif line.startswith(b"."): +# line = line[1:] +# +# # --- skip yEnc control lines --- +# if line.startswith(b"=ybegin"): +# continue +# if line.startswith(b"=ypart"): +# continue +# if line.startswith(b"=yend"): +# continue +# +# i = 0 +# length = len(line) +# +# while i < length: +# c = line[i] +# +# if c == ord('='): # yEnc escape +# i += 1 +# if i >= length: +# break +# c = (line[i] - 64) & 0xFF +# +# out.append((c - 42) & 0xFF) +# i += 1 +# +# return bytes(out) + + + + +# ================== CONFIG ================== +GROUP = "alt.binaries.e-book.magazines" +SUBJECT_KEY = "PC Pro 2011-07.pdf" +OUT_DIR = r"downloads/PC_Pro_2011-07" +FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf" +# ============================================ + +load_dotenv() +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +os.makedirs(OUT_DIR, exist_ok=True) + +print("πŸ”Œ Connecting to PostgreSQL...") +conn = get_conn() +cur = conn.cursor() + +# --- load article numbers + subject --- +cur.execute(""" + SELECT article_number, metadata->>'subject' + FROM articles + WHERE newsgroup = %s + AND metadata->>'subject' LIKE %s + ORDER BY article_number +""", (GROUP, f"%{SUBJECT_KEY}%")) + +rows = cur.fetchall() +print(f"πŸ“¦ Found {len(rows)} parts") + +# --- parse part number from subject --- +part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") + +parts = [] +for art_num, subject in rows: + m = part_re.search(subject or "") + if not m: + raise RuntimeError(f"Cannot parse part number from subject: {subject}") + part_no = int(m.group(1)) + parts.append((part_no, art_num)) + +# sort by part number (1..N) +parts.sort(key=lambda x: x[0]) + +print("πŸ”Œ Connecting to Eweka NNTP...") +with nntplib.NNTP_SSL( + "news.eweka.nl", + 563, + EWEKA_USER, + EWEKA_PASS, + readermode=True +) as nntp: + + nntp.group(GROUP) + + for idx, (part_no, art_num) in enumerate(parts, start=1): + out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") + + if os.path.exists(out_path): + print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping") + continue + + print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})") + + resp, info = nntp.body(art_num) + + print(f" BODY lines received: {len(info.lines)}") + + # rychlΓ‘ kontrola prvnΓ­ch Ε™Γ‘dkΕ― + for ln in info.lines[:3]: + print(f" RAW:", ln[:80]) + + decoded = yenc_decode_lines(info.lines, debug=True) + + print(f" RESULT bytes: {len(decoded)}") + + with open(out_path, "wb") as f: + f.write(decoded) + + +print("🧩 Assembling final PDF...") + +with open(FINAL_PDF, "wb") as out: + for part_no, _ in parts: + part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") + with open(part_path, "rb") as pf: + out.write(pf.read()) + +print("πŸŽ‰ DONE") +print(f"πŸ“„ Final PDF: {FINAL_PDF}") diff --git a/db.py b/db.py new file mode 100644 index 0000000..b107d59 --- /dev/null +++ b/db.py @@ -0,0 +1,14 @@ +from dotenv import load_dotenv +import os, psycopg + +load_dotenv() + +def get_conn(): + return psycopg.connect( + host=os.getenv("PG_HOST"), + port=int(os.getenv("PG_PORT", 5432)), + dbname=os.getenv("PG_DB"), + user=os.getenv("PG_USER"), + password=os.getenv("PG_PASS"), + connect_timeout=5, + )