This commit is contained in:
2025-12-27 17:24:30 +01:00
parent ea485bad29
commit f8dc6566bc
7 changed files with 542 additions and 0 deletions

10
.env Normal file
View File

@@ -0,0 +1,10 @@
# ===== EWEKA =====
EWEKA_USER=d6ef27c2d6496b22
EWEKA_PASS=Vlado7309208104
# ===== POSTGRES =====
PG_HOST=192.168.1.76
PG_PORT=5432
PG_DB=newsgroups
PG_USER=vladimir.buzalka
PG_PASS=Vlado7309208104++

58
10 list of newsgroups.py Normal file
View File

@@ -0,0 +1,58 @@
from datetime import datetime, UTC
import nntplib
from db import get_conn
from dotenv import load_dotenv
import os
load_dotenv() # načte .env
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
PROVIDER = "eweka"
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
conn.autocommit = True
cur = conn.cursor()
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
host="news.eweka.nl",
port=563,
user=EWEKA_USER,
password=EWEKA_PASS,
readermode=True,
) as nntp:
print("📜 Fetching LIST ACTIVE...")
resp, groups = nntp.list()
print(f"📦 Received {len(groups)} groups")
rows = [
(
name,
int(first),
int(last),
flag,
PROVIDER,
datetime.now(UTC),
)
for name, last, first, flag in groups
]
cur.executemany(
"""
INSERT INTO newsgroups
(name, first_article, last_article, posting_flag, provider, fetched_at)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (name) DO UPDATE SET
first_article = EXCLUDED.first_article,
last_article = EXCLUDED.last_article,
posting_flag = EXCLUDED.posting_flag,
fetched_at = EXCLUDED.fetched_at
""",
rows,
)
print("🎉 DONE")

View File

@@ -0,0 +1,93 @@
import nntplib
import os
from dotenv import load_dotenv
from datetime import datetime, UTC
from db import get_conn
from psycopg.types.json import Json
def sanitize(value):
if isinstance(value, str):
return value.encode("utf-8", errors="surrogatepass") \
.decode("utf-8", errors="replace")
return value
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
TOTAL_ARTICLES = 100_000
BATCH_SIZE = 1_000
# =========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
conn.autocommit = True
cur = conn.cursor()
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
host="news.eweka.nl",
port=563,
user=EWEKA_USER,
password=EWEKA_PASS,
readermode=True,
) as nntp:
# --- GROUP ---
resp, count, first, last, name = nntp.group(GROUP)
first = int(first)
last = int(last)
print(f"📂 Group: {name}")
print(f"📐 Range: {first} {last}")
start_global = max(first, last - TOTAL_ARTICLES + 1)
print(f"🎯 Target range: {start_global} {last}")
for batch_start in range(start_global, last + 1, BATCH_SIZE):
batch_end = min(batch_start + BATCH_SIZE - 1, last)
print(f"📜 XOVER {batch_start}-{batch_end}")
try:
resp, overviews = nntp.xover(batch_start, batch_end)
except Exception as e:
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
continue
rows = []
for art_num, fields in overviews:
clean_fields = {k: sanitize(v) for k, v in fields.items()}
metadata = {
"group": GROUP,
"article_number": art_num,
**clean_fields,
}
rows.append((
GROUP,
art_num,
fields.get("message-id"),
Json(metadata), # 👈 TADY
datetime.now(UTC),
))
if rows:
cur.executemany(
"""
INSERT INTO articles
(newsgroup, article_number, message_id, metadata, fetched_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (message_id) DO NOTHING
""",
rows,
)
print("🎉 DONE last 100k articles ingested")

94
21 poslednich 100k.py Normal file
View File

@@ -0,0 +1,94 @@
import nntplib
import os
from dotenv import load_dotenv
from datetime import datetime, UTC
from db import get_conn
from psycopg.types.json import Json
def sanitize(value):
if isinstance(value, str):
return value.encode("utf-8", errors="surrogatepass") \
.decode("utf-8", errors="replace")
return value
# ================= CONFIG =================
GROUP = "alt.binaries.e-book.magazines"
TOTAL_ARTICLES = 50_000_000
BATCH_SIZE = 10_000
# =========================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
conn.autocommit = True
cur = conn.cursor()
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
host="news.eweka.nl",
port=563,
user=EWEKA_USER,
password=EWEKA_PASS,
readermode=True,
) as nntp:
# --- GROUP ---
resp, count, first, last, name = nntp.group(GROUP)
first = int(first)
last = int(last)
start_global = first
end_global = min(first + TOTAL_ARTICLES - 1, last)
print(f"🎯 Target range: {start_global} {end_global}")
print(f"📂 Group: {name}")
print(f"📐 Range: {first} {last}")
for batch_start in range(start_global, end_global + 1, BATCH_SIZE):
batch_end = min(batch_start + BATCH_SIZE - 1, end_global)
print(f"📜 XOVER {batch_start}-{batch_end}")
try:
resp, overviews = nntp.xover(batch_start, batch_end)
except Exception as e:
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
continue
rows = []
for art_num, fields in overviews:
clean_fields = {k: sanitize(v) for k, v in fields.items()}
metadata = {
"group": GROUP,
"article_number": art_num,
**clean_fields,
}
rows.append((
GROUP,
art_num,
fields.get("message-id"),
Json(metadata), # 👈 TADY
datetime.now(UTC),
))
if rows:
cur.executemany(
"""
INSERT INTO articles
(newsgroup, article_number, message_id, metadata, fetched_at)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (message_id) DO NOTHING
""",
rows,
)
print("🎉 DONE last 100k articles ingested")

View File

@@ -0,0 +1,71 @@
import nntplib
import os
from dotenv import load_dotenv
from db import get_conn
# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
# ============================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
cur = conn.cursor()
cur.execute("""
SELECT article_number
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
article_numbers = [row[0] for row in cur.fetchall()]
total = len(article_numbers)
print(f"📦 Found {total} parts in DB")
if total == 0:
print("❌ No articles found, aborting.")
exit(1)
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True,
) as nntp:
nntp.group(GROUP)
existing = []
missing = []
for idx, art in enumerate(article_numbers, start=1):
try:
nntp.stat(art)
existing.append(art)
print(f"✅ [{idx}/{total}] EXISTS article {art}")
except Exception:
missing.append(art)
print(f"❌ [{idx}/{total}] MISSING article {art}")
print("\n================ RESULT ================")
print(f"Total parts : {total}")
print(f"Existing : {len(existing)}")
print(f"Missing : {len(missing)}")
if existing:
print("\nExisting article_numbers:")
print(existing)
if missing:
print("\nMissing article_numbers (first 20):")
print(missing[:20])

202
23 ulozeni a slepeni.py Normal file
View File

@@ -0,0 +1,202 @@
import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
"""
Decode yEnc from NNTP BODY lines.
Handles NNTP dot-stuffing and logs what happens.
"""
out = bytearray()
saw_ybegin = False
data_lines = 0
for idx, orig_line in enumerate(lines):
line = orig_line
# --- NNTP dot-stuffing ---
if line.startswith(b".."):
if debug:
print(f" [dot] line {idx}: '..' -> '.'")
line = line[1:]
elif line.startswith(b"."):
if debug:
print(f" [dot] line {idx}: '.' removed")
line = line[1:]
# --- yEnc control lines ---
if line.startswith(b"=ybegin"):
saw_ybegin = True
if debug:
print(f" [yEnc] =ybegin detected")
continue
if line.startswith(b"=ypart"):
if debug:
print(f" [yEnc] =ypart detected")
continue
if line.startswith(b"=yend"):
if debug:
print(f" [yEnc] =yend detected")
continue
# --- actual yEnc data ---
data_lines += 1
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord('='):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
if debug:
print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
print(f" [yEnc] decoded_bytes={len(out)}")
if not saw_ybegin:
print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen")
return bytes(out)
# def yenc_decode_lines(lines: list[bytes]) -> bytes:
# """
# Decode yEnc from NNTP BODY lines.
# Handles NNTP dot-stuffing correctly.
# """
# out = bytearray()
#
# for line in lines:
# # --- undo NNTP dot-stuffing ---
# if line.startswith(b".."):
# line = line[1:]
# elif line.startswith(b"."):
# line = line[1:]
#
# # --- skip yEnc control lines ---
# if line.startswith(b"=ybegin"):
# continue
# if line.startswith(b"=ypart"):
# continue
# if line.startswith(b"=yend"):
# continue
#
# i = 0
# length = len(line)
#
# while i < length:
# c = line[i]
#
# if c == ord('='): # yEnc escape
# i += 1
# if i >= length:
# break
# c = (line[i] - 64) & 0xFF
#
# out.append((c - 42) & 0xFF)
# i += 1
#
# return bytes(out)
# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
OUT_DIR = r"downloads/PC_Pro_2011-07"
FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
# ============================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(OUT_DIR, exist_ok=True)
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
cur = conn.cursor()
# --- load article numbers + subject ---
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
print(f"📦 Found {len(rows)} parts")
# --- parse part number from subject ---
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
part_no = int(m.group(1))
parts.append((part_no, art_num))
# sort by part number (1..N)
parts.sort(key=lambda x: x[0])
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True
) as nntp:
nntp.group(GROUP)
for idx, (part_no, art_num) in enumerate(parts, start=1):
out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
if os.path.exists(out_path):
print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping")
continue
print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
resp, info = nntp.body(art_num)
print(f" BODY lines received: {len(info.lines)}")
# rychlá kontrola prvních řádků
for ln in info.lines[:3]:
print(f" RAW:", ln[:80])
decoded = yenc_decode_lines(info.lines, debug=True)
print(f" RESULT bytes: {len(decoded)}")
with open(out_path, "wb") as f:
f.write(decoded)
print("🧩 Assembling final PDF...")
with open(FINAL_PDF, "wb") as out:
for part_no, _ in parts:
part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
with open(part_path, "rb") as pf:
out.write(pf.read())
print("🎉 DONE")
print(f"📄 Final PDF: {FINAL_PDF}")

14
db.py Normal file
View File

@@ -0,0 +1,14 @@
from dotenv import load_dotenv
import os, psycopg
load_dotenv()
def get_conn():
return psycopg.connect(
host=os.getenv("PG_HOST"),
port=int(os.getenv("PG_PORT", 5432)),
dbname=os.getenv("PG_DB"),
user=os.getenv("PG_USER"),
password=os.getenv("PG_PASS"),
connect_timeout=5,
)