z230
This commit is contained in:
10
.env
Normal file
10
.env
Normal file
@@ -0,0 +1,10 @@
|
||||
# ===== EWEKA =====
|
||||
EWEKA_USER=d6ef27c2d6496b22
|
||||
EWEKA_PASS=Vlado7309208104
|
||||
|
||||
# ===== POSTGRES =====
|
||||
PG_HOST=192.168.1.76
|
||||
PG_PORT=5432
|
||||
PG_DB=newsgroups
|
||||
PG_USER=vladimir.buzalka
|
||||
PG_PASS=Vlado7309208104++
|
||||
58
10 list of newsgroups.py
Normal file
58
10 list of newsgroups.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from datetime import datetime, UTC
|
||||
import nntplib
|
||||
from db import get_conn
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv() # načte .env
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
PROVIDER = "eweka"
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
conn = get_conn()
|
||||
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
host="news.eweka.nl",
|
||||
port=563,
|
||||
user=EWEKA_USER,
|
||||
password=EWEKA_PASS,
|
||||
readermode=True,
|
||||
) as nntp:
|
||||
|
||||
print("📜 Fetching LIST ACTIVE...")
|
||||
resp, groups = nntp.list()
|
||||
print(f"📦 Received {len(groups)} groups")
|
||||
|
||||
rows = [
|
||||
(
|
||||
name,
|
||||
int(first),
|
||||
int(last),
|
||||
flag,
|
||||
PROVIDER,
|
||||
datetime.now(UTC),
|
||||
)
|
||||
for name, last, first, flag in groups
|
||||
]
|
||||
|
||||
cur.executemany(
|
||||
"""
|
||||
INSERT INTO newsgroups
|
||||
(name, first_article, last_article, posting_flag, provider, fetched_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
first_article = EXCLUDED.first_article,
|
||||
last_article = EXCLUDED.last_article,
|
||||
posting_flag = EXCLUDED.posting_flag,
|
||||
fetched_at = EXCLUDED.fetched_at
|
||||
""",
|
||||
rows,
|
||||
)
|
||||
|
||||
print("🎉 DONE")
|
||||
93
20 Alt binaries ebook magazines.py
Normal file
93
20 Alt binaries ebook magazines.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import nntplib
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime, UTC
|
||||
from db import get_conn
|
||||
from psycopg.types.json import Json
|
||||
|
||||
|
||||
def sanitize(value):
|
||||
if isinstance(value, str):
|
||||
return value.encode("utf-8", errors="surrogatepass") \
|
||||
.decode("utf-8", errors="replace")
|
||||
return value
|
||||
|
||||
|
||||
# ================= CONFIG =================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
TOTAL_ARTICLES = 100_000
|
||||
BATCH_SIZE = 1_000
|
||||
# =========================================
|
||||
|
||||
load_dotenv()
|
||||
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
conn = get_conn()
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
host="news.eweka.nl",
|
||||
port=563,
|
||||
user=EWEKA_USER,
|
||||
password=EWEKA_PASS,
|
||||
readermode=True,
|
||||
) as nntp:
|
||||
|
||||
# --- GROUP ---
|
||||
resp, count, first, last, name = nntp.group(GROUP)
|
||||
first = int(first)
|
||||
last = int(last)
|
||||
|
||||
print(f"📂 Group: {name}")
|
||||
print(f"📐 Range: {first} – {last}")
|
||||
|
||||
start_global = max(first, last - TOTAL_ARTICLES + 1)
|
||||
print(f"🎯 Target range: {start_global} – {last}")
|
||||
|
||||
|
||||
for batch_start in range(start_global, last + 1, BATCH_SIZE):
|
||||
batch_end = min(batch_start + BATCH_SIZE - 1, last)
|
||||
|
||||
print(f"📜 XOVER {batch_start}-{batch_end}")
|
||||
|
||||
try:
|
||||
resp, overviews = nntp.xover(batch_start, batch_end)
|
||||
except Exception as e:
|
||||
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for art_num, fields in overviews:
|
||||
clean_fields = {k: sanitize(v) for k, v in fields.items()}
|
||||
|
||||
metadata = {
|
||||
"group": GROUP,
|
||||
"article_number": art_num,
|
||||
**clean_fields,
|
||||
}
|
||||
|
||||
rows.append((
|
||||
GROUP,
|
||||
art_num,
|
||||
fields.get("message-id"),
|
||||
Json(metadata), # 👈 TADY
|
||||
datetime.now(UTC),
|
||||
))
|
||||
|
||||
if rows:
|
||||
cur.executemany(
|
||||
"""
|
||||
INSERT INTO articles
|
||||
(newsgroup, article_number, message_id, metadata, fetched_at)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (message_id) DO NOTHING
|
||||
""",
|
||||
rows,
|
||||
)
|
||||
|
||||
print("🎉 DONE – last 100k articles ingested")
|
||||
94
21 poslednich 100k.py
Normal file
94
21 poslednich 100k.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import nntplib
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime, UTC
|
||||
from db import get_conn
|
||||
from psycopg.types.json import Json
|
||||
|
||||
|
||||
def sanitize(value):
|
||||
if isinstance(value, str):
|
||||
return value.encode("utf-8", errors="surrogatepass") \
|
||||
.decode("utf-8", errors="replace")
|
||||
return value
|
||||
|
||||
|
||||
# ================= CONFIG =================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
TOTAL_ARTICLES = 50_000_000
|
||||
BATCH_SIZE = 10_000
|
||||
# =========================================
|
||||
|
||||
load_dotenv()
|
||||
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
conn = get_conn()
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
host="news.eweka.nl",
|
||||
port=563,
|
||||
user=EWEKA_USER,
|
||||
password=EWEKA_PASS,
|
||||
readermode=True,
|
||||
) as nntp:
|
||||
|
||||
# --- GROUP ---
|
||||
resp, count, first, last, name = nntp.group(GROUP)
|
||||
first = int(first)
|
||||
last = int(last)
|
||||
|
||||
start_global = first
|
||||
end_global = min(first + TOTAL_ARTICLES - 1, last)
|
||||
|
||||
print(f"🎯 Target range: {start_global} – {end_global}")
|
||||
|
||||
print(f"📂 Group: {name}")
|
||||
print(f"📐 Range: {first} – {last}")
|
||||
|
||||
for batch_start in range(start_global, end_global + 1, BATCH_SIZE):
|
||||
batch_end = min(batch_start + BATCH_SIZE - 1, end_global)
|
||||
|
||||
print(f"📜 XOVER {batch_start}-{batch_end}")
|
||||
|
||||
try:
|
||||
resp, overviews = nntp.xover(batch_start, batch_end)
|
||||
except Exception as e:
|
||||
print(f"⚠️ XOVER failed for {batch_start}-{batch_end}: {e}")
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for art_num, fields in overviews:
|
||||
clean_fields = {k: sanitize(v) for k, v in fields.items()}
|
||||
|
||||
metadata = {
|
||||
"group": GROUP,
|
||||
"article_number": art_num,
|
||||
**clean_fields,
|
||||
}
|
||||
|
||||
rows.append((
|
||||
GROUP,
|
||||
art_num,
|
||||
fields.get("message-id"),
|
||||
Json(metadata), # 👈 TADY
|
||||
datetime.now(UTC),
|
||||
))
|
||||
|
||||
if rows:
|
||||
cur.executemany(
|
||||
"""
|
||||
INSERT INTO articles
|
||||
(newsgroup, article_number, message_id, metadata, fetched_at)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (message_id) DO NOTHING
|
||||
""",
|
||||
rows,
|
||||
)
|
||||
|
||||
print("🎉 DONE – last 100k articles ingested")
|
||||
71
22 stat test posledniho clanku.py
Normal file
71
22 stat test posledniho clanku.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import nntplib
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
|
||||
# ================== CONFIG ==================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||||
# ============================================
|
||||
|
||||
load_dotenv()
|
||||
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT article_number
|
||||
FROM articles
|
||||
WHERE newsgroup = %s
|
||||
AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number
|
||||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||||
|
||||
article_numbers = [row[0] for row in cur.fetchall()]
|
||||
total = len(article_numbers)
|
||||
|
||||
print(f"📦 Found {total} parts in DB")
|
||||
|
||||
if total == 0:
|
||||
print("❌ No articles found, aborting.")
|
||||
exit(1)
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
"news.eweka.nl",
|
||||
563,
|
||||
EWEKA_USER,
|
||||
EWEKA_PASS,
|
||||
readermode=True,
|
||||
) as nntp:
|
||||
|
||||
nntp.group(GROUP)
|
||||
|
||||
existing = []
|
||||
missing = []
|
||||
|
||||
for idx, art in enumerate(article_numbers, start=1):
|
||||
try:
|
||||
nntp.stat(art)
|
||||
existing.append(art)
|
||||
print(f"✅ [{idx}/{total}] EXISTS article {art}")
|
||||
except Exception:
|
||||
missing.append(art)
|
||||
print(f"❌ [{idx}/{total}] MISSING article {art}")
|
||||
|
||||
print("\n================ RESULT ================")
|
||||
print(f"Total parts : {total}")
|
||||
print(f"Existing : {len(existing)}")
|
||||
print(f"Missing : {len(missing)}")
|
||||
|
||||
if existing:
|
||||
print("\nExisting article_numbers:")
|
||||
print(existing)
|
||||
|
||||
if missing:
|
||||
print("\nMissing article_numbers (first 20):")
|
||||
print(missing[:20])
|
||||
202
23 ulozeni a slepeni.py
Normal file
202
23 ulozeni a slepeni.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import re
|
||||
import nntplib
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
|
||||
"""
|
||||
Decode yEnc from NNTP BODY lines.
|
||||
Handles NNTP dot-stuffing and logs what happens.
|
||||
"""
|
||||
out = bytearray()
|
||||
saw_ybegin = False
|
||||
data_lines = 0
|
||||
|
||||
for idx, orig_line in enumerate(lines):
|
||||
line = orig_line
|
||||
|
||||
# --- NNTP dot-stuffing ---
|
||||
if line.startswith(b".."):
|
||||
if debug:
|
||||
print(f" [dot] line {idx}: '..' -> '.'")
|
||||
line = line[1:]
|
||||
elif line.startswith(b"."):
|
||||
if debug:
|
||||
print(f" [dot] line {idx}: '.' removed")
|
||||
line = line[1:]
|
||||
|
||||
# --- yEnc control lines ---
|
||||
if line.startswith(b"=ybegin"):
|
||||
saw_ybegin = True
|
||||
if debug:
|
||||
print(f" [yEnc] =ybegin detected")
|
||||
continue
|
||||
|
||||
if line.startswith(b"=ypart"):
|
||||
if debug:
|
||||
print(f" [yEnc] =ypart detected")
|
||||
continue
|
||||
|
||||
if line.startswith(b"=yend"):
|
||||
if debug:
|
||||
print(f" [yEnc] =yend detected")
|
||||
continue
|
||||
|
||||
# --- actual yEnc data ---
|
||||
data_lines += 1
|
||||
i = 0
|
||||
length = len(line)
|
||||
|
||||
while i < length:
|
||||
c = line[i]
|
||||
|
||||
if c == ord('='):
|
||||
i += 1
|
||||
if i >= length:
|
||||
break
|
||||
c = (line[i] - 64) & 0xFF
|
||||
|
||||
out.append((c - 42) & 0xFF)
|
||||
i += 1
|
||||
|
||||
if debug:
|
||||
print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
|
||||
print(f" [yEnc] decoded_bytes={len(out)}")
|
||||
|
||||
if not saw_ybegin:
|
||||
print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen")
|
||||
|
||||
return bytes(out)
|
||||
|
||||
|
||||
# def yenc_decode_lines(lines: list[bytes]) -> bytes:
|
||||
# """
|
||||
# Decode yEnc from NNTP BODY lines.
|
||||
# Handles NNTP dot-stuffing correctly.
|
||||
# """
|
||||
# out = bytearray()
|
||||
#
|
||||
# for line in lines:
|
||||
# # --- undo NNTP dot-stuffing ---
|
||||
# if line.startswith(b".."):
|
||||
# line = line[1:]
|
||||
# elif line.startswith(b"."):
|
||||
# line = line[1:]
|
||||
#
|
||||
# # --- skip yEnc control lines ---
|
||||
# if line.startswith(b"=ybegin"):
|
||||
# continue
|
||||
# if line.startswith(b"=ypart"):
|
||||
# continue
|
||||
# if line.startswith(b"=yend"):
|
||||
# continue
|
||||
#
|
||||
# i = 0
|
||||
# length = len(line)
|
||||
#
|
||||
# while i < length:
|
||||
# c = line[i]
|
||||
#
|
||||
# if c == ord('='): # yEnc escape
|
||||
# i += 1
|
||||
# if i >= length:
|
||||
# break
|
||||
# c = (line[i] - 64) & 0xFF
|
||||
#
|
||||
# out.append((c - 42) & 0xFF)
|
||||
# i += 1
|
||||
#
|
||||
# return bytes(out)
|
||||
|
||||
|
||||
|
||||
|
||||
# ================== CONFIG ==================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||||
OUT_DIR = r"downloads/PC_Pro_2011-07"
|
||||
FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
|
||||
# ============================================
|
||||
|
||||
load_dotenv()
|
||||
EWEKA_USER = os.getenv("EWEKA_USER")
|
||||
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
# --- load article numbers + subject ---
|
||||
cur.execute("""
|
||||
SELECT article_number, metadata->>'subject'
|
||||
FROM articles
|
||||
WHERE newsgroup = %s
|
||||
AND metadata->>'subject' LIKE %s
|
||||
ORDER BY article_number
|
||||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||||
|
||||
rows = cur.fetchall()
|
||||
print(f"📦 Found {len(rows)} parts")
|
||||
|
||||
# --- parse part number from subject ---
|
||||
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
||||
|
||||
parts = []
|
||||
for art_num, subject in rows:
|
||||
m = part_re.search(subject or "")
|
||||
if not m:
|
||||
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
|
||||
part_no = int(m.group(1))
|
||||
parts.append((part_no, art_num))
|
||||
|
||||
# sort by part number (1..N)
|
||||
parts.sort(key=lambda x: x[0])
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
with nntplib.NNTP_SSL(
|
||||
"news.eweka.nl",
|
||||
563,
|
||||
EWEKA_USER,
|
||||
EWEKA_PASS,
|
||||
readermode=True
|
||||
) as nntp:
|
||||
|
||||
nntp.group(GROUP)
|
||||
|
||||
for idx, (part_no, art_num) in enumerate(parts, start=1):
|
||||
out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
|
||||
|
||||
if os.path.exists(out_path):
|
||||
print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping")
|
||||
continue
|
||||
|
||||
print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
|
||||
|
||||
resp, info = nntp.body(art_num)
|
||||
|
||||
print(f" BODY lines received: {len(info.lines)}")
|
||||
|
||||
# rychlá kontrola prvních řádků
|
||||
for ln in info.lines[:3]:
|
||||
print(f" RAW:", ln[:80])
|
||||
|
||||
decoded = yenc_decode_lines(info.lines, debug=True)
|
||||
|
||||
print(f" RESULT bytes: {len(decoded)}")
|
||||
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(decoded)
|
||||
|
||||
|
||||
print("🧩 Assembling final PDF...")
|
||||
|
||||
with open(FINAL_PDF, "wb") as out:
|
||||
for part_no, _ in parts:
|
||||
part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
|
||||
with open(part_path, "rb") as pf:
|
||||
out.write(pf.read())
|
||||
|
||||
print("🎉 DONE")
|
||||
print(f"📄 Final PDF: {FINAL_PDF}")
|
||||
14
db.py
Normal file
14
db.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from dotenv import load_dotenv
|
||||
import os, psycopg
|
||||
|
||||
load_dotenv()
|
||||
|
||||
def get_conn():
|
||||
return psycopg.connect(
|
||||
host=os.getenv("PG_HOST"),
|
||||
port=int(os.getenv("PG_PORT", 5432)),
|
||||
dbname=os.getenv("PG_DB"),
|
||||
user=os.getenv("PG_USER"),
|
||||
password=os.getenv("PG_PASS"),
|
||||
connect_timeout=5,
|
||||
)
|
||||
Reference in New Issue
Block a user