This commit is contained in:
2025-12-28 11:34:15 +01:00
parent f8dc6566bc
commit 64472e59ba
6 changed files with 488 additions and 143 deletions

View File

@@ -3,119 +3,11 @@ import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
"""
Decode yEnc from NNTP BODY lines.
Handles NNTP dot-stuffing and logs what happens.
"""
out = bytearray()
saw_ybegin = False
data_lines = 0
for idx, orig_line in enumerate(lines):
line = orig_line
# --- NNTP dot-stuffing ---
if line.startswith(b".."):
if debug:
print(f" [dot] line {idx}: '..' -> '.'")
line = line[1:]
elif line.startswith(b"."):
if debug:
print(f" [dot] line {idx}: '.' removed")
line = line[1:]
# --- yEnc control lines ---
if line.startswith(b"=ybegin"):
saw_ybegin = True
if debug:
print(f" [yEnc] =ybegin detected")
continue
if line.startswith(b"=ypart"):
if debug:
print(f" [yEnc] =ypart detected")
continue
if line.startswith(b"=yend"):
if debug:
print(f" [yEnc] =yend detected")
continue
# --- actual yEnc data ---
data_lines += 1
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord('='):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
if debug:
print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
print(f" [yEnc] decoded_bytes={len(out)}")
if not saw_ybegin:
print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen")
return bytes(out)
# def yenc_decode_lines(lines: list[bytes]) -> bytes:
# """
# Decode yEnc from NNTP BODY lines.
# Handles NNTP dot-stuffing correctly.
# """
# out = bytearray()
#
# for line in lines:
# # --- undo NNTP dot-stuffing ---
# if line.startswith(b".."):
# line = line[1:]
# elif line.startswith(b"."):
# line = line[1:]
#
# # --- skip yEnc control lines ---
# if line.startswith(b"=ybegin"):
# continue
# if line.startswith(b"=ypart"):
# continue
# if line.startswith(b"=yend"):
# continue
#
# i = 0
# length = len(line)
#
# while i < length:
# c = line[i]
#
# if c == ord('='): # yEnc escape
# i += 1
# if i >= length:
# break
# c = (line[i] - 64) & 0xFF
#
# out.append((c - 42) & 0xFF)
# i += 1
#
# return bytes(out)
# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
OUT_DIR = r"downloads/PC_Pro_2011-07"
FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
# ============================================
load_dotenv()
@@ -124,11 +16,47 @@ EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(OUT_DIR, exist_ok=True)
print("🔌 Connecting to PostgreSQL...")
def yenc_decode_and_extract_headers(lines: list[bytes]):
"""
Decode yEnc BODY lines and extract yEnc headers.
Returns: (decoded_bytes, yenc_header_lines)
"""
out = bytearray()
yenc_headers = []
for line in lines:
# --- undo NNTP dot-stuffing ---
if line.startswith(b".."):
line = line[1:]
elif line.startswith(b"."):
line = line[1:]
# --- capture yEnc control lines ---
if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
yenc_headers.append(line.decode("latin-1"))
continue
# --- yEnc decode ---
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord("="):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
return bytes(out), yenc_headers
# ------------------ DB ------------------
conn = get_conn()
cur = conn.cursor()
# --- load article numbers + subject ---
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
@@ -138,12 +66,11 @@ cur.execute("""
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
print(f"📦 Found {len(rows)} parts")
# --- parse part number from subject ---
# parse part number from subject
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
@@ -151,10 +78,9 @@ for art_num, subject in rows:
part_no = int(m.group(1))
parts.append((part_no, art_num))
# sort by part number (1..N)
parts.sort(key=lambda x: x[0])
print("🔌 Connecting to Eweka NNTP...")
# ------------------ NNTP ------------------
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
@@ -165,38 +91,19 @@ with nntplib.NNTP_SSL(
nntp.group(GROUP)
for idx, (part_no, art_num) in enumerate(parts, start=1):
out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
for part_no, art_num in parts:
bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")
if os.path.exists(out_path):
print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping")
if os.path.exists(bin_path) and os.path.exists(hdr_path):
continue
print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
_, info = nntp.body(art_num)
resp, info = nntp.body(art_num)
decoded, headers = yenc_decode_and_extract_headers(info.lines)
print(f" BODY lines received: {len(info.lines)}")
# rychlá kontrola prvních řádků
for ln in info.lines[:3]:
print(f" RAW:", ln[:80])
decoded = yenc_decode_lines(info.lines, debug=True)
print(f" RESULT bytes: {len(decoded)}")
with open(out_path, "wb") as f:
with open(bin_path, "wb") as f:
f.write(decoded)
print("🧩 Assembling final PDF...")
with open(FINAL_PDF, "wb") as out:
for part_no, _ in parts:
part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
with open(part_path, "rb") as pf:
out.write(pf.read())
print("🎉 DONE")
print(f"📄 Final PDF: {FINAL_PDF}")
with open(hdr_path, "w", encoding="utf-8") as f:
f.write("\n".join(headers))