z230
This commit is contained in:
@@ -3,119 +3,11 @@ import re
|
||||
import nntplib
|
||||
from dotenv import load_dotenv
|
||||
from db import get_conn
|
||||
def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
|
||||
"""
|
||||
Decode yEnc from NNTP BODY lines.
|
||||
Handles NNTP dot-stuffing and logs what happens.
|
||||
"""
|
||||
out = bytearray()
|
||||
saw_ybegin = False
|
||||
data_lines = 0
|
||||
|
||||
for idx, orig_line in enumerate(lines):
|
||||
line = orig_line
|
||||
|
||||
# --- NNTP dot-stuffing ---
|
||||
if line.startswith(b".."):
|
||||
if debug:
|
||||
print(f" [dot] line {idx}: '..' -> '.'")
|
||||
line = line[1:]
|
||||
elif line.startswith(b"."):
|
||||
if debug:
|
||||
print(f" [dot] line {idx}: '.' removed")
|
||||
line = line[1:]
|
||||
|
||||
# --- yEnc control lines ---
|
||||
if line.startswith(b"=ybegin"):
|
||||
saw_ybegin = True
|
||||
if debug:
|
||||
print(f" [yEnc] =ybegin detected")
|
||||
continue
|
||||
|
||||
if line.startswith(b"=ypart"):
|
||||
if debug:
|
||||
print(f" [yEnc] =ypart detected")
|
||||
continue
|
||||
|
||||
if line.startswith(b"=yend"):
|
||||
if debug:
|
||||
print(f" [yEnc] =yend detected")
|
||||
continue
|
||||
|
||||
# --- actual yEnc data ---
|
||||
data_lines += 1
|
||||
i = 0
|
||||
length = len(line)
|
||||
|
||||
while i < length:
|
||||
c = line[i]
|
||||
|
||||
if c == ord('='):
|
||||
i += 1
|
||||
if i >= length:
|
||||
break
|
||||
c = (line[i] - 64) & 0xFF
|
||||
|
||||
out.append((c - 42) & 0xFF)
|
||||
i += 1
|
||||
|
||||
if debug:
|
||||
print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
|
||||
print(f" [yEnc] decoded_bytes={len(out)}")
|
||||
|
||||
if not saw_ybegin:
|
||||
print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen")
|
||||
|
||||
return bytes(out)
|
||||
|
||||
|
||||
# def yenc_decode_lines(lines: list[bytes]) -> bytes:
|
||||
# """
|
||||
# Decode yEnc from NNTP BODY lines.
|
||||
# Handles NNTP dot-stuffing correctly.
|
||||
# """
|
||||
# out = bytearray()
|
||||
#
|
||||
# for line in lines:
|
||||
# # --- undo NNTP dot-stuffing ---
|
||||
# if line.startswith(b".."):
|
||||
# line = line[1:]
|
||||
# elif line.startswith(b"."):
|
||||
# line = line[1:]
|
||||
#
|
||||
# # --- skip yEnc control lines ---
|
||||
# if line.startswith(b"=ybegin"):
|
||||
# continue
|
||||
# if line.startswith(b"=ypart"):
|
||||
# continue
|
||||
# if line.startswith(b"=yend"):
|
||||
# continue
|
||||
#
|
||||
# i = 0
|
||||
# length = len(line)
|
||||
#
|
||||
# while i < length:
|
||||
# c = line[i]
|
||||
#
|
||||
# if c == ord('='): # yEnc escape
|
||||
# i += 1
|
||||
# if i >= length:
|
||||
# break
|
||||
# c = (line[i] - 64) & 0xFF
|
||||
#
|
||||
# out.append((c - 42) & 0xFF)
|
||||
# i += 1
|
||||
#
|
||||
# return bytes(out)
|
||||
|
||||
|
||||
|
||||
|
||||
# ================== CONFIG ==================
|
||||
GROUP = "alt.binaries.e-book.magazines"
|
||||
SUBJECT_KEY = "PC Pro 2011-07.pdf"
|
||||
OUT_DIR = r"downloads/PC_Pro_2011-07"
|
||||
FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
|
||||
# ============================================
|
||||
|
||||
load_dotenv()
|
||||
@@ -124,11 +16,47 @@ EWEKA_PASS = os.getenv("EWEKA_PASS")
|
||||
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
print("🔌 Connecting to PostgreSQL...")
|
||||
|
||||
def yenc_decode_and_extract_headers(lines: list[bytes]):
|
||||
"""
|
||||
Decode yEnc BODY lines and extract yEnc headers.
|
||||
Returns: (decoded_bytes, yenc_header_lines)
|
||||
"""
|
||||
out = bytearray()
|
||||
yenc_headers = []
|
||||
|
||||
for line in lines:
|
||||
# --- undo NNTP dot-stuffing ---
|
||||
if line.startswith(b".."):
|
||||
line = line[1:]
|
||||
elif line.startswith(b"."):
|
||||
line = line[1:]
|
||||
|
||||
# --- capture yEnc control lines ---
|
||||
if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"):
|
||||
yenc_headers.append(line.decode("latin-1"))
|
||||
continue
|
||||
|
||||
# --- yEnc decode ---
|
||||
i = 0
|
||||
length = len(line)
|
||||
while i < length:
|
||||
c = line[i]
|
||||
if c == ord("="):
|
||||
i += 1
|
||||
if i >= length:
|
||||
break
|
||||
c = (line[i] - 64) & 0xFF
|
||||
out.append((c - 42) & 0xFF)
|
||||
i += 1
|
||||
|
||||
return bytes(out), yenc_headers
|
||||
|
||||
|
||||
# ------------------ DB ------------------
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
# --- load article numbers + subject ---
|
||||
cur.execute("""
|
||||
SELECT article_number, metadata->>'subject'
|
||||
FROM articles
|
||||
@@ -138,12 +66,11 @@ cur.execute("""
|
||||
""", (GROUP, f"%{SUBJECT_KEY}%"))
|
||||
|
||||
rows = cur.fetchall()
|
||||
print(f"📦 Found {len(rows)} parts")
|
||||
|
||||
# --- parse part number from subject ---
|
||||
# parse part number from subject
|
||||
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
|
||||
|
||||
parts = []
|
||||
|
||||
for art_num, subject in rows:
|
||||
m = part_re.search(subject or "")
|
||||
if not m:
|
||||
@@ -151,10 +78,9 @@ for art_num, subject in rows:
|
||||
part_no = int(m.group(1))
|
||||
parts.append((part_no, art_num))
|
||||
|
||||
# sort by part number (1..N)
|
||||
parts.sort(key=lambda x: x[0])
|
||||
|
||||
print("🔌 Connecting to Eweka NNTP...")
|
||||
# ------------------ NNTP ------------------
|
||||
with nntplib.NNTP_SSL(
|
||||
"news.eweka.nl",
|
||||
563,
|
||||
@@ -165,38 +91,19 @@ with nntplib.NNTP_SSL(
|
||||
|
||||
nntp.group(GROUP)
|
||||
|
||||
for idx, (part_no, art_num) in enumerate(parts, start=1):
|
||||
out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
|
||||
for part_no, art_num in parts:
|
||||
bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
|
||||
hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader")
|
||||
|
||||
if os.path.exists(out_path):
|
||||
print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping")
|
||||
if os.path.exists(bin_path) and os.path.exists(hdr_path):
|
||||
continue
|
||||
|
||||
print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
|
||||
_, info = nntp.body(art_num)
|
||||
|
||||
resp, info = nntp.body(art_num)
|
||||
decoded, headers = yenc_decode_and_extract_headers(info.lines)
|
||||
|
||||
print(f" BODY lines received: {len(info.lines)}")
|
||||
|
||||
# rychlá kontrola prvních řádků
|
||||
for ln in info.lines[:3]:
|
||||
print(f" RAW:", ln[:80])
|
||||
|
||||
decoded = yenc_decode_lines(info.lines, debug=True)
|
||||
|
||||
print(f" RESULT bytes: {len(decoded)}")
|
||||
|
||||
with open(out_path, "wb") as f:
|
||||
with open(bin_path, "wb") as f:
|
||||
f.write(decoded)
|
||||
|
||||
|
||||
print("🧩 Assembling final PDF...")
|
||||
|
||||
with open(FINAL_PDF, "wb") as out:
|
||||
for part_no, _ in parts:
|
||||
part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
|
||||
with open(part_path, "rb") as pf:
|
||||
out.write(pf.read())
|
||||
|
||||
print("🎉 DONE")
|
||||
print(f"📄 Final PDF: {FINAL_PDF}")
|
||||
with open(hdr_path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(headers))
|
||||
|
||||
Reference in New Issue
Block a user