This commit is contained in:
2025-12-27 17:24:30 +01:00
parent ea485bad29
commit f8dc6566bc
7 changed files with 542 additions and 0 deletions

202
23 ulozeni a slepeni.py Normal file
View File

@@ -0,0 +1,202 @@
import os
import re
import nntplib
from dotenv import load_dotenv
from db import get_conn
def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes:
"""
Decode yEnc from NNTP BODY lines.
Handles NNTP dot-stuffing and logs what happens.
"""
out = bytearray()
saw_ybegin = False
data_lines = 0
for idx, orig_line in enumerate(lines):
line = orig_line
# --- NNTP dot-stuffing ---
if line.startswith(b".."):
if debug:
print(f" [dot] line {idx}: '..' -> '.'")
line = line[1:]
elif line.startswith(b"."):
if debug:
print(f" [dot] line {idx}: '.' removed")
line = line[1:]
# --- yEnc control lines ---
if line.startswith(b"=ybegin"):
saw_ybegin = True
if debug:
print(f" [yEnc] =ybegin detected")
continue
if line.startswith(b"=ypart"):
if debug:
print(f" [yEnc] =ypart detected")
continue
if line.startswith(b"=yend"):
if debug:
print(f" [yEnc] =yend detected")
continue
# --- actual yEnc data ---
data_lines += 1
i = 0
length = len(line)
while i < length:
c = line[i]
if c == ord('='):
i += 1
if i >= length:
break
c = (line[i] - 64) & 0xFF
out.append((c - 42) & 0xFF)
i += 1
if debug:
print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}")
print(f" [yEnc] decoded_bytes={len(out)}")
if not saw_ybegin:
print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen")
return bytes(out)
# def yenc_decode_lines(lines: list[bytes]) -> bytes:
# """
# Decode yEnc from NNTP BODY lines.
# Handles NNTP dot-stuffing correctly.
# """
# out = bytearray()
#
# for line in lines:
# # --- undo NNTP dot-stuffing ---
# if line.startswith(b".."):
# line = line[1:]
# elif line.startswith(b"."):
# line = line[1:]
#
# # --- skip yEnc control lines ---
# if line.startswith(b"=ybegin"):
# continue
# if line.startswith(b"=ypart"):
# continue
# if line.startswith(b"=yend"):
# continue
#
# i = 0
# length = len(line)
#
# while i < length:
# c = line[i]
#
# if c == ord('='): # yEnc escape
# i += 1
# if i >= length:
# break
# c = (line[i] - 64) & 0xFF
#
# out.append((c - 42) & 0xFF)
# i += 1
#
# return bytes(out)
# ================== CONFIG ==================
GROUP = "alt.binaries.e-book.magazines"
SUBJECT_KEY = "PC Pro 2011-07.pdf"
OUT_DIR = r"downloads/PC_Pro_2011-07"
FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf"
# ============================================
load_dotenv()
EWEKA_USER = os.getenv("EWEKA_USER")
EWEKA_PASS = os.getenv("EWEKA_PASS")
os.makedirs(OUT_DIR, exist_ok=True)
print("🔌 Connecting to PostgreSQL...")
conn = get_conn()
cur = conn.cursor()
# --- load article numbers + subject ---
cur.execute("""
SELECT article_number, metadata->>'subject'
FROM articles
WHERE newsgroup = %s
AND metadata->>'subject' LIKE %s
ORDER BY article_number
""", (GROUP, f"%{SUBJECT_KEY}%"))
rows = cur.fetchall()
print(f"📦 Found {len(rows)} parts")
# --- parse part number from subject ---
part_re = re.compile(r"\((\d+)\s*/\s*\d+\)")
parts = []
for art_num, subject in rows:
m = part_re.search(subject or "")
if not m:
raise RuntimeError(f"Cannot parse part number from subject: {subject}")
part_no = int(m.group(1))
parts.append((part_no, art_num))
# sort by part number (1..N)
parts.sort(key=lambda x: x[0])
print("🔌 Connecting to Eweka NNTP...")
with nntplib.NNTP_SSL(
"news.eweka.nl",
563,
EWEKA_USER,
EWEKA_PASS,
readermode=True
) as nntp:
nntp.group(GROUP)
for idx, (part_no, art_num) in enumerate(parts, start=1):
out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
if os.path.exists(out_path):
print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping")
continue
print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})")
resp, info = nntp.body(art_num)
print(f" BODY lines received: {len(info.lines)}")
# rychlá kontrola prvních řádků
for ln in info.lines[:3]:
print(f" RAW:", ln[:80])
decoded = yenc_decode_lines(info.lines, debug=True)
print(f" RESULT bytes: {len(decoded)}")
with open(out_path, "wb") as f:
f.write(decoded)
print("🧩 Assembling final PDF...")
with open(FINAL_PDF, "wb") as out:
for part_no, _ in parts:
part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin")
with open(part_path, "rb") as pf:
out.write(pf.read())
print("🎉 DONE")
print(f"📄 Final PDF: {FINAL_PDF}")