diff --git a/23 ulozeni a slepeni.py b/23 ulozeni a slepeni.py index 382e63a..0d170bc 100644 --- a/23 ulozeni a slepeni.py +++ b/23 ulozeni a slepeni.py @@ -3,119 +3,11 @@ import re import nntplib from dotenv import load_dotenv from db import get_conn -def yenc_decode_lines(lines: list[bytes], debug=False) -> bytes: - """ - Decode yEnc from NNTP BODY lines. - Handles NNTP dot-stuffing and logs what happens. - """ - out = bytearray() - saw_ybegin = False - data_lines = 0 - - for idx, orig_line in enumerate(lines): - line = orig_line - - # --- NNTP dot-stuffing --- - if line.startswith(b".."): - if debug: - print(f" [dot] line {idx}: '..' -> '.'") - line = line[1:] - elif line.startswith(b"."): - if debug: - print(f" [dot] line {idx}: '.' removed") - line = line[1:] - - # --- yEnc control lines --- - if line.startswith(b"=ybegin"): - saw_ybegin = True - if debug: - print(f" [yEnc] =ybegin detected") - continue - - if line.startswith(b"=ypart"): - if debug: - print(f" [yEnc] =ypart detected") - continue - - if line.startswith(b"=yend"): - if debug: - print(f" [yEnc] =yend detected") - continue - - # --- actual yEnc data --- - data_lines += 1 - i = 0 - length = len(line) - - while i < length: - c = line[i] - - if c == ord('='): - i += 1 - if i >= length: - break - c = (line[i] - 64) & 0xFF - - out.append((c - 42) & 0xFF) - i += 1 - - if debug: - print(f" [yEnc] saw_ybegin={saw_ybegin}, decoded_data_lines={data_lines}") - print(f" [yEnc] decoded_bytes={len(out)}") - - if not saw_ybegin: - print("⚠️ WARNING: yEnc decoder used but =ybegin was NOT seen") - - return bytes(out) - - -# def yenc_decode_lines(lines: list[bytes]) -> bytes: -# """ -# Decode yEnc from NNTP BODY lines. -# Handles NNTP dot-stuffing correctly. -# """ -# out = bytearray() -# -# for line in lines: -# # --- undo NNTP dot-stuffing --- -# if line.startswith(b".."): -# line = line[1:] -# elif line.startswith(b"."): -# line = line[1:] -# -# # --- skip yEnc control lines --- -# if line.startswith(b"=ybegin"): -# continue -# if line.startswith(b"=ypart"): -# continue -# if line.startswith(b"=yend"): -# continue -# -# i = 0 -# length = len(line) -# -# while i < length: -# c = line[i] -# -# if c == ord('='): # yEnc escape -# i += 1 -# if i >= length: -# break -# c = (line[i] - 64) & 0xFF -# -# out.append((c - 42) & 0xFF) -# i += 1 -# -# return bytes(out) - - - # ================== CONFIG ================== GROUP = "alt.binaries.e-book.magazines" SUBJECT_KEY = "PC Pro 2011-07.pdf" OUT_DIR = r"downloads/PC_Pro_2011-07" -FINAL_PDF = r"downloads/PC_Pro_2011-07.pdf" # ============================================ load_dotenv() @@ -124,11 +16,47 @@ EWEKA_PASS = os.getenv("EWEKA_PASS") os.makedirs(OUT_DIR, exist_ok=True) -print("🔌 Connecting to PostgreSQL...") + +def yenc_decode_and_extract_headers(lines: list[bytes]): + """ + Decode yEnc BODY lines and extract yEnc headers. + Returns: (decoded_bytes, yenc_header_lines) + """ + out = bytearray() + yenc_headers = [] + + for line in lines: + # --- undo NNTP dot-stuffing --- + if line.startswith(b".."): + line = line[1:] + elif line.startswith(b"."): + line = line[1:] + + # --- capture yEnc control lines --- + if line.startswith(b"=ybegin") or line.startswith(b"=ypart") or line.startswith(b"=yend"): + yenc_headers.append(line.decode("latin-1")) + continue + + # --- yEnc decode --- + i = 0 + length = len(line) + while i < length: + c = line[i] + if c == ord("="): + i += 1 + if i >= length: + break + c = (line[i] - 64) & 0xFF + out.append((c - 42) & 0xFF) + i += 1 + + return bytes(out), yenc_headers + + +# ------------------ DB ------------------ conn = get_conn() cur = conn.cursor() -# --- load article numbers + subject --- cur.execute(""" SELECT article_number, metadata->>'subject' FROM articles @@ -138,12 +66,11 @@ cur.execute(""" """, (GROUP, f"%{SUBJECT_KEY}%")) rows = cur.fetchall() -print(f"📦 Found {len(rows)} parts") -# --- parse part number from subject --- +# parse part number from subject part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") - parts = [] + for art_num, subject in rows: m = part_re.search(subject or "") if not m: @@ -151,10 +78,9 @@ for art_num, subject in rows: part_no = int(m.group(1)) parts.append((part_no, art_num)) -# sort by part number (1..N) parts.sort(key=lambda x: x[0]) -print("🔌 Connecting to Eweka NNTP...") +# ------------------ NNTP ------------------ with nntplib.NNTP_SSL( "news.eweka.nl", 563, @@ -165,38 +91,19 @@ with nntplib.NNTP_SSL( nntp.group(GROUP) - for idx, (part_no, art_num) in enumerate(parts, start=1): - out_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") + for part_no, art_num in parts: + bin_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") + hdr_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.yEncHeader") - if os.path.exists(out_path): - print(f"⏭️ [{idx}/{len(parts)}] part {part_no} already exists, skipping") + if os.path.exists(bin_path) and os.path.exists(hdr_path): continue - print(f"⬇️ [{idx}/{len(parts)}] Downloading part {part_no} (article {art_num})") + _, info = nntp.body(art_num) - resp, info = nntp.body(art_num) + decoded, headers = yenc_decode_and_extract_headers(info.lines) - print(f" BODY lines received: {len(info.lines)}") - - # rychlá kontrola prvních řádků - for ln in info.lines[:3]: - print(f" RAW:", ln[:80]) - - decoded = yenc_decode_lines(info.lines, debug=True) - - print(f" RESULT bytes: {len(decoded)}") - - with open(out_path, "wb") as f: + with open(bin_path, "wb") as f: f.write(decoded) - -print("🧩 Assembling final PDF...") - -with open(FINAL_PDF, "wb") as out: - for part_no, _ in parts: - part_path = os.path.join(OUT_DIR, f"part_{part_no:03d}.bin") - with open(part_path, "rb") as pf: - out.write(pf.read()) - -print("🎉 DONE") -print(f"📄 Final PDF: {FINAL_PDF}") + with open(hdr_path, "w", encoding="utf-8") as f: + f.write("\n".join(headers)) diff --git a/24 ULOŽENÍ A slepení2.PY b/24 ULOŽENÍ A slepení2.PY new file mode 100644 index 0000000..122ff29 --- /dev/null +++ b/24 ULOŽENÍ A slepení2.PY @@ -0,0 +1,93 @@ +import os +import re + +# ================= CONFIG ================= +PARTS_DIR = r"downloads/PC_Pro_2011-07" +OUTPUT_DIR = r"downloads" +# ========================================== + +ybegin_re = re.compile(r"size=(\d+)") +name_re = re.compile(r"name=(.+)") +ypart_re = re.compile(r"begin=(\d+)\s+end=(\d+)") + +parts = [] +total_size = None +final_name = None + +# --- 1. načti všechny yEnc hlavičky --- +for fname in os.listdir(PARTS_DIR): + if not fname.endswith(".yEncHeader"): + continue + + part_id = fname.replace(".yEncHeader", "") + hdr_path = os.path.join(PARTS_DIR, fname) + bin_path = os.path.join(PARTS_DIR, f"{part_id}.bin") + + if not os.path.exists(bin_path): + raise RuntimeError(f"Missing bin file for {fname}") + + with open(hdr_path, "r", encoding="utf-8", errors="replace") as f: + lines = f.read().splitlines() + + begin = end = None + + for line in lines: + if line.startswith("=ybegin"): + if total_size is None: + m = ybegin_re.search(line) + if m: + total_size = int(m.group(1)) + m = name_re.search(line) + if m: + final_name = m.group(1) + + if line.startswith("=ypart"): + m = ypart_re.search(line) + if not m: + raise RuntimeError(f"Cannot parse ypart in {fname}") + begin = int(m.group(1)) + end = int(m.group(2)) + + if begin is None or end is None: + raise RuntimeError(f"Missing begin/end in {fname}") + + parts.append({ + "bin": bin_path, + "begin": begin, + "end": end + }) + +# --- sanity checks --- +if total_size is None or final_name is None: + raise RuntimeError("Missing ybegin info (size/name)") + +output_path = os.path.join(OUTPUT_DIR, final_name) + +print(f"📄 Final file: {output_path}") +print(f"📦 Total size: {total_size} bytes") +print(f"🧩 Parts: {len(parts)}") + +# --- 2. alokuj cílový soubor --- +with open(output_path, "wb") as f: + f.truncate(total_size) + +# --- 3. zapiš jednotlivé části na správné offsety --- +with open(output_path, "r+b") as out: + for p in parts: + expected_len = p["end"] - p["begin"] + 1 + + with open(p["bin"], "rb") as bf: + data = bf.read() + + if len(data) != expected_len: + print( + f"⚠️ Size mismatch in {os.path.basename(p['bin'])}: " + f"expected {expected_len}, got {len(data)}" + ) + + # kratší data jsou OK (CRC, padding, end-of-part) + + out.seek(p["begin"] - 1) # yEnc je 1-based + out.write(data) + +print("🎉 DONE – file assembled correctly") diff --git a/25 stahni raw multithread.py b/25 stahni raw multithread.py new file mode 100644 index 0000000..f4b156e --- /dev/null +++ b/25 stahni raw multithread.py @@ -0,0 +1,93 @@ +import os +import re +import nntplib +from concurrent.futures import ThreadPoolExecutor, as_completed +from dotenv import load_dotenv +from db import get_conn + +# ================= CONFIG ================= +GROUP = "alt.binaries.e-book.magazines" +SUBJECT_KEY = "PC Pro 2011-07.pdf" +RAW_DIR = r"downloads/raw" +MAX_WORKERS = 5 +# ========================================== + +load_dotenv() +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +os.makedirs(RAW_DIR, exist_ok=True) + +# ---------- DB: load parts ---------- +conn = get_conn() +cur = conn.cursor() + +cur.execute(""" + SELECT article_number, metadata->>'subject' + FROM articles + WHERE newsgroup = %s + AND metadata->>'subject' LIKE %s + ORDER BY article_number +""", (GROUP, f"%{SUBJECT_KEY}%")) + +rows = cur.fetchall() + +part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") +parts = [] + +for art_num, subject in rows: + m = part_re.search(subject or "") + if not m: + continue + part_no = int(m.group(1)) + parts.append((part_no, art_num)) + +parts.sort(key=lambda x: x[0]) + +print(f"📦 Parts to download: {len(parts)}") + +# ---------- worker ---------- +def download_part(part_no: int, art_num: int): + out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw") + + if os.path.exists(out_path): + return f"⏭️ part {part_no} exists" + + with nntplib.NNTP_SSL( + "news.eweka.nl", + 563, + EWEKA_USER, + EWEKA_PASS, + readermode=True, + timeout=120 + ) as nntp: + + nntp.group(GROUP) + _, info = nntp.body(art_num) + + with open(out_path, "wb") as f: + for line in info.lines: + f.write(line) + f.write(b"\n") + + return f"⬇️ part {part_no} done" + +# ---------- parallel execution ---------- +errors = 0 + +with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = [ + executor.submit(download_part, part_no, art_num) + for part_no, art_num in parts + ] + + for future in as_completed(futures): + try: + msg = future.result() + print(msg) + except Exception as e: + errors += 1 + print(f"❌ ERROR: {e}") + +print("🎉 DONE") +print(f"⚠️ Errors: {errors}") diff --git a/25 stahni raw.py b/25 stahni raw.py new file mode 100644 index 0000000..27ea0e6 --- /dev/null +++ b/25 stahni raw.py @@ -0,0 +1,74 @@ +import os +import re +import nntplib +from dotenv import load_dotenv +from db import get_conn + +# ================= CONFIG ================= +GROUP = "alt.binaries.e-book.magazines" +SUBJECT_KEY = "PC Pro 2011-07.pdf" +RAW_DIR = r"downloads/raw" +# ========================================== + +load_dotenv() +EWEKA_USER = os.getenv("EWEKA_USER") +EWEKA_PASS = os.getenv("EWEKA_PASS") + +os.makedirs(RAW_DIR, exist_ok=True) + +# --- DB: najdi články --- +conn = get_conn() +cur = conn.cursor() + +cur.execute(""" + SELECT article_number, metadata->>'subject' + FROM articles + WHERE newsgroup = %s + AND metadata->>'subject' LIKE %s + ORDER BY article_number +""", (GROUP, f"%{SUBJECT_KEY}%")) + +rows = cur.fetchall() + +# --- parse part number from subject --- +part_re = re.compile(r"\((\d+)\s*/\s*\d+\)") +parts = [] + +for art_num, subject in rows: + m = part_re.search(subject or "") + if not m: + raise RuntimeError(f"Cannot parse part number from subject: {subject}") + part_no = int(m.group(1)) + parts.append((part_no, art_num)) + +parts.sort(key=lambda x: x[0]) + +print(f"📦 Parts to download: {len(parts)}") + +# --- NNTP download --- +with nntplib.NNTP_SSL( + "news.eweka.nl", + 563, + EWEKA_USER, + EWEKA_PASS, + readermode=True +) as nntp: + + nntp.group(GROUP) + + for part_no, art_num in parts: + out_path = os.path.join(RAW_DIR, f"part_{part_no:03d}.raw") + + if os.path.exists(out_path): + continue + + print(f"⬇️ Downloading RAW part {part_no} (article {art_num})") + + _, info = nntp.body(art_num) + + with open(out_path, "wb") as f: + for line in info.lines: + f.write(line) + f.write(b"\n") + +print("🎉 DONE – RAW NNTP data stored") diff --git a/26 Analyze one.py b/26 Analyze one.py new file mode 100644 index 0000000..4b4e949 --- /dev/null +++ b/26 Analyze one.py @@ -0,0 +1,93 @@ +import binascii + +RAW_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw" + +# -------------------------------------------------- +# yEnc decoder (offline, minimal, correct) +# -------------------------------------------------- +def yenc_decode(lines: list[bytes]) -> bytes: + out = bytearray() + + for line in lines: + # undo NNTP dot-stuffing + if line.startswith(b".."): + line = line[1:] + elif line.startswith(b"."): + line = line[1:] + + i = 0 + while i < len(line): + c = line[i] + if c == ord("="): + i += 1 + c = (line[i] - 64) & 0xFF + out.append((c - 42) & 0xFF) + i += 1 + + return bytes(out) + +# -------------------------------------------------- +# 1. read RAW file +# -------------------------------------------------- +with open(RAW_FILE, "rb") as f: + raw_lines = f.read().splitlines() + +print(f"📄 RAW lines: {len(raw_lines)}") + +# -------------------------------------------------- +# 2. split parts +# -------------------------------------------------- +ybegin = None +ypart = None +yend = None +data_lines = [] + +for line in raw_lines: + if line.startswith(b"=ybegin"): + ybegin = line.decode(errors="replace") + continue + + if line.startswith(b"=ypart"): + ypart = line.decode(errors="replace") + continue + + if line.startswith(b"=yend"): + yend = line.decode(errors="replace") + continue + + data_lines.append(line) + +print("🧾 ybegin:", ybegin) +print("🧾 ypart :", ypart) +print("🧾 yend :", yend) +print(f"📦 DATA lines: {len(data_lines)}") + +# -------------------------------------------------- +# 3. decode yEnc DATA +# -------------------------------------------------- +decoded = yenc_decode(data_lines) + +print(f"📦 Decoded bytes: {len(decoded)}") + +# -------------------------------------------------- +# 4. extract pcrc32 +# -------------------------------------------------- +pcrc = None +if "pcrc32=" in yend: + pcrc = yend.split("pcrc32=")[1].strip() + +if not pcrc: + raise RuntimeError("No pcrc32 found") + +# -------------------------------------------------- +# 5. compute CRC32 +# -------------------------------------------------- +crc_calc = f"{binascii.crc32(decoded) & 0xffffffff:08x}" + +print(f"🔎 CRC expected: {pcrc}") +print(f"🔎 CRC computed: {crc_calc}") + +if crc_calc == pcrc: + print("✅ CRC MATCH — PART IS PERFECT") +else: + print("❌ CRC MISMATCH — DATA IS BROKEN") diff --git a/90 test.py b/90 test.py new file mode 100644 index 0000000..116d95d --- /dev/null +++ b/90 test.py @@ -0,0 +1,85 @@ +import sabctools +import io +import os +import binascii +import re + +# --- KONFIGURACE --- +INPUT_FILE = r"u:\PycharmProjects\NewsGroups\downloads\raw\part_001.raw" +OUTPUT_DIR = r"u:\PycharmProjects\NewsGroups\downloads\decoded" + + +def ultimate_bomba_decoder(): + if not os.path.exists(INPUT_FILE): + print(f"❌ Soubor nenalezen: {INPUT_FILE}") + return + + print(f"📖 Načítám soubor...") + with open(INPUT_FILE, "rb") as f: + raw_data = f.read() + + # 1. OPRAVA SYNTAX WARNING A VYTAŽENÍ METADAT + # Používáme [0-9] místo \d pro odstranění varování v Pythonu 3.13 + yend_match = re.search(b"=yend size=([0-9]+).*pcrc32=([0-9a-fA-F]+)", raw_data) + expected_size = 0 + expected_crc_str = "" + if yend_match: + expected_size = int(yend_match.group(1)) + expected_crc_str = yend_match.group(2).decode().lower() + print(f"🎯 Metadata nalezena: Očekávaná velikost={expected_size}, Očekávané CRC={expected_crc_str}") + + # 2. KLÍČOVÁ OPRAVA (Čištění dat) + # Odstraníme prázdné znaky na začátku/konci a sjednotíme konce řádků na \r\n + processed_data = raw_data.strip() + # Tento trik zajistí, že i linuxové konce řádků budou pro yEnc správně \r\n + processed_data = processed_data.replace(b"\r\n", b"\n").replace(b"\n", b"\r\n") + + # 3. ZABALENÍ DO NNTP OBÁLKY + wrapped = b"222 0 \r\n" + processed_data + b"\r\n.\r\n" + + # 4. DEKÓDOVÁNÍ (Sabctools 3.13 Streaming API) + decoder = sabctools.Decoder(len(wrapped)) + buf = io.BytesIO(wrapped) + n = buf.readinto(decoder) + decoder.process(n) + + response = next(decoder, None) + + if response and response.data: + # 5. KONTROLA INTEGRITY (Vlastní výpočet CRC32) + # binascii.crc32 vrací integer, :08x ho převede na hexadecimální formát + vypoctene_crc_int = binascii.crc32(response.data) + vypoctene_crc_str = f"{vypoctene_crc_int:08x}".lower() + + real_size = len(response.data) + + print("-" * 40) + print(f"📊 Kontrola integrity:") + print(f" Skutečná velikost: {real_size} (Očekáváno: {expected_size})") + print(f" Vypočítané CRC: {vypoctene_crc_str}") + print(f" Očekávané CRC: {expected_crc_str}") + + if vypoctene_crc_str == expected_crc_str: + print("✅ BINGO! Soubor je 100% v pořádku.") + else: + print("⚠️ POZOR: CRC nesouhlasí, data mohou být poškozena.") + + # 6. ULOŽENÍ + if not os.path.exists(OUTPUT_DIR): + os.makedirs(OUTPUT_DIR) + + # Jméno z yEnc hlavičky: PC Pro 2011-07.pdf + out_name = response.file_name or "decoded_part.bin" + out_path = os.path.join(OUTPUT_DIR, out_name) + + with open(out_path, "wb") as f_out: + f_out.write(response.data) + + print(f"💾 Uloženo do: {out_path}") + print("-" * 40) + else: + print("❌ Chyba: Dekodér nevrátil žádná data. Zkontrolujte, zda je soubor kompletní.") + + +if __name__ == "__main__": + ultimate_bomba_decoder() \ No newline at end of file