103 lines
3.4 KiB
Python
103 lines
3.4 KiB
Python
import os
|
|
import nntplib
|
|
import sabctools
|
|
import io
|
|
import binascii
|
|
import threading
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from dotenv import load_dotenv
|
|
from db import get_conn
|
|
|
|
# --- Konfigurace zůstává ---
|
|
load_dotenv()
|
|
EWEKA_USER = os.getenv("EWEKA_USER")
|
|
EWEKA_PASS = os.getenv("EWEKA_PASS")
|
|
EWEKA_HOST = "news.eweka.nl"
|
|
NEWSGROUP = 'alt.binaries.e-book.magazines'
|
|
SUBJECT_FILTER = '%PC Pro 2011-07.pdf%'
|
|
FINAL_PATH = os.path.join("FastLane/output", "Fast_Lane_Biker_Dec_2011.pdf")
|
|
|
|
MAX_WORKERS = 50
|
|
PART_FIXED_SIZE = 384000
|
|
file_lock = threading.Lock()
|
|
|
|
|
|
def download_chunk(articles_subset):
|
|
"""
|
|
Tato funkce běží v jednom vlákně a obsluhuje JEDNO TRVALÉ spojení.
|
|
articles_subset je seznam tuplic (index, art_num).
|
|
"""
|
|
if not articles_subset:
|
|
return
|
|
|
|
try:
|
|
# PŘIHLÁŠENÍ JEN JEDNOU NA ZAČÁTKU
|
|
server = nntplib.NNTP(EWEKA_HOST, user=EWEKA_USER, password=EWEKA_PASS)
|
|
server.group(NEWSGROUP)
|
|
|
|
# Otevřeme soubor pro zápis (v r+b módu)
|
|
with open(FINAL_PATH, "r+b") as f_out:
|
|
for index, art_num in articles_subset:
|
|
try:
|
|
resp, info = server.body(str(art_num))
|
|
stuffed_lines = [(b"." + l if l.startswith(b".") else l) for l in info.lines]
|
|
raw_body = b"\r\n".join(stuffed_lines)
|
|
|
|
wrapped = b"222 0 <id>\r\n" + raw_body + b"\r\n.\r\n"
|
|
decoder = sabctools.Decoder(len(wrapped))
|
|
decoder.process(io.BytesIO(wrapped).readinto(decoder))
|
|
res = next(decoder, None)
|
|
|
|
if res and res.data:
|
|
current_offset = index * PART_FIXED_SIZE
|
|
|
|
# Zápis na specifický offset
|
|
with file_lock:
|
|
f_out.seek(current_offset)
|
|
f_out.write(res.data)
|
|
|
|
print(f" [OK] Part {art_num} (Vlákno {threading.current_thread().name})")
|
|
except Exception as e:
|
|
print(f" [!] Chyba u článku {art_num}: {e}")
|
|
|
|
server.quit()
|
|
except Exception as e:
|
|
print(f" [!!!] Vlákno se nemohlo připojit: {e}")
|
|
|
|
|
|
def final_precision_downloader():
|
|
conn = get_conn()
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT article_number FROM articles
|
|
WHERE newsgroup = %s AND metadata->>'subject' LIKE %s
|
|
ORDER BY article_number;
|
|
""", (NEWSGROUP, SUBJECT_FILTER))
|
|
articles = cur.fetchall()
|
|
cur.close()
|
|
conn.close()
|
|
|
|
if not articles: return
|
|
|
|
# Příprava souboru (alokace místa)
|
|
# ... (zde ponechte váš kód pro zjištění total_size a truncate) ...
|
|
# Pro ukázku zkráceno:
|
|
total_size = len(articles) * PART_FIXED_SIZE # Přibližné, raději použijte váš výpočet z minula
|
|
with open(FINAL_PATH, "wb") as f:
|
|
f.truncate(total_size)
|
|
|
|
# ROZDĚLENÍ PRÁCE: Rozdělíme seznam článků na 5 částí
|
|
all_tasks = [(i, art[0]) for i, art in enumerate(articles)]
|
|
chunk_size = (len(all_tasks) + MAX_WORKERS - 1) // MAX_WORKERS
|
|
subsets = [all_tasks[i:i + chunk_size] for i in range(0, len(all_tasks), chunk_size)]
|
|
|
|
print(f"🚀 Startuji stahování s {len(subsets)} trvalými spoji...")
|
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
executor.map(download_chunk, subsets)
|
|
|
|
print("🏁 Hotovo.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
final_precision_downloader() |