#!/usr/bin/env python3 """ Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.) """ import os import csv import json import logging from pathlib import Path from urllib.parse import urljoin, urlparse from collections import deque from dotenv import load_dotenv import requests from bs4 import BeautifulSoup # Setup logging with UTF-8 encoding for Czech characters import sys file_handler = logging.FileHandler('log.txt', encoding='utf-8') file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) stream_handler = logging.StreamHandler(sys.stdout) stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler(file_handler) logger.addHandler(stream_handler) # Load environment variables load_dotenv('.env') USERNAME = os.getenv('SPLCR_USERNAME') PASSWORD = os.getenv('SPLCR_PASSWORD') BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/') # Document extensions we're looking for DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'} class SplcrScraper: def __init__(self, base_url, username=None, password=None): self.base_url = base_url.rstrip('/') self.session = requests.Session() self.visited = set() self.documents = [] # Seed URLs — orphaned pages that don't link from homepage seed_urls = [ f"{self.base_url}/rubriky/pro-cleny-spl-cr/appel/", # Archive of all appel years ] # Dynamically add appel year URLs (2015-2025) for year in range(2015, 2026): seed_urls.append(f"{self.base_url}/appel-rocnik-{year}/") self.queue = deque([self.base_url] + seed_urls) # Login if credentials provided if username and password: self._login(username, password) def _login(self, username, password): """Attempt to login with provided credentials""" logger.info(f"Attempting login as {username}...") login_url = self.base_url # Login data matching the form login_data = { 'log': username, 'pwd': password, 'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/', 'a': 'login', 'rememberme': 'forever', 'Submit': 'Přihlásit se' } try: resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True) # Check if login was successful by looking for logout link or checking redirect if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url: logger.info("Login successful!") else: logger.warning("Login response received - checking if authenticated...") logger.info(f"Redirected to: {resp.url}") except requests.RequestException as e: logger.error(f"Login failed: {e}") def _get_domain(self, url): """Extract domain from URL""" return urlparse(url).netloc def _is_valid_url(self, url): """Check if URL is valid and same domain""" try: parsed = urlparse(url) # Must be same domain and http(s) return (parsed.netloc == urlparse(self.base_url).netloc and parsed.scheme in ['http', 'https']) except: return False def _extract_documents(self, url, html): """Extract document links from HTML""" try: soup = BeautifulSoup(html, 'html.parser') for link in soup.find_all('a', href=True): href = link.get('href', '').strip() if not href: continue # Make absolute URL full_url = urljoin(url, href) # Check if it's a document path = urlparse(full_url).path.lower() if any(path.endswith(ext) for ext in DOC_EXTENSIONS): # Check if already found if full_url not in [doc['url'] for doc in self.documents]: title = link.get_text(strip=True) or path.split('/')[-1] self.documents.append({ 'url': full_url, 'title': title, 'type': path.split('.')[-1].upper(), 'found_on': url }) logger.info(f"Found document: {path}") except Exception as e: logger.error(f"Error extracting documents from {url}: {e}") def _extract_links(self, url, html): """Extract all page links from HTML""" links = set() try: soup = BeautifulSoup(html, 'html.parser') for link in soup.find_all('a', href=True): href = link.get('href', '').strip() if not href or href.startswith('#'): continue full_url = urljoin(url, href) full_url = full_url.split('#')[0] # Remove fragments if self._is_valid_url(full_url) and full_url not in self.visited: links.add(full_url) except Exception as e: logger.error(f"Error extracting links from {url}: {e}") return links def scrape(self, max_pages=None): """Start crawling from base URL""" logger.info(f"Starting scrape of {self.base_url}") page_count = 0 while self.queue and (max_pages is None or page_count < max_pages): url = self.queue.popleft() if url in self.visited: continue self.visited.add(url) page_count += 1 logger.info(f"[{page_count}] Fetching: {url}") try: resp = self.session.get(url, timeout=10) resp.raise_for_status() # Extract documents self._extract_documents(url, resp.text) # Extract new links to visit new_links = self._extract_links(url, resp.text) self.queue.extend(new_links) logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}") except requests.RequestException as e: logger.error(f"Failed to fetch {url}: {e}") logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents") def save_results(self, output_dir='output'): """Save results to CSV and JSON""" Path(output_dir).mkdir(exist_ok=True) # Save CSV csv_path = Path(output_dir) / 'documents.csv' with open(csv_path, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on']) writer.writeheader() writer.writerows(self.documents) logger.info(f"Saved CSV: {csv_path}") # Save JSON json_path = Path(output_dir) / 'documents.json' with open(json_path, 'w', encoding='utf-8') as f: json.dump(self.documents, f, indent=2, ensure_ascii=False) logger.info(f"Saved JSON: {json_path}") def main(): scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD) scraper.scrape(max_pages=None) # No limit, scrape entire site scraper.save_results() if __name__ == '__main__': main()