Files
Vladimir Buzalka c6360a8c99 notebookvb
2026-05-20 05:28:41 +02:00

208 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""
Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.)
"""
import os
import csv
import json
import logging
from pathlib import Path
from urllib.parse import urljoin, urlparse
from collections import deque
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
# Setup logging with UTF-8 encoding for Czech characters
import sys
file_handler = logging.FileHandler('log.txt', encoding='utf-8')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
logger.addHandler(stream_handler)
# Load environment variables
load_dotenv('.env')
USERNAME = os.getenv('SPLCR_USERNAME')
PASSWORD = os.getenv('SPLCR_PASSWORD')
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
# Document extensions we're looking for
DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
class SplcrScraper:
def __init__(self, base_url, username=None, password=None):
self.base_url = base_url.rstrip('/')
self.session = requests.Session()
self.visited = set()
self.documents = []
# Seed URLs — orphaned pages that don't link from homepage
seed_urls = [
f"{self.base_url}/rubriky/pro-cleny-spl-cr/appel/", # Archive of all appel years
]
# Dynamically add appel year URLs (2015-2025)
for year in range(2015, 2026):
seed_urls.append(f"{self.base_url}/appel-rocnik-{year}/")
self.queue = deque([self.base_url] + seed_urls)
# Login if credentials provided
if username and password:
self._login(username, password)
def _login(self, username, password):
"""Attempt to login with provided credentials"""
logger.info(f"Attempting login as {username}...")
login_url = self.base_url
# Login data matching the form
login_data = {
'log': username,
'pwd': password,
'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/',
'a': 'login',
'rememberme': 'forever',
'Submit': 'Přihlásit se'
}
try:
resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True)
# Check if login was successful by looking for logout link or checking redirect
if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url:
logger.info("Login successful!")
else:
logger.warning("Login response received - checking if authenticated...")
logger.info(f"Redirected to: {resp.url}")
except requests.RequestException as e:
logger.error(f"Login failed: {e}")
def _get_domain(self, url):
"""Extract domain from URL"""
return urlparse(url).netloc
def _is_valid_url(self, url):
"""Check if URL is valid and same domain"""
try:
parsed = urlparse(url)
# Must be same domain and http(s)
return (parsed.netloc == urlparse(self.base_url).netloc and
parsed.scheme in ['http', 'https'])
except:
return False
def _extract_documents(self, url, html):
"""Extract document links from HTML"""
try:
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a', href=True):
href = link.get('href', '').strip()
if not href:
continue
# Make absolute URL
full_url = urljoin(url, href)
# Check if it's a document
path = urlparse(full_url).path.lower()
if any(path.endswith(ext) for ext in DOC_EXTENSIONS):
# Check if already found
if full_url not in [doc['url'] for doc in self.documents]:
title = link.get_text(strip=True) or path.split('/')[-1]
self.documents.append({
'url': full_url,
'title': title,
'type': path.split('.')[-1].upper(),
'found_on': url
})
logger.info(f"Found document: {path}")
except Exception as e:
logger.error(f"Error extracting documents from {url}: {e}")
def _extract_links(self, url, html):
"""Extract all page links from HTML"""
links = set()
try:
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a', href=True):
href = link.get('href', '').strip()
if not href or href.startswith('#'):
continue
full_url = urljoin(url, href)
full_url = full_url.split('#')[0] # Remove fragments
if self._is_valid_url(full_url) and full_url not in self.visited:
links.add(full_url)
except Exception as e:
logger.error(f"Error extracting links from {url}: {e}")
return links
def scrape(self, max_pages=None):
"""Start crawling from base URL"""
logger.info(f"Starting scrape of {self.base_url}")
page_count = 0
while self.queue and (max_pages is None or page_count < max_pages):
url = self.queue.popleft()
if url in self.visited:
continue
self.visited.add(url)
page_count += 1
logger.info(f"[{page_count}] Fetching: {url}")
try:
resp = self.session.get(url, timeout=10)
resp.raise_for_status()
# Extract documents
self._extract_documents(url, resp.text)
# Extract new links to visit
new_links = self._extract_links(url, resp.text)
self.queue.extend(new_links)
logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}")
except requests.RequestException as e:
logger.error(f"Failed to fetch {url}: {e}")
logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents")
def save_results(self, output_dir='output'):
"""Save results to CSV and JSON"""
Path(output_dir).mkdir(exist_ok=True)
# Save CSV
csv_path = Path(output_dir) / 'documents.csv'
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on'])
writer.writeheader()
writer.writerows(self.documents)
logger.info(f"Saved CSV: {csv_path}")
# Save JSON
json_path = Path(output_dir) / 'documents.json'
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(self.documents, f, indent=2, ensure_ascii=False)
logger.info(f"Saved JSON: {json_path}")
def main():
scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD)
scraper.scrape(max_pages=None) # No limit, scrape entire site
scraper.save_results()
if __name__ == '__main__':
main()