196 lines
6.7 KiB
Python
196 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.)
|
|
"""
|
|
import os
|
|
import csv
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlparse
|
|
from collections import deque
|
|
from dotenv import load_dotenv
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('log.txt'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Load environment variables
|
|
load_dotenv('.env')
|
|
USERNAME = os.getenv('USERNAME')
|
|
PASSWORD = os.getenv('PASSWORD')
|
|
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
|
|
|
|
# Document extensions we're looking for
|
|
DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
|
|
|
|
class SplcrScraper:
|
|
def __init__(self, base_url, username=None, password=None):
|
|
self.base_url = base_url.rstrip('/')
|
|
self.session = requests.Session()
|
|
self.visited = set()
|
|
self.documents = []
|
|
self.queue = deque([self.base_url])
|
|
|
|
# Login if credentials provided
|
|
if username and password:
|
|
self._login(username, password)
|
|
|
|
def _login(self, username, password):
|
|
"""Attempt to login with provided credentials"""
|
|
logger.info(f"Attempting login as {username}...")
|
|
login_url = self.base_url
|
|
|
|
# Login data matching the form
|
|
login_data = {
|
|
'log': username,
|
|
'pwd': password,
|
|
'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/',
|
|
'a': 'login',
|
|
'rememberme': 'forever',
|
|
'Submit': 'Přihlásit se'
|
|
}
|
|
|
|
try:
|
|
resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True)
|
|
# Check if login was successful by looking for logout link or checking redirect
|
|
if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url:
|
|
logger.info("Login successful!")
|
|
else:
|
|
logger.warning("Login response received - checking if authenticated...")
|
|
logger.info(f"Redirected to: {resp.url}")
|
|
except requests.RequestException as e:
|
|
logger.error(f"Login failed: {e}")
|
|
|
|
def _get_domain(self, url):
|
|
"""Extract domain from URL"""
|
|
return urlparse(url).netloc
|
|
|
|
def _is_valid_url(self, url):
|
|
"""Check if URL is valid and same domain"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
# Must be same domain and http(s)
|
|
return (parsed.netloc == urlparse(self.base_url).netloc and
|
|
parsed.scheme in ['http', 'https'])
|
|
except:
|
|
return False
|
|
|
|
def _extract_documents(self, url, html):
|
|
"""Extract document links from HTML"""
|
|
try:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
for link in soup.find_all('a', href=True):
|
|
href = link.get('href', '').strip()
|
|
if not href:
|
|
continue
|
|
|
|
# Make absolute URL
|
|
full_url = urljoin(url, href)
|
|
|
|
# Check if it's a document
|
|
path = urlparse(full_url).path.lower()
|
|
if any(path.endswith(ext) for ext in DOC_EXTENSIONS):
|
|
# Check if already found
|
|
if full_url not in [doc['url'] for doc in self.documents]:
|
|
title = link.get_text(strip=True) or path.split('/')[-1]
|
|
self.documents.append({
|
|
'url': full_url,
|
|
'title': title,
|
|
'type': path.split('.')[-1].upper(),
|
|
'found_on': url
|
|
})
|
|
logger.info(f"Found document: {path}")
|
|
except Exception as e:
|
|
logger.error(f"Error extracting documents from {url}: {e}")
|
|
|
|
def _extract_links(self, url, html):
|
|
"""Extract all page links from HTML"""
|
|
links = set()
|
|
try:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
for link in soup.find_all('a', href=True):
|
|
href = link.get('href', '').strip()
|
|
if not href or href.startswith('#'):
|
|
continue
|
|
|
|
full_url = urljoin(url, href)
|
|
full_url = full_url.split('#')[0] # Remove fragments
|
|
|
|
if self._is_valid_url(full_url) and full_url not in self.visited:
|
|
links.add(full_url)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting links from {url}: {e}")
|
|
|
|
return links
|
|
|
|
def scrape(self, max_pages=None):
|
|
"""Start crawling from base URL"""
|
|
logger.info(f"Starting scrape of {self.base_url}")
|
|
page_count = 0
|
|
|
|
while self.queue and (max_pages is None or page_count < max_pages):
|
|
url = self.queue.popleft()
|
|
|
|
if url in self.visited:
|
|
continue
|
|
|
|
self.visited.add(url)
|
|
page_count += 1
|
|
|
|
logger.info(f"[{page_count}] Fetching: {url}")
|
|
|
|
try:
|
|
resp = self.session.get(url, timeout=10)
|
|
resp.raise_for_status()
|
|
|
|
# Extract documents
|
|
self._extract_documents(url, resp.text)
|
|
|
|
# Extract new links to visit
|
|
new_links = self._extract_links(url, resp.text)
|
|
self.queue.extend(new_links)
|
|
|
|
logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}")
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Failed to fetch {url}: {e}")
|
|
|
|
logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents")
|
|
|
|
def save_results(self, output_dir='output'):
|
|
"""Save results to CSV and JSON"""
|
|
Path(output_dir).mkdir(exist_ok=True)
|
|
|
|
# Save CSV
|
|
csv_path = Path(output_dir) / 'documents.csv'
|
|
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on'])
|
|
writer.writeheader()
|
|
writer.writerows(self.documents)
|
|
logger.info(f"Saved CSV: {csv_path}")
|
|
|
|
# Save JSON
|
|
json_path = Path(output_dir) / 'documents.json'
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(self.documents, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Saved JSON: {json_path}")
|
|
|
|
def main():
|
|
scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD)
|
|
scraper.scrape(max_pages=None) # No limit, scrape entire site
|
|
scraper.save_results()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|