notebookvb

2026-05-19 20:17:20 +02:00
parent bdb3ce9599
commit 19b9c6a6b4
3 changed files with 249 additions and 0 deletions
@@ -0,0 +1,12 @@
+.env
+.env.local
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+dist/
+build/
+.DS_Store
+output/
+*.csv
+*.json
@@ -0,0 +1,42 @@
+# splcr.cz Web Scraper
+
+## Cíl
+Procrawlovat celý web `https://splcr.cz/` (starší WordPress) a najít všechny dokumenty k stažení:
+- PDF
+- Word (DOCX, DOC)
+- Excel (XLSX, XLS)
+- PowerPoint (PPTX, PPT)
+
+## Setup
+
+### Přihlašovací údaje
+- Uloženy v `.env` (vždy mimo git)
+- Username: 6219
+- Password: AlenaVojtěchovská
+
+### Potřebné knihovny
+```bash
+pip install requests beautifulsoup4 python-dotenv
+```
+
+### Struktura
+- `main.py` — hlavní skript pro scraping
+- `.env` — přihlašovací údaje (GITIGNORE)
+- `output/` — výstupní soubory (CSV, JSON)
+
+## Strategie
+
+1. **Logování**: Přihlášení pomocí session (pokud vyžadováno)
+2. **Crawling**: Procházet stránky od homepage (BFS/DFS)
+3. **Extrakce**: Hledat všechny `<a href="...">` s příslušnými příponami
+4. **Deduplikace**: Unikátní linky
+5. **Export**: CSV/JSON se všemi dokumenty
+
+## Output
+- `documents.csv` — seznam všech dokumentů (url, title, size?, type)
+- `documents.json` — stejné v JSON formátu
+- `log.txt` — průběh crawlingu
+
+## Poznámky
+- WordPress = často všechny linky na webu
+- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc)
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.)
+"""
+import os
+import csv
+import json
+import logging
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+from collections import deque
+from dotenv import load_dotenv
+import requests
+from bs4 import BeautifulSoup
+
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('log.txt'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# Load environment variables
+load_dotenv('.env')
+USERNAME = os.getenv('USERNAME')
+PASSWORD = os.getenv('PASSWORD')
+BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
+
+# Document extensions we're looking for
+DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
+
+class SplcrScraper:
+    def __init__(self, base_url, username=None, password=None):
+        self.base_url = base_url.rstrip('/')
+        self.session = requests.Session()
+        self.visited = set()
+        self.documents = []
+        self.queue = deque([self.base_url])
+
+        # Login if credentials provided
+        if username and password:
+            self._login(username, password)
+
+    def _login(self, username, password):
+        """Attempt to login with provided credentials"""
+        logger.info(f"Attempting login as {username}...")
+        login_url = self.base_url
+
+        # Login data matching the form
+        login_data = {
+            'log': username,
+            'pwd': password,
+            'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/',
+            'a': 'login',
+            'rememberme': 'forever',
+            'Submit': 'Přihlásit se'
+        }
+
+        try:
+            resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True)
+            # Check if login was successful by looking for logout link or checking redirect
+            if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url:
+                logger.info("Login successful!")
+            else:
+                logger.warning("Login response received - checking if authenticated...")
+                logger.info(f"Redirected to: {resp.url}")
+        except requests.RequestException as e:
+            logger.error(f"Login failed: {e}")
+
+    def _get_domain(self, url):
+        """Extract domain from URL"""
+        return urlparse(url).netloc
+
+    def _is_valid_url(self, url):
+        """Check if URL is valid and same domain"""
+        try:
+            parsed = urlparse(url)
+            # Must be same domain and http(s)
+            return (parsed.netloc == urlparse(self.base_url).netloc and
+                    parsed.scheme in ['http', 'https'])
+        except:
+            return False
+
+    def _extract_documents(self, url, html):
+        """Extract document links from HTML"""
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+
+            for link in soup.find_all('a', href=True):
+                href = link.get('href', '').strip()
+                if not href:
+                    continue
+
+                # Make absolute URL
+                full_url = urljoin(url, href)
+
+                # Check if it's a document
+                path = urlparse(full_url).path.lower()
+                if any(path.endswith(ext) for ext in DOC_EXTENSIONS):
+                    # Check if already found
+                    if full_url not in [doc['url'] for doc in self.documents]:
+                        title = link.get_text(strip=True) or path.split('/')[-1]
+                        self.documents.append({
+                            'url': full_url,
+                            'title': title,
+                            'type': path.split('.')[-1].upper(),
+                            'found_on': url
+                        })
+                        logger.info(f"Found document: {path}")
+        except Exception as e:
+            logger.error(f"Error extracting documents from {url}: {e}")
+
+    def _extract_links(self, url, html):
+        """Extract all page links from HTML"""
+        links = set()
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+            for link in soup.find_all('a', href=True):
+                href = link.get('href', '').strip()
+                if not href or href.startswith('#'):
+                    continue
+
+                full_url = urljoin(url, href)
+                full_url = full_url.split('#')[0]  # Remove fragments
+
+                if self._is_valid_url(full_url) and full_url not in self.visited:
+                    links.add(full_url)
+        except Exception as e:
+            logger.error(f"Error extracting links from {url}: {e}")
+
+        return links
+
+    def scrape(self, max_pages=None):
+        """Start crawling from base URL"""
+        logger.info(f"Starting scrape of {self.base_url}")
+        page_count = 0
+
+        while self.queue and (max_pages is None or page_count < max_pages):
+            url = self.queue.popleft()
+
+            if url in self.visited:
+                continue
+
+            self.visited.add(url)
+            page_count += 1
+
+            logger.info(f"[{page_count}] Fetching: {url}")
+
+            try:
+                resp = self.session.get(url, timeout=10)
+                resp.raise_for_status()
+
+                # Extract documents
+                self._extract_documents(url, resp.text)
+
+                # Extract new links to visit
+                new_links = self._extract_links(url, resp.text)
+                self.queue.extend(new_links)
+
+                logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}")
+
+            except requests.RequestException as e:
+                logger.error(f"Failed to fetch {url}: {e}")
+
+        logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents")
+
+    def save_results(self, output_dir='output'):
+        """Save results to CSV and JSON"""
+        Path(output_dir).mkdir(exist_ok=True)
+
+        # Save CSV
+        csv_path = Path(output_dir) / 'documents.csv'
+        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on'])
+            writer.writeheader()
+            writer.writerows(self.documents)
+        logger.info(f"Saved CSV: {csv_path}")
+
+        # Save JSON
+        json_path = Path(output_dir) / 'documents.json'
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(self.documents, f, indent=2, ensure_ascii=False)
+        logger.info(f"Saved JSON: {json_path}")
+
+def main():
+    scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD)
+    scraper.scrape(max_pages=None)  # No limit, scrape entire site
+    scraper.save_results()
+
+if __name__ == '__main__':
+    main()