From 19b9c6a6b47c2cb377ca31be721bf88fc30e4603 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Tue, 19 May 2026 20:17:20 +0200 Subject: [PATCH] notebookvb --- Webpagescraping/splcr.cz/.gitignore | 12 ++ Webpagescraping/splcr.cz/NOTES.md | 42 ++++++ Webpagescraping/splcr.cz/main.py | 195 ++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 Webpagescraping/splcr.cz/.gitignore create mode 100644 Webpagescraping/splcr.cz/NOTES.md create mode 100644 Webpagescraping/splcr.cz/main.py diff --git a/Webpagescraping/splcr.cz/.gitignore b/Webpagescraping/splcr.cz/.gitignore new file mode 100644 index 0000000..bf0bcb4 --- /dev/null +++ b/Webpagescraping/splcr.cz/.gitignore @@ -0,0 +1,12 @@ +.env +.env.local +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +dist/ +build/ +.DS_Store +output/ +*.csv +*.json diff --git a/Webpagescraping/splcr.cz/NOTES.md b/Webpagescraping/splcr.cz/NOTES.md new file mode 100644 index 0000000..58e53be --- /dev/null +++ b/Webpagescraping/splcr.cz/NOTES.md @@ -0,0 +1,42 @@ +# splcr.cz Web Scraper + +## Cíl +Procrawlovat celý web `https://splcr.cz/` (starší WordPress) a najít všechny dokumenty k stažení: +- PDF +- Word (DOCX, DOC) +- Excel (XLSX, XLS) +- PowerPoint (PPTX, PPT) + +## Setup + +### Přihlašovací údaje +- Uloženy v `.env` (vždy mimo git) +- Username: 6219 +- Password: AlenaVojtěchovská + +### Potřebné knihovny +```bash +pip install requests beautifulsoup4 python-dotenv +``` + +### Struktura +- `main.py` — hlavní skript pro scraping +- `.env` — přihlašovací údaje (GITIGNORE) +- `output/` — výstupní soubory (CSV, JSON) + +## Strategie + +1. **Logování**: Přihlášení pomocí session (pokud vyžadováno) +2. **Crawling**: Procházet stránky od homepage (BFS/DFS) +3. **Extrakce**: Hledat všechny `` s příslušnými příponami +4. **Deduplikace**: Unikátní linky +5. **Export**: CSV/JSON se všemi dokumenty + +## Output +- `documents.csv` — seznam všech dokumentů (url, title, size?, type) +- `documents.json` — stejné v JSON formátu +- `log.txt` — průběh crawlingu + +## Poznámky +- WordPress = často všechny linky na webu +- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc) diff --git a/Webpagescraping/splcr.cz/main.py b/Webpagescraping/splcr.cz/main.py new file mode 100644 index 0000000..94aeb62 --- /dev/null +++ b/Webpagescraping/splcr.cz/main.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.) +""" +import os +import csv +import json +import logging +from pathlib import Path +from urllib.parse import urljoin, urlparse +from collections import deque +from dotenv import load_dotenv +import requests +from bs4 import BeautifulSoup + +# Setup logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('log.txt'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv('.env') +USERNAME = os.getenv('USERNAME') +PASSWORD = os.getenv('PASSWORD') +BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/') + +# Document extensions we're looking for +DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'} + +class SplcrScraper: + def __init__(self, base_url, username=None, password=None): + self.base_url = base_url.rstrip('/') + self.session = requests.Session() + self.visited = set() + self.documents = [] + self.queue = deque([self.base_url]) + + # Login if credentials provided + if username and password: + self._login(username, password) + + def _login(self, username, password): + """Attempt to login with provided credentials""" + logger.info(f"Attempting login as {username}...") + login_url = self.base_url + + # Login data matching the form + login_data = { + 'log': username, + 'pwd': password, + 'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/', + 'a': 'login', + 'rememberme': 'forever', + 'Submit': 'Přihlásit se' + } + + try: + resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True) + # Check if login was successful by looking for logout link or checking redirect + if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url: + logger.info("Login successful!") + else: + logger.warning("Login response received - checking if authenticated...") + logger.info(f"Redirected to: {resp.url}") + except requests.RequestException as e: + logger.error(f"Login failed: {e}") + + def _get_domain(self, url): + """Extract domain from URL""" + return urlparse(url).netloc + + def _is_valid_url(self, url): + """Check if URL is valid and same domain""" + try: + parsed = urlparse(url) + # Must be same domain and http(s) + return (parsed.netloc == urlparse(self.base_url).netloc and + parsed.scheme in ['http', 'https']) + except: + return False + + def _extract_documents(self, url, html): + """Extract document links from HTML""" + try: + soup = BeautifulSoup(html, 'html.parser') + + for link in soup.find_all('a', href=True): + href = link.get('href', '').strip() + if not href: + continue + + # Make absolute URL + full_url = urljoin(url, href) + + # Check if it's a document + path = urlparse(full_url).path.lower() + if any(path.endswith(ext) for ext in DOC_EXTENSIONS): + # Check if already found + if full_url not in [doc['url'] for doc in self.documents]: + title = link.get_text(strip=True) or path.split('/')[-1] + self.documents.append({ + 'url': full_url, + 'title': title, + 'type': path.split('.')[-1].upper(), + 'found_on': url + }) + logger.info(f"Found document: {path}") + except Exception as e: + logger.error(f"Error extracting documents from {url}: {e}") + + def _extract_links(self, url, html): + """Extract all page links from HTML""" + links = set() + try: + soup = BeautifulSoup(html, 'html.parser') + for link in soup.find_all('a', href=True): + href = link.get('href', '').strip() + if not href or href.startswith('#'): + continue + + full_url = urljoin(url, href) + full_url = full_url.split('#')[0] # Remove fragments + + if self._is_valid_url(full_url) and full_url not in self.visited: + links.add(full_url) + except Exception as e: + logger.error(f"Error extracting links from {url}: {e}") + + return links + + def scrape(self, max_pages=None): + """Start crawling from base URL""" + logger.info(f"Starting scrape of {self.base_url}") + page_count = 0 + + while self.queue and (max_pages is None or page_count < max_pages): + url = self.queue.popleft() + + if url in self.visited: + continue + + self.visited.add(url) + page_count += 1 + + logger.info(f"[{page_count}] Fetching: {url}") + + try: + resp = self.session.get(url, timeout=10) + resp.raise_for_status() + + # Extract documents + self._extract_documents(url, resp.text) + + # Extract new links to visit + new_links = self._extract_links(url, resp.text) + self.queue.extend(new_links) + + logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}") + + except requests.RequestException as e: + logger.error(f"Failed to fetch {url}: {e}") + + logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents") + + def save_results(self, output_dir='output'): + """Save results to CSV and JSON""" + Path(output_dir).mkdir(exist_ok=True) + + # Save CSV + csv_path = Path(output_dir) / 'documents.csv' + with open(csv_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on']) + writer.writeheader() + writer.writerows(self.documents) + logger.info(f"Saved CSV: {csv_path}") + + # Save JSON + json_path = Path(output_dir) / 'documents.json' + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(self.documents, f, indent=2, ensure_ascii=False) + logger.info(f"Saved JSON: {json_path}") + +def main(): + scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD) + scraper.scrape(max_pages=None) # No limit, scrape entire site + scraper.save_results() + +if __name__ == '__main__': + main()