notebookvb
This commit is contained in:
@@ -0,0 +1,12 @@
|
|||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.DS_Store
|
||||||
|
output/
|
||||||
|
*.csv
|
||||||
|
*.json
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
# splcr.cz Web Scraper
|
||||||
|
|
||||||
|
## Cíl
|
||||||
|
Procrawlovat celý web `https://splcr.cz/` (starší WordPress) a najít všechny dokumenty k stažení:
|
||||||
|
- PDF
|
||||||
|
- Word (DOCX, DOC)
|
||||||
|
- Excel (XLSX, XLS)
|
||||||
|
- PowerPoint (PPTX, PPT)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### Přihlašovací údaje
|
||||||
|
- Uloženy v `.env` (vždy mimo git)
|
||||||
|
- Username: 6219
|
||||||
|
- Password: AlenaVojtěchovská
|
||||||
|
|
||||||
|
### Potřebné knihovny
|
||||||
|
```bash
|
||||||
|
pip install requests beautifulsoup4 python-dotenv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Struktura
|
||||||
|
- `main.py` — hlavní skript pro scraping
|
||||||
|
- `.env` — přihlašovací údaje (GITIGNORE)
|
||||||
|
- `output/` — výstupní soubory (CSV, JSON)
|
||||||
|
|
||||||
|
## Strategie
|
||||||
|
|
||||||
|
1. **Logování**: Přihlášení pomocí session (pokud vyžadováno)
|
||||||
|
2. **Crawling**: Procházet stránky od homepage (BFS/DFS)
|
||||||
|
3. **Extrakce**: Hledat všechny `<a href="...">` s příslušnými příponami
|
||||||
|
4. **Deduplikace**: Unikátní linky
|
||||||
|
5. **Export**: CSV/JSON se všemi dokumenty
|
||||||
|
|
||||||
|
## Output
|
||||||
|
- `documents.csv` — seznam všech dokumentů (url, title, size?, type)
|
||||||
|
- `documents.json` — stejné v JSON formátu
|
||||||
|
- `log.txt` — průběh crawlingu
|
||||||
|
|
||||||
|
## Poznámky
|
||||||
|
- WordPress = často všechny linky na webu
|
||||||
|
- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc)
|
||||||
@@ -0,0 +1,195 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from collections import deque
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler('log.txt'),
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv('.env')
|
||||||
|
USERNAME = os.getenv('USERNAME')
|
||||||
|
PASSWORD = os.getenv('PASSWORD')
|
||||||
|
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
|
||||||
|
|
||||||
|
# Document extensions we're looking for
|
||||||
|
DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
|
||||||
|
|
||||||
|
class SplcrScraper:
|
||||||
|
def __init__(self, base_url, username=None, password=None):
|
||||||
|
self.base_url = base_url.rstrip('/')
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.visited = set()
|
||||||
|
self.documents = []
|
||||||
|
self.queue = deque([self.base_url])
|
||||||
|
|
||||||
|
# Login if credentials provided
|
||||||
|
if username and password:
|
||||||
|
self._login(username, password)
|
||||||
|
|
||||||
|
def _login(self, username, password):
|
||||||
|
"""Attempt to login with provided credentials"""
|
||||||
|
logger.info(f"Attempting login as {username}...")
|
||||||
|
login_url = self.base_url
|
||||||
|
|
||||||
|
# Login data matching the form
|
||||||
|
login_data = {
|
||||||
|
'log': username,
|
||||||
|
'pwd': password,
|
||||||
|
'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/',
|
||||||
|
'a': 'login',
|
||||||
|
'rememberme': 'forever',
|
||||||
|
'Submit': 'Přihlásit se'
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True)
|
||||||
|
# Check if login was successful by looking for logout link or checking redirect
|
||||||
|
if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url:
|
||||||
|
logger.info("Login successful!")
|
||||||
|
else:
|
||||||
|
logger.warning("Login response received - checking if authenticated...")
|
||||||
|
logger.info(f"Redirected to: {resp.url}")
|
||||||
|
except requests.RequestException as e:
|
||||||
|
logger.error(f"Login failed: {e}")
|
||||||
|
|
||||||
|
def _get_domain(self, url):
|
||||||
|
"""Extract domain from URL"""
|
||||||
|
return urlparse(url).netloc
|
||||||
|
|
||||||
|
def _is_valid_url(self, url):
|
||||||
|
"""Check if URL is valid and same domain"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
# Must be same domain and http(s)
|
||||||
|
return (parsed.netloc == urlparse(self.base_url).netloc and
|
||||||
|
parsed.scheme in ['http', 'https'])
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _extract_documents(self, url, html):
|
||||||
|
"""Extract document links from HTML"""
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
href = link.get('href', '').strip()
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Make absolute URL
|
||||||
|
full_url = urljoin(url, href)
|
||||||
|
|
||||||
|
# Check if it's a document
|
||||||
|
path = urlparse(full_url).path.lower()
|
||||||
|
if any(path.endswith(ext) for ext in DOC_EXTENSIONS):
|
||||||
|
# Check if already found
|
||||||
|
if full_url not in [doc['url'] for doc in self.documents]:
|
||||||
|
title = link.get_text(strip=True) or path.split('/')[-1]
|
||||||
|
self.documents.append({
|
||||||
|
'url': full_url,
|
||||||
|
'title': title,
|
||||||
|
'type': path.split('.')[-1].upper(),
|
||||||
|
'found_on': url
|
||||||
|
})
|
||||||
|
logger.info(f"Found document: {path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting documents from {url}: {e}")
|
||||||
|
|
||||||
|
def _extract_links(self, url, html):
|
||||||
|
"""Extract all page links from HTML"""
|
||||||
|
links = set()
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
href = link.get('href', '').strip()
|
||||||
|
if not href or href.startswith('#'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_url = urljoin(url, href)
|
||||||
|
full_url = full_url.split('#')[0] # Remove fragments
|
||||||
|
|
||||||
|
if self._is_valid_url(full_url) and full_url not in self.visited:
|
||||||
|
links.add(full_url)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting links from {url}: {e}")
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
def scrape(self, max_pages=None):
|
||||||
|
"""Start crawling from base URL"""
|
||||||
|
logger.info(f"Starting scrape of {self.base_url}")
|
||||||
|
page_count = 0
|
||||||
|
|
||||||
|
while self.queue and (max_pages is None or page_count < max_pages):
|
||||||
|
url = self.queue.popleft()
|
||||||
|
|
||||||
|
if url in self.visited:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.visited.add(url)
|
||||||
|
page_count += 1
|
||||||
|
|
||||||
|
logger.info(f"[{page_count}] Fetching: {url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self.session.get(url, timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# Extract documents
|
||||||
|
self._extract_documents(url, resp.text)
|
||||||
|
|
||||||
|
# Extract new links to visit
|
||||||
|
new_links = self._extract_links(url, resp.text)
|
||||||
|
self.queue.extend(new_links)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}")
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
logger.error(f"Failed to fetch {url}: {e}")
|
||||||
|
|
||||||
|
logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents")
|
||||||
|
|
||||||
|
def save_results(self, output_dir='output'):
|
||||||
|
"""Save results to CSV and JSON"""
|
||||||
|
Path(output_dir).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Save CSV
|
||||||
|
csv_path = Path(output_dir) / 'documents.csv'
|
||||||
|
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on'])
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(self.documents)
|
||||||
|
logger.info(f"Saved CSV: {csv_path}")
|
||||||
|
|
||||||
|
# Save JSON
|
||||||
|
json_path = Path(output_dir) / 'documents.json'
|
||||||
|
with open(json_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(self.documents, f, indent=2, ensure_ascii=False)
|
||||||
|
logger.info(f"Saved JSON: {json_path}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD)
|
||||||
|
scraper.scrape(max_pages=None) # No limit, scrape entire site
|
||||||
|
scraper.save_results()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user