notebookvb
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
.env
|
||||
.env.local
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
.DS_Store
|
||||
output/
|
||||
*.csv
|
||||
*.json
|
||||
@@ -0,0 +1,42 @@
|
||||
# splcr.cz Web Scraper
|
||||
|
||||
## Cíl
|
||||
Procrawlovat celý web `https://splcr.cz/` (starší WordPress) a najít všechny dokumenty k stažení:
|
||||
- PDF
|
||||
- Word (DOCX, DOC)
|
||||
- Excel (XLSX, XLS)
|
||||
- PowerPoint (PPTX, PPT)
|
||||
|
||||
## Setup
|
||||
|
||||
### Přihlašovací údaje
|
||||
- Uloženy v `.env` (vždy mimo git)
|
||||
- Username: 6219
|
||||
- Password: AlenaVojtěchovská
|
||||
|
||||
### Potřebné knihovny
|
||||
```bash
|
||||
pip install requests beautifulsoup4 python-dotenv
|
||||
```
|
||||
|
||||
### Struktura
|
||||
- `main.py` — hlavní skript pro scraping
|
||||
- `.env` — přihlašovací údaje (GITIGNORE)
|
||||
- `output/` — výstupní soubory (CSV, JSON)
|
||||
|
||||
## Strategie
|
||||
|
||||
1. **Logování**: Přihlášení pomocí session (pokud vyžadováno)
|
||||
2. **Crawling**: Procházet stránky od homepage (BFS/DFS)
|
||||
3. **Extrakce**: Hledat všechny `<a href="...">` s příslušnými příponami
|
||||
4. **Deduplikace**: Unikátní linky
|
||||
5. **Export**: CSV/JSON se všemi dokumenty
|
||||
|
||||
## Output
|
||||
- `documents.csv` — seznam všech dokumentů (url, title, size?, type)
|
||||
- `documents.json` — stejné v JSON formátu
|
||||
- `log.txt` — průběh crawlingu
|
||||
|
||||
## Poznámky
|
||||
- WordPress = často všechny linky na webu
|
||||
- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc)
|
||||
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Web scraper for splcr.cz — finds all downloadable documents (PDF, DOC, XLS, PPT, etc.)
|
||||
"""
|
||||
import os
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from collections import deque
|
||||
from dotenv import load_dotenv
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('log.txt'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv('.env')
|
||||
USERNAME = os.getenv('USERNAME')
|
||||
PASSWORD = os.getenv('PASSWORD')
|
||||
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
|
||||
|
||||
# Document extensions we're looking for
|
||||
DOC_EXTENSIONS = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
|
||||
|
||||
class SplcrScraper:
|
||||
def __init__(self, base_url, username=None, password=None):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.session = requests.Session()
|
||||
self.visited = set()
|
||||
self.documents = []
|
||||
self.queue = deque([self.base_url])
|
||||
|
||||
# Login if credentials provided
|
||||
if username and password:
|
||||
self._login(username, password)
|
||||
|
||||
def _login(self, username, password):
|
||||
"""Attempt to login with provided credentials"""
|
||||
logger.info(f"Attempting login as {username}...")
|
||||
login_url = self.base_url
|
||||
|
||||
# Login data matching the form
|
||||
login_data = {
|
||||
'log': username,
|
||||
'pwd': password,
|
||||
'redirect_to': 'https://splcr.cz/rubriky/pro-cleny-spl-cr/',
|
||||
'a': 'login',
|
||||
'rememberme': 'forever',
|
||||
'Submit': 'Přihlásit se'
|
||||
}
|
||||
|
||||
try:
|
||||
resp = self.session.post(login_url, data=login_data, timeout=10, allow_redirects=True)
|
||||
# Check if login was successful by looking for logout link or checking redirect
|
||||
if 'odhlášení' in resp.text.lower() or 'logout' in resp.text.lower() or 'pro-cleny' in resp.url:
|
||||
logger.info("Login successful!")
|
||||
else:
|
||||
logger.warning("Login response received - checking if authenticated...")
|
||||
logger.info(f"Redirected to: {resp.url}")
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Login failed: {e}")
|
||||
|
||||
def _get_domain(self, url):
|
||||
"""Extract domain from URL"""
|
||||
return urlparse(url).netloc
|
||||
|
||||
def _is_valid_url(self, url):
|
||||
"""Check if URL is valid and same domain"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Must be same domain and http(s)
|
||||
return (parsed.netloc == urlparse(self.base_url).netloc and
|
||||
parsed.scheme in ['http', 'https'])
|
||||
except:
|
||||
return False
|
||||
|
||||
def _extract_documents(self, url, html):
|
||||
"""Extract document links from HTML"""
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link.get('href', '').strip()
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Make absolute URL
|
||||
full_url = urljoin(url, href)
|
||||
|
||||
# Check if it's a document
|
||||
path = urlparse(full_url).path.lower()
|
||||
if any(path.endswith(ext) for ext in DOC_EXTENSIONS):
|
||||
# Check if already found
|
||||
if full_url not in [doc['url'] for doc in self.documents]:
|
||||
title = link.get_text(strip=True) or path.split('/')[-1]
|
||||
self.documents.append({
|
||||
'url': full_url,
|
||||
'title': title,
|
||||
'type': path.split('.')[-1].upper(),
|
||||
'found_on': url
|
||||
})
|
||||
logger.info(f"Found document: {path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting documents from {url}: {e}")
|
||||
|
||||
def _extract_links(self, url, html):
|
||||
"""Extract all page links from HTML"""
|
||||
links = set()
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link.get('href', '').strip()
|
||||
if not href or href.startswith('#'):
|
||||
continue
|
||||
|
||||
full_url = urljoin(url, href)
|
||||
full_url = full_url.split('#')[0] # Remove fragments
|
||||
|
||||
if self._is_valid_url(full_url) and full_url not in self.visited:
|
||||
links.add(full_url)
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting links from {url}: {e}")
|
||||
|
||||
return links
|
||||
|
||||
def scrape(self, max_pages=None):
|
||||
"""Start crawling from base URL"""
|
||||
logger.info(f"Starting scrape of {self.base_url}")
|
||||
page_count = 0
|
||||
|
||||
while self.queue and (max_pages is None or page_count < max_pages):
|
||||
url = self.queue.popleft()
|
||||
|
||||
if url in self.visited:
|
||||
continue
|
||||
|
||||
self.visited.add(url)
|
||||
page_count += 1
|
||||
|
||||
logger.info(f"[{page_count}] Fetching: {url}")
|
||||
|
||||
try:
|
||||
resp = self.session.get(url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
|
||||
# Extract documents
|
||||
self._extract_documents(url, resp.text)
|
||||
|
||||
# Extract new links to visit
|
||||
new_links = self._extract_links(url, resp.text)
|
||||
self.queue.extend(new_links)
|
||||
|
||||
logger.info(f"Found {len(new_links)} new links, queue size: {len(self.queue)}")
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Failed to fetch {url}: {e}")
|
||||
|
||||
logger.info(f"Scraping complete! Visited {page_count} pages, found {len(self.documents)} documents")
|
||||
|
||||
def save_results(self, output_dir='output'):
|
||||
"""Save results to CSV and JSON"""
|
||||
Path(output_dir).mkdir(exist_ok=True)
|
||||
|
||||
# Save CSV
|
||||
csv_path = Path(output_dir) / 'documents.csv'
|
||||
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['url', 'title', 'type', 'found_on'])
|
||||
writer.writeheader()
|
||||
writer.writerows(self.documents)
|
||||
logger.info(f"Saved CSV: {csv_path}")
|
||||
|
||||
# Save JSON
|
||||
json_path = Path(output_dir) / 'documents.json'
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.documents, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved JSON: {json_path}")
|
||||
|
||||
def main():
|
||||
scraper = SplcrScraper(BASE_URL, USERNAME, PASSWORD)
|
||||
scraper.scrape(max_pages=None) # No limit, scrape entire site
|
||||
scraper.save_results()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user