notebookvb
This commit is contained in:
@@ -39,4 +39,10 @@ pip install requests beautifulsoup4 python-dotenv
|
||||
|
||||
## Poznámky
|
||||
- WordPress = často všechny linky na webu
|
||||
- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc)
|
||||
- Zpočítku bez limit na počet stránek (pak omezit, pokud je jich moc)
|
||||
|
||||
## Seed URLs
|
||||
Některé stránky nejsou linkovány z homepage (orphaned). Jsou zabudovány přímo:
|
||||
- `/appel-rocnik-2023/` a `/appel-rocnik-2024/` (apelace)
|
||||
|
||||
Přidej další podle potřeby do `seed_urls` v `SplcrScraper.__init__`
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -13,21 +13,23 @@ from dotenv import load_dotenv
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('log.txt'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
# Setup logging with UTF-8 encoding for Czech characters
|
||||
import sys
|
||||
file_handler = logging.FileHandler('log.txt', encoding='utf-8')
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv('.env')
|
||||
USERNAME = os.getenv('USERNAME')
|
||||
PASSWORD = os.getenv('PASSWORD')
|
||||
USERNAME = os.getenv('SPLCR_USERNAME')
|
||||
PASSWORD = os.getenv('SPLCR_PASSWORD')
|
||||
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
|
||||
|
||||
# Document extensions we're looking for
|
||||
@@ -39,7 +41,17 @@ class SplcrScraper:
|
||||
self.session = requests.Session()
|
||||
self.visited = set()
|
||||
self.documents = []
|
||||
self.queue = deque([self.base_url])
|
||||
|
||||
# Seed URLs — orphaned pages that don't link from homepage
|
||||
seed_urls = [
|
||||
f"{self.base_url}/rubriky/pro-cleny-spl-cr/appel/", # Archive of all appel years
|
||||
]
|
||||
|
||||
# Dynamically add appel year URLs (2015-2025)
|
||||
for year in range(2015, 2026):
|
||||
seed_urls.append(f"{self.base_url}/appel-rocnik-{year}/")
|
||||
|
||||
self.queue = deque([self.base_url] + seed_urls)
|
||||
|
||||
# Login if credentials provided
|
||||
if username and password:
|
||||
|
||||
Reference in New Issue
Block a user