notebookvb

This commit is contained in:
Vladimir Buzalka
2026-05-20 05:28:41 +02:00
parent 19b9c6a6b4
commit c6360a8c99
3 changed files with 2751 additions and 14 deletions
+7 -1
View File
@@ -39,4 +39,10 @@ pip install requests beautifulsoup4 python-dotenv
## Poznámky
- WordPress = často všechny linky na webu
- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc)
- Zpočítku bez limit na počet stránek (pak omezit, pokud je jich moc)
## Seed URLs
Některé stránky nejsou linkovány z homepage (orphaned). Jsou zabudovány přímo:
- `/appel-rocnik-2023/` a `/appel-rocnik-2024/` (apelace)
Přidej další podle potřeby do `seed_urls` v `SplcrScraper.__init__`
File diff suppressed because it is too large Load Diff
+25 -13
View File
@@ -13,21 +13,23 @@ from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('log.txt'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Setup logging with UTF-8 encoding for Czech characters
import sys
file_handler = logging.FileHandler('log.txt', encoding='utf-8')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
logger.addHandler(stream_handler)
# Load environment variables
load_dotenv('.env')
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')
USERNAME = os.getenv('SPLCR_USERNAME')
PASSWORD = os.getenv('SPLCR_PASSWORD')
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
# Document extensions we're looking for
@@ -39,7 +41,17 @@ class SplcrScraper:
self.session = requests.Session()
self.visited = set()
self.documents = []
self.queue = deque([self.base_url])
# Seed URLs — orphaned pages that don't link from homepage
seed_urls = [
f"{self.base_url}/rubriky/pro-cleny-spl-cr/appel/", # Archive of all appel years
]
# Dynamically add appel year URLs (2015-2025)
for year in range(2015, 2026):
seed_urls.append(f"{self.base_url}/appel-rocnik-{year}/")
self.queue = deque([self.base_url] + seed_urls)
# Login if credentials provided
if username and password: