notebookvb
This commit is contained in:
@@ -39,4 +39,10 @@ pip install requests beautifulsoup4 python-dotenv
|
|||||||
|
|
||||||
## Poznámky
|
## Poznámky
|
||||||
- WordPress = často všechny linky na webu
|
- WordPress = často všechny linky na webu
|
||||||
- Zpočátku bez limit na počet stránek (pak omezit, pokud je jich moc)
|
- Zpočítku bez limit na počet stránek (pak omezit, pokud je jich moc)
|
||||||
|
|
||||||
|
## Seed URLs
|
||||||
|
Některé stránky nejsou linkovány z homepage (orphaned). Jsou zabudovány přímo:
|
||||||
|
- `/appel-rocnik-2023/` a `/appel-rocnik-2024/` (apelace)
|
||||||
|
|
||||||
|
Přidej další podle potřeby do `seed_urls` v `SplcrScraper.__init__`
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -13,21 +13,23 @@ from dotenv import load_dotenv
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging with UTF-8 encoding for Czech characters
|
||||||
logging.basicConfig(
|
import sys
|
||||||
level=logging.INFO,
|
file_handler = logging.FileHandler('log.txt', encoding='utf-8')
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||||
handlers=[
|
|
||||||
logging.FileHandler('log.txt'),
|
stream_handler = logging.StreamHandler(sys.stdout)
|
||||||
logging.StreamHandler()
|
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||||
]
|
|
||||||
)
|
logger = logging.getLogger()
|
||||||
logger = logging.getLogger(__name__)
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
logger.addHandler(stream_handler)
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv('.env')
|
load_dotenv('.env')
|
||||||
USERNAME = os.getenv('USERNAME')
|
USERNAME = os.getenv('SPLCR_USERNAME')
|
||||||
PASSWORD = os.getenv('PASSWORD')
|
PASSWORD = os.getenv('SPLCR_PASSWORD')
|
||||||
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
|
BASE_URL = os.getenv('BASE_URL', 'https://splcr.cz/')
|
||||||
|
|
||||||
# Document extensions we're looking for
|
# Document extensions we're looking for
|
||||||
@@ -39,7 +41,17 @@ class SplcrScraper:
|
|||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.visited = set()
|
self.visited = set()
|
||||||
self.documents = []
|
self.documents = []
|
||||||
self.queue = deque([self.base_url])
|
|
||||||
|
# Seed URLs — orphaned pages that don't link from homepage
|
||||||
|
seed_urls = [
|
||||||
|
f"{self.base_url}/rubriky/pro-cleny-spl-cr/appel/", # Archive of all appel years
|
||||||
|
]
|
||||||
|
|
||||||
|
# Dynamically add appel year URLs (2015-2025)
|
||||||
|
for year in range(2015, 2026):
|
||||||
|
seed_urls.append(f"{self.base_url}/appel-rocnik-{year}/")
|
||||||
|
|
||||||
|
self.queue = deque([self.base_url] + seed_urls)
|
||||||
|
|
||||||
# Login if credentials provided
|
# Login if credentials provided
|
||||||
if username and password:
|
if username and password:
|
||||||
|
|||||||
Reference in New Issue
Block a user