notebookVB
This commit is contained in:
@@ -1,49 +1,99 @@
|
||||
"""
|
||||
Reuse saved browser state (cookies + localStorage) so the Usercentrics cookie
|
||||
banner never appears and we stay logged in — same situation as a returning user.
|
||||
Reusable login flow for Rohlik.cz:
|
||||
|
||||
- If auth_state.json is MISSING: opens a browser, you accept cookies + log in
|
||||
manually, then press Enter to save the state.
|
||||
- If auth_state.json EXISTS: loads it and just verifies (no banner, logged in).
|
||||
1. Load saved session (auth_state.json) if it exists.
|
||||
2. Open the site and check whether we're already logged in.
|
||||
3. If yes -> continue.
|
||||
4. If no -> log in via the JSON API, accept cookies, save the session, continue.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from playwright.sync_api import sync_playwright
|
||||
from config import BASE_URL, AUTH_STATE_PATH
|
||||
from playwright.sync_api import sync_playwright, BrowserContext, Page
|
||||
from config import BASE_URL, AUTH_STATE_PATH, ROHLIK_EMAIL, ROHLIK_PASSWORD
|
||||
|
||||
auth_path = Path(AUTH_STATE_PATH)
|
||||
have_state = auth_path.exists()
|
||||
LOGIN_URL = f"{BASE_URL}/services/frontend-service/login"
|
||||
|
||||
|
||||
def is_logged_in(page: Page) -> bool:
|
||||
return page.locator('text="Přihlásit se"').count() == 0
|
||||
|
||||
|
||||
def accept_cookies(page: Page):
|
||||
"""Accept the Usercentrics consent banner via its official JS API."""
|
||||
result = page.evaluate('''async () => {
|
||||
for (let i = 0; i < 20; i++) {
|
||||
if (window.UC_UI && window.UC_UI.isInitialized && window.UC_UI.isInitialized()) break;
|
||||
await new Promise(r => setTimeout(r, 250));
|
||||
}
|
||||
if (window.UC_UI && typeof window.UC_UI.acceptAllConsents === 'function') {
|
||||
await window.UC_UI.acceptAllConsents();
|
||||
await window.UC_UI.closeCMP();
|
||||
return "accepted";
|
||||
}
|
||||
return "UC_UI not available";
|
||||
}''')
|
||||
# Wait for the banner to actually detach from the DOM (close animation ~1s)
|
||||
try:
|
||||
page.wait_for_selector('#usercentrics-cmp-ui', state='detached', timeout=5000)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
def api_login(context: BrowserContext) -> int:
|
||||
resp = context.request.post(
|
||||
LOGIN_URL,
|
||||
data=json.dumps({"email": ROHLIK_EMAIL, "password": ROHLIK_PASSWORD}),
|
||||
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
||||
)
|
||||
return resp.status
|
||||
|
||||
|
||||
def ensure_logged_in(pw) -> tuple[BrowserContext, Page]:
|
||||
auth_path = Path(AUTH_STATE_PATH)
|
||||
have_state = auth_path.exists()
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=False, args=["--start-maximized"])
|
||||
|
||||
ctx_args = {"no_viewport": True}
|
||||
if have_state:
|
||||
ctx_args["storage_state"] = AUTH_STATE_PATH
|
||||
context = browser.new_context(**ctx_args)
|
||||
page = context.new_page()
|
||||
|
||||
print(f"Opening {BASE_URL} (state loaded: {have_state}) ...")
|
||||
print(f"1) Opening site (saved session: {have_state}) ...")
|
||||
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
|
||||
page.wait_for_timeout(4000)
|
||||
page.wait_for_timeout(3000)
|
||||
|
||||
banner_present = page.locator('#usercentrics-cmp-ui').count() > 0
|
||||
is_logged_in = page.locator('text="Přihlásit se"').count() == 0
|
||||
print(f"Cookie banner present: {banner_present}")
|
||||
print(f"Logged in: {is_logged_in}")
|
||||
if is_logged_in(page):
|
||||
print("2) Already logged in from saved session — continuing.")
|
||||
return context, page
|
||||
|
||||
if not have_state:
|
||||
print("\n" + "=" * 60)
|
||||
print("No saved state. Accept cookies + log in manually,")
|
||||
print("then press Enter here to save the state.")
|
||||
print("=" * 60)
|
||||
input()
|
||||
print("2) Not logged in — logging in via API ...")
|
||||
status = api_login(context)
|
||||
print(f" Login API status: {status}")
|
||||
|
||||
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
|
||||
page.wait_for_timeout(3000)
|
||||
|
||||
print(f"3) Accepting cookies: {accept_cookies(page)}")
|
||||
|
||||
if is_logged_in(page):
|
||||
context.storage_state(path=AUTH_STATE_PATH)
|
||||
print(f"Saved state to {AUTH_STATE_PATH}")
|
||||
banner_present = page.locator('#usercentrics-cmp-ui').count() > 0
|
||||
is_logged_in = page.locator('text="Přihlásit se"').count() == 0
|
||||
print(f" -> banner present now: {banner_present}, logged in: {is_logged_in}")
|
||||
print("4) Logged in and session saved.")
|
||||
else:
|
||||
print("4) Login FAILED — check API status above.")
|
||||
|
||||
print("\nPress Enter to close browser...")
|
||||
input()
|
||||
browser.close()
|
||||
return context, page
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with sync_playwright() as pw:
|
||||
context, page = ensure_logged_in(pw)
|
||||
|
||||
print(f"\n -> logged in: {is_logged_in(page)}")
|
||||
print(f" -> cookie banner present: {page.locator('#usercentrics-cmp-ui').count() > 0}")
|
||||
|
||||
print("\nReady to scrape. Press Enter to close browser...")
|
||||
input()
|
||||
context.browser.close()
|
||||
|
||||
Reference in New Issue
Block a user