Initial commit — clean history (removed large test files, browser profiles, Medidata/Clario downloads)

This commit is contained in:
2026-06-01 15:36:31 +02:00
commit bb604e593e
1304 changed files with 116480 additions and 0 deletions
+204
View File
@@ -0,0 +1,204 @@
"""
AE Import — scrapes Adverse Events from EvaMed DRY study and upserts into MongoDB.
Run repeatedly; only stores field-level changes (delta) in history[].
Unique key: patient_code + event_number.
"""
import asyncio
import re
from datetime import datetime
from pathlib import Path
from playwright.async_api import async_playwright
from pymongo import MongoClient
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
# Direct filtered URL: CZ1 (center_id=2), Adverse Event (formtype=120), all records
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting&center_id=2&formtype=120&l=ALL"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
MONGO_HOST = "192.168.1.76"
DB_NAME = "Dry"
COLLECTION = "AE"
SESSION_FILE = Path(__file__).parent / "session.json"
DATE_RE = re.compile(r"^(\d{2})/(\d{2})/(\d{4})$")
def parse_value(value):
"""Parse DD/MM/YYYY → datetime, digit-only → int, else str. None if empty."""
if not value or not value.strip():
return None
v = value.strip()
m = DATE_RE.fullmatch(v)
if m:
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1)))
if re.fullmatch(r"\d+", v):
return int(v)
return v
async def do_login(page):
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator("#login").fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('input[value="Connection"]')
await page.wait_for_load_state("networkidle")
async def get_form_ids(page):
"""Return list of {form_id, patient_code} from the filtered forms list."""
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
return await page.evaluate("""() => {
const results = [];
document.querySelectorAll('a[title="Open form"]').forEach(a => {
const href = a.getAttribute('href') || '';
const m = href.match(/id=(\\d+)/);
if (!m) return;
// Patient code: "Open directory" link in the same row, text of the anchor
const row = a.closest('tr');
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
const patientCode = dirLink ? dirLink.innerText.trim() : '';
results.push({ formId: m[1], patientCode: patientCode });
});
return results;
}""")
async def extract_form_fields(page, form_id):
"""Navigate to AE form and extract all field values."""
url = f"{BASE_URL}?module=dossier&class=file&event=show&id={form_id}#fiche"
await page.goto(url)
await page.wait_for_load_state("networkidle")
raw = await page.evaluate("""() => {
const fields = {};
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
const key = label.innerText.trim();
const valEl = label.nextElementSibling;
fields[key] = valEl ? valEl.innerText.trim() || null : null;
});
return fields;
}""")
# Parse values into correct Python types
parsed = {}
for k, v in raw.items():
if k == '_patient_code':
parsed[k] = v
else:
parsed[k] = parse_value(v)
parsed['_form_id'] = int(form_id)
return parsed
def upsert_ae(collection, doc, now):
patient_code = doc.get('_patient_code') or ''
event_number = doc.get('Event Number')
key = {"patient_code": patient_code, "event_number": event_number}
existing = collection.find_one(key)
# Fields we track changes for (exclude internal fields)
skip = {'_patient_code', '_form_id'}
data = {k: v for k, v in doc.items() if k not in skip}
if existing is None:
collection.insert_one({
**key,
"_form_id": doc['_form_id'],
"data": data,
"history": [],
"first_seen_at": now,
"last_seen_at": now,
"deleted_at": None,
})
print(f" NEW {patient_code} AE#{event_number}")
return
# Delta: compare data with stored data
old_data = existing.get("data", {})
changes = {}
for k in set(data) | set(old_data):
old_v = old_data.get(k)
new_v = data.get(k)
if old_v != new_v:
changes[k] = {"old": old_v, "new": new_v}
update = {"$set": {"last_seen_at": now, "deleted_at": None}}
if changes:
update["$set"]["data"] = data
update["$push"] = {"history": {"timestamp": now, "changes": changes}}
print(f" CHANGED {patient_code} AE#{event_number} -> {list(changes.keys())}")
else:
print(f" ok {patient_code} AE#{event_number}")
collection.update_one(key, update)
async def main():
mongo = MongoClient(MONGO_HOST)
col = mongo[DB_NAME][COLLECTION]
now = datetime.now()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
# Reuse saved session if available
if SESSION_FILE.exists():
context = await browser.new_context(storage_state=str(SESSION_FILE))
print("Loaded saved session")
else:
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
# Check if we need to log in
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
if "authentification" in page.url:
print("Logging in...")
await do_login(page)
await context.storage_state(path=str(SESSION_FILE))
print("Session saved")
else:
print("Session valid")
# Get all AE form IDs from filtered list
form_infos = await get_form_ids(page)
current_ids = {info['formId'] for info in form_infos}
print(f"Found {len(form_infos)} AE forms")
# Scrape and upsert each form
for info in form_infos:
fid = info['formId']
print(f"Scraping form {fid} ({info['patientCode']})...")
doc = await extract_form_fields(page, fid)
# Patient code comes from the list (more reliable than form page heading)
doc['_patient_code'] = info['patientCode']
upsert_ae(col, doc, now)
# Mark as deleted any forms that disappeared from the list
for rec in col.find({"deleted_at": None}, {"_form_id": 1, "patient_code": 1, "event_number": 1}):
if str(rec.get('_form_id', '')) not in current_ids:
col.update_one({"_id": rec["_id"]}, {"$set": {"deleted_at": now}})
print(f" DELETED form_id={rec['_form_id']} ({rec.get('patient_code')} AE#{rec.get('event_number')})")
await browser.close()
mongo.close()
print(f"\nDone — {len(form_infos)} forms processed at {now.isoformat()}")
if __name__ == "__main__":
asyncio.run(main())
+184
View File
@@ -0,0 +1,184 @@
"""
Device Deficiency Import — scrapes DD forms from EvaMed DRY study and upserts into MongoDB.
Run repeatedly; only stores field-level changes (delta) in history[].
Unique key: _form_id (each DD form has a unique ID in EvaMed).
"""
import asyncio
import re
from datetime import datetime
from pathlib import Path
from playwright.async_api import async_playwright
from pymongo import MongoClient
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting&center_id=2&formtype=121&l=ALL"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
MONGO_HOST = "192.168.1.76"
DB_NAME = "Dry"
COLLECTION = "DevDeficiency"
SESSION_FILE = Path(__file__).parent / "session.json"
DATE_RE = re.compile(r"^(\d{2})/(\d{2})/(\d{4})$")
def parse_value(value):
"""Parse DD/MM/YYYY -> datetime, digit-only -> int, else str. None if empty."""
if not value or not value.strip():
return None
v = value.strip()
m = DATE_RE.fullmatch(v)
if m:
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1)))
if re.fullmatch(r"\d+", v):
return int(v)
return v
async def do_login(page):
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator("#login").fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('input[value="Connection"]')
await page.wait_for_load_state("networkidle")
async def get_form_ids(page):
"""Return list of {formId, patientCode} from the filtered forms list."""
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
return await page.evaluate("""() => {
const results = [];
document.querySelectorAll('a[title="Open form"]').forEach(a => {
const href = a.getAttribute('href') || '';
const m = href.match(/id=(\\d+)/);
if (!m) return;
const row = a.closest('tr');
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
const patientCode = dirLink ? dirLink.innerText.trim() : '';
results.push({ formId: m[1], patientCode: patientCode });
});
return results;
}""")
async def extract_form_fields(page, form_id):
"""Navigate to DD form and extract all field values."""
url = f"{BASE_URL}?module=dossier&class=file&event=show&id={form_id}#fiche"
await page.goto(url)
await page.wait_for_load_state("networkidle")
raw = await page.evaluate("""() => {
const fields = {};
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
const key = label.innerText.trim();
const valEl = label.nextElementSibling;
fields[key] = valEl ? valEl.innerText.trim() || null : null;
});
return fields;
}""")
parsed = {}
for k, v in raw.items():
parsed[k] = parse_value(v)
parsed['_form_id'] = int(form_id)
return parsed
def upsert_dd(collection, doc, patient_code, now):
form_id = doc['_form_id']
key = {"_form_id": form_id}
existing = collection.find_one(key)
skip = {'_form_id'}
data = {k: v for k, v in doc.items() if k not in skip}
if existing is None:
collection.insert_one({
**key,
"patient_code": patient_code,
"data": data,
"history": [],
"first_seen_at": now,
"last_seen_at": now,
"deleted_at": None,
})
print(f" NEW {patient_code} DD form_id={form_id}")
return
old_data = existing.get("data", {})
changes = {}
for k in set(data) | set(old_data):
old_v = old_data.get(k)
new_v = data.get(k)
if old_v != new_v:
changes[k] = {"old": old_v, "new": new_v}
update = {"$set": {"last_seen_at": now, "deleted_at": None, "patient_code": patient_code}}
if changes:
update["$set"]["data"] = data
update["$push"] = {"history": {"timestamp": now, "changes": changes}}
print(f" CHANGED {patient_code} DD form_id={form_id} -> {list(changes.keys())}")
else:
print(f" ok {patient_code} DD form_id={form_id}")
collection.update_one(key, update)
async def main():
mongo = MongoClient(MONGO_HOST)
col = mongo[DB_NAME][COLLECTION]
now = datetime.now()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
if SESSION_FILE.exists():
context = await browser.new_context(storage_state=str(SESSION_FILE))
print("Loaded saved session")
else:
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
if "authentification" in page.url:
print("Logging in...")
await do_login(page)
await context.storage_state(path=str(SESSION_FILE))
print("Session saved")
else:
print("Session valid")
form_infos = await get_form_ids(page)
current_ids = {info['formId'] for info in form_infos}
print(f"Found {len(form_infos)} DD forms")
for info in form_infos:
fid = info['formId']
print(f"Scraping form {fid} ({info['patientCode']})...")
doc = await extract_form_fields(page, fid)
upsert_dd(col, doc, info['patientCode'], now)
for rec in col.find({"deleted_at": None}, {"_form_id": 1, "patient_code": 1}):
if str(rec.get('_form_id', '')) not in current_ids:
col.update_one({"_id": rec["_id"]}, {"$set": {"deleted_at": now}})
print(f" DELETED form_id={rec['_form_id']} ({rec.get('patient_code')})")
await browser.close()
mongo.close()
print(f"\nDone -- {len(form_infos)} forms processed at {now.isoformat()}")
if __name__ == "__main__":
asyncio.run(main())
+169
View File
@@ -0,0 +1,169 @@
"""Explorační skript — přihlášení do EvaMed CRF, načtení všech formulářů, nalezení AE."""
import asyncio
from playwright.async_api import async_playwright
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = "https://prod.evamed.com/etude/soft/index.php?module=authentification&class=login&client=myopowers-dry"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
SCREENSHOTS_DIR = "screenshots"
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
# 1. Login
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator('#login').fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('text=Connection')
await page.wait_for_load_state("networkidle")
print("Logged in")
# 2. Go to Forms list
await page.goto(f"{BASE_URL}?module=monitoring&class=formslisting")
await page.wait_for_load_state("networkidle")
print("Forms list loaded (page 1)")
# 3. Switch to ALL records
await page.select_option('select[name="l"]', 'ALL')
print("Switched to ALL, waiting for all rows to load...")
await page.wait_for_function(
"() => document.querySelectorAll('tr').length > 3200",
timeout=120000
)
print(f"All rows loaded")
# 4. Count total rows and find AE rows
stats = await page.evaluate("""() => {
const rows = document.querySelectorAll('tr');
let totalRows = rows.length;
let aeRows = [];
let formCodes = {};
rows.forEach((row, idx) => {
const cells = Array.from(row.querySelectorAll('td'));
cells.forEach(cell => {
// collect all unique form codes from the Formcode column
});
// Look for AE in any cell
const cellTexts = cells.map(c => c.innerText.trim());
// Formcode is typically column index 8 based on the header
if (cells.length > 8) {
const formcode = cells[8]?.innerText?.trim();
formCodes[formcode] = (formCodes[formcode] || 0) + 1;
if (formcode === 'AE') {
aeRows.push({
rowIndex: idx,
subject: cells[0]?.innerText?.trim(),
formcode: formcode,
formName: cells[9]?.innerText?.trim(),
allCells: cellTexts,
links: Array.from(row.querySelectorAll('a')).map(a => ({
href: a.getAttribute('href'),
title: a.title || '',
text: a.innerText.trim().substring(0, 50)
}))
});
}
}
});
return { totalRows, aeCount: aeRows.length, aeRows: aeRows.slice(0, 5), formCodes };
}""")
print(f"Total rows: {stats['totalRows']}")
print(f"AE rows found: {stats['aeCount']}")
print(f"Form codes: {stats['formCodes']}")
if stats['aeRows']:
print(f"\nFirst AE row sample:")
ae = stats['aeRows'][0]
print(f" Subject: {ae['subject']}")
print(f" All cells: {ae['allCells']}")
print(f" Links: {ae['links']}")
# 5. Open first AE form
for link in ae['links']:
if link.get('href') and 'id=' in link['href']:
ae_url = link['href']
if not ae_url.startswith('http'):
ae_url = f"https://prod.evamed.com/etude/soft/{ae_url}"
print(f"\nOpening AE form: {ae_url}")
await page.goto(ae_url)
await page.wait_for_load_state("networkidle")
await page.screenshot(path=f"{SCREENSHOTS_DIR}/05_ae_form.png", full_page=True)
print("Screenshot: AE form")
# Extract all fields from the form
fields = await page.evaluate("""() => {
// Try input-group pattern
const groups = document.querySelectorAll('.input-group');
let inputGroupFields = Array.from(groups).map(g => ({
html: g.outerHTML.substring(0, 800),
text: g.innerText.trim().substring(0, 300)
}));
// Try label/value pattern in tableauFormulaire
const tableFields = [];
document.querySelectorAll('.tableauFormulaire td, .tableauFormulaire th').forEach(el => {
tableFields.push({
tag: el.tagName,
className: el.className,
text: el.innerText.trim().substring(0, 200)
});
});
// Try all form inputs
const inputs = [];
document.querySelectorAll('input, select, textarea').forEach(el => {
inputs.push({
type: el.type,
name: el.name,
id: el.id,
value: el.value?.substring(0, 200),
className: el.className
});
});
return { inputGroupFields, tableFields: tableFields.slice(0, 50), inputs: inputs.slice(0, 50) };
}""")
print(f"\nInput groups: {len(fields['inputGroupFields'])}")
for i, f in enumerate(fields['inputGroupFields'][:10]):
print(f" [{i}] {f['text'][:120]}")
print(f"\nTable fields: {len(fields['tableFields'])}")
for i, f in enumerate(fields['tableFields'][:20]):
print(f" [{i}] <{f['tag']} class='{f['className']}'> {f['text'][:100]}")
print(f"\nForm inputs: {len(fields['inputs'])}")
for i, f in enumerate(fields['inputs'][:20]):
print(f" [{i}] {f['type']} name={f['name']} id={f['id']} val={f['value'][:60] if f['value'] else ''}")
# Save full form HTML for analysis
form_html = await page.content()
with open(f"{SCREENSHOTS_DIR}/05_ae_form_full.html", "w", encoding="utf-8") as f:
f.write(form_html)
print("Saved: full AE form HTML")
break
else:
print("NO AE ROWS FOUND!")
await page.screenshot(path=f"{SCREENSHOTS_DIR}/04_all_forms.png", full_page=False)
# Dump all unique form codes for debugging
print("Available form codes:", list(stats['formCodes'].keys())[:30])
await browser.close()
print("Done")
if __name__ == "__main__":
import os
os.makedirs(SCREENSHOTS_DIR, exist_ok=True)
asyncio.run(main())
+165
View File
@@ -0,0 +1,165 @@
"""Exploration script — Device Deficiency forms in EvaMed DRY study."""
import asyncio
import json
from pathlib import Path
from playwright.async_api import async_playwright
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting&center_id=2&formtype=121&l=ALL"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
SCREENSHOTS_DIR = Path(__file__).parent / "screenshots_dd"
SESSION_FILE = Path(__file__).parent / "session.json"
async def main():
SCREENSHOTS_DIR.mkdir(exist_ok=True)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
if SESSION_FILE.exists():
context = await browser.new_context(storage_state=str(SESSION_FILE))
print("Loaded saved session")
else:
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
# Login if needed
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
if "authentification" in page.url:
print("Logging in...")
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator("#login").fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('input[value="Connection"]')
await page.wait_for_load_state("networkidle")
await context.storage_state(path=str(SESSION_FILE))
print("Session saved")
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
else:
print("Session valid")
await page.screenshot(path=str(SCREENSHOTS_DIR / "01_dd_listing.png"), full_page=False)
print("Screenshot: DD listing")
# Get all DD form links from the listing
form_infos = await page.evaluate("""() => {
const results = [];
document.querySelectorAll('a[title="Open form"]').forEach(a => {
const href = a.getAttribute('href') || '';
const m = href.match(/id=(\\d+)/);
if (!m) return;
const row = a.closest('tr');
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
const patientCode = dirLink ? dirLink.innerText.trim() : '';
const cells = row ? Array.from(row.querySelectorAll('td')).map(c => c.innerText.trim()) : [];
results.push({ formId: m[1], patientCode, cells });
});
return results;
}""")
print(f"\nFound {len(form_infos)} Device Deficiency forms")
for i, info in enumerate(form_infos[:5]):
print(f" [{i}] form_id={info['formId']} patient={info['patientCode']} cells={info['cells']}")
if not form_infos:
print("NO DD FORMS FOUND!")
# Save HTML for debugging
html = await page.content()
(SCREENSHOTS_DIR / "01_dd_listing.html").write_text(html, encoding="utf-8")
await browser.close()
return
# Open the first DD form
first = form_infos[0]
form_url = f"{BASE_URL}?module=dossier&class=file&event=show&id={first['formId']}#fiche"
print(f"\nOpening DD form: {form_url}")
await page.goto(form_url)
await page.wait_for_load_state("networkidle")
await page.screenshot(path=str(SCREENSHOTS_DIR / "02_dd_form.png"), full_page=True)
print("Screenshot: DD form")
# Extract fields using the same pattern as AE (span.label + span.valeur)
fields_label_value = await page.evaluate("""() => {
const fields = [];
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
const key = label.innerText.trim();
const valEl = label.nextElementSibling;
const value = valEl ? valEl.innerText.trim() : null;
const valClass = valEl ? valEl.className : '';
fields.push({ key, value, valueClass: valClass });
});
return fields;
}""")
print(f"\n=== Fields (span.label -> span.valeur) : {len(fields_label_value)} ===")
for f in fields_label_value:
print(f" {f['key']:40s} = {f['value']}")
# Also explore table structure for any additional patterns
table_structure = await page.evaluate("""() => {
const sections = [];
document.querySelectorAll('.tableauFormulaire').forEach((table, ti) => {
const rows = [];
table.querySelectorAll('tr').forEach((tr, ri) => {
const cells = Array.from(tr.querySelectorAll('td, th')).map(c => ({
tag: c.tagName,
class: c.className,
colspan: c.colSpan,
text: c.innerText.trim().substring(0, 200),
childSpans: Array.from(c.querySelectorAll('span')).map(s => ({
class: s.className,
text: s.innerText.trim().substring(0, 200)
}))
}));
if (cells.length > 0) rows.push({ rowIndex: ri, cells });
});
sections.push({ tableIndex: ti, rowCount: rows.length, rows: rows.slice(0, 30) });
});
return sections;
}""")
print(f"\n=== Table structure: {len(table_structure)} tableauFormulaire blocks ===")
for sec in table_structure:
print(f"\n Table #{sec['tableIndex']} ({sec['rowCount']} rows):")
for row in sec['rows'][:15]:
for cell in row['cells']:
spans_info = " | ".join(f"[{s['class']}]{s['text'][:60]}" for s in cell['childSpans'])
print(f" row{row['rowIndex']} <{cell['tag']} class='{cell['class']}' colspan={cell['colspan']}> "
f"{cell['text'][:80]}")
if spans_info:
print(f" spans: {spans_info}")
# Save full form HTML
html = await page.content()
(SCREENSHOTS_DIR / "02_dd_form.html").write_text(html, encoding="utf-8")
print("\nSaved: full DD form HTML")
# Save extracted data as JSON for easy review
result = {
"form_id": first['formId'],
"patient_code": first['patientCode'],
"listing_cells": first['cells'],
"fields": fields_label_value,
"table_structure": table_structure,
}
(SCREENSHOTS_DIR / "dd_form_data.json").write_text(
json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8"
)
print("Saved: dd_form_data.json")
await browser.close()
print("\nDone")
if __name__ == "__main__":
asyncio.run(main())
+131
View File
@@ -0,0 +1,131 @@
"""Exploration script — V0 Implantation Visit forms (formtype=10) in EvaMed DRY study."""
import asyncio
import json
from pathlib import Path
from playwright.async_api import async_playwright
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting&center_id=2&formtype=10&l=ALL"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
SCREENSHOTS_DIR = Path(__file__).parent / "screenshots_surgery"
SESSION_FILE = Path(__file__).parent / "session.json"
async def main():
SCREENSHOTS_DIR.mkdir(exist_ok=True)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
if SESSION_FILE.exists():
context = await browser.new_context(storage_state=str(SESSION_FILE))
print("Loaded saved session")
else:
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
# Login if needed
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
if "authentification" in page.url:
print("Logging in...")
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator("#login").fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('input[value="Connection"]')
await page.wait_for_load_state("networkidle")
await context.storage_state(path=str(SESSION_FILE))
print("Session saved")
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
else:
print("Session valid")
await page.screenshot(path=str(SCREENSHOTS_DIR / "01_listing.png"), full_page=False)
print("Screenshot: listing")
# Get all form links from the listing
form_infos = await page.evaluate("""() => {
const results = [];
document.querySelectorAll('a[title="Open form"]').forEach(a => {
const href = a.getAttribute('href') || '';
const m = href.match(/id=(\\d+)/);
if (!m) return;
const row = a.closest('tr');
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
const patientCode = dirLink ? dirLink.innerText.trim() : '';
const cells = row ? Array.from(row.querySelectorAll('td')).map(c => c.innerText.trim()) : [];
results.push({ formId: m[1], patientCode, cells });
});
return results;
}""")
print(f"\nFound {len(form_infos)} Implantation Visit forms")
for i, info in enumerate(form_infos[:5]):
print(f" [{i}] form_id={info['formId']} patient={info['patientCode']} cells={info['cells']}")
if not form_infos:
print("NO FORMS FOUND!")
html = await page.content()
(SCREENSHOTS_DIR / "01_listing.html").write_text(html, encoding="utf-8")
await browser.close()
return
# Open the first form
first = form_infos[0]
form_url = f"{BASE_URL}?module=dossier&class=file&event=show&id={first['formId']}#fiche"
print(f"\nOpening form: {form_url}")
await page.goto(form_url)
await page.wait_for_load_state("networkidle")
await page.screenshot(path=str(SCREENSHOTS_DIR / "02_form.png"), full_page=True)
print("Screenshot: form")
# Extract fields using span.label + span.valeur pattern
fields_label_value = await page.evaluate("""() => {
const fields = [];
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
const key = label.innerText.trim();
const valEl = label.nextElementSibling;
const value = valEl ? valEl.innerText.trim() : null;
const valClass = valEl ? valEl.className : '';
fields.push({ key, value, valueClass: valClass });
});
return fields;
}""")
print(f"\n=== Fields (span.label -> span.valeur) : {len(fields_label_value)} ===")
for f in fields_label_value:
key = f['key'].encode('ascii', 'replace').decode()
val = (f['value'] or '').encode('ascii', 'replace').decode()
print(f" {key:50s} = {val}")
# Save full form HTML
html = await page.content()
(SCREENSHOTS_DIR / "02_form.html").write_text(html, encoding="utf-8")
print("\nSaved: full form HTML")
# Save extracted data as JSON
result = {
"form_id": first['formId'],
"patient_code": first['patientCode'],
"listing_cells": first['cells'],
"fields": fields_label_value,
}
(SCREENSHOTS_DIR / "form_data.json").write_text(
json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8"
)
print("Saved: form_data.json")
await browser.close()
print("\nDone")
if __name__ == "__main__":
asyncio.run(main())
+108
View File
@@ -0,0 +1,108 @@
"""
EvaMed DRY study — form type selectors.
Usage:
Filter URL: formtype={value}
Checkbox: #formtype_{value}
Example:
await page.check('#formtype_120') # Adverse Event
await page.check('#formtype_121') # Device Deficiency
"""
FORMTYPES = {
1: "Eligibility Criteria",
8: "V-3 : Baseline Within 3 weeks before the implant procedure",
122: "V-3 : Day 1 - Voiding diary",
123: "V-3 : Day 2 - Voiding diary",
124: "V-3 : Day 3 - Voiding diary",
2: "V-3 : ICIQ-MLUTS",
3: "V-3 : ICIQ-LUTSQol",
4: "V-3 : EQ-5D-5L",
5: "V-3 : MSHQ",
9: "V-1 : Phone Call (1 Week before Implantation)",
10: "V0 : Implantation Visit",
11: "DV : Discharge Visit",
12: "V1 : 6 Weeks Post-Operative Period (Device Activation)",
129: "V1 : Device Activation",
125: "V1 : Physician Usability Questionnaire",
13: "V2 : 8 Weeks Post-Operative Period",
130: "V2 : Device Adjustment",
127: "V2 : Physician Usability Questionnaire",
14: "V3 : 3 Months Post-Operative Period",
131: "V3 : Device Adjustment",
128: "V3 : Physician Usability Questionnaire",
15: "V4 - Phone Call (10 Weeks after device activation)",
16: "V5 : 3 Months Post-Device Activation",
138: "V5 : Day 1 - Voiding diary",
139: "V5 : Day 2 - Voiding diary",
140: "V5 : Day 3 - Voiding diary",
132: "V5 : Device Adjustment",
135: "V5 : Unlocking additional modes",
17: "V5 : ICIQ-MLUTS",
18: "V5 : ICIQ-LUTSQol",
19: "V5 : EQ-5D-5L",
20: "V5 : MSHQ",
21: "V5 : PGI-I",
22: "V5 : Subject Usability Questionnaire",
23: "V5 : Physician Usability Questionnaire",
24: "V6 : Phone Call (22 Weeks after Device activation)",
25: "V7 : 6 Months Post-Device Activation",
142: "V7 : Day 1 - Voiding diary",
143: "V7 : Day 2 - Voiding diary",
144: "V7 : Day 3 - Voiding diary",
150: "V7 : Device Adjustment",
180: "V7 - Unlocking additional mode",
26: "V7 : ICIQ-MLUTS",
27: "V7 : ICIQ-LUTSQol",
29: "V7 : EQ-5D-5L",
31: "V7 : MSHQ",
32: "V7 : PGI-I",
33: "V7 : Subject Usability Questionnaire",
34: "V7 : Physician Usability Questionnaire",
35: "V8 : Phone Call (46 Weeks after Device activation)",
36: "V9 : 12 Months Post-Device Activation",
146: "V9 : Day 1 - Voiding diary",
147: "V9 : Day 2 - Voiding diary",
148: "V9 : Day 3 - Voiding diary",
151: "V9 : Device Adjustment",
181: "V9 - Unlocking additional mode",
37: "V9 : ICIQ-MLUTS",
38: "V9 : ICIQ-LUTSQol",
39: "V9 : EQ-5D-5L",
40: "V9 : MSHQ",
41: "V9 : PGI-I",
42: "V9 : Subject Usability Questionnaire",
43: "V9 : Physician Usability Questionnaire",
44: "V10 : Long-term annual Follow-up",
153: "V10 : Device Adjustment",
162: "V10 : Unlocking additional modes",
45: "V10 : ICIQ-MLUTS",
47: "V10 : ICIQ-LUTSQol",
48: "V10 : EQ-5D-5L",
49: "V10 : MSHQ",
50: "V10 : PGI-I",
51: "V10 : Subject Usability Questionnaire",
52: "V10 : Physician Usability Questionnaire",
53: "V11 : Long-term annual Follow-up",
154: "V11 : Device Adjustment",
163: "V11 : Unlocking additional modes",
54: "V11 : ICIQ-MLUTS",
55: "V11 : ICIQ-LUTSQol",
56: "V11 : EQ-5D-5L",
57: "V11 : MSHQ",
58: "V11 : PGI-I",
59: "V11 : Subject Usability Questionnaire",
60: "V11 : Physician Usability Questionnaire",
119: "UV : Unscheduled Visit",
183: "UV : Voiding diary - Day 1 to Day 3 (if applicable)",
173: "UV : Device Adjustment",
174: "UV : Unlocking additional modes",
175: "UV : Physician Usability Questionnaire",
177: "Concomitant Medication",
120: "Adverse Event",
121: "Device Deficiency",
178: "Deviation",
182: "Subsequent surgery",
176: "Study Termination",
}
Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

+24
View File
@@ -0,0 +1,24 @@
<table style="margin: auto;">
<tbody><tr>
<td>
<a href="?module=monitoring&amp;class=formslisting&amp;center_code=&amp;patientfile_code=&amp;center_id=&amp;filetype=&amp;formtype=&amp;dateinc_inf=&amp;dateinc_sup=&amp;dateint_inf=&amp;dateint_sup=&amp;datet0_inf=&amp;datet0_sup=&amp;datefiche_inf=&amp;datefiche_sup=&amp;delai_inf=&amp;delai_sup=&amp;visits=&amp;status=&amp;pff_exists=&amp;nb_open_query_fields=&amp;tx_remplissage_inf=&amp;tx_remplissage_sup=&amp;l=0&amp;DOWNLOAD=Excel2007">
<img src="img/dl/ms-excel2007-128x128.png" alt="Download"><br>
MS Excel 2007 (.xlsx)</a>
</td>
<td>
<a href="?module=monitoring&amp;class=formslisting&amp;center_code=&amp;patientfile_code=&amp;center_id=&amp;filetype=&amp;formtype=&amp;dateinc_inf=&amp;dateinc_sup=&amp;dateint_inf=&amp;dateint_sup=&amp;datet0_inf=&amp;datet0_sup=&amp;datefiche_inf=&amp;datefiche_sup=&amp;delai_inf=&amp;delai_sup=&amp;visits=&amp;status=&amp;pff_exists=&amp;nb_open_query_fields=&amp;tx_remplissage_inf=&amp;tx_remplissage_sup=&amp;l=0&amp;DOWNLOAD=Excel5">
<img src="img/dl/ms-excel5-128x128.png" alt="Download"><br>
MS Excel 5 (.xls)</a>
</td>
<td>
<a href="?module=monitoring&amp;class=formslisting&amp;center_code=&amp;patientfile_code=&amp;center_id=&amp;filetype=&amp;formtype=&amp;dateinc_inf=&amp;dateinc_sup=&amp;dateint_inf=&amp;dateint_sup=&amp;datet0_inf=&amp;datet0_sup=&amp;datefiche_inf=&amp;datefiche_sup=&amp;delai_inf=&amp;delai_sup=&amp;visits=&amp;status=&amp;pff_exists=&amp;nb_open_query_fields=&amp;tx_remplissage_inf=&amp;tx_remplissage_sup=&amp;l=0&amp;DOWNLOAD=csv">
<img src="img/dl/csv-128x128.png" alt="Download"><br>
CSV (.csv)</a>
</td>
<td>
<a href="?module=monitoring&amp;class=formslisting&amp;center_code=&amp;patientfile_code=&amp;center_id=&amp;filetype=&amp;formtype=&amp;dateinc_inf=&amp;dateinc_sup=&amp;dateint_inf=&amp;dateint_sup=&amp;datet0_inf=&amp;datet0_sup=&amp;datefiche_inf=&amp;datefiche_sup=&amp;delai_inf=&amp;delai_sup=&amp;visits=&amp;status=&amp;pff_exists=&amp;nb_open_query_fields=&amp;tx_remplissage_inf=&amp;tx_remplissage_sup=&amp;l=0&amp;DOWNLOAD=XML">
<img src="img/dl/xml-128x128.png" alt="Download"><br>
XML (.xml)</a>
</td>
</tr>
</tbody></table>
Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

File diff suppressed because one or more lines are too long
Binary file not shown.

After

Width:  |  Height:  |  Size: 161 KiB

+107
View File
@@ -0,0 +1,107 @@
{
"form_id": "461",
"patient_code": "CZ1-01",
"listing_cells": [
"CZ1-01",
"Male",
"Subject",
"16/02/2024",
"28/02/2024",
"17/04/2024",
"CZ1 - Fakultní Thomayerova nemocnice",
"DD",
"Device Deficiency",
"0",
"0",
"30",
"",
"",
"",
"23/02/2025",
"Accepted",
"",
"",
"0",
"100",
"0",
"",
"",
"1",
"Open",
""
],
"fields": [
{
"key": "Date of Onset",
"value": "23/02/2025",
"valueClass": "valeur DTEONSET"
},
{
"key": "Date device deficiency discovered by site",
"value": "04/03/2025",
"valueClass": "valeur DTEDEVIC"
},
{
"key": "Title of device deficiency",
"value": null,
"valueClass": ""
},
{
"key": "1 - Type of Device Deficiency",
"value": "Malfunction (failure of device to operate as intended when used per IFU and protocol)",
"valueClass": "valeur "
},
{
"key": "Specify other type of device deficiency",
"value": null,
"valueClass": ""
},
{
"key": "2 - Description of Event",
"value": "Patient reported to site device malfunction. Device switched to emergency regime and kept being permanently open. It is not possible to control device. The sponsor informed abou the deficiency immediately. Patient has no pain, no urine retention, as per X-ray, positioning of control unit and cuff is correct.",
"valueClass": "valeur DESC"
},
{
"key": "3 - Information about Device(s) (name, lot and serial number)",
"value": "Remote control 052300185, control unit 210011, cuff 22-0224",
"valueClass": "valeur INFO"
},
{
"key": "4 - Consequence of Device Deficiency",
"value": "Led to adverse event",
"valueClass": "valeur "
},
{
"key": "Lead to AE number",
"value": null,
"valueClass": ""
},
{
"key": "Action(s) taken",
"value": "Other",
"valueClass": "valeur "
},
{
"key": "If other action(s), specify",
"value": "Implementation of the new sofware leading to continence, time to activation emergency regime increased from 8 to 12hours",
"valueClass": "valeur ACTIONP"
},
{
"key": "Event Outcome",
"value": "Resolved without Sequelae",
"valueClass": "valeur "
},
{
"key": "Event End Date",
"value": "07/04/2025",
"valueClass": "valeur DTEEND"
}
],
"table_structure": [
{
"tableIndex": 0,
"rowCount": 0,
"rows": []
}
]
}
Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

File diff suppressed because one or more lines are too long
Binary file not shown.

After

Width:  |  Height:  |  Size: 208 KiB

+385
View File
@@ -0,0 +1,385 @@
{
"form_id": "10",
"patient_code": "CZ1-01",
"listing_cells": [
"CZ1-01",
"Male",
"Subject",
"16/02/2024",
"28/02/2024",
"17/04/2024",
"CZ1 - Fakultní Thomayerova nemocnice",
"VISIT0",
"V0 : Implantation Visit",
"0",
"1",
"1",
"",
"",
"",
"28/02/2024",
"Accepted",
"",
"",
"0",
"",
"0",
"",
"",
"1",
"Open",
""
],
"fields": [
{
"key": "Date of surgery (or date of the attempt)",
"value": "28/02/2024",
"valueClass": "valeur DTET0"
},
{
"key": "Has the patient been implanted with ARTUS®?",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "How many implantation attempts have there been?",
"value": null,
"valueClass": ""
},
{
"key": "Please explain why and describe the difficulty(ies) experienced:",
"value": null,
"valueClass": ""
},
{
"key": "The results of the 24-hour Pad Weight Test have been entered in the Baseline visit to validate the inclusion of the subject",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Was the clinical examination performed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Exam date",
"value": "27/02/2024",
"valueClass": "valeur DTEEXAM"
},
{
"key": "Weight",
"value": "114   kg",
"valueClass": "valeur WGT"
},
{
"key": "Body Mass Index",
"value": "32.3   kg/cm²",
"valueClass": "valeur BMI"
},
{
"key": "Explain why clinical examination was not performed",
"value": null,
"valueClass": ""
},
{
"key": "The charge of the TWO Remote Controls was done correctly before the surgery (for 5 hours each)",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "All steps for the \"Start procedure\" of the two Remote Controls have been completed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "Comments",
"value": null,
"valueClass": ""
},
{
"key": "All steps for \"Pairing procedure\" of the Remote Control and the Control Unit have been completed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "Comments",
"value": null,
"valueClass": ""
},
{
"key": "All the steps of the 'Calibration procedure' have been completed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "Comments",
"value": null,
"valueClass": ""
},
{
"key": "LOT of the ARTUS® Cuff S",
"value": "22-0224",
"valueClass": "valeur CUFF"
},
{
"key": "SN of the ARTUS® Control Unit",
"value": "210011",
"valueClass": "valeur CONTSN"
},
{
"key": "SN of the ARTUS® Remote Control #1",
"value": "052300185",
"valueClass": "valeur REMOSN"
},
{
"key": "SN of the ARTUS® Remote Control #2",
"value": null,
"valueClass": ""
},
{
"key": "The Back-up Material was used",
"value": "No",
"valueClass": "valeur "
},
{
"key": "LOT of the ARTUS® Cuff S",
"value": null,
"valueClass": ""
},
{
"key": "SN of the ARTUS® Control Unit",
"value": null,
"valueClass": ""
},
{
"key": "SN of the ARTUS® Remote Control",
"value": null,
"valueClass": ""
},
{
"key": "Beginning of the surgery, start of the incision (hh:mm)",
"value": "10:00",
"valueClass": "valeur START"
},
{
"key": "End of the surgery, closure of the incision (hh:mm)",
"value": "11:15",
"valueClass": "valeur STOP"
},
{
"key": "What type of anesthesia performed ?",
"value": "General",
"valueClass": "valeur "
},
{
"key": "Size of Foley catheter used",
"value": "14 CH",
"valueClass": "valeur "
},
{
"key": "Did a surgical dissection of the bulbospongiosus muscle has been performed?",
"value": null,
"valueClass": ""
},
{
"key": "Locking Position of the cuff around the urethra",
"value": "1",
"valueClass": "valeur "
},
{
"key": "Tightening of the urethra",
"value": null,
"valueClass": ""
},
{
"key": "If you have encountered difficulties, please describe the difficulty(ies) experienced and alternative or solution provided",
"value": "no difficulties",
"valueClass": "valeur ENCOUNT"
},
{
"key": "Comment",
"value": null,
"valueClass": ""
},
{
"key": "All steps for the \"Surgery Test procedure\" have been completed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "Comments",
"value": null,
"valueClass": ""
},
{
"key": "All steps for the \"Implantation of the control unit\" have been completed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "Comments",
"value": null,
"valueClass": ""
},
{
"key": "All steps for the \"Completion of implantation\" have been completed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Describe the difficulty(ies) experienced and alternative or solution provided",
"value": null,
"valueClass": ""
},
{
"key": "Comments",
"value": null,
"valueClass": ""
},
{
"key": "Was a Picture of the Cuff final position around the urethra taken intra-operatively",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Was a Picture of the Control Unit final position taken intra-operatively",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Was the Pelvis radiography performed?",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Date of most recent pelvis radiography",
"value": "04/03/2024",
"valueClass": "valeur DTEPEL"
},
{
"key": "Was the Usability Questionnaire Surgeon (intra-operative) performed",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Explain why Usability Questionnaire Surgeon was not performed",
"value": null,
"valueClass": ""
},
{
"key": "1. The implantation of Artus® is technically simple",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "2. The handling of the shell screwed on the end of the transmission cable is easy",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "3. The cuff is easy to insert around the urethra",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "4. Positioning the Cuff around the urethra is easy to perform",
"value": "3 : Not sure",
"valueClass": "valeur "
},
{
"key": "5. The Cuff loop around the urethra is easy to adjust",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "6. The implantation technique of the Control Unit in the abdominal wall is easy to perform",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "7. The fixing technique for the Control Unit in the abdominal wall is easy to perform",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "8. The connection of the transmission cable and the Control Unit is easy to perform",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "9. The remote control is easy to manipulate in the sterile bag",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "10. The sequence to be carried out to reach the screen proposing the password to access the Physician interface is simple",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "11. Matching between the Remote Control and the implanted Control Unit is easy to perform",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "12. Visual evaluation of the correct operation of the device (Surgery mode) is easy to perform",
"value": "2 : Agree",
"valueClass": "valeur "
},
{
"key": "Do you have any comments regarding the intra-operative ARTUS implant and its use",
"value": "No",
"valueClass": "valeur "
},
{
"key": "Please complete your comment below",
"value": null,
"valueClass": ""
},
{
"key": "Device deficiency(ies) occurred during the procedure",
"value": "No",
"valueClass": "valeur "
},
{
"key": "Is the subject receiving any concomitant medication",
"value": "Yes",
"valueClass": "valeur "
},
{
"key": "Did new adverse event(s) occur since the informed consent signature",
"value": "No",
"valueClass": "valeur "
}
]
}
+185
View File
@@ -0,0 +1,185 @@
"""
Surgery (V0 Implantation Visit) Import — scrapes from EvaMed DRY study and upserts into MongoDB.
Run repeatedly; only stores field-level changes (delta) in history[].
Unique key: _form_id (each form has a unique ID in EvaMed).
MongoDB: db=Dry, collection=Surgery
"""
import asyncio
import re
from datetime import datetime
from pathlib import Path
from playwright.async_api import async_playwright
from pymongo import MongoClient
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting&center_id=2&formtype=10&l=ALL"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
MONGO_HOST = "192.168.1.76"
DB_NAME = "Dry"
COLLECTION = "Surgery"
SESSION_FILE = Path(__file__).parent / "session.json"
DATE_RE = re.compile(r"^(\d{2})/(\d{2})/(\d{4})$")
def parse_value(value):
"""Parse DD/MM/YYYY -> datetime, digit-only -> int, else str. None if empty."""
if not value or not value.strip():
return None
v = value.strip()
m = DATE_RE.fullmatch(v)
if m:
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1)))
if re.fullmatch(r"\d+", v):
return int(v)
return v
async def do_login(page):
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator("#login").fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('input[value="Connection"]')
await page.wait_for_load_state("networkidle")
async def get_form_ids(page):
"""Return list of {formId, patientCode} from the filtered forms list."""
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
return await page.evaluate("""() => {
const results = [];
document.querySelectorAll('a[title="Open form"]').forEach(a => {
const href = a.getAttribute('href') || '';
const m = href.match(/id=(\\d+)/);
if (!m) return;
const row = a.closest('tr');
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
const patientCode = dirLink ? dirLink.innerText.trim() : '';
results.push({ formId: m[1], patientCode: patientCode });
});
return results;
}""")
async def extract_form_fields(page, form_id):
"""Navigate to form and extract all field values."""
url = f"{BASE_URL}?module=dossier&class=file&event=show&id={form_id}#fiche"
await page.goto(url)
await page.wait_for_load_state("networkidle")
raw = await page.evaluate("""() => {
const fields = {};
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
const key = label.innerText.trim();
const valEl = label.nextElementSibling;
fields[key] = valEl ? valEl.innerText.trim() || null : null;
});
return fields;
}""")
parsed = {}
for k, v in raw.items():
parsed[k] = parse_value(v)
parsed['_form_id'] = int(form_id)
return parsed
def upsert(collection, doc, patient_code, now):
form_id = doc['_form_id']
key = {"_form_id": form_id}
existing = collection.find_one(key)
skip = {'_form_id'}
data = {k: v for k, v in doc.items() if k not in skip}
if existing is None:
collection.insert_one({
**key,
"patient_code": patient_code,
"data": data,
"history": [],
"first_seen_at": now,
"last_seen_at": now,
"deleted_at": None,
})
print(f" NEW {patient_code} Surgery form_id={form_id}")
return
old_data = existing.get("data", {})
changes = {}
for k in set(data) | set(old_data):
old_v = old_data.get(k)
new_v = data.get(k)
if old_v != new_v:
changes[k] = {"old": old_v, "new": new_v}
update = {"$set": {"last_seen_at": now, "deleted_at": None, "patient_code": patient_code}}
if changes:
update["$set"]["data"] = data
update["$push"] = {"history": {"timestamp": now, "changes": changes}}
print(f" CHANGED {patient_code} Surgery form_id={form_id} -> {list(changes.keys())}")
else:
print(f" ok {patient_code} Surgery form_id={form_id}")
collection.update_one(key, update)
async def main():
mongo = MongoClient(MONGO_HOST)
col = mongo[DB_NAME][COLLECTION]
now = datetime.now()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
if SESSION_FILE.exists():
context = await browser.new_context(storage_state=str(SESSION_FILE))
print("Loaded saved session")
else:
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
if "authentification" in page.url:
print("Logging in...")
await do_login(page)
await context.storage_state(path=str(SESSION_FILE))
print("Session saved")
else:
print("Session valid")
form_infos = await get_form_ids(page)
current_ids = {info['formId'] for info in form_infos}
print(f"Found {len(form_infos)} Surgery forms")
for info in form_infos:
fid = info['formId']
print(f"Scraping form {fid} ({info['patientCode']})...")
doc = await extract_form_fields(page, fid)
upsert(col, doc, info['patientCode'], now)
for rec in col.find({"deleted_at": None}, {"_form_id": 1, "patient_code": 1}):
if str(rec.get('_form_id', '')) not in current_ids:
col.update_one({"_id": rec["_id"]}, {"$set": {"deleted_at": now}})
print(f" DELETED form_id={rec['_form_id']} ({rec.get('patient_code')})")
await browser.close()
mongo.close()
print(f"\nDone -- {len(form_infos)} forms processed at {now.isoformat()}")
if __name__ == "__main__":
asyncio.run(main())