205 lines
6.8 KiB
Python
205 lines
6.8 KiB
Python
"""
|
|
AE Import — scrapes Adverse Events from EvaMed DRY study and upserts into MongoDB.
|
|
|
|
Run repeatedly; only stores field-level changes (delta) in history[].
|
|
Unique key: patient_code + event_number.
|
|
"""
|
|
|
|
import asyncio
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from playwright.async_api import async_playwright
|
|
from pymongo import MongoClient
|
|
|
|
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
|
|
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
|
|
# Direct filtered URL: CZ1 (center_id=2), Adverse Event (formtype=120), all records
|
|
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting¢er_id=2&formtype=120&l=ALL"
|
|
LOGIN = "vbuzalka"
|
|
PASSWORD = "Vlado9674+"
|
|
|
|
MONGO_HOST = "192.168.1.76"
|
|
DB_NAME = "Dry"
|
|
COLLECTION = "AE"
|
|
SESSION_FILE = Path(__file__).parent / "session.json"
|
|
|
|
DATE_RE = re.compile(r"^(\d{2})/(\d{2})/(\d{4})$")
|
|
|
|
|
|
def parse_value(value):
|
|
"""Parse DD/MM/YYYY → datetime, digit-only → int, else str. None if empty."""
|
|
if not value or not value.strip():
|
|
return None
|
|
v = value.strip()
|
|
m = DATE_RE.fullmatch(v)
|
|
if m:
|
|
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1)))
|
|
if re.fullmatch(r"\d+", v):
|
|
return int(v)
|
|
return v
|
|
|
|
|
|
async def do_login(page):
|
|
await page.goto(LOGIN_URL)
|
|
await page.wait_for_load_state("networkidle")
|
|
await page.locator("#login").fill(LOGIN)
|
|
await page.locator('input[type="password"]').first.fill(PASSWORD)
|
|
await page.click('input[value="Connection"]')
|
|
await page.wait_for_load_state("networkidle")
|
|
|
|
|
|
async def get_form_ids(page):
|
|
"""Return list of {form_id, patient_code} from the filtered forms list."""
|
|
await page.goto(LIST_URL)
|
|
await page.wait_for_load_state("networkidle")
|
|
|
|
return await page.evaluate("""() => {
|
|
const results = [];
|
|
document.querySelectorAll('a[title="Open form"]').forEach(a => {
|
|
const href = a.getAttribute('href') || '';
|
|
const m = href.match(/id=(\\d+)/);
|
|
if (!m) return;
|
|
// Patient code: "Open directory" link in the same row, text of the anchor
|
|
const row = a.closest('tr');
|
|
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
|
|
const patientCode = dirLink ? dirLink.innerText.trim() : '';
|
|
results.push({ formId: m[1], patientCode: patientCode });
|
|
});
|
|
return results;
|
|
}""")
|
|
|
|
|
|
async def extract_form_fields(page, form_id):
|
|
"""Navigate to AE form and extract all field values."""
|
|
url = f"{BASE_URL}?module=dossier&class=file&event=show&id={form_id}#fiche"
|
|
await page.goto(url)
|
|
await page.wait_for_load_state("networkidle")
|
|
|
|
raw = await page.evaluate("""() => {
|
|
const fields = {};
|
|
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
|
|
const key = label.innerText.trim();
|
|
const valEl = label.nextElementSibling;
|
|
fields[key] = valEl ? valEl.innerText.trim() || null : null;
|
|
});
|
|
return fields;
|
|
}""")
|
|
|
|
# Parse values into correct Python types
|
|
parsed = {}
|
|
for k, v in raw.items():
|
|
if k == '_patient_code':
|
|
parsed[k] = v
|
|
else:
|
|
parsed[k] = parse_value(v)
|
|
|
|
parsed['_form_id'] = int(form_id)
|
|
return parsed
|
|
|
|
|
|
|
|
|
|
|
|
def upsert_ae(collection, doc, now):
|
|
patient_code = doc.get('_patient_code') or ''
|
|
event_number = doc.get('Event Number')
|
|
|
|
key = {"patient_code": patient_code, "event_number": event_number}
|
|
existing = collection.find_one(key)
|
|
|
|
# Fields we track changes for (exclude internal fields)
|
|
skip = {'_patient_code', '_form_id'}
|
|
data = {k: v for k, v in doc.items() if k not in skip}
|
|
|
|
if existing is None:
|
|
collection.insert_one({
|
|
**key,
|
|
"_form_id": doc['_form_id'],
|
|
"data": data,
|
|
"history": [],
|
|
"first_seen_at": now,
|
|
"last_seen_at": now,
|
|
"deleted_at": None,
|
|
})
|
|
print(f" NEW {patient_code} AE#{event_number}")
|
|
return
|
|
|
|
# Delta: compare data with stored data
|
|
old_data = existing.get("data", {})
|
|
changes = {}
|
|
for k in set(data) | set(old_data):
|
|
old_v = old_data.get(k)
|
|
new_v = data.get(k)
|
|
if old_v != new_v:
|
|
changes[k] = {"old": old_v, "new": new_v}
|
|
|
|
update = {"$set": {"last_seen_at": now, "deleted_at": None}}
|
|
if changes:
|
|
update["$set"]["data"] = data
|
|
update["$push"] = {"history": {"timestamp": now, "changes": changes}}
|
|
print(f" CHANGED {patient_code} AE#{event_number} -> {list(changes.keys())}")
|
|
else:
|
|
print(f" ok {patient_code} AE#{event_number}")
|
|
|
|
collection.update_one(key, update)
|
|
|
|
|
|
async def main():
|
|
mongo = MongoClient(MONGO_HOST)
|
|
col = mongo[DB_NAME][COLLECTION]
|
|
now = datetime.now()
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
|
|
# Reuse saved session if available
|
|
if SESSION_FILE.exists():
|
|
context = await browser.new_context(storage_state=str(SESSION_FILE))
|
|
print("Loaded saved session")
|
|
else:
|
|
context = await browser.new_context(viewport={"width": 1400, "height": 900})
|
|
|
|
page = await context.new_page()
|
|
|
|
# Check if we need to log in
|
|
await page.goto(LIST_URL)
|
|
await page.wait_for_load_state("networkidle")
|
|
if "authentification" in page.url:
|
|
print("Logging in...")
|
|
await do_login(page)
|
|
await context.storage_state(path=str(SESSION_FILE))
|
|
print("Session saved")
|
|
else:
|
|
print("Session valid")
|
|
|
|
# Get all AE form IDs from filtered list
|
|
form_infos = await get_form_ids(page)
|
|
current_ids = {info['formId'] for info in form_infos}
|
|
print(f"Found {len(form_infos)} AE forms")
|
|
|
|
# Scrape and upsert each form
|
|
for info in form_infos:
|
|
fid = info['formId']
|
|
print(f"Scraping form {fid} ({info['patientCode']})...")
|
|
doc = await extract_form_fields(page, fid)
|
|
# Patient code comes from the list (more reliable than form page heading)
|
|
doc['_patient_code'] = info['patientCode']
|
|
upsert_ae(col, doc, now)
|
|
|
|
# Mark as deleted any forms that disappeared from the list
|
|
for rec in col.find({"deleted_at": None}, {"_form_id": 1, "patient_code": 1, "event_number": 1}):
|
|
if str(rec.get('_form_id', '')) not in current_ids:
|
|
col.update_one({"_id": rec["_id"]}, {"$set": {"deleted_at": now}})
|
|
print(f" DELETED form_id={rec['_form_id']} ({rec.get('patient_code')} AE#{rec.get('event_number')})")
|
|
|
|
await browser.close()
|
|
|
|
mongo.close()
|
|
print(f"\nDone — {len(form_infos)} forms processed at {now.isoformat()}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|