Files
janssen/DRY/surgery_import.py
T

186 lines
6.1 KiB
Python

"""
Surgery (V0 Implantation Visit) Import — scrapes from EvaMed DRY study and upserts into MongoDB.
Run repeatedly; only stores field-level changes (delta) in history[].
Unique key: _form_id (each form has a unique ID in EvaMed).
MongoDB: db=Dry, collection=Surgery
"""
import asyncio
import re
from datetime import datetime
from pathlib import Path
from playwright.async_api import async_playwright
from pymongo import MongoClient
BASE_URL = "https://prod.evamed.com/etude/soft/index.php"
LOGIN_URL = f"{BASE_URL}?module=authentification&class=login&client=myopowers-dry"
LIST_URL = f"{BASE_URL}?module=monitoring&class=formslisting&center_id=2&formtype=10&l=ALL"
LOGIN = "vbuzalka"
PASSWORD = "Vlado9674+"
MONGO_HOST = "192.168.1.76"
DB_NAME = "Dry"
COLLECTION = "Surgery"
SESSION_FILE = Path(__file__).parent / "session.json"
DATE_RE = re.compile(r"^(\d{2})/(\d{2})/(\d{4})$")
def parse_value(value):
"""Parse DD/MM/YYYY -> datetime, digit-only -> int, else str. None if empty."""
if not value or not value.strip():
return None
v = value.strip()
m = DATE_RE.fullmatch(v)
if m:
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1)))
if re.fullmatch(r"\d+", v):
return int(v)
return v
async def do_login(page):
await page.goto(LOGIN_URL)
await page.wait_for_load_state("networkidle")
await page.locator("#login").fill(LOGIN)
await page.locator('input[type="password"]').first.fill(PASSWORD)
await page.click('input[value="Connection"]')
await page.wait_for_load_state("networkidle")
async def get_form_ids(page):
"""Return list of {formId, patientCode} from the filtered forms list."""
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
return await page.evaluate("""() => {
const results = [];
document.querySelectorAll('a[title="Open form"]').forEach(a => {
const href = a.getAttribute('href') || '';
const m = href.match(/id=(\\d+)/);
if (!m) return;
const row = a.closest('tr');
const dirLink = row ? row.querySelector('a[title="Open directory"]') : null;
const patientCode = dirLink ? dirLink.innerText.trim() : '';
results.push({ formId: m[1], patientCode: patientCode });
});
return results;
}""")
async def extract_form_fields(page, form_id):
"""Navigate to form and extract all field values."""
url = f"{BASE_URL}?module=dossier&class=file&event=show&id={form_id}#fiche"
await page.goto(url)
await page.wait_for_load_state("networkidle")
raw = await page.evaluate("""() => {
const fields = {};
document.querySelectorAll('.tableauFormulaire span.label').forEach(label => {
const key = label.innerText.trim();
const valEl = label.nextElementSibling;
fields[key] = valEl ? valEl.innerText.trim() || null : null;
});
return fields;
}""")
parsed = {}
for k, v in raw.items():
parsed[k] = parse_value(v)
parsed['_form_id'] = int(form_id)
return parsed
def upsert(collection, doc, patient_code, now):
form_id = doc['_form_id']
key = {"_form_id": form_id}
existing = collection.find_one(key)
skip = {'_form_id'}
data = {k: v for k, v in doc.items() if k not in skip}
if existing is None:
collection.insert_one({
**key,
"patient_code": patient_code,
"data": data,
"history": [],
"first_seen_at": now,
"last_seen_at": now,
"deleted_at": None,
})
print(f" NEW {patient_code} Surgery form_id={form_id}")
return
old_data = existing.get("data", {})
changes = {}
for k in set(data) | set(old_data):
old_v = old_data.get(k)
new_v = data.get(k)
if old_v != new_v:
changes[k] = {"old": old_v, "new": new_v}
update = {"$set": {"last_seen_at": now, "deleted_at": None, "patient_code": patient_code}}
if changes:
update["$set"]["data"] = data
update["$push"] = {"history": {"timestamp": now, "changes": changes}}
print(f" CHANGED {patient_code} Surgery form_id={form_id} -> {list(changes.keys())}")
else:
print(f" ok {patient_code} Surgery form_id={form_id}")
collection.update_one(key, update)
async def main():
mongo = MongoClient(MONGO_HOST)
col = mongo[DB_NAME][COLLECTION]
now = datetime.now()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
if SESSION_FILE.exists():
context = await browser.new_context(storage_state=str(SESSION_FILE))
print("Loaded saved session")
else:
context = await browser.new_context(viewport={"width": 1400, "height": 900})
page = await context.new_page()
await page.goto(LIST_URL)
await page.wait_for_load_state("networkidle")
if "authentification" in page.url:
print("Logging in...")
await do_login(page)
await context.storage_state(path=str(SESSION_FILE))
print("Session saved")
else:
print("Session valid")
form_infos = await get_form_ids(page)
current_ids = {info['formId'] for info in form_infos}
print(f"Found {len(form_infos)} Surgery forms")
for info in form_infos:
fid = info['formId']
print(f"Scraping form {fid} ({info['patientCode']})...")
doc = await extract_form_fields(page, fid)
upsert(col, doc, info['patientCode'], now)
for rec in col.find({"deleted_at": None}, {"_form_id": 1, "patient_code": 1}):
if str(rec.get('_form_id', '')) not in current_ids:
col.update_one({"_id": rec["_id"]}, {"$set": {"deleted_at": now}})
print(f" DELETED form_id={rec['_form_id']} ({rec.get('patient_code')})")
await browser.close()
mongo.close()
print(f"\nDone -- {len(form_infos)} forms processed at {now.isoformat()}")
if __name__ == "__main__":
asyncio.run(main())