This commit is contained in:
2026-06-15 16:10:47 +02:00
parent 36aa84aa02
commit 495cf8da21
34 changed files with 8012 additions and 8 deletions
@@ -0,0 +1,55 @@
"Protocol","Study Population","Country","Site","Principal Investigator","Participant ID","Baseline Stool Frequency","Visit","Visit Date","Endoscopy Completed?","Endoscopy Date","Bowel Preparation Start Date 1","Bowel Preparation End Date 1","Bowel Preparation Start Date 2","Bowel Preparation End Date 2","Central Endoscopy Score","Local Endoscopy Score","PGA Score","Eligible Day (-1)","Day (-1) Excluded Reason(s)","Eligible Day (-2)","Day (-2) Excluded Reason(s)","Eligible Day (-3)","Day (-3) Excluded Reason(s)","Eligible Day (-4)","Day (-4) Excluded Reason(s)","Eligible Day (-5)","Day (-5) Excluded Reason(s)","Eligible Day (-6)","Day (-6) Excluded Reason(s)","Eligible Day (-7)","Day (-7) Excluded Reason(s)","Eligible Day (-8)","Day (-8) Excluded Reason(s)","Eligible Day (-9)","Day (-9) Excluded Reason(s)","Eligible Day (-10)","Day (-10) Excluded Reason(s)","Eligible Day (-1) Stool Count","Eligible Day (-2) Stool Count","Eligible Day (-3) Stool Count","Eligible Day (-4) Stool Count","Eligible Day (-5) Stool Count","Eligible Day (-6) Stool Count","Eligible Day (-7) Stool Count","Eligible Day (-8) Stool Count","Eligible Day (-9) Stool Count","Eligible Day (-10) Stool Count","Stool Frequency Sub-score","Eligible Day (-1) Rectal Bleeding Score","Eligible Day (-2) Rectal Bleeding Score","Eligible Day (-3) Rectal Bleeding Score","Eligible Day (-4) Rectal Bleeding Score","Eligible Day (-5) Rectal Bleeding Score","Eligible Day (-6) Rectal Bleeding Score","Eligible Day (-7) Rectal Bleeding Score","Eligible Day (-8) Rectal Bleeding Score","Eligible Day (-9) Rectal Bleeding Score","Eligible Day (-10) Rectal Bleeding Score","Rectal Bleeding Sub-score","Partial Mayo Score","Modified Mayo Score","Full Mayo Score","Site Action","Last Mayo Score Submission","Week I-12 Clinical Responder","Week I-12 Clinical Remission","Clinical Flare","Loss of Response","Partial Mayo Response Post Loss of Response","Partial Mayo Response for Clinical Non-Responders"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-0","19 Feb 2026","Yes","05 Feb 2026","04 Feb 2026","04 Feb 2026","-","-","2","-","3","18 Feb 2026","-","17 Feb 2026","-","16 Feb 2026","-","15 Feb 2026","-","14 Feb 2026","-","13 Feb 2026","-","12 Feb 2026","-","11 Feb 2026","Day Not Applicable for Calculation","10 Feb 2026","Day Not Applicable for Calculation","09 Feb 2026","Day Not Applicable for Calculation","10","8","7","5","7","8","8","-","-","-","3","1","1","1","0","1","1","1","-","-","-","1","7","6","9","-","08 Apr 2026 07:11:25","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-2","04 Mar 2026","-","-","-","-","-","-","-","-","3","03 Mar 2026","-","02 Mar 2026","-","01 Mar 2026","-","28 Feb 2026","-","27 Feb 2026","-","26 Feb 2026","-","25 Feb 2026","-","24 Feb 2026","Day Not Applicable for Calculation","23 Feb 2026","Day Not Applicable for Calculation","22 Feb 2026","Day Not Applicable for Calculation","5","4","5","4","5","6","6","-","-","-","2","1","0","1","0","1","0","1","-","-","-","1","6","","","-","28 May 2026 10:04:05","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-4","18 Mar 2026","-","-","-","-","-","-","-","-","2","17 Mar 2026","-","16 Mar 2026","-","15 Mar 2026","-","14 Mar 2026","-","13 Mar 2026","-","12 Mar 2026","-","11 Mar 2026","-","10 Mar 2026","Day Not Applicable for Calculation","09 Mar 2026","Day Not Applicable for Calculation","08 Mar 2026","Day Not Applicable for Calculation","5","5","5","4","5","4","5","-","-","-","2","1","0","0","1","1","1","0","-","-","-","1","5","","","-","08 Apr 2026 11:04:49","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-8","05 May 2026","-","-","-","-","-","-","-","-","1","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","3","3","4","4","5","4","4","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","4","","","-","28 May 2026 14:42:53","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-12","13 May 2026","Yes","06 May 2026","05 May 2026","05 May 2026","-","-","1","-","1","12 May 2026","-","11 May 2026","-","10 May 2026","-","09 May 2026","-","08 May 2026","-","07 May 2026","-","06 May 2026","Endoscopy","05 May 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","04 May 2026","-","03 May 2026","Day Not Applicable for Calculation","5","4","6","5","5","5","-","-","3","-","2","1","0","1","1","1","1","-","-","1","-","1","4","4","5","-","28 May 2026 14:43:11","Clinical Responder","No","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","M-4","10 Jun 2026","-","-","-","-","-","-","-","-","1","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","4","5","3","4","5","4","5","-","-","-","2","0","0","0","0","1","0","1","-","-","-","0","3","","","-","10 Jun 2026 07:15:50","N/A","N/A","No","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-0","08 Apr 2026","Yes","18 Mar 2026","17 Mar 2026","18 Mar 2026","-","-","2","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","Missing Diary","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","3","3","4","-","3","3","4","-","-","-","1","0","0","0","-","0","0","1","-","-","-","0","3","3","5","-","10 Jun 2026 08:42:08","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-2","23 Apr 2026","-","-","-","-","-","-","-","-","2","22 Apr 2026","Missing Diary","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","Day Not Applicable for Calculation","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","-","3","3","6","5","5","4","-","-","-","2","-","0","0","1","1","1","1","-","-","-","1","5","","","-","10 Jun 2026 08:42:33","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-4","06 May 2026","-","-","-","-","-","-","-","-","1","05 May 2026","-","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","Day Not Applicable for Calculation","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","6","3","2","3","3","3","3","-","-","-","1","1","0","0","0","1","1","0","-","-","-","0","2","","","-","04 Jun 2026 07:39:06","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-8","04 Jun 2026","-","-","-","-","-","-","-","-","1","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","Day Not Applicable for Calculation","26 May 2026","Day Not Applicable for Calculation","25 May 2026","Day Not Applicable for Calculation","3","4","3","3","3","3","4","-","-","-","1","0","0","0","0","0","0","1","-","-","-","0","2","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012003","1","I-0","27 May 2026","Yes","13 May 2026","12 May 2026","12 May 2026","-","-","3","-","2","26 May 2026","-","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","Day Not Applicable for Calculation","18 May 2026","Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","6","9","7","8","9","7","8","-","-","-","3","2","2","2","2","1","1","1","-","-","-","2","7","8","10","-","27 May 2026 07:24:39","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012003","1","I-2","10 Jun 2026","-","-","-","-","-","-","-","-","2","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","7","8","8","7","6","8","6","-","-","-","3","2","2","1","2","2","2","1","-","-","-","2","7","","","-","10 Jun 2026 07:30:18","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10003","Leksa Vaclav","CZ100032001","2","I-0","10 Jun 2026","Yes","27 May 2026","26 May 2026","26 May 2026","-","-","2","-","2","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","4","4","4","4","5","4","5","-","-","-","1","2","2","2","2","2","2","2","-","-","-","2","5","5","7","-","10 Jun 2026 08:48:09","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-0","20 Mar 2026","Yes","19 Feb 2026","-","-","-","-","3","-","3","19 Mar 2026","-","18 Mar 2026","-","17 Mar 2026","-","16 Mar 2026","-","15 Mar 2026","-","14 Mar 2026","-","13 Mar 2026","-","12 Mar 2026","Day Not Applicable for Calculation","11 Mar 2026","Day Not Applicable for Calculation","10 Mar 2026","Day Not Applicable for Calculation","7","7","8","8","7","8","5","-","-","-","3","2","1","1","1","1","1","0","-","-","-","1","7","7","10","-","20 Mar 2026 07:02:44","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-2","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","Medication For Diarrhea","06 Apr 2026","Medication For Diarrhea","05 Apr 2026","Medication For Diarrhea","04 Apr 2026","Medication For Diarrhea","03 Apr 2026","Medication For Diarrhea","02 Apr 2026","Medication For Diarrhea","01 Apr 2026","Medication For Diarrhea","31 Mar 2026","Medication For Diarrhea;Day Not Applicable for Calculation","30 Mar 2026","Medication For Diarrhea;Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","-","-","-","-","-","-","-","-","-","-","Non-Evaluable","-","-","-","-","-","-","-","-","-","-","Non-Evaluable","Non-Evaluable","Non-Evaluable","Non-Evaluable","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-4","15 Apr 2026","-","-","-","-","-","-","-","-","3","14 Apr 2026","-","13 Apr 2026","-","12 Apr 2026","-","11 Apr 2026","-","10 Apr 2026","-","09 Apr 2026","-","08 Apr 2026","-","07 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","06 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","05 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","9","22","20","19","17","18","18","-","-","-","3","1","3","2","2","2","2","2","-","-","-","2","8","","","-","04 May 2026 22:06:03","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-8","18 May 2026","-","-","-","-","-","-","-","-","2","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","-","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","08 May 2026","Day Not Applicable for Calculation","7","5","9","7","7","8","8","-","-","-","3","1","1","1","1","1","1","1","-","-","-","1","6","","","-","04 Jun 2026 21:46:30","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-12","08 Jun 2026","Yes","28 May 2026","-","-","-","-","3","-","3","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","Missing Diary","31 May 2026","Day Not Applicable for Calculation","30 May 2026","Day Not Applicable for Calculation","29 May 2026","Day Not Applicable for Calculation","6","5","5","5","7","6","-","-","-","-","3","1","1","0","0","1","0","-","-","-","-","1","7","7","10","-","11 Jun 2026 22:12:05","Clinical Nonresponder","No","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062002","1","I-0","26 May 2026","Yes","14 May 2026","13 May 2026","13 May 2026","-","-","2","-","2","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","-","18 May 2026","Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","16 May 2026","Day Not Applicable for Calculation","8","8","6","7","7","6","7","-","-","-","3","2","2","2","2","2","2","2","-","-","-","2","7","7","9","-","29 May 2026 15:45:00","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062002","1","I-2","09 Jun 2026","-","-","-","-","-","-","-","-","2","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","30 May 2026","Day Not Applicable for Calculation","7","8","7","7","7","5","7","-","-","-","3","2","1","1","1","2","2","2","-","-","-","2","7","","","-","11 Jun 2026 22:12:40","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-0","05 May 2026","Yes","24 Apr 2026","23 Apr 2026","23 Apr 2026","-","-","2","-","2","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","5","5","5","5","5","5","5","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","5","5","7","-","05 May 2026 11:19:40","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-2","19 May 2026","-","-","-","-","-","-","-","-","1","18 May 2026","-","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","Day Not Applicable for Calculation","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","5","4","5","5","5","4","6","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","4","","","-","19 May 2026 10:38:25","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-4","04 Jun 2026","-","-","-","-","-","-","-","-","1","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","Day Not Applicable for Calculation","26 May 2026","Day Not Applicable for Calculation","25 May 2026","Day Not Applicable for Calculation","2","3","2","3","3","2","3","-","-","-","1","0","0","0","0","0","0","0","-","-","-","0","2","","","-","04 Jun 2026 09:24:54","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-0","07 Apr 2026","Yes","24 Mar 2026","22 Mar 2026","22 Mar 2026","-","-","2","-","2","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","-","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","28 Mar 2026","Day Not Applicable for Calculation","8","11","5","9","11","10","13","-","-","-","3","1","2","2","2","2","2","2","-","-","-","2","7","7","9","-","04 May 2026 08:44:52","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-2","22 Apr 2026","-","-","-","-","-","-","-","-","2","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","7","5","6","6","7","8","2","-","-","-","1","1","0","1","1","1","2","0","-","-","-","1","4","","","-","04 May 2026 08:45:07","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-4","07 May 2026","-","-","-","-","-","-","-","-","1","06 May 2026","-","05 May 2026","-","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","Day Not Applicable for Calculation","28 Apr 2026","Day Not Applicable for Calculation","27 Apr 2026","Day Not Applicable for Calculation","8","7","7","8","4","11","7","-","-","-","1","2","1","1","1","0","1","1","-","-","-","1","3","","","-","01 Jun 2026 00:57:35","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-8","03 Jun 2026","-","-","-","-","-","-","-","-","2","02 Jun 2026","-","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","Day Not Applicable for Calculation","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","5","9","7","5","5","9","7","-","-","-","1","1","1","1","0","3","0","1","-","-","-","1","4","","","-","03 Jun 2026 17:47:25","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-0","24 Mar 2026","Yes","12 Mar 2026","11 Mar 2026","11 Mar 2026","-","-","2","-","2","23 Mar 2026","-","22 Mar 2026","-","21 Mar 2026","-","20 Mar 2026","-","19 Mar 2026","-","18 Mar 2026","-","17 Mar 2026","-","16 Mar 2026","Day Not Applicable for Calculation","15 Mar 2026","Day Not Applicable for Calculation","14 Mar 2026","Day Not Applicable for Calculation","8","6","5","7","6","7","6","-","-","-","3","1","1","1","0","1","1","1","-","-","-","1","6","6","8","-","05 Apr 2026 22:41:27","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-2","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","5","2","3","6","5","5","5","-","-","-","2","0","0","0","0","1","1","0","-","-","-","0","4","","","-","28 May 2026 23:19:03","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-4","21 Apr 2026","-","-","-","-","-","-","-","-","0","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","-","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","11 Apr 2026","Day Not Applicable for Calculation","4","3","4","3","3","4","4","-","-","-","2","0","0","0","0","0","0","0","-","-","-","0","2","","","-","27 May 2026 12:54:41","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-0","12 May 2026","Yes","21 Apr 2026","20 Apr 2026","21 Apr 2026","-","-","2","-","2","11 May 2026","-","10 May 2026","-","09 May 2026","-","08 May 2026","-","07 May 2026","-","06 May 2026","-","05 May 2026","Missing Diary","04 May 2026","Day Not Applicable for Calculation","03 May 2026","Day Not Applicable for Calculation","02 May 2026","Day Not Applicable for Calculation","2","1","1","1","1","2","-","-","-","-","0","0","0","0","0","0","0","-","-","-","-","0","2","2","4","-","28 May 2026 23:19:30","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-2","26 May 2026","-","-","-","-","-","-","-","-","1","25 May 2026","-","24 May 2026","Missing Diary","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","-","18 May 2026","Missing Diary;Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","16 May 2026","Day Not Applicable for Calculation","1","-","1","2","1","2","2","-","-","-","1","0","-","0","0","0","0","0","-","-","-","0","2","","","-","28 May 2026 23:19:51","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-4","10 Jun 2026","-","-","-","-","-","-","-","-","2","09 Jun 2026","-","08 Jun 2026","Missing Diary","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Missing Diary;Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","4","-","1","1","2","2","1","-","-","-","1","0","-","0","0","0","0","0","-","-","-","0","3","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","1","I-0","02 Jun 2026","Yes","25 May 2026","24 May 2026","24 May 2026","-","-","2","-","2","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Endoscopy;Missing Diary;Day Not Applicable for Calculation","24 May 2026","Bowel Preparation for Procedure;Missing Diary;Day Not Applicable for Calculation","23 May 2026","Missing Diary;Day Not Applicable for Calculation","8","8","11","10","10","11","6","-","-","-","3","2","2","1","2","1","2","2","-","-","-","2","7","7","9","-","02 Jun 2026 08:17:40","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","1","I-2","10 Jun 2026","-","-","-","-","-","-","-","-","2","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","9","2","1","4","2","4","2","-","-","-","1","1","1","0","1","1","1","0","-","-","-","1","4","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","1","I-0","28 May 2026","Yes","19 May 2026","18 May 2026","19 May 2026","-","-","3","-","3","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","Day Not Applicable for Calculation","19 May 2026","Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation","18 May 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","14","15","15","15","15","15","15","-","-","-","3","2","3","3","2","2","3","3","-","-","-","3","9","9","12","-","28 May 2026 10:22:48","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","1","I-2","11 Jun 2026","-","-","-","-","-","-","-","-","3","10 Jun 2026","-","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","Day Not Applicable for Calculation","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","10","9","9","8","13","9","8","-","-","-","3","2","1","1","1","2","1","1","-","-","-","1","7","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","Unscheduled 1","04 May 2026","Yes","20 Apr 2026","12 Apr 2026","15 Apr 2026","-","-","2","-","3","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","-","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","24 Apr 2026","Day Not Applicable for Calculation","5","6","6","7","6","3","3","-","-","-","2","0","0","0","0","0","0","0","-","-","-","0","5","4","7","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","I-0","18 May 2026","Yes","01 May 2026","01 May 2026","01 May 2026","-","-","2","-","3","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","-","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","08 May 2026","Day Not Applicable for Calculation","6","6","6","6","6","6","6","-","-","-","3","0","0","0","0","0","0","0","-","-","-","0","6","5","8","-","18 May 2026 08:39:27","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","I-2","01 Jun 2026","-","-","-","-","-","-","-","-","3","31 May 2026","-","30 May 2026","Missing Diary","29 May 2026","Missing Diary","28 May 2026","Missing Diary","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","22 May 2026","Day Not Applicable for Calculation","6","-","-","-","6","6","6","-","-","-","3","0","-","-","-","0","0","0","-","-","-","0","6","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-0","07 Apr 2026","Yes","16 Mar 2026","15 Mar 2026","16 Mar 2026","-","-","3","-","3","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","-","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","28 Mar 2026","Day Not Applicable for Calculation","11","11","10","11","11","10","9","-","-","-","3","2","2","2","2","2","2","2","-","-","-","2","8","8","11","-","20 Apr 2026 09:27:58","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-2","20 Apr 2026","-","-","-","-","-","-","-","-","3","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","-","13 Apr 2026","-","12 Apr 2026","Day Not Applicable for Calculation","11 Apr 2026","Day Not Applicable for Calculation","10 Apr 2026","Day Not Applicable for Calculation","8","7","9","8","8","7","8","-","-","-","3","2","2","1","1","1","2","1","-","-","-","1","7","","","-","20 Apr 2026 09:29:01","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-4","05 May 2026","-","-","-","-","-","-","-","-","1","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","6","6","6","6","7","7","6","-","-","-","3","0","0","1","1","1","1","1","-","-","-","1","5","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-8","02 Jun 2026","-","-","-","-","-","-","-","-","1","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","3","4","4","4","5","5","5","-","-","-","2","0","0","0","0","0","1","1","-","-","-","0","3","","","-","02 Jun 2026 14:44:34","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222002","1","I-0","19 Feb 2026","Yes","11 Feb 2026","10 Feb 2026","11 Feb 2026","-","-","2","-","2","18 Feb 2026","-","17 Feb 2026","-","16 Feb 2026","-","15 Feb 2026","-","14 Feb 2026","-","13 Feb 2026","-","12 Feb 2026","-","11 Feb 2026","Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation","10 Feb 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","09 Feb 2026","Day Not Applicable for Calculation","3","2","2","3","4","3","2","-","-","-","1","1","1","0","0","0","2","2","-","-","-","1","4","4","6","-","19 Feb 2026 15:24:43","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-0","09 Mar 2026","Yes","11 Feb 2026","10 Feb 2026","11 Feb 2026","-","-","2","-","2","08 Mar 2026","-","07 Mar 2026","-","06 Mar 2026","-","05 Mar 2026","-","04 Mar 2026","-","03 Mar 2026","Missing Diary","02 Mar 2026","Missing Diary","01 Mar 2026","Missing Diary;Day Not Applicable for Calculation","28 Feb 2026","Missing Diary;Day Not Applicable for Calculation","27 Feb 2026","Missing Diary;Day Not Applicable for Calculation","7","7","6","6","7","-","-","-","-","-","3","2","2","2","2","2","-","-","-","-","-","2","7","7","9","-","22 Mar 2026 18:34:58","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-2","27 Mar 2026","-","-","-","-","-","-","-","-","2","26 Mar 2026","-","25 Mar 2026","-","24 Mar 2026","-","23 Mar 2026","-","22 Mar 2026","-","21 Mar 2026","-","20 Mar 2026","-","19 Mar 2026","Day Not Applicable for Calculation","18 Mar 2026","Day Not Applicable for Calculation","17 Mar 2026","Day Not Applicable for Calculation","7","3","3","3","5","5","5","-","-","-","2","0","0","1","1","1","1","2","-","-","-","1","5","","","-","08 Apr 2026 07:36:56","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-4","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","3","3","4","4","5","4","3","-","-","-","2","1","0","0","2","1","1","2","-","-","-","1","5","","","-","08 Apr 2026 07:59:35","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-8","04 May 2026","-","-","-","-","-","-","-","-","2","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","-","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","24 Apr 2026","Missing Diary;Day Not Applicable for Calculation","3","5","3","3","3","2","3","-","-","-","1","0","0","0","0","0","0","0","-","-","-","0","3","","","-","04 May 2026 07:52:47","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-12","01 Jun 2026","Yes","20 May 2026","19 May 2026","20 May 2026","-","-","3","-","2","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","22 May 2026","Day Not Applicable for Calculation","4","4","6","3","3","3","3","-","-","-","2","1","1","2","1","1","1","2","-","-","-","1","5","6","8","-","01 Jun 2026 14:25:57","Clinical Nonresponder","No","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-0","09 Apr 2026","Yes","08 Apr 2026","31 Mar 2026","01 Apr 2026","-","-","2","-","2","08 Apr 2026","Endoscopy","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","31 Mar 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","30 Mar 2026","-","-","3","3","4","3","4","3","-","-","3","1","-","2","2","2","2","2","2","-","-","2","2","5","5","7","-","29 May 2026 11:07:08","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-2","22 Apr 2026","-","-","-","-","-","-","-","-","2","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","3","3","5","3","2","3","2","-","-","-","1","1","2","2","1","1","1","2","-","-","-","1","4","","","-","05 May 2026 07:29:35","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-4","05 May 2026","-","-","-","-","-","-","-","-","2","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","4","2","2","2","2","2","2","-","-","-","1","1","1","1","1","2","1","1","-","-","-","1","4","","","-","05 May 2026 07:28:55","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-8","02 Jun 2026","-","-","-","-","-","-","-","-","2","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","2","2","2","2","2","4","10","-","-","-","1","2","1","2","1","2","2","2","-","-","-","2","5","","","-","02 Jun 2026 08:18:08","N/A","N/A","N/A","N/A","N/A","N/A"
1 Protocol Study Population Country Site Principal Investigator Participant ID Baseline Stool Frequency Visit Visit Date Endoscopy Completed? Endoscopy Date Bowel Preparation Start Date 1 Bowel Preparation End Date 1 Bowel Preparation Start Date 2 Bowel Preparation End Date 2 Central Endoscopy Score Local Endoscopy Score PGA Score Eligible Day (-1) Day (-1) Excluded Reason(s) Eligible Day (-2) Day (-2) Excluded Reason(s) Eligible Day (-3) Day (-3) Excluded Reason(s) Eligible Day (-4) Day (-4) Excluded Reason(s) Eligible Day (-5) Day (-5) Excluded Reason(s) Eligible Day (-6) Day (-6) Excluded Reason(s) Eligible Day (-7) Day (-7) Excluded Reason(s) Eligible Day (-8) Day (-8) Excluded Reason(s) Eligible Day (-9) Day (-9) Excluded Reason(s) Eligible Day (-10) Day (-10) Excluded Reason(s) Eligible Day (-1) Stool Count Eligible Day (-2) Stool Count Eligible Day (-3) Stool Count Eligible Day (-4) Stool Count Eligible Day (-5) Stool Count Eligible Day (-6) Stool Count Eligible Day (-7) Stool Count Eligible Day (-8) Stool Count Eligible Day (-9) Stool Count Eligible Day (-10) Stool Count Stool Frequency Sub-score Eligible Day (-1) Rectal Bleeding Score Eligible Day (-2) Rectal Bleeding Score Eligible Day (-3) Rectal Bleeding Score Eligible Day (-4) Rectal Bleeding Score Eligible Day (-5) Rectal Bleeding Score Eligible Day (-6) Rectal Bleeding Score Eligible Day (-7) Rectal Bleeding Score Eligible Day (-8) Rectal Bleeding Score Eligible Day (-9) Rectal Bleeding Score Eligible Day (-10) Rectal Bleeding Score Rectal Bleeding Sub-score Partial Mayo Score Modified Mayo Score Full Mayo Score Site Action Last Mayo Score Submission Week I-12 Clinical Responder Week I-12 Clinical Remission Clinical Flare Loss of Response Partial Mayo Response Post Loss of Response Partial Mayo Response for Clinical Non-Responders
2 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-0 19 Feb 2026 Yes 05 Feb 2026 04 Feb 2026 04 Feb 2026 - - 2 - 3 18 Feb 2026 - 17 Feb 2026 - 16 Feb 2026 - 15 Feb 2026 - 14 Feb 2026 - 13 Feb 2026 - 12 Feb 2026 - 11 Feb 2026 Day Not Applicable for Calculation 10 Feb 2026 Day Not Applicable for Calculation 09 Feb 2026 Day Not Applicable for Calculation 10 8 7 5 7 8 8 - - - 3 1 1 1 0 1 1 1 - - - 1 7 6 9 - 08 Apr 2026 07:11:25 N/A N/A N/A N/A N/A N/A
3 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-2 04 Mar 2026 - - - - - - - - 3 03 Mar 2026 - 02 Mar 2026 - 01 Mar 2026 - 28 Feb 2026 - 27 Feb 2026 - 26 Feb 2026 - 25 Feb 2026 - 24 Feb 2026 Day Not Applicable for Calculation 23 Feb 2026 Day Not Applicable for Calculation 22 Feb 2026 Day Not Applicable for Calculation 5 4 5 4 5 6 6 - - - 2 1 0 1 0 1 0 1 - - - 1 6 - 28 May 2026 10:04:05 N/A N/A N/A N/A N/A N/A
4 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-4 18 Mar 2026 - - - - - - - - 2 17 Mar 2026 - 16 Mar 2026 - 15 Mar 2026 - 14 Mar 2026 - 13 Mar 2026 - 12 Mar 2026 - 11 Mar 2026 - 10 Mar 2026 Day Not Applicable for Calculation 09 Mar 2026 Day Not Applicable for Calculation 08 Mar 2026 Day Not Applicable for Calculation 5 5 5 4 5 4 5 - - - 2 1 0 0 1 1 1 0 - - - 1 5 - 08 Apr 2026 11:04:49 N/A N/A N/A N/A N/A N/A
5 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-8 05 May 2026 - - - - - - - - 1 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 3 3 4 4 5 4 4 - - - 2 1 1 1 1 1 1 1 - - - 1 4 - 28 May 2026 14:42:53 N/A N/A N/A N/A N/A N/A
6 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-12 13 May 2026 Yes 06 May 2026 05 May 2026 05 May 2026 - - 1 - 1 12 May 2026 - 11 May 2026 - 10 May 2026 - 09 May 2026 - 08 May 2026 - 07 May 2026 - 06 May 2026 Endoscopy 05 May 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 04 May 2026 - 03 May 2026 Day Not Applicable for Calculation 5 4 6 5 5 5 - - 3 - 2 1 0 1 1 1 1 - - 1 - 1 4 4 5 - 28 May 2026 14:43:11 Clinical Responder No N/A N/A N/A N/A
7 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 M-4 10 Jun 2026 - - - - - - - - 1 09 Jun 2026 - 08 Jun 2026 - 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 Day Not Applicable for Calculation 01 Jun 2026 Day Not Applicable for Calculation 31 May 2026 Day Not Applicable for Calculation 4 5 3 4 5 4 5 - - - 2 0 0 0 0 1 0 1 - - - 0 3 - 10 Jun 2026 07:15:50 N/A N/A No N/A N/A N/A
8 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-0 08 Apr 2026 Yes 18 Mar 2026 17 Mar 2026 18 Mar 2026 - - 2 - 2 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 Missing Diary 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 Day Not Applicable for Calculation 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 3 3 4 - 3 3 4 - - - 1 0 0 0 - 0 0 1 - - - 0 3 3 5 - 10 Jun 2026 08:42:08 N/A N/A N/A N/A N/A N/A
9 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-2 23 Apr 2026 - - - - - - - - 2 22 Apr 2026 Missing Diary 21 Apr 2026 - 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 Day Not Applicable for Calculation 14 Apr 2026 Day Not Applicable for Calculation 13 Apr 2026 Day Not Applicable for Calculation - 3 3 6 5 5 4 - - - 2 - 0 0 1 1 1 1 - - - 1 5 - 10 Jun 2026 08:42:33 N/A N/A N/A N/A N/A N/A
10 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-4 06 May 2026 - - - - - - - - 1 05 May 2026 - 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 Day Not Applicable for Calculation 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 6 3 2 3 3 3 3 - - - 1 1 0 0 0 1 1 0 - - - 0 2 - 04 Jun 2026 07:39:06 N/A N/A N/A N/A N/A N/A
11 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-8 04 Jun 2026 - - - - - - - - 1 03 Jun 2026 - 02 Jun 2026 - 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 Day Not Applicable for Calculation 26 May 2026 Day Not Applicable for Calculation 25 May 2026 Day Not Applicable for Calculation 3 4 3 3 3 3 4 - - - 1 0 0 0 0 0 0 1 - - - 0 2 - - N/A N/A N/A N/A N/A N/A
12 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012003 1 I-0 27 May 2026 Yes 13 May 2026 12 May 2026 12 May 2026 - - 3 - 2 26 May 2026 - 25 May 2026 - 24 May 2026 - 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 - 19 May 2026 Day Not Applicable for Calculation 18 May 2026 Day Not Applicable for Calculation 17 May 2026 Day Not Applicable for Calculation 6 9 7 8 9 7 8 - - - 3 2 2 2 2 1 1 1 - - - 2 7 8 10 - 27 May 2026 07:24:39 N/A N/A N/A N/A N/A N/A
13 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012003 1 I-2 10 Jun 2026 - - - - - - - - 2 09 Jun 2026 - 08 Jun 2026 - 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 Day Not Applicable for Calculation 01 Jun 2026 Day Not Applicable for Calculation 31 May 2026 Day Not Applicable for Calculation 7 8 8 7 6 8 6 - - - 3 2 2 1 2 2 2 1 - - - 2 7 - 10 Jun 2026 07:30:18 N/A N/A N/A N/A N/A N/A
14 77242113UCO3001 Adult Czech Republic DD5-CZ10003 Leksa Vaclav CZ100032001 2 I-0 10 Jun 2026 Yes 27 May 2026 26 May 2026 26 May 2026 - - 2 - 2 09 Jun 2026 - 08 Jun 2026 - 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 Day Not Applicable for Calculation 01 Jun 2026 Day Not Applicable for Calculation 31 May 2026 Day Not Applicable for Calculation 4 4 4 4 5 4 5 - - - 1 2 2 2 2 2 2 2 - - - 2 5 5 7 - 10 Jun 2026 08:48:09 N/A N/A N/A N/A N/A N/A
15 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-0 20 Mar 2026 Yes 19 Feb 2026 - - - - 3 - 3 19 Mar 2026 - 18 Mar 2026 - 17 Mar 2026 - 16 Mar 2026 - 15 Mar 2026 - 14 Mar 2026 - 13 Mar 2026 - 12 Mar 2026 Day Not Applicable for Calculation 11 Mar 2026 Day Not Applicable for Calculation 10 Mar 2026 Day Not Applicable for Calculation 7 7 8 8 7 8 5 - - - 3 2 1 1 1 1 1 0 - - - 1 7 7 10 - 20 Mar 2026 07:02:44 N/A N/A N/A N/A N/A N/A
16 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-2 08 Apr 2026 - - - - - - - - 2 07 Apr 2026 Medication For Diarrhea 06 Apr 2026 Medication For Diarrhea 05 Apr 2026 Medication For Diarrhea 04 Apr 2026 Medication For Diarrhea 03 Apr 2026 Medication For Diarrhea 02 Apr 2026 Medication For Diarrhea 01 Apr 2026 Medication For Diarrhea 31 Mar 2026 Medication For Diarrhea;Day Not Applicable for Calculation 30 Mar 2026 Medication For Diarrhea;Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation - - - - - - - - - - Non-Evaluable - - - - - - - - - - Non-Evaluable Non-Evaluable Non-Evaluable Non-Evaluable - - N/A N/A N/A N/A N/A N/A
17 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-4 15 Apr 2026 - - - - - - - - 3 14 Apr 2026 - 13 Apr 2026 - 12 Apr 2026 - 11 Apr 2026 - 10 Apr 2026 - 09 Apr 2026 - 08 Apr 2026 - 07 Apr 2026 Medication For Diarrhea;Day Not Applicable for Calculation 06 Apr 2026 Medication For Diarrhea;Day Not Applicable for Calculation 05 Apr 2026 Medication For Diarrhea;Day Not Applicable for Calculation 9 22 20 19 17 18 18 - - - 3 1 3 2 2 2 2 2 - - - 2 8 - 04 May 2026 22:06:03 N/A N/A N/A N/A N/A N/A
18 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-8 18 May 2026 - - - - - - - - 2 17 May 2026 - 16 May 2026 - 15 May 2026 - 14 May 2026 - 13 May 2026 - 12 May 2026 - 11 May 2026 - 10 May 2026 Day Not Applicable for Calculation 09 May 2026 Day Not Applicable for Calculation 08 May 2026 Day Not Applicable for Calculation 7 5 9 7 7 8 8 - - - 3 1 1 1 1 1 1 1 - - - 1 6 - 04 Jun 2026 21:46:30 N/A N/A N/A N/A N/A N/A
19 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-12 08 Jun 2026 Yes 28 May 2026 - - - - 3 - 3 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 - 01 Jun 2026 Missing Diary 31 May 2026 Day Not Applicable for Calculation 30 May 2026 Day Not Applicable for Calculation 29 May 2026 Day Not Applicable for Calculation 6 5 5 5 7 6 - - - - 3 1 1 0 0 1 0 - - - - 1 7 7 10 - 11 Jun 2026 22:12:05 Clinical Nonresponder No N/A N/A N/A N/A
20 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062002 1 I-0 26 May 2026 Yes 14 May 2026 13 May 2026 13 May 2026 - - 2 - 2 25 May 2026 - 24 May 2026 - 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 - 19 May 2026 - 18 May 2026 Day Not Applicable for Calculation 17 May 2026 Day Not Applicable for Calculation 16 May 2026 Day Not Applicable for Calculation 8 8 6 7 7 6 7 - - - 3 2 2 2 2 2 2 2 - - - 2 7 7 9 - 29 May 2026 15:45:00 N/A N/A N/A N/A N/A N/A
21 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062002 1 I-2 09 Jun 2026 - - - - - - - - 2 08 Jun 2026 - 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 - 01 Jun 2026 Day Not Applicable for Calculation 31 May 2026 Day Not Applicable for Calculation 30 May 2026 Day Not Applicable for Calculation 7 8 7 7 7 5 7 - - - 3 2 1 1 1 2 2 2 - - - 2 7 - 11 Jun 2026 22:12:40 N/A N/A N/A N/A N/A N/A
22 77242113UCO3001 Adult Czech Republic DD5-CZ10009 Jiri Pumprla CZ100092001 1 I-0 05 May 2026 Yes 24 Apr 2026 23 Apr 2026 23 Apr 2026 - - 2 - 2 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 5 5 5 5 5 5 5 - - - 2 1 1 1 1 1 1 1 - - - 1 5 5 7 - 05 May 2026 11:19:40 N/A N/A N/A N/A N/A N/A
23 77242113UCO3001 Adult Czech Republic DD5-CZ10009 Jiri Pumprla CZ100092001 1 I-2 19 May 2026 - - - - - - - - 1 18 May 2026 - 17 May 2026 - 16 May 2026 - 15 May 2026 - 14 May 2026 - 13 May 2026 - 12 May 2026 - 11 May 2026 Day Not Applicable for Calculation 10 May 2026 Day Not Applicable for Calculation 09 May 2026 Day Not Applicable for Calculation 5 4 5 5 5 4 6 - - - 2 1 1 1 1 1 1 1 - - - 1 4 - 19 May 2026 10:38:25 N/A N/A N/A N/A N/A N/A
24 77242113UCO3001 Adult Czech Republic DD5-CZ10009 Jiri Pumprla CZ100092001 1 I-4 04 Jun 2026 - - - - - - - - 1 03 Jun 2026 - 02 Jun 2026 - 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 Day Not Applicable for Calculation 26 May 2026 Day Not Applicable for Calculation 25 May 2026 Day Not Applicable for Calculation 2 3 2 3 3 2 3 - - - 1 0 0 0 0 0 0 0 - - - 0 2 - 04 Jun 2026 09:24:54 N/A N/A N/A N/A N/A N/A
25 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-0 07 Apr 2026 Yes 24 Mar 2026 22 Mar 2026 22 Mar 2026 - - 2 - 2 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 - 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 28 Mar 2026 Day Not Applicable for Calculation 8 11 5 9 11 10 13 - - - 3 1 2 2 2 2 2 2 - - - 2 7 7 9 - 04 May 2026 08:44:52 N/A N/A N/A N/A N/A N/A
26 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-2 22 Apr 2026 - - - - - - - - 2 21 Apr 2026 - 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 Day Not Applicable for Calculation 13 Apr 2026 Day Not Applicable for Calculation 12 Apr 2026 Day Not Applicable for Calculation 7 5 6 6 7 8 2 - - - 1 1 0 1 1 1 2 0 - - - 1 4 - 04 May 2026 08:45:07 N/A N/A N/A N/A N/A N/A
27 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-4 07 May 2026 - - - - - - - - 1 06 May 2026 - 05 May 2026 - 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 Day Not Applicable for Calculation 28 Apr 2026 Day Not Applicable for Calculation 27 Apr 2026 Day Not Applicable for Calculation 8 7 7 8 4 11 7 - - - 1 2 1 1 1 0 1 1 - - - 1 3 - 01 Jun 2026 00:57:35 N/A N/A N/A N/A N/A N/A
28 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-8 03 Jun 2026 - - - - - - - - 2 02 Jun 2026 - 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 Day Not Applicable for Calculation 25 May 2026 Day Not Applicable for Calculation 24 May 2026 Day Not Applicable for Calculation 5 9 7 5 5 9 7 - - - 1 1 1 1 0 3 0 1 - - - 1 4 - 03 Jun 2026 17:47:25 N/A N/A N/A N/A N/A N/A
29 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132001 1 I-0 24 Mar 2026 Yes 12 Mar 2026 11 Mar 2026 11 Mar 2026 - - 2 - 2 23 Mar 2026 - 22 Mar 2026 - 21 Mar 2026 - 20 Mar 2026 - 19 Mar 2026 - 18 Mar 2026 - 17 Mar 2026 - 16 Mar 2026 Day Not Applicable for Calculation 15 Mar 2026 Day Not Applicable for Calculation 14 Mar 2026 Day Not Applicable for Calculation 8 6 5 7 6 7 6 - - - 3 1 1 1 0 1 1 1 - - - 1 6 6 8 - 05 Apr 2026 22:41:27 N/A N/A N/A N/A N/A N/A
30 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132001 1 I-2 08 Apr 2026 - - - - - - - - 2 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 Day Not Applicable for Calculation 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 5 2 3 6 5 5 5 - - - 2 0 0 0 0 1 1 0 - - - 0 4 - 28 May 2026 23:19:03 N/A N/A N/A N/A N/A N/A
31 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132001 1 I-4 21 Apr 2026 - - - - - - - - 0 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 - 13 Apr 2026 Day Not Applicable for Calculation 12 Apr 2026 Day Not Applicable for Calculation 11 Apr 2026 Day Not Applicable for Calculation 4 3 4 3 3 4 4 - - - 2 0 0 0 0 0 0 0 - - - 0 2 - 27 May 2026 12:54:41 N/A N/A N/A N/A N/A N/A
32 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132002 1 I-0 12 May 2026 Yes 21 Apr 2026 20 Apr 2026 21 Apr 2026 - - 2 - 2 11 May 2026 - 10 May 2026 - 09 May 2026 - 08 May 2026 - 07 May 2026 - 06 May 2026 - 05 May 2026 Missing Diary 04 May 2026 Day Not Applicable for Calculation 03 May 2026 Day Not Applicable for Calculation 02 May 2026 Day Not Applicable for Calculation 2 1 1 1 1 2 - - - - 0 0 0 0 0 0 0 - - - - 0 2 2 4 - 28 May 2026 23:19:30 N/A N/A N/A N/A N/A N/A
33 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132002 1 I-2 26 May 2026 - - - - - - - - 1 25 May 2026 - 24 May 2026 Missing Diary 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 - 19 May 2026 - 18 May 2026 Missing Diary;Day Not Applicable for Calculation 17 May 2026 Day Not Applicable for Calculation 16 May 2026 Day Not Applicable for Calculation 1 - 1 2 1 2 2 - - - 1 0 - 0 0 0 0 0 - - - 0 2 - 28 May 2026 23:19:51 N/A N/A N/A N/A N/A N/A
34 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132002 1 I-4 10 Jun 2026 - - - - - - - - 2 09 Jun 2026 - 08 Jun 2026 Missing Diary 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 Missing Diary;Day Not Applicable for Calculation 01 Jun 2026 Day Not Applicable for Calculation 31 May 2026 Day Not Applicable for Calculation 4 - 1 1 2 2 1 - - - 1 0 - 0 0 0 0 0 - - - 0 3 - - N/A N/A N/A N/A N/A N/A
35 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132003 1 I-0 02 Jun 2026 Yes 25 May 2026 24 May 2026 24 May 2026 - - 2 - 2 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 Endoscopy;Missing Diary;Day Not Applicable for Calculation 24 May 2026 Bowel Preparation for Procedure;Missing Diary;Day Not Applicable for Calculation 23 May 2026 Missing Diary;Day Not Applicable for Calculation 8 8 11 10 10 11 6 - - - 3 2 2 1 2 1 2 2 - - - 2 7 7 9 - 02 Jun 2026 08:17:40 N/A N/A N/A N/A N/A N/A
36 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132003 1 I-2 10 Jun 2026 - - - - - - - - 2 09 Jun 2026 - 08 Jun 2026 - 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 - 02 Jun 2026 Day Not Applicable for Calculation 01 Jun 2026 Day Not Applicable for Calculation 31 May 2026 Day Not Applicable for Calculation 9 2 1 4 2 4 2 - - - 1 1 1 0 1 1 1 0 - - - 1 4 - - N/A N/A N/A N/A N/A N/A
37 77242113UCO3001 Adult Czech Republic DD5-CZ10016 Robert Mudr CZ100162001 1 I-0 28 May 2026 Yes 19 May 2026 18 May 2026 19 May 2026 - - 3 - 3 27 May 2026 - 26 May 2026 - 25 May 2026 - 24 May 2026 - 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 Day Not Applicable for Calculation 19 May 2026 Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation 18 May 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 14 15 15 15 15 15 15 - - - 3 2 3 3 2 2 3 3 - - - 3 9 9 12 - 28 May 2026 10:22:48 N/A N/A N/A N/A N/A N/A
38 77242113UCO3001 Adult Czech Republic DD5-CZ10016 Robert Mudr CZ100162001 1 I-2 11 Jun 2026 - - - - - - - - 3 10 Jun 2026 - 09 Jun 2026 - 08 Jun 2026 - 07 Jun 2026 - 06 Jun 2026 - 05 Jun 2026 - 04 Jun 2026 - 03 Jun 2026 Day Not Applicable for Calculation 02 Jun 2026 Day Not Applicable for Calculation 01 Jun 2026 Day Not Applicable for Calculation 10 9 9 8 13 9 8 - - - 3 2 1 1 1 2 1 1 - - - 1 7 - - N/A N/A N/A N/A N/A N/A
39 77242113UCO3001 Adolescent Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 1 Unscheduled 1 04 May 2026 Yes 20 Apr 2026 12 Apr 2026 15 Apr 2026 - - 2 - 3 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 - 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 24 Apr 2026 Day Not Applicable for Calculation 5 6 6 7 6 3 3 - - - 2 0 0 0 0 0 0 0 - - - 0 5 4 7 - - N/A N/A N/A N/A N/A N/A
40 77242113UCO3001 Adolescent Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 1 I-0 18 May 2026 Yes 01 May 2026 01 May 2026 01 May 2026 - - 2 - 3 17 May 2026 - 16 May 2026 - 15 May 2026 - 14 May 2026 - 13 May 2026 - 12 May 2026 - 11 May 2026 - 10 May 2026 Day Not Applicable for Calculation 09 May 2026 Day Not Applicable for Calculation 08 May 2026 Day Not Applicable for Calculation 6 6 6 6 6 6 6 - - - 3 0 0 0 0 0 0 0 - - - 0 6 5 8 - 18 May 2026 08:39:27 N/A N/A N/A N/A N/A N/A
41 77242113UCO3001 Adolescent Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 1 I-2 01 Jun 2026 - - - - - - - - 3 31 May 2026 - 30 May 2026 Missing Diary 29 May 2026 Missing Diary 28 May 2026 Missing Diary 27 May 2026 - 26 May 2026 - 25 May 2026 - 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 22 May 2026 Day Not Applicable for Calculation 6 - - - 6 6 6 - - - 3 0 - - - 0 0 0 - - - 0 6 - - N/A N/A N/A N/A N/A N/A
42 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-0 07 Apr 2026 Yes 16 Mar 2026 15 Mar 2026 16 Mar 2026 - - 3 - 3 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 - 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 28 Mar 2026 Day Not Applicable for Calculation 11 11 10 11 11 10 9 - - - 3 2 2 2 2 2 2 2 - - - 2 8 8 11 - 20 Apr 2026 09:27:58 N/A N/A N/A N/A N/A N/A
43 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-2 20 Apr 2026 - - - - - - - - 3 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 - 13 Apr 2026 - 12 Apr 2026 Day Not Applicable for Calculation 11 Apr 2026 Day Not Applicable for Calculation 10 Apr 2026 Day Not Applicable for Calculation 8 7 9 8 8 7 8 - - - 3 2 2 1 1 1 2 1 - - - 1 7 - 20 Apr 2026 09:29:01 N/A N/A N/A N/A N/A N/A
44 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-4 05 May 2026 - - - - - - - - 1 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 6 6 6 6 7 7 6 - - - 3 0 0 1 1 1 1 1 - - - 1 5 - - N/A N/A N/A N/A N/A N/A
45 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-8 02 Jun 2026 - - - - - - - - 1 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 Day Not Applicable for Calculation 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 3 4 4 4 5 5 5 - - - 2 0 0 0 0 0 1 1 - - - 0 3 - 02 Jun 2026 14:44:34 N/A N/A N/A N/A N/A N/A
46 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222002 1 I-0 19 Feb 2026 Yes 11 Feb 2026 10 Feb 2026 11 Feb 2026 - - 2 - 2 18 Feb 2026 - 17 Feb 2026 - 16 Feb 2026 - 15 Feb 2026 - 14 Feb 2026 - 13 Feb 2026 - 12 Feb 2026 - 11 Feb 2026 Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation 10 Feb 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 09 Feb 2026 Day Not Applicable for Calculation 3 2 2 3 4 3 2 - - - 1 1 1 0 0 0 2 2 - - - 1 4 4 6 - 19 Feb 2026 15:24:43 N/A N/A N/A N/A N/A N/A
47 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-0 09 Mar 2026 Yes 11 Feb 2026 10 Feb 2026 11 Feb 2026 - - 2 - 2 08 Mar 2026 - 07 Mar 2026 - 06 Mar 2026 - 05 Mar 2026 - 04 Mar 2026 - 03 Mar 2026 Missing Diary 02 Mar 2026 Missing Diary 01 Mar 2026 Missing Diary;Day Not Applicable for Calculation 28 Feb 2026 Missing Diary;Day Not Applicable for Calculation 27 Feb 2026 Missing Diary;Day Not Applicable for Calculation 7 7 6 6 7 - - - - - 3 2 2 2 2 2 - - - - - 2 7 7 9 - 22 Mar 2026 18:34:58 N/A N/A N/A N/A N/A N/A
48 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-2 27 Mar 2026 - - - - - - - - 2 26 Mar 2026 - 25 Mar 2026 - 24 Mar 2026 - 23 Mar 2026 - 22 Mar 2026 - 21 Mar 2026 - 20 Mar 2026 - 19 Mar 2026 Day Not Applicable for Calculation 18 Mar 2026 Day Not Applicable for Calculation 17 Mar 2026 Day Not Applicable for Calculation 7 3 3 3 5 5 5 - - - 2 0 0 1 1 1 1 2 - - - 1 5 - 08 Apr 2026 07:36:56 N/A N/A N/A N/A N/A N/A
49 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-4 08 Apr 2026 - - - - - - - - 2 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 Day Not Applicable for Calculation 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 3 3 4 4 5 4 3 - - - 2 1 0 0 2 1 1 2 - - - 1 5 - 08 Apr 2026 07:59:35 N/A N/A N/A N/A N/A N/A
50 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-8 04 May 2026 - - - - - - - - 2 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 - 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 24 Apr 2026 Missing Diary;Day Not Applicable for Calculation 3 5 3 3 3 2 3 - - - 1 0 0 0 0 0 0 0 - - - 0 3 - 04 May 2026 07:52:47 N/A N/A N/A N/A N/A N/A
51 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-12 01 Jun 2026 Yes 20 May 2026 19 May 2026 20 May 2026 - - 3 - 2 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 - 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 22 May 2026 Day Not Applicable for Calculation 4 4 6 3 3 3 3 - - - 2 1 1 2 1 1 1 2 - - - 1 5 6 8 - 01 Jun 2026 14:25:57 Clinical Nonresponder No N/A N/A N/A N/A
52 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-0 09 Apr 2026 Yes 08 Apr 2026 31 Mar 2026 01 Apr 2026 - - 2 - 2 08 Apr 2026 Endoscopy 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 31 Mar 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 30 Mar 2026 - - 3 3 4 3 4 3 - - 3 1 - 2 2 2 2 2 2 - - 2 2 5 5 7 - 29 May 2026 11:07:08 N/A N/A N/A N/A N/A N/A
53 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-2 22 Apr 2026 - - - - - - - - 2 21 Apr 2026 - 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 Day Not Applicable for Calculation 13 Apr 2026 Day Not Applicable for Calculation 12 Apr 2026 Day Not Applicable for Calculation 3 3 5 3 2 3 2 - - - 1 1 2 2 1 1 1 2 - - - 1 4 - 05 May 2026 07:29:35 N/A N/A N/A N/A N/A N/A
54 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-4 05 May 2026 - - - - - - - - 2 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 4 2 2 2 2 2 2 - - - 1 1 1 1 1 2 1 1 - - - 1 4 - 05 May 2026 07:28:55 N/A N/A N/A N/A N/A N/A
55 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-8 02 Jun 2026 - - - - - - - - 2 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 Day Not Applicable for Calculation 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 2 2 2 2 2 4 10 - - - 1 2 1 2 1 2 2 2 - - - 2 5 - 02 Jun 2026 08:18:08 N/A N/A N/A N/A N/A N/A
@@ -0,0 +1,219 @@
"Protocol","Country","Site","PI Name","Subject ID","Age at Informed Consent","Baseline Stool Count","Confirm Baseline Stool Count","Data Correction ID","Creation Date UTC","Status","Description","Date of Last Action UTC","Total Open Period","Total Open Time (Days)","Current Status Time (Days)","Type","Next Action Required","Category","Query History","Reason for Change","Resolution"
"77242113UCO3001","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","48","1","","SW00703544","13-May-2026","Submitted","Please change answer to clinical remision from no to YES (week 12). Entry erros ","20-May-2026","15-21 Days","21","16","Query Active ","Site","New","(1) 20 May 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification Request.
For us to process your request, please let us know the name of the form (with date) with question.
Thank you. ERT/CLARIO Data Coordination Team
","Entry Error",""
"77242113UCO3001","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","79","1","","SW00696586","09-Apr-2026","ReadyForQC","Please correct date of endoscopy to date: 18 March 2026 (from 25 March 2026)","15-Apr-2026","Over 28 Days","43","40","Query Active ","Site","Site-Entered Data","","Entry Error","CLARIO RESOLUTION:
Part 1: In Mayo Subscore (1) dated 08 Apr 2026 for I-0 visit, CLARIO to make the following changes:
- What was the date of endoscopy? (ENDODT1D): from 25 Mar 2026 to 18 Mar 2026
- Data Flag (QSDFLG1B): from blank to check
"
"77242113UCO3001","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","19","1","","SW00704536","19-May-2026","ReadyForQC","Please change the endoscopy date to 19-FEB-2026. 06-MAR-2026 was entered in error. ","26-May-2026","15-21 Days","18","13","Query Active ","Site","Site-Entered Data","","Entry Error","CLARIO RESOLUTION:
Part 1: In Mayo Subscore (1) dated 20 Mar 2026 for I-0 visit, CLARIO to make the following changes:
-What was the date of endoscopy? (ENDODT1D): from 06 Mar 2026 to 19 Feb 2026
- Data Flag (QSDFLG1B): from blank to check
"
"77242113UCO3001","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","22","5","Yes, I confirm this is the correct stool count.","SW00706684","01-Jun-2026","Submitted","The right endoscopy date is 23MAR2026, please change the date","15-Jun-2026","8-14 Days","9","","","Clario DM","New","(1) 05 Jun 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please confirm that if you are requesting following.
Mayo Subscore (1) dated 07 Apr 2026 for I-0
What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 23 Mar 2026
Thank you. ERT/CLARIO Data Coordination Team.
(2) 15 Jun 2026 hosova.kristyna@fnbrno.cz (Site User): The endoscopy was performed 23MAR2026
","Entry Error",""
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","29","1","","SW00705646","26-May-2026","ReadyForQC","Correct visit date I-O is 12-May-2026. All questionaries were filled on paper and entered in tablet later.
Log-in issue. ","09-Jun-2026","8-14 Days","13","3","","Clario DM","Visit Data","(1) 01 Jun 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please provide the timestamps for each of the assessments if you used paper forms and transcribed into the device.
If unknown, ERT will use a dummy timestamp.
Thank you. ERT/CLARIO Data Coordination Team.
(2) 01 Jun 2026 dstepek@vnbrno.cz (Site User): time is unknown
","Changed Information","CLARIO RESOLUTION:
Part 1: In the following forms for I-0, CLARIO to make the following changes:
-Report Date: from 26May 2026 to 12 May 2026
-Report Start Date and time: from 26 May 2026 to 12 May 2026 23:59:59
-Event End Date: from 26 May 2026 08:27:57 to 12 May 2026 23:59:59
+Tablet Training Module (1)
+Participant Start Instructions (1)
+IBDQ (1)
+PROMIS Fatigue Short Form 7a (1)
+BASDAI (1)
+Participant End Instructions (1)
+Visit End (122)
"
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","49","1","","SW00708623","10-Jun-2026","Cancelled","Correct date of I-2 is 26.5.2026. all questionaries were entered on paper at 07,45 and transmited later. ","10-Jun-2026","1 Day","1","","","","New","","yes, subject mishmasch",""
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","49","1","","SW00706581","29-May-2026","Completed","baseline stool count reported by subject is 0, please change to 1 as per CRA request (subject has 1 stool in 2-3 days if in remission)","10-Jun-2026","4-7 Days","7","","","","Demographic","","Changed Information","CLARIO RESOLUTION:
Part 1: In System Variables form, CLARIO to make the following changes:
- Baseline Stool Count (PT.Custom4): from 0 to 1
"
"77242113UCO3001","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","48","1","","SW00705916","27-May-2026","Completed","As per ATS investigation (ATS26040111), please remove the below form which was entered as a duplicate
- MAYO Diary (5) 24 Apr 2026","10-Jun-2026","8-14 Days","9","","","","Technical Revision","","Technical Revision - Other","CLARIO RESOLUTION:
Part 1: CLARIO to delete MAYO Diary (5) dated 24 Apr 2026
"
"77242113UCO3001","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","15","1","","SW00701729","06-May-2026","Completed","Dears, please delete data from visit I-0 (reported as 4th of May 2026) as this visit had to be postponed - see the previous DCR of this patient and change data request that was corrected. Patient has left the site before it was resolved and and new date of I-0 was planned. Patient continues to fill in his diary and patient is coming to I=0 visit within allowed window. We need the system and tablet to be ready to run new Mayo Score Report with updated and recent data (e.g. reflect new I-0 visit date, new eligible days -1 to -7.).
thank you, Jiri Skopek","19-May-2026","8-14 Days","8","","","","Visit Data","(1) 11 May 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please note that the delete forms are allowed if the reason is one of the following.
If not, forms will move to unscheduled visit.
Data collected by the wrong patient.
Data collected by someone other than the patient.
Data collected prior to informed consent, or after withdrawal from the study.
Duplicate data erroneously entered at an Unscheduled visit via paper transcription.
Data collected that is not expected per protocol.
Also, I-0 visit is still ongoing. Please close the visit.
Once the visit was closed, we will process accoridngly.
Thank you. ERT/CLARIO Data Coordination Team
(2) 11 May 2026 jskopek (Site User): Dears,
I do not see any option that is adequate -from the list. Data are not needed to be deleted fully, they reflect the situation at May4th. Please mark it as unscheduled visit - as exactly that is the case. We need the system to be ready for I-0 visit planned for next week.
I will close the visit tomorrow - do you mean in tablet/ipad?
Thank you very much for your help! Jiri
(3) 12 May 2026 venkata.ramana (Clario): Thank you for your response.
Please note that the visit I-0 was still ongoing but not closed yet.
So please close the visit.
Kind Regards, Clario Data Coordination Team.
(4) 12 May 2026 jskopek (Site User): If I try to close the I-O visit in TABLET, it asks me if patient fulfils eligibility criteria to proceed to next visit based on these old data if I answer NO, it asks me to DEACTIVATE patient. I do not want to DEACTIVATE patient can you help WHERE and HOW to close this visit for you to change it to UNSCHEDULED and not to de-activate patient?
Thank you Jiri
","Other-delete visit I-0","CLARIO RESOLUTION:
Part 1: In the following forms dated 04 May 2026, CLARIO to make the following changes:
-Event ID: from I-0 to Unscheduled Visit 1
-Event At Entry: from I-0 to Unscheduled Visit 1
+Visit Start (49)
+ePRO Availability (1)
+Mayo Subscore (1)
+PGA (1)
Part 2: CLARIO to delete the following forms dated 04 May 2026 for I-0 visit.
+C-SSRS Since Last Visit (1)
+C-SSRS Since Last Visit Findings Report (1)
Part 3: CLARIO to manually enter Visit End form for Unscheduled visit 1 with the following information:
-Protocol: 77242113UCO3001
-Report Date: 04 May 2026
-Report Start Date and Time: 04 May 2026 23:59:59
-Event ID: Unscheduled Visit 1
-Event End Date: 04 May 2026 23:59:59
-Visit Status: Incomplete
-Phase At Entry: Screening
-Phase At Entry Timestamp: 13 Apr 2026 12:32:20
-Event At Entry: Unscheduled visit 1
-Event Start Date: 04 May 2026 23:59:59
-Event Time Zone Offset in Milliseconds: 7200000
-Session Repeat Number (SESREP1N): 0
-Session Instance Id (SESINST1S): 3f1214f0-4788-11f1-a0cf-bb403212adce
"
"77242113UCO3001","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","15","1","","SW00701226","04-May-2026","Completed","Dears, we would like ask you to change the information I read on assignment form given by patient on April 13, 2026 (Visit 1), Baseline Stool Count (PT.Custom4) as 3 that should be reported as 1.
Patient has entered wrong number as he did not understood it should be number of stools when illness is in remission or absent. He is a child and did not reflected this question correctly. Therefore, please change Baseline Stool Count = 1.
Thank you, Jiri Skopek ","04-May-2026","1 Day","1","","","","Demographic","","Changed Information","(Clario instructions)
1. Please make below changes in the assignment form:
Baseline Stool Count (PT. Custom4): 03 to 01."
"77242113UCO3001","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","61","1","","SW00699492","23-Apr-2026","ReadyForQC","Please correct the date of endoscopy done during screening visit of patient CZ100212001 to correct date 16-MAR-2026.","29-Apr-2026","Over 28 Days","34","30","Query Active ","Site","Site-Entered Data","","Changed Information","CLARIO RESOLUTION:
Part 1: In the Mayo Subscore (1) dated 07 Apr 2026 for I-0 visit, CLARIO to make the following changes:
-What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 16 Mar 2026
- Data Flag (QSDFLG1B): from blank to check
"
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","39","1","","SW00703322","12-May-2026","Completed","As per ATS investigation (ATS26040111), please remove the below form that's been entered as a duplicate
- MAYO Diary (16) - 18 Mar 2026
","20-May-2026","4-7 Days","6","","","","Technical Revision","","Technical Revision - Other","CLARIO RESOLUTION:
Part 1: CLARIO to delete the MAYO Diary (16) dated 18 Mar 2026.
"
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","39","1","","SW00689748","09-Mar-2026","Completed","Dear all,
Patient CZ 100222003 was randomized on 9 Mar 2026. Kindly correct the colonoscopy date to 11 Feb 2025.
The date was initially entered as 21 Feb 2025 because the earlier date could not be entered in the system. The patient was rescreened.","02-Apr-2026","15-21 Days","17","","","","Site-Entered Data","(1) 13 Mar 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Could you please conform that if you are requesting following?
Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit
-What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025
Could you please confirm the year? This subject was assigned on 02 Mar 2026, you are providing that correct date is 11 Feb 2025 which a year ago.
If you are not requesting above, please provide us the name of the form with question.
Thank you. ERT/CLARIO Data Coordination Team
(2) 13 Mar 2026 katerina.havlikova@clinoxus.com (Site User): confirm date of colonoscopy 11Feb2026
(3) 21 Mar 2026 msullivan (Clario): Dear Site,
The requested changes to the Mayo data have been updated. Please navigate to the Mayo Score Report and resubmit the form for visit to log the updated Mayo Score form. Once done, please respond to this query confirming that the Mayo Score has been resubmitted.
Thank you. ERT/CLARIO Data Coordination Team
(4) 24 Mar 2026 jana.pomahacova@clinoxus.com (Site User): Thank you and sent
","New Information","CLARIO RESOLUTION:
Part 1: In the Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit, CLARIO to make the following changes:
-What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025
-Data Flag (QSDFLG1B): from blank to check"
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","33","1","","SW00705372","22-May-2026","Submitted","Dear all, please change Colonoscopz date from 8April2026 to date 01Apr2026 Thank you in advance","12-Jun-2026","8-14 Days","14","","Query Active ","Site","New","(1) 29 May 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please provide us the name of the form for this request.
Thank you. ERT/CLARIO Data Coordination Team
(2) 02 Jun 2026 katerina.havlikova@clinoxus.com (Site User): Dear all, please change Colonoscopy for Week I-12 date from 8April2026 to date 01Apr2026 Thank you in advance
(3) 12 Jun 2026 msullivan (Clario): Dear Site,
Please note that there is no I-12 visit in StudyWorks.
If you completed visit and stored, please submit all stored reports.
Until we see the data in StudyWorks, we are unable to confirm your request.
Also, please provide us the name of the form for this request.
Thank you. ERT/CLARIO Data Coordination Team
","Changed Information",""
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","33","1","","SW00702538","08-May-2026","Completed","This TRR is to document the correction to the Mayo Subscore (1) form, where the following variables were populated with NULL values, due to a known core defect:
Event At Entry, Event Start Date, Event Time Zone Offset in Milliseconds.","12-May-2026","2-3 Days","2","","","","Technical Revision","","Technical Revision - Other","Please make the below changes in Mayo Subscore (1) dated 22 Apr 2026:
-Event At Entry: I-0
-Event Start Date: 09 Apr 2026 08:09:19
-Event Time Zone Offset in Milliseconds: 7200000"
1 Protocol Country Site PI Name Subject ID Age at Informed Consent Baseline Stool Count Confirm Baseline Stool Count Data Correction ID Creation Date UTC Status Description Date of Last Action UTC Total Open Period Total Open Time (Days) Current Status Time (Days) Type Next Action Required Category Query History Reason for Change Resolution
2 77242113UCO3001 Czech Republic DD5-CZ10001 Matej Falc CZ100012001 48 1 SW00703544 13-May-2026 Submitted Please change answer to clinical remision from no to YES (week 12). Entry erros 20-May-2026 15-21 Days 21 16 Query Active Site New (1) 20 May 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification Request. For us to process your request, please let us know the name of the form (with date) with question. Thank you. ERT/CLARIO Data Coordination Team Entry Error
3 77242113UCO3001 Czech Republic DD5-CZ10001 Matej Falc CZ100012002 79 1 SW00696586 09-Apr-2026 ReadyForQC Please correct date of endoscopy to date: 18 March 2026 (from 25 March 2026) 15-Apr-2026 Over 28 Days 43 40 Query Active Site Site-Entered Data Entry Error CLARIO RESOLUTION: Part 1: In Mayo Subscore (1) dated 08 Apr 2026 for I-0 visit, CLARIO to make the following changes: - What was the date of endoscopy? (ENDODT1D): from 25 Mar 2026 to 18 Mar 2026 - Data Flag (QSDFLG1B): from blank to check
4 77242113UCO3001 Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 19 1 SW00704536 19-May-2026 ReadyForQC Please change the endoscopy date to 19-FEB-2026. 06-MAR-2026 was entered in error. 26-May-2026 15-21 Days 18 13 Query Active Site Site-Entered Data Entry Error CLARIO RESOLUTION: Part 1: In Mayo Subscore (1) dated 20 Mar 2026 for I-0 visit, CLARIO to make the following changes: -What was the date of endoscopy? (ENDODT1D): from 06 Mar 2026 to 19 Feb 2026 - Data Flag (QSDFLG1B): from blank to check
5 77242113UCO3001 Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 22 5 Yes, I confirm this is the correct stool count. SW00706684 01-Jun-2026 Submitted The right endoscopy date is 23MAR2026, please change the date 15-Jun-2026 8-14 Days 9 Clario DM New (1) 05 Jun 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please confirm that if you are requesting following. Mayo Subscore (1) dated 07 Apr 2026 for I-0 What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 23 Mar 2026 Thank you. ERT/CLARIO Data Coordination Team. (2) 15 Jun 2026 hosova.kristyna@fnbrno.cz (Site User): The endoscopy was performed 23MAR2026 Entry Error
6 77242113UCO3001 Czech Republic DD5-CZ10013 David Stepek CZ100132002 29 1 SW00705646 26-May-2026 ReadyForQC Correct visit date I-O is 12-May-2026. All questionaries were filled on paper and entered in tablet later. Log-in issue. 09-Jun-2026 8-14 Days 13 3 Clario DM Visit Data (1) 01 Jun 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please provide the timestamps for each of the assessments if you used paper forms and transcribed into the device. If unknown, ERT will use a dummy timestamp. Thank you. ERT/CLARIO Data Coordination Team. (2) 01 Jun 2026 dstepek@vnbrno.cz (Site User): time is unknown Changed Information CLARIO RESOLUTION: Part 1: In the following forms for I-0, CLARIO to make the following changes: -Report Date: from 26May 2026 to 12 May 2026 -Report Start Date and time: from 26 May 2026 to 12 May 2026 23:59:59 -Event End Date: from 26 May 2026 08:27:57 to 12 May 2026 23:59:59 +Tablet Training Module (1) +Participant Start Instructions (1) +IBDQ (1) +PROMIS Fatigue – Short Form 7a (1) +BASDAI (1) +Participant End Instructions (1) +Visit End (122)
7 77242113UCO3001 Czech Republic DD5-CZ10013 David Stepek CZ100132003 49 1 SW00708623 10-Jun-2026 Cancelled Correct date of I-2 is 26.5.2026. all questionaries were entered on paper at 07,45 and transmited later. 10-Jun-2026 1 Day 1 New yes, subject mishmasch
8 77242113UCO3001 Czech Republic DD5-CZ10013 David Stepek CZ100132003 49 1 SW00706581 29-May-2026 Completed baseline stool count reported by subject is 0, please change to 1 as per CRA request (subject has 1 stool in 2-3 days if in remission) 10-Jun-2026 4-7 Days 7 Demographic Changed Information CLARIO RESOLUTION: Part 1: In System Variables form, CLARIO to make the following changes: - Baseline Stool Count (PT.Custom4): from 0 to 1
9 77242113UCO3001 Czech Republic DD5-CZ10016 Robert Mudr CZ100162001 48 1 SW00705916 27-May-2026 Completed As per ATS investigation (ATS26040111), please remove the below form which was entered as a duplicate - MAYO Diary (5) 24 Apr 2026 10-Jun-2026 8-14 Days 9 Technical Revision Technical Revision - Other CLARIO RESOLUTION: Part 1: CLARIO to delete MAYO Diary (5) dated 24 Apr 2026
10 77242113UCO3001 Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 15 1 SW00701729 06-May-2026 Completed Dears, please delete data from visit I-0 (reported as 4th of May 2026) as this visit had to be postponed - see the previous DCR of this patient and change data request that was corrected. Patient has left the site before it was resolved and and new date of I-0 was planned. Patient continues to fill in his diary and patient is coming to I=0 visit within allowed window. We need the system and tablet to be ready to run new Mayo Score Report with updated and recent data (e.g. reflect new I-0 visit date, new eligible days -1 to -7.). thank you, Jiri Skopek 19-May-2026 8-14 Days 8 Visit Data (1) 11 May 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please note that the delete forms are allowed if the reason is one of the following. If not, forms will move to unscheduled visit. Data collected by the wrong patient. Data collected by someone other than the patient. Data collected prior to informed consent, or after withdrawal from the study. Duplicate data erroneously entered at an Unscheduled visit via paper transcription. Data collected that is not expected per protocol. Also, I-0 visit is still ongoing. Please close the visit. Once the visit was closed, we will process accoridngly. Thank you. ERT/CLARIO Data Coordination Team (2) 11 May 2026 jskopek (Site User): Dears, I do not see any option that is adequate -from the list. Data are not needed to be deleted fully, they reflect the situation at May4th. Please mark it as unscheduled visit - as exactly that is the case. We need the system to be ready for I-0 visit planned for next week. I will close the visit tomorrow - do you mean in tablet/ipad? Thank you very much for your help! Jiri (3) 12 May 2026 venkata.ramana (Clario): Thank you for your response. Please note that the visit I-0 was still ongoing but not closed yet. So please close the visit. Kind Regards, Clario Data Coordination Team. (4) 12 May 2026 jskopek (Site User): If I try to close the I-O visit in TABLET, it asks me if patient fulfils eligibility criteria to proceed to next visit based on these old data – if I answer NO, it asks me to DEACTIVATE patient. I do not want to DEACTIVATE patient – can you help WHERE and HOW to close this visit for you to change it to UNSCHEDULED and not to de-activate patient? Thank you Jiri Other-delete visit I-0 CLARIO RESOLUTION: Part 1: In the following forms dated 04 May 2026, CLARIO to make the following changes: -Event ID: from I-0 to Unscheduled Visit 1 -Event At Entry: from I-0 to Unscheduled Visit 1 +Visit Start (49) +ePRO Availability (1) +Mayo Subscore (1) +PGA (1) Part 2: CLARIO to delete the following forms dated 04 May 2026 for I-0 visit. +C-SSRS Since Last Visit (1) +C-SSRS Since Last Visit Findings Report (1) Part 3: CLARIO to manually enter Visit End form for Unscheduled visit 1 with the following information: -Protocol: 77242113UCO3001 -Report Date: 04 May 2026 -Report Start Date and Time: 04 May 2026 23:59:59 -Event ID: Unscheduled Visit 1 -Event End Date: 04 May 2026 23:59:59 -Visit Status: Incomplete -Phase At Entry: Screening -Phase At Entry Timestamp: 13 Apr 2026 12:32:20 -Event At Entry: Unscheduled visit 1 -Event Start Date: 04 May 2026 23:59:59 -Event Time Zone Offset in Milliseconds: 7200000 -Session Repeat Number (SESREP1N): 0 -Session Instance Id (SESINST1S): 3f1214f0-4788-11f1-a0cf-bb403212adce
11 77242113UCO3001 Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 15 1 SW00701226 04-May-2026 Completed Dears, we would like ask you to change the information I read on assignment form given by patient on April 13, 2026 (Visit 1), Baseline Stool Count (PT.Custom4) as 3 that should be reported as 1. Patient has entered wrong number as he did not understood it should be number of stools when illness is in remission or absent. He is a child and did not reflected this question correctly. Therefore, please change Baseline Stool Count = 1. Thank you, Jiri Skopek 04-May-2026 1 Day 1 Demographic Changed Information (Clario instructions) 1. Please make below changes in the assignment form: Baseline Stool Count (PT. Custom4): 03 to 01.
12 77242113UCO3001 Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 61 1 SW00699492 23-Apr-2026 ReadyForQC Please correct the date of endoscopy done during screening visit of patient CZ100212001 to correct date 16-MAR-2026. 29-Apr-2026 Over 28 Days 34 30 Query Active Site Site-Entered Data Changed Information CLARIO RESOLUTION: Part 1: In the Mayo Subscore (1) dated 07 Apr 2026 for I-0 visit, CLARIO to make the following changes: -What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 16 Mar 2026 - Data Flag (QSDFLG1B): from blank to check
13 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 39 1 SW00703322 12-May-2026 Completed As per ATS investigation (ATS26040111), please remove the below form that's been entered as a duplicate - MAYO Diary (16) - 18 Mar 2026 20-May-2026 4-7 Days 6 Technical Revision Technical Revision - Other CLARIO RESOLUTION: Part 1: CLARIO to delete the MAYO Diary (16) dated 18 Mar 2026.
14 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 39 1 SW00689748 09-Mar-2026 Completed Dear all, Patient CZ 100222003 was randomized on 9 Mar 2026. Kindly correct the colonoscopy date to 11 Feb 2025. The date was initially entered as 21 Feb 2025 because the earlier date could not be entered in the system. The patient was rescreened. 02-Apr-2026 15-21 Days 17 Site-Entered Data (1) 13 Mar 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Could you please conform that if you are requesting following? Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit -What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025 Could you please confirm the year? This subject was assigned on 02 Mar 2026, you are providing that correct date is 11 Feb 2025 which a year ago. If you are not requesting above, please provide us the name of the form with question. Thank you. ERT/CLARIO Data Coordination Team (2) 13 Mar 2026 katerina.havlikova@clinoxus.com (Site User): confirm date of colonoscopy 11Feb2026 (3) 21 Mar 2026 msullivan (Clario): Dear Site, The requested changes to the Mayo data have been updated. Please navigate to the Mayo Score Report and resubmit the form for visit to log the updated Mayo Score form. Once done, please respond to this query confirming that the Mayo Score has been resubmitted. Thank you. ERT/CLARIO Data Coordination Team (4) 24 Mar 2026 jana.pomahacova@clinoxus.com (Site User): Thank you and sent New Information CLARIO RESOLUTION: Part 1: In the Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit, CLARIO to make the following changes: -What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025 -Data Flag (QSDFLG1B): from blank to check
15 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 33 1 SW00705372 22-May-2026 Submitted Dear all, please change Colonoscopz date from 8April2026 to date 01Apr2026 Thank you in advance 12-Jun-2026 8-14 Days 14 Query Active Site New (1) 29 May 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please provide us the name of the form for this request. Thank you. ERT/CLARIO Data Coordination Team (2) 02 Jun 2026 katerina.havlikova@clinoxus.com (Site User): Dear all, please change Colonoscopy for Week I-12 date from 8April2026 to date 01Apr2026 Thank you in advance (3) 12 Jun 2026 msullivan (Clario): Dear Site, Please note that there is no I-12 visit in StudyWorks. If you completed visit and stored, please submit all stored reports. Until we see the data in StudyWorks, we are unable to confirm your request. Also, please provide us the name of the form for this request. Thank you. ERT/CLARIO Data Coordination Team Changed Information
16 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 33 1 SW00702538 08-May-2026 Completed This TRR is to document the correction to the Mayo Subscore (1) form, where the following variables were populated with NULL values, due to a known core defect: Event At Entry, Event Start Date, Event Time Zone Offset in Milliseconds. 12-May-2026 2-3 Days 2 Technical Revision Technical Revision - Other Please make the below changes in Mayo Subscore (1) dated 22 Apr 2026: -Event At Entry: I-0 -Event Start Date: 09 Apr 2026 08:09:19 -Event Time Zone Offset in Milliseconds: 7200000
@@ -0,0 +1,11 @@
"Protocol","Country","Site ID","PI_NAME","Subject Number","Age","Data Correction ID","Creation Date UTC","Status","Date of Last Action UTC","Total Open Period","Total Open Time (Days)","Current Status Time (Days)","Type","Next Action Required","Category","Query History","Reason for Change"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10001","Falc, Matej","CZ100012001","48 Years","16923867","14-May-2026","Escalated","14-Jun-2026","15-21 Days","20","","QUERY","Clario DM","Patient","(8) 14 Jun 2026 Clario: what should I do now? I have send you 1 ecg by normal way, 2 by pdf.","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10001","Falc, Matej","CZ100012001","48 Years","16567067","22-Jan-2026","Resolved","28-Jan-2026","4-7 Days","4","","QUERY","","Patient","MD Falc","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10009","Pumprla, Jiri","CZ100092001","49 Years","16776685","31-Mar-2026","Resolved","13-May-2026","Over 28 Days","29","","QUERY","","Patient","(2) 13 May 2026 Clario: I confirm, that only ONE ECG was collected by mistake.","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132001","29 Years","16990554","04-Jun-2026","Resolved","08-Jun-2026","2-3 Days","2","","QUERY","","Patient","(2) 07 Jun 2026 Clario: by mistake only one strip was taken","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132001","29 Years","16981256","02-Jun-2026","Resolved","04-Jun-2026","2-3 Days","2","","QUERY","","Transmittal","Visit: SCREENING/","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132002","29 Years","16985014","03-Jun-2026","Resolved","04-Jun-2026","1 Day","1","","QUERY","","Patient","(2) 04 Jun 2026 Clario: by mistake only one strip was expected","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132003","49 Years","16988974","04-Jun-2026","Resolved","05-Jun-2026","1 Day","1","","DCR","","Transmittal","Affected Event: 'SCREENING'",""
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132003","49 Years","16985006","03-Jun-2026","Resolved","04-Jun-2026","1 Day","1","","QUERY","","Patient","(2) 04 Jun 2026 Clario: by mistake only one strip was taken","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10021","Bortlik, Martin","CZ100212001","61 Years","16717619","11-Mar-2026","Resolved","28-Apr-2026","Over 28 Days","32","","QUERY","","Patient","(2) 28 Apr 2026 Clario: I confirmed that due to technical problems, the ECG was done only twice","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10022","Hrabak, Petr","CZ100222003","39 Years","16945114","21-May-2026","Resolved","04-Jun-2026","8-14 Days","10","","DCR","","Patient","(7) 04 Jun 2026 Portal, EXPeRT: It was mistake NO ECG for this date 20May2026 was done",""
1 Protocol Country Site ID PI_NAME Subject Number Age Data Correction ID Creation Date UTC Status Date of Last Action UTC Total Open Period Total Open Time (Days) Current Status Time (Days) Type Next Action Required Category Query History Reason for Change
2 77242113UCO3001_ANALYSIS Czech Republic The CZ10001 Falc, Matej CZ100012001 48 Years 16923867 14-May-2026 Escalated 14-Jun-2026 15-21 Days 20 QUERY Clario DM Patient (8) 14 Jun 2026 Clario: what should I do now? I have send you 1 ecg by normal way, 2 by pdf. Data Checks
3 77242113UCO3001_ANALYSIS Czech Republic The CZ10001 Falc, Matej CZ100012001 48 Years 16567067 22-Jan-2026 Resolved 28-Jan-2026 4-7 Days 4 QUERY Patient MD Falc Data Checks
4 77242113UCO3001_ANALYSIS Czech Republic The CZ10009 Pumprla, Jiri CZ100092001 49 Years 16776685 31-Mar-2026 Resolved 13-May-2026 Over 28 Days 29 QUERY Patient (2) 13 May 2026 Clario: I confirm, that only ONE ECG was collected by mistake. Data Checks
5 77242113UCO3001_ANALYSIS Czech Republic The CZ10013 Stepek, David CZ100132001 29 Years 16990554 04-Jun-2026 Resolved 08-Jun-2026 2-3 Days 2 QUERY Patient (2) 07 Jun 2026 Clario: by mistake only one strip was taken Data Checks
6 77242113UCO3001_ANALYSIS Czech Republic The CZ10013 Stepek, David CZ100132001 29 Years 16981256 02-Jun-2026 Resolved 04-Jun-2026 2-3 Days 2 QUERY Transmittal Visit: SCREENING/ Data Checks
7 77242113UCO3001_ANALYSIS Czech Republic The CZ10013 Stepek, David CZ100132002 29 Years 16985014 03-Jun-2026 Resolved 04-Jun-2026 1 Day 1 QUERY Patient (2) 04 Jun 2026 Clario: by mistake only one strip was expected Data Checks
8 77242113UCO3001_ANALYSIS Czech Republic The CZ10013 Stepek, David CZ100132003 49 Years 16988974 04-Jun-2026 Resolved 05-Jun-2026 1 Day 1 DCR Transmittal Affected Event: 'SCREENING'
9 77242113UCO3001_ANALYSIS Czech Republic The CZ10013 Stepek, David CZ100132003 49 Years 16985006 03-Jun-2026 Resolved 04-Jun-2026 1 Day 1 QUERY Patient (2) 04 Jun 2026 Clario: by mistake only one strip was taken Data Checks
10 77242113UCO3001_ANALYSIS Czech Republic The CZ10021 Bortlik, Martin CZ100212001 61 Years 16717619 11-Mar-2026 Resolved 28-Apr-2026 Over 28 Days 32 QUERY Patient (2) 28 Apr 2026 Clario: I confirmed that due to technical problems, the ECG was done only twice Data Checks
11 77242113UCO3001_ANALYSIS Czech Republic The CZ10022 Hrabak, Petr CZ100222003 39 Years 16945114 21-May-2026 Resolved 04-Jun-2026 8-14 Days 10 DCR Patient (7) 04 Jun 2026 Portal, EXPeRT: It was mistake NO ECG for this date 20May2026 was done
+302
View File
@@ -0,0 +1,302 @@
"""
import_to_mongo.py
Verze: 1.2
Datum: 2026-06-02
Import Clario CSV do MongoDB (databáze: Clario).
Kolekce: Clario.MayoDiary / Clario.MayoScore / Clario.eCOA_DCRs / Clario.ECG_DCRs
Filtr: pouze řádky s Country == "Czech Republic"
Klíč: MayoDiary → Subject ID + Form Number
MayoScore → Participant ID + Visit
eCOA_DCRs → Data Correction ID
ECG_DCRs → Data Correction ID
Historie: při změně fields se stará verze uloží do pole history[]
Po importu přesune zpracované CSV do downloads/Zpracovano/
Použití:
python import_to_mongo.py # importuje všechny CSV z downloads/
python import_to_mongo.py downloads/konkretni.csv # jeden soubor
"""
import csv
import re
import shutil
import sys
from datetime import datetime, timezone
from pathlib import Path
from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "Clario"
DOWNLOADS_DIR = Path(__file__).parent / "downloads"
PROCESSED_DIR = DOWNLOADS_DIR / "Zpracovano"
COUNTRY_FILTER = "Czech Republic"
# ---------------------------------------------------------------------------
# Konfigurace kolekcí
# ---------------------------------------------------------------------------
COLLECTION_CONFIG = {
"MayoDiary": {
"collection": "Clario.MayoDiary",
"subject_col": "Subject ID",
"key_cols": ("Subject ID", "Form Number"),
},
"MayoScore": {
"collection": "Clario.MayoScore",
"subject_col": "Participant ID",
"key_cols": ("Participant ID", "Visit"),
"outcome_cols": (
"Site Action",
"Last Mayo Score Submission",
"Week I-12 Clinical Responder",
"Week I-12 Clinical Remission",
"Clinical Flare",
"Loss of Response",
"Partial Mayo Response Post Loss of Response",
"Partial Mayo Response for Clinical Non-Responders",
),
},
"eCOA DCRs": {
"collection": "Clario.eCOA_DCRs",
"subject_col": "Subject ID",
"key_cols": ("Data Correction ID",),
},
"ECG DCRs": {
"collection": "Clario.ECG_DCRs",
"subject_col": "Subject Number",
"key_cols": ("Data Correction ID",),
},
}
DATE_FORMATS = [
"%d-%b-%Y ",
"%d-%b-%Y",
"%d-%b-%Y %H:%M:%S",
"%d %b %Y %H:%M:%S",
"%d %b %Y %H:%M:%S:%f",
"%d %b %Y",
"%d %B %Y",
"%Y%m%d %H:%M:%S.%f",
"%Y-%m-%d %H:%M:%S",
"%m/%d/%Y %I:%M:%S %p",
]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def clean_colname(name: str) -> str:
"""Odstraní BOM a okolní uvozovky/mezery z názvu sloupce."""
return name.lstrip("").strip().strip('"')
def parse_date(value: str) -> str | None:
v = value.strip()
for fmt in DATE_FORMATS:
try:
dt = datetime.strptime(v, fmt.strip())
return dt.replace(tzinfo=timezone.utc).isoformat()
except ValueError:
continue
return None
def extract_snapshot_date(filename: str) -> str:
match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name)
return match.group(1) if match else datetime.now().strftime("%Y-%m-%d")
def detect_collection_type(filename: str) -> str | None:
"""Vrátí klíč do COLLECTION_CONFIG nebo None."""
stem = Path(filename).stem
for key in COLLECTION_CONFIG:
if key in stem:
return key
return None
# ---------------------------------------------------------------------------
# CSV → dokument
# ---------------------------------------------------------------------------
def map_row(row: dict, col_type: str) -> dict:
cfg = COLLECTION_CONFIG[col_type]
doc: dict = {}
fields: dict = {}
cleaned = {clean_colname(k): v.strip() if v else "" for k, v in row.items()}
subject_col = cfg["subject_col"]
doc["subject"] = {"id": cleaned.get(subject_col, "")}
# ECG DCRs používají "Site ID" místo "Site"
site_name = cleaned.get("Site") or cleaned.get("Site ID", "")
doc["site"] = {"name": site_name}
doc["country"] = cleaned.get("Country", "")
doc["study"] = cleaned.get("Protocol", "")
key_parts = [cleaned.get(c, "") for c in cfg["key_cols"]]
doc["recordKey"] = "_".join(key_parts)
outcome_cols = set(cfg.get("outcome_cols", ()))
for col in outcome_cols:
value = cleaned.get(col, "")
if value and value != "-":
parsed = parse_date(value)
doc[col] = parsed if parsed else value
else:
doc[col] = None
skip_top = {"Protocol", "Country", "Site", subject_col} | outcome_cols
for col, value in cleaned.items():
if col in skip_top:
continue
if not value or value == "-":
continue
parsed = parse_date(value)
fields[col] = parsed if parsed else value
doc["fields"] = fields
return doc
# ---------------------------------------------------------------------------
# Import jednoho souboru
# ---------------------------------------------------------------------------
def import_file(csv_path: str, db) -> dict:
filename = Path(csv_path).name
col_type = detect_collection_type(filename)
if col_type is None:
print(f" Preskakuji (neznamy typ): {filename}")
return {"skipped": True}
cfg = COLLECTION_CONFIG[col_type]
col_name = cfg["collection"]
snapshot_date = extract_snapshot_date(filename)
collection = db[col_name]
inserted = changed = unchanged = filtered_out = 0
with open(csv_path, encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f, delimiter=",", quotechar='"')
for row in reader:
cleaned_row = {clean_colname(k): v for k, v in row.items()}
country = cleaned_row.get("Country", "").strip()
if COUNTRY_FILTER not in country:
filtered_out += 1
continue
doc = map_row(row, col_type)
record_key = doc.get("recordKey")
if not record_key:
continue
doc["sourceFile"] = filename
existing = collection.find_one({"recordKey": record_key})
if existing is None:
doc["firstSeen"] = snapshot_date
doc["lastSeen"] = snapshot_date
doc["history"] = []
collection.insert_one(doc)
inserted += 1
elif existing.get("fields") != doc["fields"]:
old_entry = {
"date": existing.get("lastSeen", snapshot_date),
"fields": existing["fields"],
}
update_doc = {k: v for k, v in doc.items()}
update_doc["lastSeen"] = snapshot_date
collection.update_one(
{"_id": existing["_id"]},
{
"$push": {"history": old_entry},
"$set": update_doc,
},
)
changed += 1
else:
collection.update_one(
{"_id": existing["_id"]},
{"$set": {"lastSeen": snapshot_date, "sourceFile": filename}},
)
unchanged += 1
collection.create_index([("recordKey", ASCENDING)], unique=True)
collection.create_index([("subject.id", ASCENDING)])
collection.create_index([("site.name", ASCENDING)])
if col_type == "MayoScore":
collection.create_index([("Site Action", ASCENDING)])
if col_type in ("eCOA DCRs", "ECG DCRs"):
collection.create_index([("fields.Status", ASCENDING)])
collection.create_index([("fields.Type", ASCENDING)])
stats = {
"collection": col_name,
"snapshot": snapshot_date,
"inserted": inserted,
"changed": changed,
"unchanged": unchanged,
"filtered_out": filtered_out,
}
print(f" {col_name} [{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{filtered_out} non-CZ")
return stats
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
paths: list[Path] = []
if len(sys.argv) > 1:
for arg in sys.argv[1:]:
p = Path(arg)
if p.is_file():
paths.append(p)
else:
print(f"Soubor nenalezen: {arg}")
else:
paths = sorted(DOWNLOADS_DIR.glob("*.csv"))
if not paths:
print("Zadne CSV soubory k importu.")
return
print(f"Nalezeno {len(paths)} souboru.\n")
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
PROCESSED_DIR.mkdir(exist_ok=True)
total = {"inserted": 0, "changed": 0, "unchanged": 0}
for csv_path in paths:
print(f"Import: {csv_path.name}")
stats = import_file(str(csv_path), db)
if not stats.get("skipped"):
for k in total:
total[k] += stats.get(k, 0)
dest = PROCESSED_DIR / csv_path.name
shutil.move(str(csv_path), str(dest))
print(f" -> presunut do Zpracovano/")
client.close()
print(f"\nCelkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
if __name__ == "__main__":
main()
+776
View File
@@ -0,0 +1,776 @@
"""
create_report.py
Verze: 1.6
Datum: 2026-06-02
Generuje Excel report (.xlsm) pro studii 77242113UCO3001 z MongoDB databáze Clario.
Výstup: U:/Dropbox/!!!Days/Downloads Z230/YYYY-MM-DD 77242113UCO3001 Clario Reports.xlsm
Zdroj dat:
MongoDB 192.168.1.76, databáze Clario
Kolekce Clario.MayoScore — skóre Mayo per pacient × visit
Kolekce Clario.MayoDiary — denní záznamy deníku pacienta
Kolekce Clario.eCOA_DCRs — data correction requests eCOA
Kolekce Clario.ECG_DCRs — data correction requests ECG
Listy:
MayoScore — jeden řádek = pacient × visit
sloupec „KLIKNI SEM" naviguje na filtrovaný EligibleDays
řádky I-0 s Modified Mayo < 5 červeně tučně
MayoDiary — jeden řádek = denní záznam deníku pacienta
Compliance — jeden řádek = pacient × visit; kolik dní v okně mezi návštěvami
mělo být vyplněno v MayoDiary a kolik jich pacient skutečně
vyplnil + procento. Okno I-0 = od první diary po I-0; ostatní
= od (předchozí visit +1) po aktuální visit. Unscheduled se
ignorují. Řádky s compliance ≥ 100 % zeleně.
EligibleDays — jeden řádek = jeden eligible day z MayoScore obohacený o data z MayoDiary;
included/excluded flag, excluded dny šedě na žlutém pozadí
eCOA_DCRs — všechna pole z kolekce Clario.eCOA_DCRs
ECG_DCRs — všechna pole z kolekce Clario.ECG_DCRs
VBA makro (Worksheet_SelectionChange na listu MayoScore):
Klik na sloupec „KLIKNI SEM" → přepne na EligibleDays a vyfiltruje záznamy
pro daného pacienta a visit. Vyžaduje povolení maker při otevření souboru.
"""
VERSION = "1.7"
from datetime import datetime, timedelta
from pathlib import Path
import time
from pymongo import MongoClient
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import xlwings as xw
# ---------------------------------------------------------------------------
# Konfigurace
# ---------------------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "Clario"
OUTPUT_DIR = Path(r"U:\Dropbox\!!!Days\Downloads Z230")
VISIT_ORDER = ["I-0", "I-2", "I-4", "I-8", "I-12", "M-4"]
COLUMNS_SCORE = [
("KLIKNI SEM", lambda d: "▶ klikni sem"),
("Site", lambda d: d.get("site", {}).get("name", "")),
("Subject ID", lambda d: d.get("subject", {}).get("id", "")),
("Visit", lambda d: d["fields"].get("Visit", "")),
("Visit Date", lambda d: d["fields"].get("Visit Date", "")),
("Baseline Stool Frequency", lambda d: _num(d["fields"].get("Baseline Stool Frequency", ""))),
("Central Endoscopy Score", lambda d: _num(d["fields"].get("Central Endoscopy Score", ""))),
("PGA Score", lambda d: _num(d["fields"].get("PGA Score", ""))),
("Stool Frequency Sub-score", lambda d: _num(d["fields"].get("Stool Frequency Sub-score", ""))),
("Rectal Bleeding Sub-score", lambda d: _num(d["fields"].get("Rectal Bleeding Sub-score", ""))),
("Partial Mayo Score", lambda d: _num(d["fields"].get("Partial Mayo Score", ""))),
("Modified Mayo Score", lambda d: _num(d["fields"].get("Modified Mayo Score", ""))),
("Full Mayo Score", lambda d: _num(d["fields"].get("Full Mayo Score", ""))),
("Site Action", lambda d: d.get("Site Action") or ""),
("Last Mayo Score Submission", lambda d: d.get("Last Mayo Score Submission") or ""),
("Wk I-12 Responder", lambda d: d.get("Week I-12 Clinical Responder") or ""),
("Wk I-12 Remission", lambda d: d.get("Week I-12 Clinical Remission") or ""),
("Clinical Flare", lambda d: d.get("Clinical Flare") or ""),
("Loss of Response", lambda d: d.get("Loss of Response") or ""),
("Partial Mayo Post LoR", lambda d: d.get("Partial Mayo Response Post Loss of Response") or ""),
("Partial Mayo Non-Resp", lambda d: d.get("Partial Mayo Response for Clinical Non-Responders") or ""),
]
COLUMNS_DIARY = [
("Subject ID", lambda d: d.get("subject", {}).get("id", "")),
("Report Date", lambda d: d["fields"].get("Report Date", "")),
("Baseline Stool Count", lambda d: _num(d["fields"].get("Baseline Stool Count", ""))),
("Stool Frequency", lambda d: _num(d["fields"].get("Stool Frequency", ""))),
("MAYO050", lambda d: d["fields"].get("MAYO050", "")),
("Not Applicable", lambda d: d["fields"].get("Not Applicable", "")),
("Constipation", lambda d: d["fields"].get("Constipation", "")),
("Diarrhea", lambda d: d["fields"].get("Diarrhea", "")),
("Irregularity", lambda d: d["fields"].get("Irregularity", "")),
]
COLUMNS_ECOA_DCRS = [
("Site", lambda d: d.get("site", {}).get("name", "")),
("Subject ID", lambda d: d.get("subject", {}).get("id", "")),
("Data Correction ID", lambda d: d["fields"].get("Data Correction ID", "")),
("PI Name", lambda d: d["fields"].get("PI Name", "")),
("Creation Date UTC", lambda d: d["fields"].get("Creation Date UTC", "")),
("Date of Last Action UTC", lambda d: d["fields"].get("Date of Last Action UTC", "")),
("Status", lambda d: d["fields"].get("Status", "")),
("Type", lambda d: d["fields"].get("Type", "")),
("Next Action Required", lambda d: d["fields"].get("Next Action Required", "")),
("Category", lambda d: d["fields"].get("Category", "")),
("Total Open Period", lambda d: d["fields"].get("Total Open Period", "")),
("Total Open Time (Days)", lambda d: _num(d["fields"].get("Total Open Time (Days)", ""))),
("Current Status Time (Days)", lambda d: _num(d["fields"].get("Current Status Time (Days)", ""))),
("Reason for Change", lambda d: d["fields"].get("Reason for Change", "")),
("Description", lambda d: d["fields"].get("Description", "")),
("Resolution", lambda d: d["fields"].get("Resolution", "")),
("Query History", lambda d: d["fields"].get("Query History", "")),
("Age at Informed Consent", lambda d: d["fields"].get("Age at Informed Consent", "")),
("Baseline Stool Count", lambda d: _num(d["fields"].get("Baseline Stool Count", ""))),
("firstSeen", lambda d: d.get("firstSeen", "")),
("lastSeen", lambda d: d.get("lastSeen", "")),
]
COLUMNS_ECG_DCRS = [
("Site ID", lambda d: d.get("site", {}).get("name", "")),
("Subject Number", lambda d: d.get("subject", {}).get("id", "")),
("Data Correction ID", lambda d: d["fields"].get("Data Correction ID", "")),
("PI Name", lambda d: d["fields"].get("PI_NAME", "")),
("Age", lambda d: d["fields"].get("Age", "")),
("Creation Date UTC", lambda d: d["fields"].get("Creation Date UTC", "")),
("Date of Last Action UTC", lambda d: d["fields"].get("Date of Last Action UTC", "")),
("Status", lambda d: d["fields"].get("Status", "")),
("Type", lambda d: d["fields"].get("Type", "")),
("Next Action Required", lambda d: d["fields"].get("Next Action Required", "")),
("Category", lambda d: d["fields"].get("Category", "")),
("Total Open Period", lambda d: d["fields"].get("Total Open Period", "")),
("Total Open Time (Days)", lambda d: _num(d["fields"].get("Total Open Time (Days)", ""))),
("Current Status Time (Days)", lambda d: _num(d["fields"].get("Current Status Time (Days)", ""))),
("Reason for Change", lambda d: d["fields"].get("Reason for Change", "")),
("Query History", lambda d: d["fields"].get("Query History", "")),
("firstSeen", lambda d: d.get("firstSeen", "")),
("lastSeen", lambda d: d.get("lastSeen", "")),
]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _num(value):
"""Převede číselný string na int, jinak vrátí původní hodnotu nebo None."""
if value == "" or value is None:
return None
try:
return int(value)
except (ValueError, TypeError):
try:
return float(value)
except (ValueError, TypeError):
return value
def _visit_sort_key(doc):
visit = doc["fields"].get("Visit", "")
try:
idx = VISIT_ORDER.index(visit)
except ValueError:
idx = len(VISIT_ORDER)
return (doc.get("site", {}).get("name", ""), doc.get("subject", {}).get("id", ""), idx, visit)
def _iso_to_date(value):
"""ISO string → Python date pro Excel."""
if not isinstance(value, str):
return value
try:
return datetime.fromisoformat(value).date()
except ValueError:
return value
# ---------------------------------------------------------------------------
# Styly
# ---------------------------------------------------------------------------
HEADER_FILL = PatternFill("solid", fgColor="1F497D")
HEADER_FONT = Font(bold=True, color="FFFFFF", size=10)
CELL_FONT = Font(size=10)
ALIGN_CTR = Alignment(horizontal="center", vertical="center", wrap_text=False)
ALIGN_LEFT = Alignment(horizontal="left", vertical="center")
THIN = Side(style="thin", color="BFBFBF")
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
# zebra
FILL_ODD = PatternFill("solid", fgColor="FFFFFF")
FILL_EVEN = PatternFill("solid", fgColor="EBF1DE")
# DCR status barvy
FILL_DCR_SITE = PatternFill("solid", fgColor="FFFF00") # žlutá — čeká lékař
FILL_DCR_CLARIO = PatternFill("solid", fgColor="BDD7EE") # modrá — čeká Clario
FILL_DCR_QC = PatternFill("solid", fgColor="F4B942") # oranžová — ReadyForQC
FILL_DCR_DONE = PatternFill("solid", fgColor="FFFFFF") # bílá — Completed
SCORE_COLS = {"Partial Mayo Score", "Modified Mayo Score", "Full Mayo Score"}
SCORE_FILL = PatternFill("solid", fgColor="FFC7CE") # červená pro skóre ≥ 5 (placeholder — nepoužíváme podmíněné formátování)
# ---------------------------------------------------------------------------
# Sestavení sheetu
# ---------------------------------------------------------------------------
def _build_sheet(ws, docs, columns, date_cols, center_cols, col_widths, row_font_fn=None, wrap_cols=None, header_row=1):
headers = [c[0] for c in columns]
for col_idx, header in enumerate(headers, 1):
cell = ws.cell(row=header_row, column=col_idx, value=header)
cell.font = HEADER_FONT
cell.fill = HEADER_FILL
cell.alignment = ALIGN_CTR
cell.border = BORDER
ws.row_dimensions[header_row].height = 28
data_start = header_row + 1
for row_idx, doc in enumerate(docs, data_start):
fill = FILL_EVEN if (row_idx - header_row) % 2 == 0 else FILL_ODD
font = row_font_fn(doc) if row_font_fn else CELL_FONT
for col_idx, (col_name, getter) in enumerate(columns, 1):
value = getter(doc)
if col_name in date_cols and isinstance(value, str):
value = _iso_to_date(value)
cell = ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = font
cell.fill = fill
cell.border = BORDER
if wrap_cols and col_name in wrap_cols:
cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True)
else:
cell.alignment = ALIGN_CTR if col_name in center_cols else ALIGN_LEFT
for col_idx, (col_name, _) in enumerate(columns, 1):
ws.column_dimensions[get_column_letter(col_idx)].width = col_widths.get(col_name, 14)
for col_name in date_cols:
if col_name in headers:
letter = get_column_letter(headers.index(col_name) + 1)
for row_idx in range(data_start, len(docs) + data_start):
ws[f"{letter}{row_idx}"].number_format = "DD-MMM-YYYY"
ws.freeze_panes = f"A{data_start}"
ws.auto_filter.ref = f"A{header_row}:{get_column_letter(len(headers))}{header_row}"
def _score_row_font(doc):
visit = doc["fields"].get("Visit", "")
try:
mod_mayo = int(doc["fields"].get("Modified Mayo Score", ""))
except (ValueError, TypeError):
mod_mayo = None
if visit == "I-0" and mod_mayo is not None and mod_mayo < 5:
return Font(size=10, bold=True, color="FF0000")
return CELL_FONT
def build_mayo_score_sheet(ws, docs):
_build_sheet(
ws, docs, COLUMNS_SCORE,
date_cols={"Visit Date", "Last Mayo Score Submission"},
center_cols={"KLIKNI SEM", "Visit", "Central Endoscopy Score", "PGA Score",
"Stool Frequency Sub-score", "Rectal Bleeding Sub-score",
"Partial Mayo Score", "Modified Mayo Score", "Full Mayo Score",
"Baseline Stool Frequency",
"Wk I-12 Responder", "Wk I-12 Remission", "Clinical Flare",
"Loss of Response", "Partial Mayo Post LoR", "Partial Mayo Non-Resp",
"Last Mayo Score Submission"},
col_widths={
"KLIKNI SEM": 14,
"Site": 18, "Subject ID": 16, "Visit": 12, "Visit Date": 14,
"Baseline Stool Frequency": 14, "Central Endoscopy Score": 14,
"PGA Score": 10, "Stool Frequency Sub-score": 14,
"Rectal Bleeding Sub-score": 14, "Partial Mayo Score": 14,
"Modified Mayo Score": 14, "Full Mayo Score": 13,
"Site Action": 22, "Last Mayo Score Submission": 16,
"Wk I-12 Responder": 14, "Wk I-12 Remission": 14,
"Clinical Flare": 14, "Loss of Response": 14,
"Partial Mayo Post LoR": 20, "Partial Mayo Non-Resp": 20,
},
row_font_fn=_score_row_font,
)
# Speciální styl pro sloupec KLIKNI SEM — vypadá jako tlačítko/odkaz
link_font = Font(size=10, bold=True, color="FFFFFF")
link_fill = PatternFill("solid", fgColor="2E75B6")
for row in range(2, len(docs) + 2):
cell = ws.cell(row=row, column=1)
cell.font = link_font
cell.fill = link_fill
cell.alignment = ALIGN_CTR
def build_mayo_diary_sheet(ws, docs):
_build_sheet(
ws, docs, COLUMNS_DIARY,
date_cols={"Report Date"},
center_cols={"Baseline Stool Count", "Stool Frequency", "Not Applicable",
"Constipation", "Diarrhea", "Irregularity"},
col_widths={
"Subject ID": 16, "Report Date": 14, "Baseline Stool Count": 14,
"Stool Frequency": 14, "MAYO050": 48, "Not Applicable": 14,
"Constipation": 14, "Diarrhea": 12, "Irregularity": 14,
},
)
def build_eligible_days_sheet(ws, score_docs, diary_docs):
# Lookup diary records by (subject_id, date_part YYYY-MM-DD)
diary_lookup: dict[tuple, dict] = {}
for d in diary_docs:
subj = d.get("subject", {}).get("id", "")
date_iso = d["fields"].get("Report Date", "")
date_part = date_iso[:10] if date_iso else ""
if subj and date_part:
diary_lookup[(subj, date_part)] = d
headers = [
"Included", "Subject ID", "Visit", "Visit Date", "Day",
"Report Date", "Baseline Stool Count", "Stool Frequency",
"MAYO050", "Not Applicable", "Constipation", "Diarrhea", "Irregularity",
]
col_widths = {
"Included": 10, "Subject ID": 16, "Visit": 10, "Visit Date": 14, "Day": 8,
"Report Date": 14, "Baseline Stool Count": 14, "Stool Frequency": 14,
"MAYO050": 48, "Not Applicable": 14, "Constipation": 14,
"Diarrhea": 12, "Irregularity": 14,
}
center_cols = {"Included", "Visit", "Day", "Baseline Stool Count", "Stool Frequency",
"Not Applicable", "Constipation", "Diarrhea", "Irregularity"}
date_cols = {"Visit Date", "Report Date"}
no_fill = PatternFill("solid", fgColor="FFF2CC") # žlutá pro excluded dny
for col_idx, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col_idx, value=header)
cell.font = HEADER_FONT
cell.fill = HEADER_FILL
cell.alignment = ALIGN_CTR
cell.border = BORDER
ws.row_dimensions[1].height = 28
row_idx = 2
for score_doc in score_docs:
subj = score_doc.get("subject", {}).get("id", "")
visit = score_doc["fields"].get("Visit", "")
visit_date = score_doc["fields"].get("Visit Date", "")
for n in range(1, 11):
day_date_iso = score_doc["fields"].get(f"Eligible Day (-{n})")
if not day_date_iso or day_date_iso == "-":
continue
date_part = day_date_iso[:10]
excl_reason = score_doc["fields"].get(f"Day (-{n}) Excluded Reason(s)", "")
included = "No" if excl_reason and excl_reason != "-" else "Yes"
diary = diary_lookup.get((subj, date_part), {})
df = diary.get("fields", {})
fill = no_fill if included == "No" else (FILL_EVEN if row_idx % 2 == 0 else FILL_ODD)
font = Font(size=10, color="808080") if included == "No" else CELL_FONT
values = [
included,
subj,
visit,
_iso_to_date(visit_date) if isinstance(visit_date, str) else visit_date,
f"-{n}",
_iso_to_date(day_date_iso),
_num(df.get("Baseline Stool Count", "")),
_num(df.get("Stool Frequency", "")),
df.get("MAYO050", ""),
df.get("Not Applicable", ""),
df.get("Constipation", ""),
df.get("Diarrhea", ""),
df.get("Irregularity", ""),
]
for col_idx, (header, value) in enumerate(zip(headers, values), 1):
cell = ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = font
cell.fill = fill
cell.border = BORDER
if header in date_cols:
cell.number_format = "DD-MMM-YYYY"
cell.alignment = ALIGN_CTR if header in center_cols else ALIGN_LEFT
row_idx += 1
for col_idx, header in enumerate(headers, 1):
ws.column_dimensions[get_column_letter(col_idx)].width = col_widths.get(header, 14)
ws.freeze_panes = "A2"
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
def _build_dcr_legend(ws):
"""Vloží legendu do řádků 14, prázdný řádek 5. Data začínají od řádku 6."""
legend = [
(FILL_DCR_SITE, "Čeká lékař — Next Action Required = Site (lékař musí odpovědět nebo potvrdit)"),
(FILL_DCR_CLARIO, "Čeká Clario — Next Action Required = Clario DM (Clario dostalo podklady, provede změnu)"),
(FILL_DCR_QC, "ReadyForQC — Clario provedlo změny, čeká na finální QC kontrolu"),
(FILL_DCR_DONE, "Completed / Resolved — DCR je uzavřen"),
]
for i, (fill, text) in enumerate(legend, 1):
a = ws.cell(row=i, column=1, value="")
a.fill = fill
a.border = BORDER
b = ws.cell(row=i, column=2, value=text)
b.font = Font(size=10, bold=True)
b.alignment = ALIGN_LEFT
# řádek 5 prázdný — nic nedělat
def _dcr_row_fill(doc):
"""Vrátí fill barvu dle stavu DCR."""
status = doc["fields"].get("Status", "")
next_action = doc["fields"].get("Next Action Required", "")
if status in ("Completed", "Resolved"):
return FILL_DCR_DONE
if status == "ReadyForQC":
return FILL_DCR_QC
if "Site" in next_action:
return FILL_DCR_SITE
if "Clario" in next_action or next_action == "":
return FILL_DCR_CLARIO
return FILL_ODD
def build_ecoa_dcrs_sheet(ws, docs):
_build_dcr_legend(ws)
docs_sorted = sorted(docs, key=lambda d: (
d.get("site", {}).get("name", ""),
d.get("subject", {}).get("id", ""),
d["fields"].get("Creation Date UTC", ""),
))
_build_sheet(
ws, docs_sorted, COLUMNS_ECOA_DCRS,
date_cols={"Creation Date UTC", "Date of Last Action UTC"},
center_cols={"Status", "Type", "Next Action Required", "Category",
"Total Open Time (Days)", "Current Status Time (Days)",
"Baseline Stool Count", "firstSeen", "lastSeen"},
col_widths={
"Site": 16, "Subject ID": 16, "Data Correction ID": 18,
"PI Name": 18, "Creation Date UTC": 14, "Date of Last Action UTC": 14,
"Status": 14, "Type": 16, "Next Action Required": 16, "Category": 20,
"Total Open Period": 14, "Total Open Time (Days)": 14,
"Current Status Time (Days)": 16, "Reason for Change": 20,
"Description": 50, "Resolution": 50, "Query History": 60,
"Age at Informed Consent": 14, "Baseline Stool Count": 14,
"firstSeen": 12, "lastSeen": 12,
},
wrap_cols={"Reason for Change", "Description", "Resolution", "Query History"},
header_row=6,
row_font_fn=lambda doc: CELL_FONT,
)
# Přebarvení řádků dle DCR stavu (přepíše zebra fill)
data_start = 7
for row_idx, doc in enumerate(docs_sorted, data_start):
fill = _dcr_row_fill(doc)
for col_idx in range(1, len(COLUMNS_ECOA_DCRS) + 1):
ws.cell(row=row_idx, column=col_idx).fill = fill
def build_ecg_dcrs_sheet(ws, docs):
_build_dcr_legend(ws)
docs_sorted = sorted(docs, key=lambda d: (
d.get("site", {}).get("name", ""),
d.get("subject", {}).get("id", ""),
d["fields"].get("Creation Date UTC", ""),
))
_build_sheet(
ws, docs_sorted, COLUMNS_ECG_DCRS,
date_cols={"Creation Date UTC", "Date of Last Action UTC"},
center_cols={"Status", "Type", "Next Action Required", "Category",
"Total Open Time (Days)", "Current Status Time (Days)",
"firstSeen", "lastSeen"},
col_widths={
"Site ID": 14, "Subject Number": 16, "Data Correction ID": 16,
"PI Name": 18, "Age": 10, "Creation Date UTC": 14,
"Date of Last Action UTC": 14, "Status": 14, "Type": 12,
"Next Action Required": 16, "Category": 14,
"Total Open Period": 14, "Total Open Time (Days)": 14,
"Current Status Time (Days)": 16, "Reason for Change": 20,
"Query History": 60, "firstSeen": 12, "lastSeen": 12,
},
wrap_cols={"Query History"},
header_row=6,
row_font_fn=lambda doc: CELL_FONT,
)
# Přebarvení řádků dle DCR stavu
data_start = 7
for row_idx, doc in enumerate(docs_sorted, data_start):
fill = _dcr_row_fill(doc)
for col_idx in range(1, len(COLUMNS_ECG_DCRS) + 1):
ws.cell(row=row_idx, column=col_idx).fill = fill
# ---------------------------------------------------------------------------
# List Compliance
# ---------------------------------------------------------------------------
# Pořadí návštěv pro výpočet oken (Unscheduled apod. se ignorují)
COMPLIANCE_VISIT_ORDER = ["I-0", "I-2", "I-4", "I-8", "I-12", "M-4"]
FILL_COMPLIANCE_OK = PatternFill("solid", fgColor="C6EFCE") # zelená — compliance ≥ 100 %
FONT_COMPLIANCE_OK = Font(size=10, color="006100")
def build_compliance_sheet(ws, score_docs, diary_docs):
"""Compliance vyplňování MayoDiary mezi návštěvami.
Okno pro I-0 = od první MayoDiary daného pacienta po datum I-0.
Okno pro ostatní = od (datum předchozí návštěvy + 1 den) po datum návštěvy.
Vyplněno = počet MayoDiary záznamů pacienta s Report Date uvnitř okna.
Dní v okně = počet kalendářních dní okna (včetně obou krajů).
"""
# -- MayoDiary datumy per pacient (jeden průchod) ------------------------
diary_by_subj: dict[str, list] = {}
for d in diary_docs:
subj = d.get("subject", {}).get("id", "")
rd = d["fields"].get("Report Date", "")
dt = _iso_to_date(rd) if isinstance(rd, str) else rd
if subj and hasattr(dt, "year"):
diary_by_subj.setdefault(subj, []).append(dt)
first_diary = {s: min(dts) for s, dts in diary_by_subj.items() if dts}
def _vidx(v):
try:
return COMPLIANCE_VISIT_ORDER.index(v)
except ValueError:
return len(COMPLIANCE_VISIT_ORDER)
# -- Návštěvy per pacient (jen známé visity) -----------------------------
by_subj: dict[str, list] = {}
for sd in score_docs:
if sd["fields"].get("Visit", "") not in COMPLIANCE_VISIT_ORDER:
continue
subj = sd.get("subject", {}).get("id", "")
by_subj.setdefault(subj, []).append(sd)
rows = []
for subj in sorted(by_subj):
visits = sorted(by_subj[subj], key=lambda d: _vidx(d["fields"].get("Visit", "")))
prev_end = None
for sd in visits:
visit = sd["fields"].get("Visit", "")
vdate = _iso_to_date(sd["fields"].get("Visit Date", ""))
if not hasattr(vdate, "year"):
continue
if visit == "I-0":
start = first_diary.get(subj)
else:
start = (prev_end + timedelta(days=1)) if prev_end else first_diary.get(subj)
prev_end = vdate
if not start or not hasattr(start, "year"):
continue
days = (vdate - start).days + 1
if days <= 0:
continue
filled = sum(1 for dt in diary_by_subj.get(subj, []) if start <= dt <= vdate)
pct = round(filled / days * 100)
rows.append({
"site": sd.get("site", {}).get("name", ""),
"subj": subj,
"visit": visit,
"start": start,
"end": vdate,
"days": days,
"filled": filled,
"pct": pct,
})
# -- Zápis listu ---------------------------------------------------------
headers = ["Site", "Subject ID", "Visit", "Okno od", "Okno do",
"Dní v okně", "Vyplněno", "Compliance %"]
col_widths = {"Site": 18, "Subject ID": 16, "Visit": 10, "Okno od": 14,
"Okno do": 14, "Dní v okně": 12, "Vyplněno": 12, "Compliance %": 14}
center_cols = {"Visit", "Dní v okně", "Vyplněno", "Compliance %"}
date_cols = {"Okno od", "Okno do"}
for col_idx, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col_idx, value=header)
cell.font = HEADER_FONT
cell.fill = HEADER_FILL
cell.alignment = ALIGN_CTR
cell.border = BORDER
ws.row_dimensions[1].height = 28
for row_idx, r in enumerate(rows, 2):
is_ok = r["pct"] >= 100
if is_ok:
fill = FILL_COMPLIANCE_OK
font = FONT_COMPLIANCE_OK
else:
fill = FILL_EVEN if row_idx % 2 == 0 else FILL_ODD
font = CELL_FONT
values = [r["site"], r["subj"], r["visit"], r["start"], r["end"],
r["days"], r["filled"], r["pct"]]
for col_idx, (header, value) in enumerate(zip(headers, values), 1):
cell = ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = font
cell.fill = fill
cell.border = BORDER
if header in date_cols:
cell.number_format = "DD-MMM-YYYY"
if header == "Compliance %":
cell.number_format = '0"%"'
cell.alignment = ALIGN_CTR if header in center_cols else ALIGN_LEFT
for col_idx, header in enumerate(headers, 1):
ws.column_dimensions[get_column_letter(col_idx)].width = col_widths.get(header, 14)
ws.freeze_panes = "A2"
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
# ---------------------------------------------------------------------------
# Helpers: výstupní cesta
# ---------------------------------------------------------------------------
def _unique_path(directory: Path, stem: str, suffix: str) -> Path:
candidate = directory / f"{stem}{suffix}"
if not candidate.exists():
return candidate
n = 2
while True:
candidate = directory / f"{stem} ({n}){suffix}"
if not candidate.exists():
return candidate
n += 1
# ---------------------------------------------------------------------------
# Timing helper
# ---------------------------------------------------------------------------
def _tick(label: str, t0: float) -> float:
"""Vypíše dobu od t0 a vrátí aktuální čas jako nový t0."""
elapsed = time.perf_counter() - t0
print(f" {label:<30} {elapsed:6.2f} s")
return time.perf_counter()
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
t_total = time.perf_counter()
print("Spouštím generování reportu...")
print()
# -- 1. MongoDB: připojení + načtení + seřazení --------------------------
t = time.perf_counter()
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
score_docs = list(db["Clario.MayoScore"].find({}))
diary_docs = list(db["Clario.MayoDiary"].find({}))
ecoa_dcr_docs = list(db["Clario.eCOA_DCRs"].find({}))
ecg_dcr_docs = list(db["Clario.ECG_DCRs"].find({}))
client.close()
score_docs.sort(key=_visit_sort_key)
diary_docs.sort(key=lambda d: (
d.get("subject", {}).get("id", ""),
d["fields"].get("Report Date", ""),
))
t = _tick(f"MongoDB (ping, fetch, sort → {len(score_docs)} + {len(diary_docs)} + {len(ecoa_dcr_docs)} + {len(ecg_dcr_docs)} záznamů)", t)
# -- 24. Tvorba listů ---------------------------------------------------
wb = Workbook()
ws_score = wb.active
ws_score.title = "MayoScore"
build_mayo_score_sheet(ws_score, score_docs)
t = _tick("List MayoScore (KLIKNI SEM, zebra, červené I-0, autofilter)", t)
ws_diary = wb.create_sheet("MayoDiary")
build_mayo_diary_sheet(ws_diary, diary_docs)
t = _tick("List MayoDiary (zebra, formátování dat, autofilter)", t)
ws_comp = wb.create_sheet("Compliance")
build_compliance_sheet(ws_comp, score_docs, diary_docs)
t = _tick("List Compliance (okna mezi visitami, % vyplnění, zelená ≥100 %)", t)
ws_days = wb.create_sheet("EligibleDays")
build_eligible_days_sheet(ws_days, score_docs, diary_docs)
t = _tick("List EligibleDays (diary lookup, included/excluded flag, autofilter)", t)
ws_ecoa = wb.create_sheet("eCOA_DCRs")
build_ecoa_dcrs_sheet(ws_ecoa, ecoa_dcr_docs)
t = _tick(f"List eCOA_DCRs ({len(ecoa_dcr_docs)} záznamů)", t)
ws_ecg = wb.create_sheet("ECG_DCRs")
build_ecg_dcrs_sheet(ws_ecg, ecg_dcr_docs)
t = _tick(f"List ECG_DCRs ({len(ecg_dcr_docs)} záznamů)", t)
# -- 5. Uložení XLSX -----------------------------------------------------
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
today = datetime.now().strftime("%Y-%m-%d")
base_stem = f"{today} 77242113UCO3001 Clario Reports"
xlsm_path = _unique_path(OUTPUT_DIR, base_stem, ".xlsm")
xlsx_path = xlsm_path.with_suffix(".xlsx")
wb.save(str(xlsx_path))
t = _tick("Uložení XLSX (openpyxl, dočasný soubor)", t)
# -- 6. Injektování VBA --------------------------------------------------
inject_vba(xlsx_path, xlsm_path)
xlsx_path.unlink(missing_ok=True)
_tick("Injektování VBA (xlwings: open → AddFromString → SaveAs .xlsm)", t)
# -- Souhrn --------------------------------------------------------------
total = time.perf_counter() - t_total
print()
print(f" {'Celkem':<30} {total:6.2f} s")
print()
print(f"Uloženo: {xlsm_path}")
def inject_vba(xlsx_path: Path, xlsm_path: Path) -> None:
vba_code = '''\
Private Sub Worksheet_SelectionChange(ByVal Target As Range)
If Target.Row < 2 Then Exit Sub
If Target.Rows.Count > 1 Then Exit Sub
If Target.Column <> 1 Then Exit Sub
Dim subjectId As String
Dim visit As String
subjectId = CStr(Me.Cells(Target.Row, 3).Value)
visit = CStr(Me.Cells(Target.Row, 4).Value)
If subjectId = "" Or visit = "" Then Exit Sub
Dim ws As Worksheet
On Error Resume Next
Set ws = ThisWorkbook.Sheets("EligibleDays")
On Error GoTo 0
If ws Is Nothing Then Exit Sub
Application.ScreenUpdating = False
ws.AutoFilterMode = False
ws.Range("A1").AutoFilter
ws.Range("A1").AutoFilter Field:=2, Criteria1:=subjectId
ws.Range("A1").AutoFilter Field:=3, Criteria1:=visit
ws.Activate
ws.Range("A2").Select
Application.ScreenUpdating = True
End Sub
'''
app = xw.App(visible=False)
try:
wb = app.books.open(str(xlsx_path))
# Najdi VBComponent odpovídající listu "MayoScore" podle tab názvu
vb_comp = None
for comp in wb.api.VBProject.VBComponents:
if comp.Type == 100: # xlSheet
try:
if comp.Properties("Name").Value == "MayoScore":
vb_comp = comp
break
except Exception:
pass
if vb_comp is None:
# fallback: první sheet (Sheet1)
vb_comp = wb.api.VBProject.VBComponents("Sheet1")
vb_comp.CodeModule.AddFromString(vba_code)
wb.api.SaveAs(str(xlsm_path), FileFormat=52) # 52 = xlOpenXMLWorkbookMacroEnabled
wb.close()
finally:
app.quit()
if __name__ == "__main__":
main()
+20 -8
View File
@@ -1,7 +1,7 @@
"""
import_to_mongo.py
Verze: 1.2
Datum: 2026-06-02
Verze: 1.3
Datum: 2026-06-15
Import Clario CSV do MongoDB (databáze: Clario).
@@ -11,7 +11,8 @@ Klíč: MayoDiary → Subject ID + Form Number
MayoScore → Participant ID + Visit
eCOA_DCRs → Data Correction ID
ECG_DCRs → Data Correction ID
Historie: při změně fields se stará verze uloží do pole history[]
Historie: při změně jakéhokoliv datového sloupce (fields + outcome cols) se stará
verze uloží do pole history[] spolu s outcome poli
Po importu přesune zpracované CSV do downloads/Zpracovano/
Použití:
@@ -119,6 +120,14 @@ def detect_collection_type(filename: str) -> str | None:
return None
def data_snapshot(doc: dict, outcome_cols: tuple) -> dict:
"""Porovnatelný snapshot všech datových polí: fields{} + outcome cols."""
snap = {"fields": doc.get("fields", {})}
for col in outcome_cols:
snap[col] = doc.get(col)
return snap
# ---------------------------------------------------------------------------
# CSV → dokument
# ---------------------------------------------------------------------------
@@ -176,6 +185,7 @@ def import_file(csv_path: str, db) -> dict:
cfg = COLLECTION_CONFIG[col_type]
col_name = cfg["collection"]
outcome_cols = tuple(cfg.get("outcome_cols", ()))
snapshot_date = extract_snapshot_date(filename)
collection = db[col_name]
@@ -207,11 +217,13 @@ def import_file(csv_path: str, db) -> dict:
collection.insert_one(doc)
inserted += 1
elif existing.get("fields") != doc["fields"]:
old_entry = {
"date": existing.get("lastSeen", snapshot_date),
"fields": existing["fields"],
}
elif data_snapshot(existing, outcome_cols) != data_snapshot(doc, outcome_cols):
# Uložíme kompletní snapshot starého stavu (fields + outcome cols)
old_entry = {"date": existing.get("lastSeen", snapshot_date)}
for col in outcome_cols:
old_entry[col] = existing.get(col)
old_entry["fields"] = existing.get("fields", {})
update_doc = {k: v for k, v in doc.items()}
update_doc["lastSeen"] = snapshot_date
collection.update_one(
+36
View File
@@ -0,0 +1,36 @@
# scan_cleanup_v1.0.ps1
**Verze:** 1.0 · **Datum:** 2026-06-15
## Účel
READ-ONLY skener volného místa na disku pro účet **bez admin práv** (např. JNJ
počítač). Skript **nic nemaže** — jen proskenuje typická user-space místa, kde
lze uklízet bez administrátora, a vypíše přehled seřazený podle velikosti.
## Spuštění
Na JNJ počítači v PowerShellu:
```powershell
powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1
```
Uložení reportu do souboru:
```powershell
powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1 *> report.txt
```
## Co skript dělá
1. Vypíše volné/obsazené místo na systémovém disku.
2. **Bezpečně smazatelné** — cache/temp (uživatelský TEMP, INetCache, WER,
thumbnail cache, Chrome/Edge/Firefox cache, Teams, Office cache, pip/npm/
NuGet/Playwright cache, Spotify storage, CrashDumps) s velikostí a počtem souborů.
3. **K ruční kontrole** — Downloads, Plocha, Dokumenty, Koš (nemazat automaticky).
4. **20 největších souborů v profilu** nad 100 MB.
5. Vypíše hotové příkazy pro **ruční** smazání (kicker: aplikace musí být zavřené).
## Poznámky
- Vše skenuje pouze v rámci uživatelského profilu → nepotřebuje admina.
- Zamčené soubory (běžící prohlížeč apod.) se při pozdějším mazání přeskočí —
před úklidem cache zavřít příslušnou aplikaci.
- Reálné smazání si pouští uživatel ručně, nikdy ne skript sám.
+164
View File
@@ -0,0 +1,164 @@
# =============================================================================
# scan_cleanup_v1.0.ps1
# Verze: 1.0
# Datum: 2026-06-15
# Autor: Vladimír Buzalka (s asistencí Claude)
# Popis: READ-ONLY skener volného místa na disku pro účet bez admin práv.
# NIC NEMAŽE. Pouze proskenuje typická "user-space" místa, kde lze
# uklízet bez administrátorských oprávnění, a vypíše přehled
# seřazený podle velikosti + návrhy co smazat. Na konci ukáže
# přesné příkazy pro skutečné smazání (musíš je spustit ručně).
#
# Spuštění (z PowerShellu, NEpotřebuje admina):
# powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1
# Volitelně uložení reportu do souboru:
# powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1 *> report.txt
# =============================================================================
$ErrorActionPreference = 'SilentlyContinue'
$ProgressPreference = 'SilentlyContinue'
function Format-Size {
param([long]$Bytes)
if ($Bytes -ge 1GB) { return ('{0:N2} GB' -f ($Bytes / 1GB)) }
if ($Bytes -ge 1MB) { return ('{0:N1} MB' -f ($Bytes / 1MB)) }
if ($Bytes -ge 1KB) { return ('{0:N0} KB' -f ($Bytes / 1KB)) }
return "$Bytes B"
}
function Get-FolderSize {
param([string]$Path)
if (-not (Test-Path -LiteralPath $Path)) { return $null }
$files = Get-ChildItem -LiteralPath $Path -Recurse -Force -File -ErrorAction SilentlyContinue
if (-not $files) { return [pscustomobject]@{ Bytes = 0; Count = 0 } }
$sum = ($files | Measure-Object -Property Length -Sum)
return [pscustomobject]@{
Bytes = [long]($sum.Sum)
Count = [int]$sum.Count
}
}
# --- Hlavička / info o disku -------------------------------------------------
Write-Host ""
Write-Host "===========================================================" -ForegroundColor Cyan
Write-Host " SKEN MOZNOSTI UKLIDU DISKU (read-only, bez admina)" -ForegroundColor Cyan
Write-Host " Pocitac: $env:COMPUTERNAME Uzivatel: $env:USERNAME" -ForegroundColor Cyan
Write-Host " Cas: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" -ForegroundColor Cyan
Write-Host "===========================================================" -ForegroundColor Cyan
Write-Host ""
# Volné místo na systémovém disku
$sysDrive = (Get-Item $env:SystemDrive)
$drive = Get-PSDrive -Name $sysDrive.Name.TrimEnd(':') -ErrorAction SilentlyContinue
if ($drive) {
$free = $drive.Free
$used = $drive.Used
$total = $free + $used
Write-Host ("Disk {0} celkem: {1} volne: {2} obsazeno: {3}" -f `
$env:SystemDrive, (Format-Size $total), (Format-Size $free), (Format-Size $used)) -ForegroundColor Yellow
Write-Host ""
}
# --- Kandidatske lokace (vse v ramci uzivatelskeho profilu = bez admina) -----
# Bezpecne smazatelne (cache / temp / koš)
$candidates = @(
@{ Name = 'Uzivatelsky TEMP'; Path = $env:TEMP; Safe = $true }
@{ Name = 'Windows Temp (user)'; Path = (Join-Path $env:LOCALAPPDATA 'Temp'); Safe = $true }
@{ Name = 'INetCache (IE/Win)'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Windows\INetCache'); Safe = $true }
@{ Name = 'WER - chybove reporty'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Windows\WER'); Safe = $true }
@{ Name = 'Explorer thumbnail cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Windows\Explorer'); Safe = $true }
@{ Name = 'Chrome - Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Google\Chrome\User Data\Default\Cache'); Safe = $true }
@{ Name = 'Chrome - Code Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Google\Chrome\User Data\Default\Code Cache'); Safe = $true }
@{ Name = 'Chrome - GPUCache'; Path = (Join-Path $env:LOCALAPPDATA 'Google\Chrome\User Data\Default\GPUCache'); Safe = $true }
@{ Name = 'Edge - Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Edge\User Data\Default\Cache'); Safe = $true }
@{ Name = 'Edge - Code Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Edge\User Data\Default\Code Cache'); Safe = $true }
@{ Name = 'Firefox - cache2'; Path = (Join-Path $env:LOCALAPPDATA 'Mozilla\Firefox\Profiles'); Safe = $true }
@{ Name = 'Teams - cache (classic)'; Path = (Join-Path $env:APPDATA 'Microsoft\Teams'); Safe = $true }
@{ Name = 'Teams - cache (new)'; Path = (Join-Path $env:LOCALAPPDATA 'Packages\MSTeams_8wekyb3d8bbwe\LocalCache'); Safe = $true }
@{ Name = 'Office - dokumentova cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Office\16.0\OfficeFileCache'); Safe = $true }
@{ Name = 'pip cache (Python)'; Path = (Join-Path $env:LOCALAPPDATA 'pip\Cache'); Safe = $true }
@{ Name = 'Playwright browsers cache'; Path = (Join-Path $env:LOCALAPPDATA 'ms-playwright'); Safe = $true }
@{ Name = 'npm cache'; Path = (Join-Path $env:LOCALAPPDATA 'npm-cache'); Safe = $true }
@{ Name = 'NuGet cache'; Path = (Join-Path $env:USERPROFILE '.nuget\packages'); Safe = $true }
@{ Name = 'Spotify - Storage'; Path = (Join-Path $env:LOCALAPPDATA 'Spotify\Storage'); Safe = $true }
@{ Name = 'CrashDumps'; Path = (Join-Path $env:LOCALAPPDATA 'CrashDumps'); Safe = $true }
)
# Zkontrolovat ale NEMAZAT automaticky (uzivatel musi posoudit obsah)
$review = @(
@{ Name = 'Slozka Downloads (stahovani)'; Path = (Join-Path $env:USERPROFILE 'Downloads') }
@{ Name = 'Plocha (Desktop)'; Path = (Join-Path $env:USERPROFILE 'Desktop') }
@{ Name = 'Dokumenty'; Path = (Join-Path $env:USERPROFILE 'Documents') }
@{ Name = 'Kos (Recycle Bin)'; Path = (Join-Path $env:SystemDrive '\$Recycle.Bin') }
)
Write-Host "--- BEZPECNE SMAZATELNE (cache / temp) --------------------" -ForegroundColor Green
$results = @()
foreach ($c in $candidates) {
$info = Get-FolderSize -Path $c.Path
if ($info -and $info.Bytes -gt 0) {
$results += [pscustomobject]@{
Name = $c.Name
Path = $c.Path
Bytes = $info.Bytes
Count = $info.Count
}
}
}
$totalSafe = 0
foreach ($r in ($results | Sort-Object Bytes -Descending)) {
$totalSafe += $r.Bytes
Write-Host (" {0,10} {1,7} souboru {2}" -f (Format-Size $r.Bytes), $r.Count, $r.Name)
Write-Host (" -> {0}" -f $r.Path) -ForegroundColor DarkGray
}
if ($results.Count -eq 0) { Write-Host " (nic vyznamneho nenalezeno)" -ForegroundColor DarkGray }
Write-Host ""
Write-Host (" >>> POTENCIAL CACHE/TEMP CELKEM: {0}" -f (Format-Size $totalSafe)) -ForegroundColor Green
Write-Host ""
Write-Host "--- K RUCNI KONTROLE (NEMAZAT automaticky) ----------------" -ForegroundColor Yellow
foreach ($c in ($review)) {
$info = Get-FolderSize -Path $c.Path
if ($info) {
Write-Host (" {0,10} {1,7} souboru {2}" -f (Format-Size $info.Bytes), $info.Count, $c.Name)
Write-Host (" -> {0}" -f $c.Path) -ForegroundColor DarkGray
}
}
Write-Host ""
# --- TOP velke soubory v profilu (>100 MB) -----------------------------------
Write-Host "--- 20 NEJVETSICH SOUBORU V PROFILU (>100 MB) -------------" -ForegroundColor Magenta
$big = Get-ChildItem -LiteralPath $env:USERPROFILE -Recurse -Force -File -ErrorAction SilentlyContinue |
Where-Object { $_.Length -gt 100MB } |
Sort-Object Length -Descending |
Select-Object -First 20
if ($big) {
foreach ($f in $big) {
Write-Host (" {0,10} {1}" -f (Format-Size $f.Length), $f.FullName)
}
} else {
Write-Host " (zadne soubory nad 100 MB)" -ForegroundColor DarkGray
}
Write-Host ""
# --- Navod na skutecne smazani ----------------------------------------------
Write-Host "===========================================================" -ForegroundColor Cyan
Write-Host " JAK SKUTECNE SMAZAT (spustit RUCNE, az po kontrole):" -ForegroundColor Cyan
Write-Host "===========================================================" -ForegroundColor Cyan
Write-Host @"
# Vyprazdneni kose:
Clear-RecycleBin -Force
# Smazani obsahu uzivatelskeho TEMP (zavri aplikace; nektere zamcene soubory zustanou):
Get-ChildItem -LiteralPath `$env:TEMP -Recurse -Force -ErrorAction SilentlyContinue |
Remove-Item -Recurse -Force -ErrorAction SilentlyContinue
# Smazani konkretni cache slozky (priklad Chrome) - prohlizec MUSI byt zavreny:
Remove-Item -LiteralPath "`$env:LOCALAPPDATA\Google\Chrome\User Data\Default\Cache\*" -Recurse -Force -ErrorAction SilentlyContinue
# Spravce mista ve Windows (bez admina): Nastaveni > System > Uloziste
"@ -ForegroundColor Gray
Write-Host ""
Write-Host "Hotovo. Skript NIC nesmazal - jen vypsal prehled." -ForegroundColor Green
@@ -0,0 +1,21 @@
# store_cda_batch_v1.4.py
**Verze:** 1.4 · **Datum:** 2026-06-15
Dávkové uložení binárek CDA (PDF) do Mongo `feasibility.investigators`
`cda.data_*`. Zdroj = `.msg` na Toweru (`/mnt/user/JNJEMAILS`), SFTP + extract_msg.
## Spuštění
```
python store_cda_batch_v1.4.py # dry-run
python store_cda_batch_v1.4.py --apply # zápis
```
## Historie
- v1.4 — DÁVKA 7 (15JUN2026): Molnár Martin (GASTROMART s.r.o., krok 4→5),
Dzuriková Michaela (IBDcentrum s.r.o., krok 4→5).
- v1.3 — DÁVKA 6 (12JUN2026): Gregušová Katarína, Drastich Pavel.
- v1.2 — DÁVKA 5 (11JUN2026): Mudr Robert.
- v1.1 — DÁVKA 4 (11JUN2026): Konečný Michal, Baláž Jozef.
- v1.0 — DÁVKY 13 (0910JUN2026): Hlavatý, Fedurco, Tichý, Falc, Pešta,
Jungwirthová, Matouš, Mihálkanin, Krížová, Gregar, Ďurina, Horváth.
@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
# =============================================================================
# Nazev: store_cda_batch_v1.4.py
# Verze: 1.4
# Datum: 2026-06-15
# Popis: Davkove ulozi binarky CDA (PDF) do Mongo k investigatorum
# (feasibility.investigators -> cda.data_*). Zdroj = .msg soubory na
# Toweru (/mnt/user/JNJEMAILS), stazene pres SFTP, priloha vytazena
# extract_msg. Mapovani investigator -> (.msg, attachment) je
# explicitni. Zapise cda.data_* + doplni cda.soubor.
# Pouziti: python store_cda_batch_v1.4.py (dry-run / nahled)
# python store_cda_batch_v1.4.py --apply (zapise do Mongo)
# Zmeny v1.4: DAVKA 7 (15JUN2026) - Molnar Martin (GASTROMART s.r.o., krok 4->5),
# Dzurikova Michaela (IBDcentrum s.r.o., krok 4->5).
# =============================================================================
import os
import sys
import base64
import hashlib
import unicodedata
import paramiko
import extract_msg
from pymongo import MongoClient
from bson import ObjectId
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://192.168.1.76:27017")
TOWER_HOST = "192.168.1.76"
TOWER_USER = "root"
TOWER_PASS = "7309208104"
REMOTE_DIR = "/mnt/user/JNJEMAILS"
TMPDIR = r"u:\Dropbox\!!!Days\Downloads Z230\_cda_tmp"
STORED_AT = "2026-06-15"
# investigator_id -> (msg_filename, attachment_filename, label)
# DAVKA 7 (15JUN2026)
MAPPING = [
("6a19832b5fc221351825797f", "FC130007F372CFD10000.msg",
"SK_CDA_Institution_GASTROMART s.r.o._fully signed 15Jun2026.pdf",
"Molnar Martin (GASTROMART s.r.o.)"),
("6a19832b5fc2213518257964", "FC130007F17E55100000.msg",
"SK_CDA PI_MUDr. Michaela Dzurikova_IBDcentrum s.r.o_13Jun2026.pdf",
"Dzurikova Michaela (IBDcentrum s.r.o.)"),
]
# HISTORIE drivejsich davek (jiz ulozeno):
# DAVKA 6 (12JUN2026): Gregusova Katarina FC130007E9D30EB3, Drastich Pavel FC130007E9D30EB1.
# DAVKA 5 (11JUN2026): Mudr Robert FC130007DE92C232.
# DAVKA 4 (11JUN2026): Konecny Michal FC130007DE92C231, Balaz Jozef FC130007DE92C20F.
# DAVKA 3 (10JUN2026): Gregar, Durina, Horvath.
# DAVKA 1+2 (09JUN2026): Hlavaty, Fedurco, Tichy, Falc, Pesta, Jungwirthova, Lukac,
# Matous, Mihalkanin, Krizova.
def norm(s):
s = s or ""
s = unicodedata.normalize("NFKD", s)
s = "".join(c for c in s if not unicodedata.combining(c))
return " ".join(s.lower().split())
def main():
apply = "--apply" in sys.argv
os.makedirs(TMPDIR, exist_ok=True)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(TOWER_HOST, username=TOWER_USER, password=TOWER_PASS, timeout=30)
sftp = ssh.open_sftp()
client = MongoClient(MONGO_URI)
col = client["feasibility"]["investigators"]
plan = []
for inv_id, msg_name, att_name, label in MAPPING:
local_msg = os.path.join(TMPDIR, msg_name)
if not os.path.exists(local_msg):
sftp.get(f"{REMOTE_DIR}/{msg_name}", local_msg)
m = extract_msg.Message(local_msg)
target = norm(att_name)
chosen = None
for att in m.attachments:
name = att.longFilename or att.shortFilename or ""
if norm(name) == target or (target in norm(name)) or (norm(name) in target and name.lower().endswith(".pdf")):
chosen = (name, att.data)
break
m.close()
if not chosen:
plan.append((inv_id, label, msg_name, att_name, None, "!!! PRILOHA NENALEZENA"))
continue
raw = chosen[1]
sha = hashlib.sha256(raw).hexdigest()
plan.append((inv_id, label, msg_name, chosen[0], (len(raw), sha, raw), "OK"))
sftp.close(); ssh.close()
print("=== NAHLED DAVKY (CDA -> Mongo cda.data) ===\n")
for inv_id, label, msg_name, att_name, info, status in plan:
doc = col.find_one({"_id": ObjectId(inv_id)}, {"prijmeni": 1, "jmeno": 1, "cda.data_base64": 1})
has = bool(doc and doc.get("cda", {}).get("data_base64"))
print(f"[{status}] {label} (_id {inv_id})")
print(f" .msg: {msg_name}")
print(f" priloha: {att_name}")
if info:
print(f" velikost: {info[0]} B sha256: {info[1]}")
print(f" data_base64 jiz existuje: {has}")
print()
if not apply:
print(">>> DRY-RUN. Pro zapis spust s --apply")
return
n = 0
for inv_id, label, msg_name, att_name, info, status in plan:
if status != "OK" or not info:
print(f"PRESKAKUJI {label}: {status}")
continue
size, sha, raw = info
b64 = base64.b64encode(raw).decode("ascii")
res = col.update_one(
{"_id": ObjectId(inv_id)},
{"$set": {
"cda.data_base64": b64,
"cda.data_sha256": sha,
"cda.data_filename": att_name,
"cda.data_mime": "application/pdf",
"cda.data_size": size,
"cda.data_stored_at": STORED_AT,
"cda.data_source_msg": msg_name,
"cda.soubor": att_name,
}},
)
n += res.modified_count
print(f"ZAPSANO: {label} (modified={res.modified_count})")
print(f"\n>>> CELKEM ZAPSANO: {n}")
if __name__ == "__main__":
main()
@@ -0,0 +1,121 @@
# ============================================================
# seaweed_backfill_v1.0.py
# Verze: 1.0
# Datum: 2026-06-15
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
# všechny dokumenty z VTMF.documents, které jsou na disku
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
# Placeholdery a záznamy bez souboru přeskočí.
# Lze spustit opakovaně — HEAD check zajistí dedup,
# přerušení kdykoli naváže příště.
# ============================================================
import hashlib
import mimetypes
import sys
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
SEAWEED_FILER = "http://192.168.1.50:8888"
SEAWEED_PREFIX = "/vtmf-documents"
def log(msg):
print(msg, flush=True)
def sw_path(sha256):
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
def seaweed_store(data, mime="application/octet-stream"):
"""HEAD check + PUT. Vrací (path, url, uploaded)."""
sha256 = hashlib.sha256(data).hexdigest()
path = sw_path(sha256)
url = SEAWEED_FILER + path
try:
urllib.request.urlopen(
urllib.request.Request(url, method="HEAD"), timeout=10)
return path, url, False # dedup hit
except urllib.error.HTTPError as e:
if e.code != 404:
raise
urllib.request.urlopen(
urllib.request.Request(url, data=data, method="PUT",
headers={"Content-Type": mime}),
timeout=120)
return path, url, True
def main():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
coll = client[MONGO_DB][MONGO_COLL]
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
query = {
"downloaded": True,
"placeholder": {"$ne": True},
"seaweed_path": None,
"file": {"$ne": None},
}
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
uploaded = dedup = skipped = failed = 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
path = doc.get("file")
log(f"[{n}/{len(todo)}] {key}")
if not path or not Path(path).exists():
log(f" [!] Soubor nenalezen na disku: {path} — přeskočeno.")
skipped += 1
continue
try:
data = Path(path).read_bytes()
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
sha256_hex = hashlib.sha256(data).hexdigest()
sw_p, sw_url, was_new = seaweed_store(data, mime)
coll.update_one({"_id": key}, {"$set": {
"sha256": sha256_hex,
"seaweed_path": sw_p,
"seaweed_url": sw_url,
"seaweed_synced_at": datetime.now(),
}})
if was_new:
uploaded += 1
log(f" [ok] Nahráno → {sw_p}")
else:
dedup += 1
log(f" [i] Dedup hit → {sw_p}")
except Exception as e:
failed += 1
log(f" [!] Chyba: {e}")
log(f"\n{'='*60}")
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
f"{skipped} bez souboru, {failed} chyb.")
log(f"{'='*60}")
sys.exit(1 if failed else 0)
if __name__ == "__main__":
main()
@@ -0,0 +1,102 @@
# vtmf_pipeline_v1.3 — Kompletní V-TMF workflow (report → Mongo → download)
**Verze:** 1.3 · **Datum:** 2026-06-12
**Změny v1.1:** oprava tichého selhání — výjimka kteréhokoli kroku se
vypíše jako „PIPELINE SELHALA" + exit kód 2 (v1.0 končila zavádějícím
souhrnem „0 staženo, 0 chyb"). Export reportu robustnější: menu ⋯,
položka Export to Excel i tlačítko Export se hledají přes víc selektorů
a ve všech frames; při nenalezení se automaticky uloží diagnostika
stránky do debug/<čas>_report_* (screenshot, HTML všech frames, výpis
title/aria-label atributů) — z ní se dá určit přesný selektor.
**Změny v1.2:** selektory exportu ověřené na živém DOM (Claude in
Chrome; žádný iframe na celé stránce): menu ⋯ =
`.actionMenuContainer .dropDown.vv_dropdown_toggle button.vv-icon-button`
(button má prázdný title!); menu se načítá asynchronně (AJAX) →
po kliknutí se čeká na položku `a.ReportAction[data-action-name='ExcelExport']`;
„Data Only" = radio `name=requiredRadioField value=STANDARD`, defaultně
checked (pojistka přes .check()); tlačítko Export = React `<button>`
s emotion class hash → selektovat jen přes roli+text.
**Změny v1.3:** na konci běhu se prohlížeč i konzole zavřou
automaticky (žádné čekání na ENTER); interaktivní vstup zůstává jen
u 2FA a u ručně nezavřitelného dialogu.
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
1. **Login** do vtmf.veevavault.com (persistentní profil
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
+ ENTER; údaje z `.env` v rootu projektu).
2. **Export reportu** „Document Inventory Report - Study Level"
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
uloží se s timestampem do `WhatToDownload/`, po zpracování se
přesune do `WhatToDownload/Zpracovano/`.
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`
(VTMF číslo + verze, unikátní index na dvojici):
- nový dokument → založí se (first_seen, deleted=False,
downloaded=False),
- změna sledovaných polí (name, status, type, subtype, desc,
date, url, studies) → promítne se + záznam do `history[]`
(timestamp + old/new),
- dokument chybí v reportu → `deleted=True, deleted_at` a stažený
soubor se přejmenuje s ` [D]` před příponou,
- dokument se vrátí do reportu → `deleted=False` a ` [D]`
se ze souboru zase odebere.
Výsledná sada = záznamy s `deleted=False`.
4. **Stažení chybějících** — všechny `deleted=False, downloaded≠True`:
doc URL → Source File → uložení do
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<skutečná přípona>`.
Výsledek (cesta, čas, případně chyba) se ihned zapisuje do Mongo —
běh jde kdykoli přerušit a příště naváže.
## Mongo schéma (kolekce documents)
```
_id: "VTMF-19077748|v1.0"
vtmf, version, url, name, status, type, subtype, desc, date, studies
first_seen, last_seen # kdy poprvé/naposledy v reportu
deleted, deleted_at # není ve výsledné sadě reportu
downloaded, file, downloaded_at
last_error, error_at # poslední chyba stahování
history: [{ts, changes: {pole: {old, new}}}]
```
## Migrace starého stavu
Při prvním běhu se `download_state.csv` (z download_vault v2.x)
jednorázově namigruje: záznamy `ok` se k odpovídajícímu VTMF zapíší
jako `downloaded=True` + cesta. CSV se přejmenuje na
`download_state.csv.imported`.
## Konfigurace (konstanty nahoře)
- `REPORT_URL` — ID reportu + filtr studie (pro jinou studii se mění
jen tato dvě ID)
- `LIMIT` — None = stáhnout vše zbývající; číslo = dávka na běh
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
## Ověřené technické detaily (nesahat bez ověření)
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
(křížek `.ui-dialog-titlebar-close` je display:none); objevuje se
se zpožděním → wait_for visible 8 s (home) / 2-4 s (jinde).
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
- Export kliknout právě jednou; 503/redirecty v network logu
ignorovat, rozhoduje expect_download.
## Spuštění
```powershell
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.3.py"
```
Předchůdce: download_vault v1.xv2.1 (TRASH/).
@@ -0,0 +1,112 @@
# vtmf_pipeline_v1.4 — Kompletní V-TMF workflow (report → Mongo → download)
**Verze:** 1.4 · **Datum:** 2026-06-15
**Změny v1.1:** oprava tichého selhání — výjimka kteréhokoli kroku se
vypíše jako „PIPELINE SELHALA" + exit kód 2 (v1.0 končila zavádějícím
souhrnem „0 staženo, 0 chyb"). Export reportu robustnější: menu ⋯,
položka Export to Excel i tlačítko Export se hledají přes víc selektorů
a ve všech frames; při nenalezení se automaticky uloží diagnostika
stránky do debug/<čas>_report_* (screenshot, HTML všech frames, výpis
title/aria-label atributů) — z ní se dá určit přesný selektor.
**Změny v1.2:** selektory exportu ověřené na živém DOM (Claude in
Chrome; žádný iframe na celé stránce): menu ⋯ =
`.actionMenuContainer .dropDown.vv_dropdown_toggle button.vv-icon-button`
(button má prázdný title!); menu se načítá asynchronně (AJAX) →
po kliknutí se čeká na položku `a.ReportAction[data-action-name='ExcelExport']`;
„Data Only" = radio `name=requiredRadioField value=STANDARD`, defaultně
checked (pojistka přes .check()); tlačítko Export = React `<button>`
s emotion class hash → selektovat jen přes roli+text.
**Změny v1.3:** na konci běhu se prohlížeč i konzole zavřou
automaticky (žádné čekání na ENTER); interaktivní vstup zůstává jen
u 2FA a u ručně nezavřitelného dialogu.
**Změny v1.4:** detekce placeholder dokumentů — Vault zobrazuje text
„This placeholder has no content", dokument nemá žádný Source File ke
stažení. Při detekci se zapíše `placeholder=True, downloaded=True` do
Mongo a dokument se přeskočí bez chyby. Souhrn na konci běhu uvádí
počet placeholderů zvlášť.
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
1. **Login** do vtmf.veevavault.com (persistentní profil
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
+ ENTER; údaje z `.env` v rootu projektu).
2. **Export reportu** „Document Inventory Report - Study Level"
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
uloží se s timestampem do `WhatToDownload/`, po zpracování se
přesune do `WhatToDownload/Zpracovano/`.
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`
(VTMF číslo + verze, unikátní index na dvojici):
- nový dokument → založí se (first_seen, deleted=False,
downloaded=False),
- změna sledovaných polí (name, status, type, subtype, desc,
date, url, studies) → promítne se + záznam do `history[]`
(timestamp + old/new),
- dokument chybí v reportu → `deleted=True, deleted_at` a stažený
soubor se přejmenuje s ` [D]` před příponou,
- dokument se vrátí do reportu → `deleted=False` a ` [D]`
se ze souboru zase odebere.
Výsledná sada = záznamy s `deleted=False`.
4. **Stažení chybějících** — všechny `deleted=False, downloaded≠True`:
doc URL → Source File → uložení do
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<skutečná přípona>`.
Výsledek (cesta, čas, případně chyba) se ihned zapisuje do Mongo —
běh jde kdykoli přerušit a příště naváže.
Placeholder dokumenty (stránka s textem „This placeholder has no
content") se přeskočí a označí `placeholder=True, downloaded=True`.
## Mongo schéma (kolekce documents)
```
_id: "VTMF-19077748|v1.0"
vtmf, version, url, name, status, type, subtype, desc, date, studies
first_seen, last_seen # kdy poprvé/naposledy v reportu
deleted, deleted_at # není ve výsledné sadě reportu
downloaded, file, downloaded_at
placeholder # True = Vault placeholder bez obsahu
last_error, error_at # poslední chyba stahování
history: [{ts, changes: {pole: {old, new}}}]
```
## Migrace starého stavu
Při prvním běhu se `download_state.csv` (z download_vault v2.x)
jednorázově namigruje: záznamy `ok` se k odpovídajícímu VTMF zapíší
jako `downloaded=True` + cesta. CSV se přejmenuje na
`download_state.csv.imported`.
## Konfigurace (konstanty nahoře)
- `REPORT_URL` — ID reportu + filtr studie (pro jinou studii se mění
jen tato dvě ID)
- `LIMIT` — None = stáhnout vše zbývající; číslo = dávka na běh
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
## Ověřené technické detaily (nesahat bez ověření)
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
(křížek `.ui-dialog-titlebar-close` je display:none); objevuje se
se zpožděním → wait_for visible 8 s (home) / 2-4 s (jinde).
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
- Export kliknout právě jednou; 503/redirecty v network logu
ignorovat, rozhoduje expect_download.
- Placeholder detekce: `page.locator("div.vv_placeholder_text")` (uvnitř
`div.vv_placeholder_pane > div.vv_placeholder_container > div.vv-placeholder-drag-and-drop-container`)
se testuje před hledáním Source File ikony — CSS selektor je spolehlivější
než text match.
## Spuštění
```powershell
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.4.py"
```
Předchůdce: vtmf_pipeline_v1.3 (TRASH/).
@@ -0,0 +1,864 @@
# ============================================================
# vtmf_pipeline_v1.4.py
# Verze: 1.4
# Datum: 2026-06-15
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault), studie
# 77242113UCO3001. Jeden běh udělá:
# 1) login do Vaultu (persistentní session + ruční 2FA),
# 2) export reportu "Document Inventory Report - Study
# Level" do Excelu (Data Only) do WhatToDownload/,
# 3) parse reportu a synchronizaci do MongoDB
# (Tower, db VTMF, kolekce documents,
# klíč = VTMF číslo + verze):
# - nové dokumenty se založí,
# - změny polí se promítnou (+ history[]),
# - dokumenty chybějící v reportu se označí
# deleted=True a stažený soubor dostane ' [D]',
# - znovuobjevené se vzkřísí a ' [D]' se odebere,
# 4) stažení všech dosud nestažených dokumentů do
# U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\
# <Type>\<Subtype>\"YYYY-MM-DD Description
# [VTMF-x] [v1.0].<přípona>" + zápis stavu do Mongo.
#
# Tracking stahování je KOMPLETNĚ v Mongo; starý
# download_state.csv se při prvním běhu jednorázově
# namigruje a přejmenuje na .imported.
#
# Vychází z download_vault_v2.1 (v TRASH/) — login, dialogy
# a stahování beze změny; nové jsou kroky 2 a 3.
#
# v1.1: oprava tichého selhání — chyba kteréhokoli kroku se teď
# hlasitě vypíše (a exit kód 2), místo aby běh skončil
# souhrnem "0 staženo, 0 chyb". Export reportu: více
# selektorů pro menu ⋯ i položku Export to Excel (včetně
# hledání ve všech frames) a při selhání automatický záchyt
# diagnostiky stránky do debug/ (screenshot + HTML frames).
# v1.2: selektory exportu OVĚŘENÉ na živém DOM (žádný iframe):
# menu ⋯ = .actionMenuContainer .dropDown.vv_dropdown_toggle
# button.vv-icon-button (title prázdný!); menu se načítá
# asynchronně -> čekat na položku; položka =
# a.ReportAction[data-action-name='ExcelExport']; Data Only =
# radio name=requiredRadioField value=STANDARD (default
# checked); Export = <button> role+text (emotion class hash,
# neselektovat podle tříd).
# v1.3: na konci běhu se prohlížeč i okno zavře automaticky
# (žádné čekání na ENTER) — vhodné pro bezobslužné běhy.
# Interaktivní vstupy zůstávají jen tam, kde jsou nutné
# (2FA, ručně nezavřitelný dialog).
# v1.4: detekce placeholder dokumentů — stránka s textem
# "This placeholder has no content" se přeskočí
# (placeholder=True, downloaded=True v Mongo), žádná chyba.
#
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
# ============================================================
import csv
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
from pymongo import MongoClient, ASCENDING
# --- Konfigurace -------------------------------------------------------
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
"&IdpAdapterId=CompIWALDAPEXTFORM"
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
# Report Document Inventory Report - Study Level, filtr na studii
REPORT_URL = ("https://vtmf.veevavault.com/ui/#reporting/viewer/"
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008")
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
SCRIPT_DIR = Path(__file__).resolve().parent
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
OLD_STATE_FILE = SCRIPT_DIR / "download_state.csv" # legacy CSV (migrace)
DOWNLOAD_ROOT = Path(r"U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001")
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
LIMIT = 0
# Pole reportu, jejichž změny se promítají a verzují do history[]
TRACKED_FIELDS = ("name", "status", "type", "subtype", "desc",
"date", "url", "studies")
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
RETRY_PAUSE_MS = 5000 # pauza před opakováním
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
class PlaceholderDocument(Exception):
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
def log(msg):
print(msg, flush=True)
def load_env_file(path):
"""Načte KEY=VALUE řádky z .env do os.environ.
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
if not path.exists():
log(f"[!] .env nenalezen: {path}")
return
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key, value = key.strip(), value.strip().strip('"').strip("'")
if value and key not in os.environ:
os.environ[key] = value
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
def ensure_credentials():
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
load_env_file(ENV_FILE)
if all(os.environ.get(k) for k in ENV_KEYS):
return
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
missing_lines = [f"{k}=" for k in ENV_KEYS
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
if not ENV_FILE.exists():
ENV_FILE.write_text(
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
+ ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n",
encoding="utf-8")
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
elif missing_lines:
with open(ENV_FILE, "a", encoding="utf-8") as f:
f.write("\n" + ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n")
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
print("\n" + "=" * 60)
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
print(f" {ENV_FILE}")
print(" a spusť skript znovu.")
print("=" * 60)
sys.exit(1)
# --- Parsování Excelu --------------------------------------------------
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
# nepovolené znaky Windows názvů + řídicí znaky + unicode artefakt
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f]")
def clean_filename(s):
"""Očistí string na platné jméno souboru/složky ve Windows."""
s = BAD_CHARS_RE.sub("_", str(s))
s = re.sub(r"\s+", " ", s) # vícenásobné mezery -> jedna
s = re.sub(r"_{2,}", "_", s) # vícenásobná podtržítka -> jedno
return s.strip(" ._") # okraje: mezery, tečky, podtržítka
def display_text(cell):
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
raw = str(cell.value or "").strip()
m = HYPERLINK_RE.search(raw)
return m.group(2).strip() if m else raw
def extract_doc_url(raw):
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
if not m:
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
return m.group(1)
def read_documents_from_excel(path):
"""Načte dokumenty z daného .xlsx reportu. Vrací list dictů:
vtmf, version, url, name, status, type, subtype, desc, date, studies.
Document Name/Number/Status jsou =HYPERLINK vzorce — URL i text se
berou regexem. Report má rozbité deklarované rozměry, čte se
přímou iterací řádků."""
from openpyxl import load_workbook
log(f"[i] Parsování reportu: {path.name}")
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
ws = wb[wb.sheetnames[0]]
rows = ws.iter_rows()
header = [c.value for c in next(rows)]
try:
i_num = header.index("Document Number")
i_name = header.index("Document Name")
i_status = header.index("Document Status")
i_type = header.index("Type")
i_sub = header.index("Subtype")
i_desc = header.index("Description")
i_date = header.index("Document Date")
i_study = header.index("Study")
except ValueError as e:
raise RuntimeError(f"V reportu chybí očekávaný sloupec: {e}")
docs, bad = [], []
for row in rows:
cell = row[i_num]
if cell.value is None:
continue
raw = str(cell.value)
m = HYPERLINK_RE.search(raw)
if m:
url_raw, vtmf = m.group(1), m.group(2)
elif cell.hyperlink: # pravý hyperlink místo vzorce
url_raw, vtmf = cell.hyperlink.target, raw
else:
bad.append(raw)
continue
try:
url = extract_doc_url(url_raw)
except ValueError:
bad.append(raw)
continue
name = display_text(row[i_name])
vm = VERSION_RE.search(name)
version = vm.group(1) if vm else "v?"
desc = clean_filename(display_text(row[i_desc]))
if not desc:
# fallback: Document Name bez koncové verze (jde zvlášť na konec)
desc = clean_filename(VERSION_RE.sub("", name))
date = row[i_date].value # datetime nebo None
docs.append({
"vtmf": vtmf.strip(),
"version": version,
"url": url,
"name": name,
"status": display_text(row[i_status]),
"type": clean_filename(display_text(row[i_type])),
"subtype": clean_filename(display_text(row[i_sub])),
"desc": desc,
"date": date if hasattr(date, "strftime") else None,
"studies": display_text(row[i_study]),
})
log(f"[i] Načteno {len(docs)} dokumentů"
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
return docs
def build_target_path(doc, suggested_filename):
"""Cílová cesta: DOWNLOAD_ROOT\\Type\\Subtype\\
'YYYY-MM-DD Description [VTMF-xxx] [v1.0].<skutečná přípona>'.
Datum/verze se vynechají, když nejsou k dispozici."""
ext = Path(suggested_filename).suffix # skutečná přípona vč. tečky
date_prefix = doc["date"].strftime("%Y-%m-%d") + " " if doc["date"] else ""
version = f" [{doc['version']}]" if doc.get("version") else ""
filename = f"{date_prefix}{doc['desc']} [{doc['vtmf']}]{version}{ext}"
return DOWNLOAD_ROOT / doc["type"] / doc["subtype"] / filename
def deleted_marker_path(path):
"""Jméno souboru s příznakem smazání: 'x.pdf' -> 'x [D].pdf'."""
p = Path(path)
return p.with_name(f"{p.stem} [D]{p.suffix}")
# --- MongoDB synchronizace ---------------------------------------------
def doc_key(vtmf, version):
return f"{vtmf}|{version}"
def get_collection():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
coll = client[MONGO_DB][MONGO_COLL]
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)],
unique=True)
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
return coll
def migrate_old_csv(coll):
"""Jednorázová migrace download_state.csv do Mongo: záznamy 'ok'
se zapíší jako downloaded=True k odpovídajícímu VTMF (aktuální,
nesmazané verzi). CSV se pak přejmenuje na .imported."""
if not OLD_STATE_FILE.exists():
return
migrated = 0
with open(OLD_STATE_FILE, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
if row["result"] != "ok":
continue
r = coll.update_one(
{"vtmf": row["vtmf"], "deleted": False,
"downloaded": {"$ne": True}},
{"$set": {"downloaded": True, "file": row["file"],
"downloaded_at": row["timestamp"]}})
migrated += r.modified_count
OLD_STATE_FILE.rename(OLD_STATE_FILE.with_suffix(".csv.imported"))
log(f"[i] Migrace download_state.csv -> Mongo: {migrated} záznamů; "
f"CSV přejmenováno na .imported")
def sync_report_to_mongo(coll, docs):
"""Promítne aktuální report do kolekce documents.
Klíč = (vtmf, version). Nové založí, změny polí promítne
(s history[]), chybějící označí deleted + soubor přejmenuje
s ' [D]', znovuobjevené vzkřísí a ' [D]' odebere."""
now = datetime.now()
stats = {"new": 0, "updated": 0, "unchanged": 0,
"resurrected": 0, "marked_deleted": 0}
current_keys = set()
for d in docs:
key = doc_key(d["vtmf"], d["version"])
current_keys.add(key)
existing = coll.find_one({"_id": key})
if existing is None:
coll.insert_one({
"_id": key, **d,
"first_seen": now, "last_seen": now,
"deleted": False, "downloaded": False,
"file": None, "history": [],
})
stats["new"] += 1
continue
changes = {}
for fld in TRACKED_FIELDS:
if existing.get(fld) != d.get(fld):
changes[fld] = {"old": existing.get(fld),
"new": d.get(fld)}
update = {"$set": {**d, "last_seen": now, "deleted": False}}
if changes:
update["$push"] = {"history": {"ts": now, "changes": changes}}
stats["updated"] += 1
else:
stats["unchanged"] += 1
if existing.get("deleted"):
# dokument se do reportu vrátil -> odebrat [D] ze souboru
stats["resurrected"] += 1
stats["unchanged"] -= 0 # (počítá se výše jako updated/unchanged)
old_file = existing.get("file")
if old_file:
marked = deleted_marker_path(old_file)
if marked.exists() and not Path(old_file).exists():
marked.rename(old_file)
log(f"[i] {key}: soubor vrácen z ' [D]' zpět.")
update["$set"]["file"] = str(old_file)
coll.update_one({"_id": key}, update)
# dokumenty, které v aktuálním reportu nejsou -> deleted + ' [D]'
for rec in coll.find({"deleted": False}):
if rec["_id"] in current_keys:
continue
upd = {"deleted": True, "deleted_at": now}
f = rec.get("file")
if f and Path(f).exists():
marked = deleted_marker_path(f)
try:
Path(f).rename(marked)
upd["file"] = str(marked)
log(f"[i] {rec['_id']}: soubor označen ' [D]'.")
except OSError as e:
log(f"[!] {rec['_id']}: přejmenování na [D] selhalo: {e}")
coll.update_one({"_id": rec["_id"]},
{"$set": upd,
"$push": {"history": {"ts": now,
"changes": {"deleted": {
"old": False,
"new": True}}}}})
stats["marked_deleted"] += 1
log(f"[ok] Mongo sync: {stats['new']} nových, {stats['updated']} změněných, "
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
f"{stats['marked_deleted']} označených deleted.")
return stats
# --- Přihlášení --------------------------------------------------------
def submit_login_form(page, password_box):
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
candidates = [
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
page.locator("input[type='submit']"),
page.locator("button[type='submit']"),
page.get_by_role("button", name=re.compile("^ok$", re.I)),
]
for loc in candidates:
try:
if loc.count() and loc.first.is_visible():
label = (loc.first.inner_text() or
loc.first.get_attribute("value") or "submit").strip()
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
loc.first.click()
return
except Exception:
continue
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
password_box.press("Enter")
def login_if_needed(page):
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
log(f"[i] Otevírám přihlašovací URL...")
page.goto(LOGIN_URL, wait_until="domcontentloaded")
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Už přihlášen (perzistentní session).")
return
user_box = page.locator("input[type='text']").first
try:
user_box.wait_for(timeout=8000)
except PWTimeout:
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Přihlášen bez formuláře (session redirect).")
return
raise RuntimeError(
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
username = os.environ["VAULT_USER"]
password = os.environ["VAULT_PASS"]
log("[i] Vyplňuji přihlašovací údaje...")
user_box.fill(username)
password_box = page.locator("input[type='password']").first
password_box.fill(password)
submit_login_form(page, password_box)
log("[i] Odeslán login, čekám na výsledek...")
try:
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
log("[ok] Přihlášen rovnou (bez 2FA).")
return
except PWTimeout:
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
err = page.locator("text=/invalid|incorrect|failed/i")
try:
if err.count() and err.first.is_visible():
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
except PWTimeout:
pass
print("\n" + "=" * 60)
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
print(" Potvrď přihlášení v mobilní aplikaci.")
print("=" * 60)
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
log("[ok] Přihlášení dokončeno.")
def verify_inside(page):
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
log(f"[ok] Uvnitř Vaultu: {page.url}")
def dialog_visible(page):
"""True, pokud je na stránce viditelný jQuery UI dialog."""
try:
dlg = page.locator(".ui-dialog")
return bool(dlg.count() and dlg.first.is_visible())
except Exception:
return False
def save_page_debug(page, tag):
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
kandidátů na tlačítka. Vrátí cestu složky."""
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
out.mkdir(parents=True, exist_ok=True)
try:
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
except Exception as e:
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
report = []
for i, frame in enumerate(page.frames):
report.append(f"=== frame[{i}] url={frame.url}")
try:
(out / f"frame_{i}.html").write_text(frame.content(),
encoding="utf-8")
for sel in (".ui-dialog", "a.ok.vv_button",
".ui-dialog-titlebar-close",
"button", "input[type='button']",
"[title]", "[aria-label]"):
n = frame.locator(sel).count()
if n:
report.append(f" {sel}: {n}x")
# výpis title/aria-label atributů — pomáhá najít menu ⋯
for attr in ("title", "aria-label"):
vals = frame.locator(f"[{attr}]").evaluate_all(
f"els => els.map(e => e.getAttribute('{attr}'))")
uniq = sorted({v for v in vals if v})[:80]
report.append(f" {attr}: {uniq}")
except Exception as e:
report.append(f" [chyba čtení framu: {e}]")
(out / "frames_report.txt").write_text("\n".join(report),
encoding="utf-8")
log(f"[!] Diagnostika stránky uložena do: {out}")
return out
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
".vv_login_msg_dialog .vv_button.ok")
def dismiss_maintenance_popup(page, timeout=8000):
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
proto se na něj krátce čeká. Bezpečné volat vždy."""
ok = page.locator(DIALOG_OK_SELECTOR)
try:
ok.first.wait_for(state="visible", timeout=timeout)
except PWTimeout:
return False # okno se neobjevilo — pokračujeme
except Exception:
return False
closed = 0
for _ in range(5): # dialogy umí být ve frontě
try:
if ok.count() and ok.first.is_visible():
ok.first.click()
page.wait_for_timeout(300)
closed += 1
log("[i] Maintenance/login dialog zavřen (OK).")
continue
except Exception:
pass
break
if not dialog_visible(page):
return bool(closed)
page.keyboard.press("Escape")
page.wait_for_timeout(500)
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
if dialog_visible(page):
save_page_debug(page, "dialog")
print("\n" + "=" * 60)
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
print(" Zavři ho prosím ručně v prohlížeči.")
print("=" * 60)
input(" Po ručním zavření stiskni ENTER... ")
return bool(closed)
# --- Export reportu ----------------------------------------------------
def _first_visible(page, builders):
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
hlavní stránce i ve všech frames."""
for frame in page.frames:
for build, desc in builders:
try:
loc = build(frame)
if loc.count() and loc.first.is_visible():
return loc.first, desc
except Exception:
continue
return None, None
def download_report(page):
"""Stáhne report (Export to Excel, Data Only) do WhatToDownload/
pod timestampovaným názvem. Vrátí cestu k souboru.
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
log("[i] Otevírám report Document Inventory Report - Study Level...")
page.goto(REPORT_URL, wait_until="domcontentloaded")
dismiss_maintenance_popup(page, timeout=4000)
# report je hotový, když se objeví počet záznamů / statusy
try:
page.wait_for_selector("text=Returned", timeout=30000)
except PWTimeout:
try:
page.wait_for_selector("text=Document Status:", timeout=30000)
except PWTimeout:
save_page_debug(page, "report_load")
raise RuntimeError(
"Report se nenačetl (nenašel jsem 'Returned' ani "
"'Document Status:'). Diagnostika v debug/.")
log("[i] Report načten, otevírám menu akcí (⋯)...")
# Menu ⋯ (Actions): button bez title/aria-label uvnitř
# .actionMenuContainer (ověřeno na živém DOM, žádný iframe).
actions, desc = _first_visible(page, [
(lambda f: f.locator(
".actionMenuContainer .dropDown.vv_dropdown_toggle "
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
])
if actions is None:
save_page_debug(page, "report_menu")
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. "
"Diagnostika v debug/.")
log(f"[i] Menu nalezeno přes: {desc}")
actions.click()
# Menu se načítá ASYNCHRONNĚ (data-loaded=false -> AJAX),
# počkat na položku, nečíst hned po kliknutí.
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
try:
item.first.wait_for(state="visible", timeout=15000)
except PWTimeout:
# fallback podle textu (kdyby se data atribut změnil)
item = page.get_by_text("Export to Excel", exact=True)
try:
item.first.wait_for(state="visible", timeout=5000)
except PWTimeout:
save_page_debug(page, "report_export_item")
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
"Excel' jsem nenašel. Diagnostika v debug/.")
log("[i] Klikám 'Export to Excel'...")
item.first.click()
log("[i] Dialog Excel Export Options...")
# 'Data Only' = radio value=STANDARD, defaultně checked; pojistka.
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
try:
radio.first.wait_for(state="visible", timeout=10000)
if not radio.first.is_checked():
radio.first.check()
log("[i] Přepnuto na 'Data Only'.")
except PWTimeout:
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
# Export = <button> s textem Export (React dialog, emotion třídy —
# NEselektovat podle class hash, jen role+text).
export_btn = page.get_by_role("button", name="Export", exact=True)
try:
export_btn.first.wait_for(state="visible", timeout=10000)
except PWTimeout:
save_page_debug(page, "report_export_btn")
raise RuntimeError("Dialog exportu bez tlačítka Export. "
"Diagnostika v debug/.")
export_btn = export_btn.first
# Export kliknout PRÁVĚ jednou (vícenásobné kliky = duplikáty);
# 503/redirecty v network logu neřešit — rozhoduje expect_download
with page.expect_download(timeout=120000) as dl_info:
export_btn.click()
download = dl_info.value
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dest = EXCEL_DIR / f"{ts} {download.suggested_filename}"
download.save_as(str(dest))
log(f"[ok] Report uložen: {dest}")
return dest
def archive_report(path):
"""Po úspěšném zpracování přesune report do Zpracovano/."""
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
target = PROCESSED_DIR / path.name
path.rename(target)
log(f"[i] Report archivován: {target}")
# --- Stažení dokumentů -------------------------------------------------
def find_source_file_button(page):
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře).
Více fallback selektorů — DOM se může lišit podle typu dokumentu."""
candidates = [
"[title='Source File']",
"[aria-label='Source File']",
]
for sel in candidates:
loc = page.locator(sel)
if loc.count():
return loc.first
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
if loc.count():
return loc.first
return None
def download_source_file(page, doc):
vtmf = doc["vtmf"]
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
page.goto(doc["url"], wait_until="domcontentloaded")
try:
page.wait_for_load_state("networkidle", timeout=30000)
except PWTimeout:
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
dismiss_maintenance_popup(page, timeout=2000)
ph = page.locator("div.vv_placeholder_text")
if ph.count() and ph.first.is_visible():
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
raise PlaceholderDocument(vtmf)
target = find_source_file_button(page)
if target is None:
raise RuntimeError(
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
log("[i] Klikám na Source File a čekám na download...")
with page.expect_download(timeout=60000) as dl_info:
target.click()
# Varianta s dropdownem (Source File + Viewable Rendition)
try:
item = page.get_by_role("menuitem",
name=re.compile("Source File", re.I))
if item.count() and item.first.is_visible():
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
item.first.click()
except Exception:
pass
download = dl_info.value
dest = build_target_path(doc, download.suggested_filename)
dest.parent.mkdir(parents=True, exist_ok=True)
download.save_as(str(dest))
log(f"[ok] Uloženo: {dest}")
return dest
def download_missing(page, coll):
"""Stáhne všechny nesmazané dokumenty bez downloaded=True.
Výsledek každého se ihned zapíše do Mongo."""
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
.sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
if LIMIT:
todo = todo[:LIMIT]
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
ok_count, fail_count, placeholder_count = 0, 0, 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
log(f"\n--- [{n}/{len(todo)}] {key} | {doc['desc'][:70]}")
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
try:
dest = download_source_file(page, doc)
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "file": str(dest),
"downloaded_at": datetime.now(),
"last_error": None}})
ok_count += 1
last_err = None
break
except PlaceholderDocument:
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "placeholder": True,
"file": None, "downloaded_at": datetime.now(),
"last_error": None}})
placeholder_count += 1
last_err = None
break
except Exception as e:
last_err = e
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
if attempt < MAX_ATTEMPTS:
page.wait_for_timeout(RETRY_PAUSE_MS)
if last_err is not None:
coll.update_one({"_id": key}, {"$set": {
"last_error": str(last_err),
"error_at": datetime.now()}})
fail_count += 1
page.wait_for_timeout(BETWEEN_DOCS_MS)
return ok_count, fail_count, placeholder_count
# --- Main --------------------------------------------------------------
def main():
ensure_credentials()
coll = get_collection()
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
accept_downloads=True,
no_viewport=True, # okno se chová nativně
args=["--start-maximized"],
)
page = ctx.pages[0] if ctx.pages else ctx.new_page()
ok_count = fail_count = placeholder_count = 0
pipeline_error = None
try:
# 1) login
login_if_needed(page)
verify_inside(page)
dismiss_maintenance_popup(page)
# 2) export reportu
report_path = download_report(page)
# 3) parse + sync do Mongo
docs = read_documents_from_excel(report_path)
if not docs:
raise RuntimeError("Report neobsahuje žádné dokumenty — "
"sync přeskočen, nic se nemaže.")
sync_report_to_mongo(coll, docs)
migrate_old_csv(coll)
archive_report(report_path)
# 4) stažení chybějících
DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
ok_count, fail_count, placeholder_count = download_missing(page, coll)
except KeyboardInterrupt:
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
except Exception as e:
pipeline_error = e
print("\n" + "=" * 60)
print(" PIPELINE SELHALA!")
print(f" {type(e).__name__}: {e}")
print("=" * 60)
finally:
total = coll.count_documents({})
have = coll.count_documents({"deleted": False, "downloaded": True})
active = coll.count_documents({"deleted": False})
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
f"{placeholder_count} placeholderů přeskočeno, {fail_count} chyb"
+ (f", PIPELINE SELHALA ({pipeline_error})"
if pipeline_error else "."))
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
f"z toho staženo {have} ({active - have} zbývá).")
log("[i] Zavírám prohlížeč.")
ctx.close()
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
if __name__ == "__main__":
main()
@@ -0,0 +1,96 @@
# vtmf_pipeline_v1.5 — Kompletní V-TMF workflow (report → Mongo → download → SeaweedFS)
**Verze:** 1.5 · **Datum:** 2026-06-15
**Změny v1.5:** upload každého staženého dokumentu do SeaweedFS Filer
(`192.168.1.50:8888`, cesta `/vtmf-documents/ab/cd/<sha256>`).
SHA-256 content-addressed dedup — identický soubor se uloží jen jednou
(HEAD check → 404 → PUT; při 200 dedup hit). Chyba uploadu neblokuje
download ani zápis do Mongo — soubor zůstane na disku a pole
`sha256/seaweed_path/seaweed_url/seaweed_synced_at` zůstanou `null`
(lze doplnit backfillem). Souhrn na konci uvádí počet nově nahraných,
dedup hitů a případných chyb uploadu zvlášť.
_(Předchozí změny viz TRASH/vtmf_pipeline_v1.4.md)_
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
1. **Login** do vtmf.veevavault.com (persistentní profil
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
+ ENTER; údaje z `.env` v rootu projektu).
2. **Export reportu** „Document Inventory Report - Study Level"
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
uloží se s timestampem do `WhatToDownload/`, po zpracování se
přesune do `WhatToDownload/Zpracovano/`.
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`:
- nové dokumenty se založí,
- změny sledovaných polí se promítnou (+ `history[]`),
- dokumenty chybějící v reportu se označí `deleted=True`
a stažený soubor dostane ` [D]` před příponou,
- znovuobjevené se vzkřísí a ` [D]` se odebere.
4. **Stažení + SeaweedFS upload** — všechny `deleted=False, downloaded≠True`:
- Source File se uloží do
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<přípona>`,
- soubor se přečte z disku, vypočítá se SHA-256, obsah se nahraje
do SeaweedFS na `/vtmf-documents/{sha256[:2]}/{sha256[2:4]}/{sha256}`,
- do Mongo se zapíše `downloaded=True, file, sha256, seaweed_path,
seaweed_url, seaweed_synced_at`; chyba SeaweedFS tyto fieldy
nechá `null` ale `downloaded=True` se zapíše (soubor je na disku).
- Placeholder dokumenty (`div.vv_placeholder_text` viditelný) se
přeskočí s `placeholder=True, downloaded=True`.
## Mongo schéma (kolekce documents)
```
_id: "VTMF-19077748|v1.0"
vtmf, version, url, name, status, type, subtype, desc, date, studies
first_seen, last_seen # kdy poprvé/naposledy v reportu
deleted, deleted_at # není ve výsledné sadě reportu
downloaded, file, downloaded_at
placeholder # True = Vault placeholder bez obsahu
sha256 # hex SHA-256 staženého souboru
seaweed_path # /vtmf-documents/ab/cd/<sha256>
seaweed_url # http://192.168.1.50:8888/vtmf-documents/...
seaweed_synced_at # kdy nahráno / null při chybě
last_error, error_at # poslední chyba stahování
history: [{ts, changes: {pole: {old, new}}}]
```
## SeaweedFS detaily
- **Filer**: `http://192.168.1.50:8888` (přímý PUT, žádný master assign)
- **Dedup**: HEAD → 404 → PUT; HEAD → 200 → dedup hit (vrátí `uploaded=False`)
- **Timeout**: HEAD 10 s, PUT 120 s (velké soubory)
- **MIME**: `mimetypes.guess_type()`, fallback `application/octet-stream`
- **Backfill**: dokumenty s `downloaded=True, seaweed_path=null` lze
dohnat samostatným skriptem (čte `file` z Mongo, nahraje, zapíše pola)
## Konfigurace (konstanty nahoře)
- `SEAWEED_FILER` — URL Filer serveru
- `SEAWEED_PREFIX` — prefix cesty (`/vtmf-documents`)
- `REPORT_URL` — ID reportu + filtr studie
- `LIMIT` — None = vše; číslo = dávka
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
## Ověřené technické detaily (nesahat bez ověření)
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
(křížek `.ui-dialog-titlebar-close` je display:none).
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
- Export kliknout právě jednou; rozhoduje `expect_download`.
- Placeholder detekce: `div.vv_placeholder_text` (uvnitř
`div.vv_placeholder_pane > div.vv_placeholder_container`).
## Spuštění
```powershell
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.5.py"
```
Předchůdce: vtmf_pipeline_v1.4 (TRASH/).
@@ -0,0 +1,937 @@
# ============================================================
# vtmf_pipeline_v1.5.py
# Verze: 1.5
# Datum: 2026-06-15
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault), studie
# 77242113UCO3001. Jeden běh udělá:
# 1) login do Vaultu (persistentní session + ruční 2FA),
# 2) export reportu "Document Inventory Report - Study
# Level" do Excelu (Data Only) do WhatToDownload/,
# 3) parse reportu a synchronizaci do MongoDB
# (Tower, db VTMF, kolekce documents,
# klíč = VTMF číslo + verze):
# - nové dokumenty se založí,
# - změny polí se promítnou (+ history[]),
# - dokumenty chybějící v reportu se označí
# deleted=True a stažený soubor dostane ' [D]',
# - znovuobjevené se vzkřísí a ' [D]' se odebere,
# 4) stažení všech dosud nestažených dokumentů do
# U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\
# <Type>\<Subtype>\"YYYY-MM-DD Description
# [VTMF-x] [v1.0].<přípona>" + zápis stavu do Mongo.
#
# Tracking stahování je KOMPLETNĚ v Mongo; starý
# download_state.csv se při prvním běhu jednorázově
# namigruje a přejmenuje na .imported.
#
# Vychází z download_vault_v2.1 (v TRASH/) — login, dialogy
# a stahování beze změny; nové jsou kroky 2 a 3.
#
# v1.1: oprava tichého selhání — chyba kteréhokoli kroku se teď
# hlasitě vypíše (a exit kód 2), místo aby běh skončil
# souhrnem "0 staženo, 0 chyb". Export reportu: více
# selektorů pro menu ⋯ i položku Export to Excel (včetně
# hledání ve všech frames) a při selhání automatický záchyt
# diagnostiky stránky do debug/ (screenshot + HTML frames).
# v1.2: selektory exportu OVĚŘENÉ na živém DOM (žádný iframe):
# menu ⋯ = .actionMenuContainer .dropDown.vv_dropdown_toggle
# button.vv-icon-button (title prázdný!); menu se načítá
# asynchronně -> čekat na položku; položka =
# a.ReportAction[data-action-name='ExcelExport']; Data Only =
# radio name=requiredRadioField value=STANDARD (default
# checked); Export = <button> role+text (emotion class hash,
# neselektovat podle tříd).
# v1.3: na konci běhu se prohlížeč i okno zavře automaticky
# (žádné čekání na ENTER) — vhodné pro bezobslužné běhy.
# Interaktivní vstupy zůstávají jen tam, kde jsou nutné
# (2FA, ručně nezavřitelný dialog).
# v1.4: detekce placeholder dokumentů — stránka s textem
# "This placeholder has no content" se přeskočí
# (placeholder=True, downloaded=True v Mongo), žádná chyba.
# v1.5: upload stažených dokumentů do SeaweedFS Filer
# (192.168.1.50:8888, cesta /vtmf-documents/ab/cd/<sha256>).
# SHA-256 content-addressed dedup — identický soubor se uloží
# jen jednou. Chyba uploadu neblokuje download; chybějící
# sha256/seaweed_path lze doplnit backfillem. Mongo nově ukládá:
# sha256, seaweed_path, seaweed_url, seaweed_synced_at.
# Souhrn běhu uvádí počet nově nahraných vs. dedup hitů.
#
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
# ============================================================
import csv
import hashlib
import mimetypes
import os
import re
import sys
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
from pymongo import MongoClient, ASCENDING
# --- Konfigurace -------------------------------------------------------
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
"&IdpAdapterId=CompIWALDAPEXTFORM"
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
# Report Document Inventory Report - Study Level, filtr na studii
REPORT_URL = ("https://vtmf.veevavault.com/ui/#reporting/viewer/"
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008")
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
SCRIPT_DIR = Path(__file__).resolve().parent
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
OLD_STATE_FILE = SCRIPT_DIR / "download_state.csv" # legacy CSV (migrace)
DOWNLOAD_ROOT = Path(r"U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001")
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
LIMIT = 0
# Pole reportu, jejichž změny se promítají a verzují do history[]
TRACKED_FIELDS = ("name", "status", "type", "subtype", "desc",
"date", "url", "studies")
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
RETRY_PAUSE_MS = 5000 # pauza před opakováním
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
SEAWEED_FILER = "http://192.168.1.50:8888"
SEAWEED_PREFIX = "/vtmf-documents"
class PlaceholderDocument(Exception):
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
def log(msg):
print(msg, flush=True)
def load_env_file(path):
"""Načte KEY=VALUE řádky z .env do os.environ.
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
if not path.exists():
log(f"[!] .env nenalezen: {path}")
return
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key, value = key.strip(), value.strip().strip('"').strip("'")
if value and key not in os.environ:
os.environ[key] = value
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
def ensure_credentials():
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
load_env_file(ENV_FILE)
if all(os.environ.get(k) for k in ENV_KEYS):
return
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
missing_lines = [f"{k}=" for k in ENV_KEYS
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
if not ENV_FILE.exists():
ENV_FILE.write_text(
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
+ ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n",
encoding="utf-8")
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
elif missing_lines:
with open(ENV_FILE, "a", encoding="utf-8") as f:
f.write("\n" + ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n")
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
print("\n" + "=" * 60)
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
print(f" {ENV_FILE}")
print(" a spusť skript znovu.")
print("=" * 60)
sys.exit(1)
# --- Parsování Excelu --------------------------------------------------
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
# nepovolené znaky Windows názvů + řídicí znaky + unicode artefakt
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f]")
def clean_filename(s):
"""Očistí string na platné jméno souboru/složky ve Windows."""
s = BAD_CHARS_RE.sub("_", str(s))
s = re.sub(r"\s+", " ", s) # vícenásobné mezery -> jedna
s = re.sub(r"_{2,}", "_", s) # vícenásobná podtržítka -> jedno
return s.strip(" ._") # okraje: mezery, tečky, podtržítka
def display_text(cell):
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
raw = str(cell.value or "").strip()
m = HYPERLINK_RE.search(raw)
return m.group(2).strip() if m else raw
def extract_doc_url(raw):
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
if not m:
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
return m.group(1)
def read_documents_from_excel(path):
"""Načte dokumenty z daného .xlsx reportu. Vrací list dictů:
vtmf, version, url, name, status, type, subtype, desc, date, studies.
Document Name/Number/Status jsou =HYPERLINK vzorce — URL i text se
berou regexem. Report má rozbité deklarované rozměry, čte se
přímou iterací řádků."""
from openpyxl import load_workbook
log(f"[i] Parsování reportu: {path.name}")
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
ws = wb[wb.sheetnames[0]]
rows = ws.iter_rows()
header = [c.value for c in next(rows)]
try:
i_num = header.index("Document Number")
i_name = header.index("Document Name")
i_status = header.index("Document Status")
i_type = header.index("Type")
i_sub = header.index("Subtype")
i_desc = header.index("Description")
i_date = header.index("Document Date")
i_study = header.index("Study")
except ValueError as e:
raise RuntimeError(f"V reportu chybí očekávaný sloupec: {e}")
docs, bad = [], []
for row in rows:
cell = row[i_num]
if cell.value is None:
continue
raw = str(cell.value)
m = HYPERLINK_RE.search(raw)
if m:
url_raw, vtmf = m.group(1), m.group(2)
elif cell.hyperlink: # pravý hyperlink místo vzorce
url_raw, vtmf = cell.hyperlink.target, raw
else:
bad.append(raw)
continue
try:
url = extract_doc_url(url_raw)
except ValueError:
bad.append(raw)
continue
name = display_text(row[i_name])
vm = VERSION_RE.search(name)
version = vm.group(1) if vm else "v?"
desc = clean_filename(display_text(row[i_desc]))
if not desc:
# fallback: Document Name bez koncové verze (jde zvlášť na konec)
desc = clean_filename(VERSION_RE.sub("", name))
date = row[i_date].value # datetime nebo None
docs.append({
"vtmf": vtmf.strip(),
"version": version,
"url": url,
"name": name,
"status": display_text(row[i_status]),
"type": clean_filename(display_text(row[i_type])),
"subtype": clean_filename(display_text(row[i_sub])),
"desc": desc,
"date": date if hasattr(date, "strftime") else None,
"studies": display_text(row[i_study]),
})
log(f"[i] Načteno {len(docs)} dokumentů"
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
return docs
def build_target_path(doc, suggested_filename):
"""Cílová cesta: DOWNLOAD_ROOT\\Type\\Subtype\\
'YYYY-MM-DD Description [VTMF-xxx] [v1.0].<skutečná přípona>'.
Datum/verze se vynechají, když nejsou k dispozici."""
ext = Path(suggested_filename).suffix # skutečná přípona vč. tečky
date_prefix = doc["date"].strftime("%Y-%m-%d") + " " if doc["date"] else ""
version = f" [{doc['version']}]" if doc.get("version") else ""
filename = f"{date_prefix}{doc['desc']} [{doc['vtmf']}]{version}{ext}"
return DOWNLOAD_ROOT / doc["type"] / doc["subtype"] / filename
def deleted_marker_path(path):
"""Jméno souboru s příznakem smazání: 'x.pdf' -> 'x [D].pdf'."""
p = Path(path)
return p.with_name(f"{p.stem} [D]{p.suffix}")
# --- MongoDB synchronizace ---------------------------------------------
def doc_key(vtmf, version):
return f"{vtmf}|{version}"
def get_collection():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
coll = client[MONGO_DB][MONGO_COLL]
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)],
unique=True)
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
return coll
def migrate_old_csv(coll):
"""Jednorázová migrace download_state.csv do Mongo: záznamy 'ok'
se zapíší jako downloaded=True k odpovídajícímu VTMF (aktuální,
nesmazané verzi). CSV se pak přejmenuje na .imported."""
if not OLD_STATE_FILE.exists():
return
migrated = 0
with open(OLD_STATE_FILE, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
if row["result"] != "ok":
continue
r = coll.update_one(
{"vtmf": row["vtmf"], "deleted": False,
"downloaded": {"$ne": True}},
{"$set": {"downloaded": True, "file": row["file"],
"downloaded_at": row["timestamp"]}})
migrated += r.modified_count
OLD_STATE_FILE.rename(OLD_STATE_FILE.with_suffix(".csv.imported"))
log(f"[i] Migrace download_state.csv -> Mongo: {migrated} záznamů; "
f"CSV přejmenováno na .imported")
def sync_report_to_mongo(coll, docs):
"""Promítne aktuální report do kolekce documents.
Klíč = (vtmf, version). Nové založí, změny polí promítne
(s history[]), chybějící označí deleted + soubor přejmenuje
s ' [D]', znovuobjevené vzkřísí a ' [D]' odebere."""
now = datetime.now()
stats = {"new": 0, "updated": 0, "unchanged": 0,
"resurrected": 0, "marked_deleted": 0}
current_keys = set()
for d in docs:
key = doc_key(d["vtmf"], d["version"])
current_keys.add(key)
existing = coll.find_one({"_id": key})
if existing is None:
coll.insert_one({
"_id": key, **d,
"first_seen": now, "last_seen": now,
"deleted": False, "downloaded": False,
"file": None, "history": [],
})
stats["new"] += 1
continue
changes = {}
for fld in TRACKED_FIELDS:
if existing.get(fld) != d.get(fld):
changes[fld] = {"old": existing.get(fld),
"new": d.get(fld)}
update = {"$set": {**d, "last_seen": now, "deleted": False}}
if changes:
update["$push"] = {"history": {"ts": now, "changes": changes}}
stats["updated"] += 1
else:
stats["unchanged"] += 1
if existing.get("deleted"):
# dokument se do reportu vrátil -> odebrat [D] ze souboru
stats["resurrected"] += 1
stats["unchanged"] -= 0 # (počítá se výše jako updated/unchanged)
old_file = existing.get("file")
if old_file:
marked = deleted_marker_path(old_file)
if marked.exists() and not Path(old_file).exists():
marked.rename(old_file)
log(f"[i] {key}: soubor vrácen z ' [D]' zpět.")
update["$set"]["file"] = str(old_file)
coll.update_one({"_id": key}, update)
# dokumenty, které v aktuálním reportu nejsou -> deleted + ' [D]'
for rec in coll.find({"deleted": False}):
if rec["_id"] in current_keys:
continue
upd = {"deleted": True, "deleted_at": now}
f = rec.get("file")
if f and Path(f).exists():
marked = deleted_marker_path(f)
try:
Path(f).rename(marked)
upd["file"] = str(marked)
log(f"[i] {rec['_id']}: soubor označen ' [D]'.")
except OSError as e:
log(f"[!] {rec['_id']}: přejmenování na [D] selhalo: {e}")
coll.update_one({"_id": rec["_id"]},
{"$set": upd,
"$push": {"history": {"ts": now,
"changes": {"deleted": {
"old": False,
"new": True}}}}})
stats["marked_deleted"] += 1
log(f"[ok] Mongo sync: {stats['new']} nových, {stats['updated']} změněných, "
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
f"{stats['marked_deleted']} označených deleted.")
return stats
# --- Přihlášení --------------------------------------------------------
def submit_login_form(page, password_box):
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
candidates = [
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
page.locator("input[type='submit']"),
page.locator("button[type='submit']"),
page.get_by_role("button", name=re.compile("^ok$", re.I)),
]
for loc in candidates:
try:
if loc.count() and loc.first.is_visible():
label = (loc.first.inner_text() or
loc.first.get_attribute("value") or "submit").strip()
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
loc.first.click()
return
except Exception:
continue
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
password_box.press("Enter")
def login_if_needed(page):
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
log(f"[i] Otevírám přihlašovací URL...")
page.goto(LOGIN_URL, wait_until="domcontentloaded")
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Už přihlášen (perzistentní session).")
return
user_box = page.locator("input[type='text']").first
try:
user_box.wait_for(timeout=8000)
except PWTimeout:
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Přihlášen bez formuláře (session redirect).")
return
raise RuntimeError(
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
username = os.environ["VAULT_USER"]
password = os.environ["VAULT_PASS"]
log("[i] Vyplňuji přihlašovací údaje...")
user_box.fill(username)
password_box = page.locator("input[type='password']").first
password_box.fill(password)
submit_login_form(page, password_box)
log("[i] Odeslán login, čekám na výsledek...")
try:
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
log("[ok] Přihlášen rovnou (bez 2FA).")
return
except PWTimeout:
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
err = page.locator("text=/invalid|incorrect|failed/i")
try:
if err.count() and err.first.is_visible():
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
except PWTimeout:
pass
print("\n" + "=" * 60)
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
print(" Potvrď přihlášení v mobilní aplikaci.")
print("=" * 60)
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
log("[ok] Přihlášení dokončeno.")
def verify_inside(page):
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
log(f"[ok] Uvnitř Vaultu: {page.url}")
def dialog_visible(page):
"""True, pokud je na stránce viditelný jQuery UI dialog."""
try:
dlg = page.locator(".ui-dialog")
return bool(dlg.count() and dlg.first.is_visible())
except Exception:
return False
def save_page_debug(page, tag):
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
kandidátů na tlačítka. Vrátí cestu složky."""
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
out.mkdir(parents=True, exist_ok=True)
try:
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
except Exception as e:
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
report = []
for i, frame in enumerate(page.frames):
report.append(f"=== frame[{i}] url={frame.url}")
try:
(out / f"frame_{i}.html").write_text(frame.content(),
encoding="utf-8")
for sel in (".ui-dialog", "a.ok.vv_button",
".ui-dialog-titlebar-close",
"button", "input[type='button']",
"[title]", "[aria-label]"):
n = frame.locator(sel).count()
if n:
report.append(f" {sel}: {n}x")
# výpis title/aria-label atributů — pomáhá najít menu ⋯
for attr in ("title", "aria-label"):
vals = frame.locator(f"[{attr}]").evaluate_all(
f"els => els.map(e => e.getAttribute('{attr}'))")
uniq = sorted({v for v in vals if v})[:80]
report.append(f" {attr}: {uniq}")
except Exception as e:
report.append(f" [chyba čtení framu: {e}]")
(out / "frames_report.txt").write_text("\n".join(report),
encoding="utf-8")
log(f"[!] Diagnostika stránky uložena do: {out}")
return out
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
".vv_login_msg_dialog .vv_button.ok")
def dismiss_maintenance_popup(page, timeout=8000):
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
proto se na něj krátce čeká. Bezpečné volat vždy."""
ok = page.locator(DIALOG_OK_SELECTOR)
try:
ok.first.wait_for(state="visible", timeout=timeout)
except PWTimeout:
return False # okno se neobjevilo — pokračujeme
except Exception:
return False
closed = 0
for _ in range(5): # dialogy umí být ve frontě
try:
if ok.count() and ok.first.is_visible():
ok.first.click()
page.wait_for_timeout(300)
closed += 1
log("[i] Maintenance/login dialog zavřen (OK).")
continue
except Exception:
pass
break
if not dialog_visible(page):
return bool(closed)
page.keyboard.press("Escape")
page.wait_for_timeout(500)
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
if dialog_visible(page):
save_page_debug(page, "dialog")
print("\n" + "=" * 60)
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
print(" Zavři ho prosím ručně v prohlížeči.")
print("=" * 60)
input(" Po ručním zavření stiskni ENTER... ")
return bool(closed)
# --- Export reportu ----------------------------------------------------
def _first_visible(page, builders):
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
hlavní stránce i ve všech frames."""
for frame in page.frames:
for build, desc in builders:
try:
loc = build(frame)
if loc.count() and loc.first.is_visible():
return loc.first, desc
except Exception:
continue
return None, None
def download_report(page):
"""Stáhne report (Export to Excel, Data Only) do WhatToDownload/
pod timestampovaným názvem. Vrátí cestu k souboru.
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
log("[i] Otevírám report Document Inventory Report - Study Level...")
page.goto(REPORT_URL, wait_until="domcontentloaded")
dismiss_maintenance_popup(page, timeout=4000)
# report je hotový, když se objeví počet záznamů / statusy
try:
page.wait_for_selector("text=Returned", timeout=30000)
except PWTimeout:
try:
page.wait_for_selector("text=Document Status:", timeout=30000)
except PWTimeout:
save_page_debug(page, "report_load")
raise RuntimeError(
"Report se nenačetl (nenašel jsem 'Returned' ani "
"'Document Status:'). Diagnostika v debug/.")
log("[i] Report načten, otevírám menu akcí (⋯)...")
# Menu ⋯ (Actions): button bez title/aria-label uvnitř
# .actionMenuContainer (ověřeno na živém DOM, žádný iframe).
actions, desc = _first_visible(page, [
(lambda f: f.locator(
".actionMenuContainer .dropDown.vv_dropdown_toggle "
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
])
if actions is None:
save_page_debug(page, "report_menu")
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. "
"Diagnostika v debug/.")
log(f"[i] Menu nalezeno přes: {desc}")
actions.click()
# Menu se načítá ASYNCHRONNĚ (data-loaded=false -> AJAX),
# počkat na položku, nečíst hned po kliknutí.
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
try:
item.first.wait_for(state="visible", timeout=15000)
except PWTimeout:
# fallback podle textu (kdyby se data atribut změnil)
item = page.get_by_text("Export to Excel", exact=True)
try:
item.first.wait_for(state="visible", timeout=5000)
except PWTimeout:
save_page_debug(page, "report_export_item")
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
"Excel' jsem nenašel. Diagnostika v debug/.")
log("[i] Klikám 'Export to Excel'...")
item.first.click()
log("[i] Dialog Excel Export Options...")
# 'Data Only' = radio value=STANDARD, defaultně checked; pojistka.
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
try:
radio.first.wait_for(state="visible", timeout=10000)
if not radio.first.is_checked():
radio.first.check()
log("[i] Přepnuto na 'Data Only'.")
except PWTimeout:
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
# Export = <button> s textem Export (React dialog, emotion třídy —
# NEselektovat podle class hash, jen role+text).
export_btn = page.get_by_role("button", name="Export", exact=True)
try:
export_btn.first.wait_for(state="visible", timeout=10000)
except PWTimeout:
save_page_debug(page, "report_export_btn")
raise RuntimeError("Dialog exportu bez tlačítka Export. "
"Diagnostika v debug/.")
export_btn = export_btn.first
# Export kliknout PRÁVĚ jednou (vícenásobné kliky = duplikáty);
# 503/redirecty v network logu neřešit — rozhoduje expect_download
with page.expect_download(timeout=120000) as dl_info:
export_btn.click()
download = dl_info.value
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dest = EXCEL_DIR / f"{ts} {download.suggested_filename}"
download.save_as(str(dest))
log(f"[ok] Report uložen: {dest}")
return dest
def archive_report(path):
"""Po úspěšném zpracování přesune report do Zpracovano/."""
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
target = PROCESSED_DIR / path.name
path.rename(target)
log(f"[i] Report archivován: {target}")
# --- SeaweedFS ---------------------------------------------------------
def _sw_path(sha256):
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
def seaweed_store(data, mime="application/octet-stream"):
"""Idempotentní upload do SeaweedFS Filer.
Vrací (path, url, uploaded): uploaded=False znamená dedup hit."""
sha256 = hashlib.sha256(data).hexdigest()
path = _sw_path(sha256)
url = SEAWEED_FILER + path
try:
urllib.request.urlopen(
urllib.request.Request(url, method="HEAD"), timeout=10)
return path, url, False # soubor už existuje
except urllib.error.HTTPError as e:
if e.code != 404:
raise
req = urllib.request.Request(
url, data=data, method="PUT",
headers={"Content-Type": mime})
urllib.request.urlopen(req, timeout=120)
return path, url, True
# --- Stažení dokumentů -------------------------------------------------
def find_source_file_button(page):
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře).
Více fallback selektorů — DOM se může lišit podle typu dokumentu."""
candidates = [
"[title='Source File']",
"[aria-label='Source File']",
]
for sel in candidates:
loc = page.locator(sel)
if loc.count():
return loc.first
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
if loc.count():
return loc.first
return None
def download_source_file(page, doc):
vtmf = doc["vtmf"]
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
page.goto(doc["url"], wait_until="domcontentloaded")
try:
page.wait_for_load_state("networkidle", timeout=30000)
except PWTimeout:
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
dismiss_maintenance_popup(page, timeout=2000)
ph = page.locator("div.vv_placeholder_text")
if ph.count() and ph.first.is_visible():
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
raise PlaceholderDocument(vtmf)
target = find_source_file_button(page)
if target is None:
raise RuntimeError(
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
log("[i] Klikám na Source File a čekám na download...")
with page.expect_download(timeout=60000) as dl_info:
target.click()
# Varianta s dropdownem (Source File + Viewable Rendition)
try:
item = page.get_by_role("menuitem",
name=re.compile("Source File", re.I))
if item.count() and item.first.is_visible():
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
item.first.click()
except Exception:
pass
download = dl_info.value
dest = build_target_path(doc, download.suggested_filename)
dest.parent.mkdir(parents=True, exist_ok=True)
download.save_as(str(dest))
return dest
def download_missing(page, coll):
"""Stáhne všechny nesmazané dokumenty bez downloaded=True.
Výsledek každého se ihned zapíše do Mongo."""
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
.sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
if LIMIT:
todo = todo[:LIMIT]
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
ok_count, fail_count, placeholder_count = 0, 0, 0
sw_uploaded = sw_dedup = sw_failed = 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
log(f"\n--- [{n}/{len(todo)}] {key} | {doc['desc'][:70]}")
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
try:
dest = download_source_file(page, doc)
# SeaweedFS upload (neblokuje při chybě)
sw_path = sw_url = sw_ts = sha256_hex = None
try:
data = dest.read_bytes()
size_kb = len(data) / 1024
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
ext = dest.suffix.lstrip('.').upper()
log(f"[ok] Stazeno: {dest.name} ({size_str} {ext})")
mime = mimetypes.guess_type(dest.name)[0] or "application/octet-stream"
sw_path, sw_url, uploaded = seaweed_store(data, mime)
sha256_hex = hashlib.sha256(data).hexdigest()
sw_ts = datetime.now()
if uploaded:
sw_uploaded += 1
log(f"[ok] SeaweedFS: nahrano ({size_str}) -> {sw_path}")
else:
sw_dedup += 1
log(f"[i] SeaweedFS: dedup hit ({size_str}) -> {sw_path}")
except Exception as sw_err:
sw_failed += 1
log(f"[!] SeaweedFS upload selhal (soubor je na disku): {sw_err}")
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "file": str(dest),
"downloaded_at": datetime.now(),
"sha256": sha256_hex,
"seaweed_path": sw_path,
"seaweed_url": sw_url,
"seaweed_synced_at": sw_ts,
"last_error": None}})
ok_count += 1
last_err = None
break
except PlaceholderDocument:
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "placeholder": True,
"file": None, "downloaded_at": datetime.now(),
"last_error": None}})
placeholder_count += 1
last_err = None
break
except Exception as e:
last_err = e
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
if attempt < MAX_ATTEMPTS:
page.wait_for_timeout(RETRY_PAUSE_MS)
if last_err is not None:
coll.update_one({"_id": key}, {"$set": {
"last_error": str(last_err),
"error_at": datetime.now()}})
fail_count += 1
page.wait_for_timeout(BETWEEN_DOCS_MS)
return ok_count, fail_count, placeholder_count, sw_uploaded, sw_dedup, sw_failed
# --- Main --------------------------------------------------------------
def main():
ensure_credentials()
coll = get_collection()
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
accept_downloads=True,
no_viewport=True, # okno se chová nativně
args=["--start-maximized"],
)
page = ctx.pages[0] if ctx.pages else ctx.new_page()
ok_count = fail_count = placeholder_count = 0
sw_uploaded = sw_dedup = sw_failed = 0
pipeline_error = None
try:
# 1) login
login_if_needed(page)
verify_inside(page)
dismiss_maintenance_popup(page)
# 2) export reportu
report_path = download_report(page)
# 3) parse + sync do Mongo
docs = read_documents_from_excel(report_path)
if not docs:
raise RuntimeError("Report neobsahuje žádné dokumenty — "
"sync přeskočen, nic se nemaže.")
sync_report_to_mongo(coll, docs)
migrate_old_csv(coll)
archive_report(report_path)
# 4) stažení chybějících
DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
(ok_count, fail_count, placeholder_count,
sw_uploaded, sw_dedup, sw_failed) = download_missing(page, coll)
except KeyboardInterrupt:
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
except Exception as e:
pipeline_error = e
print("\n" + "=" * 60)
print(" PIPELINE SELHALA!")
print(f" {type(e).__name__}: {e}")
print("=" * 60)
finally:
total = coll.count_documents({})
have = coll.count_documents({"deleted": False, "downloaded": True})
active = coll.count_documents({"deleted": False})
sw_info = (f"SeaweedFS: {sw_uploaded} nových, {sw_dedup} dedup"
+ (f", {sw_failed} chyb uploadu" if sw_failed else ""))
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
f"{placeholder_count} placeholderů přeskočeno, {fail_count} chyb"
+ (f", PIPELINE SELHALA ({pipeline_error})" if pipeline_error else ".")
+ (f"\n[i] {sw_info}" if ok_count else ""))
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
f"z toho staženo {have} ({active - have} zbývá).")
log("[i] Zavírám prohlížeč.")
ctx.close()
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
if __name__ == "__main__":
main()
+215
View File
@@ -0,0 +1,215 @@
# ============================================================
# migrate_to_v16.py
# Verze: 1.1
# Datum: 2026-06-15
# Popis: Jednorázová migrace stávajících STUDY-level dat
# (nasbíraných pipeline v1.3v1.5) na schéma v1.6.
#
# v1.6 ukládá dokumenty JEN do SeaweedFS (žádný Dropbox),
# klíč = číslo dokumentu + verze. Dvě fáze:
#
# [mongo] Re-parse NEJNOVĚJŠÍHO archivovaného study reportu
# (WhatToDownload/Zpracovano/*Study Level*.xlsx)
# v1.6 parserem a obohacení existujících dokumentů
# o nová pole (level, levels[], scopes[], studies[],
# countries=[], sites=[], classification,
# process_name, external_system_name, created_by,
# last_modified_by, version_created_by).
# NESAHÁ na download stav (downloaded, sha256,
# seaweed_*, history, first_seen).
#
# [seaweed] Překlíčování SeaweedFS ze starých SHA cest na nové
# /vtmf-documents/<vtmf>/<verze>.<přípona>. Zdroj
# bajtů = stávající soubor na disku (pole file), jako
# fallback GET ze staré SHA cesty. Po úspěchu: oprava
# seaweed_path/url + sha256 v Mongo, smazání staré SHA
# cesty a ODEBRÁNÍ pole file z Mongo (Dropbox se už
# nepoužívá; fyzické soubory v Dropboxu pak můžeš
# smazat ručně).
#
# DEFAULT je DRY-RUN. Ostře až s --apply. Idempotentní.
#
# Použití:
# python migrate_to_v16.py # dry-run, vše
# python migrate_to_v16.py --apply # ostře, vše
# python migrate_to_v16.py --phase mongo --apply
# python migrate_to_v16.py --phase seaweed --apply
# ============================================================
import argparse
import hashlib
import importlib.util
import mimetypes
import re
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
PIPE_FILE = SCRIPT_DIR / "vtmf_pipeline_v1.6.py"
# starý SHA-256 content-addressed tvar cesty (k odstranění z SeaweedFS)
OLD_SHA_PATH_RE = re.compile(r"^/vtmf-documents/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}$")
def load_pipeline():
spec = importlib.util.spec_from_file_location("vtmf_pipeline_v16", PIPE_FILE)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
def log(msg):
print(msg, flush=True)
def http_get(url):
with urllib.request.urlopen(url, timeout=120) as r:
return r.read()
def seaweed_delete(url):
try:
urllib.request.urlopen(urllib.request.Request(url, method="DELETE"), timeout=30)
return True
except urllib.error.HTTPError as e:
return e.code in (404, 204, 200)
except Exception:
return False
# --- Fáze MONGO --------------------------------------------------------
def phase_mongo(mod, coll, apply):
zp = SCRIPT_DIR / "WhatToDownload" / "Zpracovano"
reports = sorted(zp.glob("*Study Level*.xlsx"))
if not reports:
log("[!] Nenašel jsem žádný archivovaný study report — fáze mongo přeskočena.")
return
newest = reports[-1]
log(f"[i] [mongo] Re-parse: {newest.name}")
docs = mod.read_documents_from_excel(newest, "study")
docs = [d for d in docs if mod.TARGET_STUDY in d["studies"]]
log(f"[i] [mongo] {len(docs)} dokumentů study-level {mod.TARGET_STUDY}.")
sk = f"study|{mod.TARGET_STUDY}|"
enriched = missing = 0
for d in docs:
key = mod.doc_key(d["vtmf"], d["version"])
if not coll.find_one({"_id": key}, {"_id": 1}):
missing += 1
if missing <= 10:
log(f" [!] V Mongo chybí {key} (přeskočeno).")
continue
set_fields = {
"level": "study", "url": d["url"], "name": d["name"],
"status": d["status"], "type": d["type"], "subtype": d["subtype"],
"classification": d["classification"], "desc": d["desc"],
"process_name": d["process_name"],
"external_system_name": d["external_system_name"],
"created_by": d["created_by"], "last_modified_by": d["last_modified_by"],
"version_created_by": d["version_created_by"], "date": d["date"],
"studies": d["studies"], "countries": [], "sites": [],
}
if apply:
coll.update_one({"_id": key}, {
"$set": set_fields,
"$addToSet": {"scopes": sk, "levels": "study"},
})
enriched += 1
log(f"[{'APPLY' if apply else 'DRY'}] [mongo] Obohaceno {enriched} dokumentů"
+ (f", {missing} v Mongo chybělo." if missing else "."))
# --- Fáze SEAWEED ------------------------------------------------------
def phase_seaweed(mod, coll, apply):
q = {"downloaded": True, "placeholder": {"$ne": True}, "file": {"$ne": None}}
docs = list(coll.find(q))
log(f"[i] [seaweed] Kandidátů (s polem file): {len(docs)}")
uploaded = old_deleted = unset = missing = err = already = 0
for doc in docs:
key = doc["_id"]
src = Path(doc["file"])
ext = src.suffix
new_path = mod.seaweed_path(doc["vtmf"], doc["version"], ext)
old_path = doc.get("seaweed_path")
old_is_sha = bool(old_path and OLD_SHA_PATH_RE.match(old_path))
if old_path == new_path:
already += 1
if apply: # jen dorovnat: zahodit file
coll.update_one({"_id": key}, {"$unset": {"file": ""}})
unset += 1
continue
if not apply:
note = f" (smazat starou {old_path})" if old_is_sha else ""
log(f" PUT {new_path}{note} (+ unset file)")
continue
# zdroj bajtů: disk, fallback GET ze staré SHA cesty
try:
if src.exists():
data = src.read_bytes()
elif old_is_sha:
data = http_get(mod.SEAWEED_FILER + old_path)
else:
missing += 1
if missing <= 10:
log(f" [!] {key}: zdroj nedostupný (soubor i SHA chybí).")
continue
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
sw_path, sw_url = mod.seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
coll.update_one({"_id": key}, {
"$set": {"seaweed_path": sw_path, "seaweed_url": sw_url,
"sha256": hashlib.sha256(data).hexdigest(),
"seaweed_synced_at": datetime.now()},
"$unset": {"file": ""}})
uploaded += 1
unset += 1
if old_is_sha and old_path != sw_path:
if seaweed_delete(mod.SEAWEED_FILER + old_path):
old_deleted += 1
except Exception as e:
err += 1
log(f" [!] {key}: SeaweedFS selhal: {e}")
log(f"[{'APPLY' if apply else 'DRY'}] [seaweed] Překlíčováno {uploaded}, "
f"už na nové cestě {already}, starých SHA smazáno {old_deleted}, "
f"pole file odebráno {unset}, chybí zdroj {missing}, chyb {err}.")
# --- Main --------------------------------------------------------------
def main():
ap = argparse.ArgumentParser(description="Migrace VTMF dat na schéma v1.6")
ap.add_argument("--phase", choices=["mongo", "seaweed", "all"], default="all")
ap.add_argument("--apply", action="store_true",
help="ostrý běh (bez něj jen DRY-RUN)")
args = ap.parse_args()
mode = "APPLY (ostře)" if args.apply else "DRY-RUN (nic se nemění)"
log(f"=== Migrace na v1.6 — fáze: {args.phase} — režim: {mode} ===\n")
mod = load_pipeline()
_, coll, _ = mod.get_db()
log(f"[ok] Mongo: {mod.MONGO_URI} / {mod.MONGO_DB}.{mod.MONGO_COLL}\n")
if args.phase in ("mongo", "all"):
phase_mongo(mod, coll, args.apply)
log("")
if args.phase in ("seaweed", "all"):
phase_seaweed(mod, coll, args.apply)
log("")
log("=== DRY-RUN hotov. Pro ostrý běh přidej --apply. ==="
if not args.apply else "=== Migrace dokončena. ===")
if __name__ == "__main__":
main()
+140
View File
@@ -0,0 +1,140 @@
# ============================================================
# seaweed_backfill_v1.1.py
# Verze: 1.1
# Datum: 2026-06-15
# v1.1: retry 3x s 5s pauzou při HTTP 5xx (přechodná chyba serveru)
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
# všechny dokumenty z VTMF.documents, které jsou na disku
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
# Placeholdery a záznamy bez souboru přeskočí.
# Lze spustit opakovaně — HEAD check zajistí dedup,
# přerušení kdykoli naváže příště.
# ============================================================
import hashlib
import mimetypes
import sys
import time
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
SEAWEED_FILER = "http://192.168.1.50:8888"
SEAWEED_PREFIX = "/vtmf-documents"
def log(msg):
print(msg, flush=True)
def sw_path(sha256):
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
MAX_ATTEMPTS = 3
RETRY_PAUSE = 5 # sekund mezi pokusy při 5xx
def seaweed_store(data, mime="application/octet-stream"):
"""HEAD check + PUT s retry při 5xx. Vrací (path, url, uploaded)."""
sha256 = hashlib.sha256(data).hexdigest()
path = sw_path(sha256)
url = SEAWEED_FILER + path
try:
urllib.request.urlopen(
urllib.request.Request(url, method="HEAD"), timeout=10)
return path, url, False # dedup hit
except urllib.error.HTTPError as e:
if e.code != 404:
raise
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
try:
urllib.request.urlopen(
urllib.request.Request(url, data=data, method="PUT",
headers={"Content-Type": mime}),
timeout=120)
return path, url, True
except urllib.error.HTTPError as e:
if e.code < 500:
raise # 4xx — nema smysl opakovat
last_err = e
if attempt < MAX_ATTEMPTS:
log(f" [!] HTTP {e.code} (pokus {attempt}/{MAX_ATTEMPTS}), čekám {RETRY_PAUSE}s...")
time.sleep(RETRY_PAUSE)
raise last_err
def main():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
coll = client[MONGO_DB][MONGO_COLL]
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
query = {
"downloaded": True,
"placeholder": {"$ne": True},
"seaweed_path": None,
"file": {"$ne": None},
}
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
uploaded = dedup = skipped = failed = 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
path = doc.get("file")
if not path or not Path(path).exists():
log(f"[{n}/{len(todo)}] {key} [!] Soubor nenalezen na disku — přeskočeno.")
skipped += 1
continue
try:
data = Path(path).read_bytes()
size_kb = len(data) / 1024
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
log(f"[{n}/{len(todo)}] {key} ({size_str} {Path(path).suffix.lstrip('.').upper()}) {doc.get('desc', '')[:60]}")
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
sha256_hex = hashlib.sha256(data).hexdigest()
sw_p, sw_url, was_new = seaweed_store(data, mime)
coll.update_one({"_id": key}, {"$set": {
"sha256": sha256_hex,
"seaweed_path": sw_p,
"seaweed_url": sw_url,
"seaweed_synced_at": datetime.now(),
}})
if was_new:
uploaded += 1
log(f" [ok] Nahráno ({size_str}) → {sw_p}")
else:
dedup += 1
log(f" [i] Dedup hit ({size_str}) → {sw_p}")
except Exception as e:
failed += 1
log(f" [!] Chyba: {e}")
log(f"\n{'='*60}")
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
f"{skipped} bez souboru, {failed} chyb.")
log(f"{'='*60}")
sys.exit(1 if failed else 0)
if __name__ == "__main__":
main()
+46
View File
@@ -0,0 +1,46 @@
"""Rychlý test SeaweedFS Filer (port 8888) — PUT / HEAD / GET / DELETE."""
import hashlib
import urllib.error
import urllib.request
FILER = "http://192.168.1.50:8888"
PAYLOAD = b"SeaweedFS VTMF test " + b"x" * 1000
SHA256 = hashlib.sha256(PAYLOAD).hexdigest()
PATH = f"/vtmf-documents/_test/{SHA256[:8]}"
URL = FILER + PATH
def req(method, data=None):
r = urllib.request.Request(URL, method=method, data=data,
headers={"Content-Type": "text/plain"} if data else {})
try:
with urllib.request.urlopen(r, timeout=10) as resp:
return resp.status, resp.read()
except urllib.error.HTTPError as e:
return e.code, b""
print(f"Filer: {FILER}")
print(f"Path: {PATH}\n")
status, _ = req("PUT", PAYLOAD)
assert status in (200, 201), f"PUT selhal: {status}"
print(f"[ok] PUT → {status}")
status, _ = req("HEAD")
assert status == 200, f"HEAD selhal: {status}"
print(f"[ok] HEAD → {status}")
status, body = req("GET")
assert status == 200 and body == PAYLOAD, f"GET selhal: {status}, délka={len(body)}"
print(f"[ok] GET → {status}, {len(body)} B")
status, _ = req("DELETE")
assert status in (200, 204), f"DELETE selhal: {status}"
print(f"[ok] DELETE → {status}")
status, _ = req("HEAD")
assert status == 404, f"Po DELETE HEAD vrátil {status}, čekal 404"
print(f"[ok] HEAD po DELETE → 404 (soubor odstraněn)\n")
print("SeaweedFS Filer OK.")
+112
View File
@@ -0,0 +1,112 @@
# vtmf_pipeline_v1.4 — Kompletní V-TMF workflow (report → Mongo → download)
**Verze:** 1.4 · **Datum:** 2026-06-15
**Změny v1.1:** oprava tichého selhání — výjimka kteréhokoli kroku se
vypíše jako „PIPELINE SELHALA" + exit kód 2 (v1.0 končila zavádějícím
souhrnem „0 staženo, 0 chyb"). Export reportu robustnější: menu ⋯,
položka Export to Excel i tlačítko Export se hledají přes víc selektorů
a ve všech frames; při nenalezení se automaticky uloží diagnostika
stránky do debug/<čas>_report_* (screenshot, HTML všech frames, výpis
title/aria-label atributů) — z ní se dá určit přesný selektor.
**Změny v1.2:** selektory exportu ověřené na živém DOM (Claude in
Chrome; žádný iframe na celé stránce): menu ⋯ =
`.actionMenuContainer .dropDown.vv_dropdown_toggle button.vv-icon-button`
(button má prázdný title!); menu se načítá asynchronně (AJAX) →
po kliknutí se čeká na položku `a.ReportAction[data-action-name='ExcelExport']`;
„Data Only" = radio `name=requiredRadioField value=STANDARD`, defaultně
checked (pojistka přes .check()); tlačítko Export = React `<button>`
s emotion class hash → selektovat jen přes roli+text.
**Změny v1.3:** na konci běhu se prohlížeč i konzole zavřou
automaticky (žádné čekání na ENTER); interaktivní vstup zůstává jen
u 2FA a u ručně nezavřitelného dialogu.
**Změny v1.4:** detekce placeholder dokumentů — Vault zobrazuje text
„This placeholder has no content", dokument nemá žádný Source File ke
stažení. Při detekci se zapíše `placeholder=True, downloaded=True` do
Mongo a dokument se přeskočí bez chyby. Souhrn na konci běhu uvádí
počet placeholderů zvlášť.
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
1. **Login** do vtmf.veevavault.com (persistentní profil
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
+ ENTER; údaje z `.env` v rootu projektu).
2. **Export reportu** „Document Inventory Report - Study Level"
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
uloží se s timestampem do `WhatToDownload/`, po zpracování se
přesune do `WhatToDownload/Zpracovano/`.
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`
(VTMF číslo + verze, unikátní index na dvojici):
- nový dokument → založí se (first_seen, deleted=False,
downloaded=False),
- změna sledovaných polí (name, status, type, subtype, desc,
date, url, studies) → promítne se + záznam do `history[]`
(timestamp + old/new),
- dokument chybí v reportu → `deleted=True, deleted_at` a stažený
soubor se přejmenuje s ` [D]` před příponou,
- dokument se vrátí do reportu → `deleted=False` a ` [D]`
se ze souboru zase odebere.
Výsledná sada = záznamy s `deleted=False`.
4. **Stažení chybějících** — všechny `deleted=False, downloaded≠True`:
doc URL → Source File → uložení do
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<skutečná přípona>`.
Výsledek (cesta, čas, případně chyba) se ihned zapisuje do Mongo —
běh jde kdykoli přerušit a příště naváže.
Placeholder dokumenty (stránka s textem „This placeholder has no
content") se přeskočí a označí `placeholder=True, downloaded=True`.
## Mongo schéma (kolekce documents)
```
_id: "VTMF-19077748|v1.0"
vtmf, version, url, name, status, type, subtype, desc, date, studies
first_seen, last_seen # kdy poprvé/naposledy v reportu
deleted, deleted_at # není ve výsledné sadě reportu
downloaded, file, downloaded_at
placeholder # True = Vault placeholder bez obsahu
last_error, error_at # poslední chyba stahování
history: [{ts, changes: {pole: {old, new}}}]
```
## Migrace starého stavu
Při prvním běhu se `download_state.csv` (z download_vault v2.x)
jednorázově namigruje: záznamy `ok` se k odpovídajícímu VTMF zapíší
jako `downloaded=True` + cesta. CSV se přejmenuje na
`download_state.csv.imported`.
## Konfigurace (konstanty nahoře)
- `REPORT_URL` — ID reportu + filtr studie (pro jinou studii se mění
jen tato dvě ID)
- `LIMIT` — None = stáhnout vše zbývající; číslo = dávka na běh
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
## Ověřené technické detaily (nesahat bez ověření)
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
(křížek `.ui-dialog-titlebar-close` je display:none); objevuje se
se zpožděním → wait_for visible 8 s (home) / 2-4 s (jinde).
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
- Export kliknout právě jednou; 503/redirecty v network logu
ignorovat, rozhoduje expect_download.
- Placeholder detekce: `page.locator("div.vv_placeholder_text")` (uvnitř
`div.vv_placeholder_pane > div.vv_placeholder_container > div.vv-placeholder-drag-and-drop-container`)
se testuje před hledáním Source File ikony — CSS selektor je spolehlivější
než text match.
## Spuštění
```powershell
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.4.py"
```
Předchůdce: vtmf_pipeline_v1.3 (TRASH/).
+96
View File
@@ -0,0 +1,96 @@
# vtmf_pipeline_v1.5 — Kompletní V-TMF workflow (report → Mongo → download → SeaweedFS)
**Verze:** 1.5 · **Datum:** 2026-06-15
**Změny v1.5:** upload každého staženého dokumentu do SeaweedFS Filer
(`192.168.1.50:8888`, cesta `/vtmf-documents/ab/cd/<sha256>`).
SHA-256 content-addressed dedup — identický soubor se uloží jen jednou
(HEAD check → 404 → PUT; při 200 dedup hit). Chyba uploadu neblokuje
download ani zápis do Mongo — soubor zůstane na disku a pole
`sha256/seaweed_path/seaweed_url/seaweed_synced_at` zůstanou `null`
(lze doplnit backfillem). Souhrn na konci uvádí počet nově nahraných,
dedup hitů a případných chyb uploadu zvlášť.
_(Předchozí změny viz TRASH/vtmf_pipeline_v1.4.md)_
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
1. **Login** do vtmf.veevavault.com (persistentní profil
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
+ ENTER; údaje z `.env` v rootu projektu).
2. **Export reportu** „Document Inventory Report - Study Level"
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
uloží se s timestampem do `WhatToDownload/`, po zpracování se
přesune do `WhatToDownload/Zpracovano/`.
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`:
- nové dokumenty se založí,
- změny sledovaných polí se promítnou (+ `history[]`),
- dokumenty chybějící v reportu se označí `deleted=True`
a stažený soubor dostane ` [D]` před příponou,
- znovuobjevené se vzkřísí a ` [D]` se odebere.
4. **Stažení + SeaweedFS upload** — všechny `deleted=False, downloaded≠True`:
- Source File se uloží do
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<přípona>`,
- soubor se přečte z disku, vypočítá se SHA-256, obsah se nahraje
do SeaweedFS na `/vtmf-documents/{sha256[:2]}/{sha256[2:4]}/{sha256}`,
- do Mongo se zapíše `downloaded=True, file, sha256, seaweed_path,
seaweed_url, seaweed_synced_at`; chyba SeaweedFS tyto fieldy
nechá `null` ale `downloaded=True` se zapíše (soubor je na disku).
- Placeholder dokumenty (`div.vv_placeholder_text` viditelný) se
přeskočí s `placeholder=True, downloaded=True`.
## Mongo schéma (kolekce documents)
```
_id: "VTMF-19077748|v1.0"
vtmf, version, url, name, status, type, subtype, desc, date, studies
first_seen, last_seen # kdy poprvé/naposledy v reportu
deleted, deleted_at # není ve výsledné sadě reportu
downloaded, file, downloaded_at
placeholder # True = Vault placeholder bez obsahu
sha256 # hex SHA-256 staženého souboru
seaweed_path # /vtmf-documents/ab/cd/<sha256>
seaweed_url # http://192.168.1.50:8888/vtmf-documents/...
seaweed_synced_at # kdy nahráno / null při chybě
last_error, error_at # poslední chyba stahování
history: [{ts, changes: {pole: {old, new}}}]
```
## SeaweedFS detaily
- **Filer**: `http://192.168.1.50:8888` (přímý PUT, žádný master assign)
- **Dedup**: HEAD → 404 → PUT; HEAD → 200 → dedup hit (vrátí `uploaded=False`)
- **Timeout**: HEAD 10 s, PUT 120 s (velké soubory)
- **MIME**: `mimetypes.guess_type()`, fallback `application/octet-stream`
- **Backfill**: dokumenty s `downloaded=True, seaweed_path=null` lze
dohnat samostatným skriptem (čte `file` z Mongo, nahraje, zapíše pola)
## Konfigurace (konstanty nahoře)
- `SEAWEED_FILER` — URL Filer serveru
- `SEAWEED_PREFIX` — prefix cesty (`/vtmf-documents`)
- `REPORT_URL` — ID reportu + filtr studie
- `LIMIT` — None = vše; číslo = dávka
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
## Ověřené technické detaily (nesahat bez ověření)
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
(křížek `.ui-dialog-titlebar-close` je display:none).
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
- Export kliknout právě jednou; rozhoduje `expect_download`.
- Placeholder detekce: `div.vv_placeholder_text` (uvnitř
`div.vv_placeholder_pane > div.vv_placeholder_container`).
## Spuštění
```powershell
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.5.py"
```
Předchůdce: vtmf_pipeline_v1.4 (TRASH/).
+937
View File
@@ -0,0 +1,937 @@
# ============================================================
# vtmf_pipeline_v1.5.py
# Verze: 1.5
# Datum: 2026-06-15
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault), studie
# 77242113UCO3001. Jeden běh udělá:
# 1) login do Vaultu (persistentní session + ruční 2FA),
# 2) export reportu "Document Inventory Report - Study
# Level" do Excelu (Data Only) do WhatToDownload/,
# 3) parse reportu a synchronizaci do MongoDB
# (Tower, db VTMF, kolekce documents,
# klíč = VTMF číslo + verze):
# - nové dokumenty se založí,
# - změny polí se promítnou (+ history[]),
# - dokumenty chybějící v reportu se označí
# deleted=True a stažený soubor dostane ' [D]',
# - znovuobjevené se vzkřísí a ' [D]' se odebere,
# 4) stažení všech dosud nestažených dokumentů do
# U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\
# <Type>\<Subtype>\"YYYY-MM-DD Description
# [VTMF-x] [v1.0].<přípona>" + zápis stavu do Mongo.
#
# Tracking stahování je KOMPLETNĚ v Mongo; starý
# download_state.csv se při prvním běhu jednorázově
# namigruje a přejmenuje na .imported.
#
# Vychází z download_vault_v2.1 (v TRASH/) — login, dialogy
# a stahování beze změny; nové jsou kroky 2 a 3.
#
# v1.1: oprava tichého selhání — chyba kteréhokoli kroku se teď
# hlasitě vypíše (a exit kód 2), místo aby běh skončil
# souhrnem "0 staženo, 0 chyb". Export reportu: více
# selektorů pro menu ⋯ i položku Export to Excel (včetně
# hledání ve všech frames) a při selhání automatický záchyt
# diagnostiky stránky do debug/ (screenshot + HTML frames).
# v1.2: selektory exportu OVĚŘENÉ na živém DOM (žádný iframe):
# menu ⋯ = .actionMenuContainer .dropDown.vv_dropdown_toggle
# button.vv-icon-button (title prázdný!); menu se načítá
# asynchronně -> čekat na položku; položka =
# a.ReportAction[data-action-name='ExcelExport']; Data Only =
# radio name=requiredRadioField value=STANDARD (default
# checked); Export = <button> role+text (emotion class hash,
# neselektovat podle tříd).
# v1.3: na konci běhu se prohlížeč i okno zavře automaticky
# (žádné čekání na ENTER) — vhodné pro bezobslužné běhy.
# Interaktivní vstupy zůstávají jen tam, kde jsou nutné
# (2FA, ručně nezavřitelný dialog).
# v1.4: detekce placeholder dokumentů — stránka s textem
# "This placeholder has no content" se přeskočí
# (placeholder=True, downloaded=True v Mongo), žádná chyba.
# v1.5: upload stažených dokumentů do SeaweedFS Filer
# (192.168.1.50:8888, cesta /vtmf-documents/ab/cd/<sha256>).
# SHA-256 content-addressed dedup — identický soubor se uloží
# jen jednou. Chyba uploadu neblokuje download; chybějící
# sha256/seaweed_path lze doplnit backfillem. Mongo nově ukládá:
# sha256, seaweed_path, seaweed_url, seaweed_synced_at.
# Souhrn běhu uvádí počet nově nahraných vs. dedup hitů.
#
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
# ============================================================
import csv
import hashlib
import mimetypes
import os
import re
import sys
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
from pymongo import MongoClient, ASCENDING
# --- Konfigurace -------------------------------------------------------
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
"&IdpAdapterId=CompIWALDAPEXTFORM"
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
# Report Document Inventory Report - Study Level, filtr na studii
REPORT_URL = ("https://vtmf.veevavault.com/ui/#reporting/viewer/"
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008")
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
SCRIPT_DIR = Path(__file__).resolve().parent
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
OLD_STATE_FILE = SCRIPT_DIR / "download_state.csv" # legacy CSV (migrace)
DOWNLOAD_ROOT = Path(r"U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001")
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
LIMIT = 0
# Pole reportu, jejichž změny se promítají a verzují do history[]
TRACKED_FIELDS = ("name", "status", "type", "subtype", "desc",
"date", "url", "studies")
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
RETRY_PAUSE_MS = 5000 # pauza před opakováním
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
SEAWEED_FILER = "http://192.168.1.50:8888"
SEAWEED_PREFIX = "/vtmf-documents"
class PlaceholderDocument(Exception):
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
def log(msg):
print(msg, flush=True)
def load_env_file(path):
"""Načte KEY=VALUE řádky z .env do os.environ.
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
if not path.exists():
log(f"[!] .env nenalezen: {path}")
return
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key, value = key.strip(), value.strip().strip('"').strip("'")
if value and key not in os.environ:
os.environ[key] = value
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
def ensure_credentials():
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
load_env_file(ENV_FILE)
if all(os.environ.get(k) for k in ENV_KEYS):
return
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
missing_lines = [f"{k}=" for k in ENV_KEYS
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
if not ENV_FILE.exists():
ENV_FILE.write_text(
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
+ ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n",
encoding="utf-8")
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
elif missing_lines:
with open(ENV_FILE, "a", encoding="utf-8") as f:
f.write("\n" + ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n")
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
print("\n" + "=" * 60)
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
print(f" {ENV_FILE}")
print(" a spusť skript znovu.")
print("=" * 60)
sys.exit(1)
# --- Parsování Excelu --------------------------------------------------
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
# nepovolené znaky Windows názvů + řídicí znaky + unicode artefakt
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f]")
def clean_filename(s):
"""Očistí string na platné jméno souboru/složky ve Windows."""
s = BAD_CHARS_RE.sub("_", str(s))
s = re.sub(r"\s+", " ", s) # vícenásobné mezery -> jedna
s = re.sub(r"_{2,}", "_", s) # vícenásobná podtržítka -> jedno
return s.strip(" ._") # okraje: mezery, tečky, podtržítka
def display_text(cell):
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
raw = str(cell.value or "").strip()
m = HYPERLINK_RE.search(raw)
return m.group(2).strip() if m else raw
def extract_doc_url(raw):
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
if not m:
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
return m.group(1)
def read_documents_from_excel(path):
"""Načte dokumenty z daného .xlsx reportu. Vrací list dictů:
vtmf, version, url, name, status, type, subtype, desc, date, studies.
Document Name/Number/Status jsou =HYPERLINK vzorce — URL i text se
berou regexem. Report má rozbité deklarované rozměry, čte se
přímou iterací řádků."""
from openpyxl import load_workbook
log(f"[i] Parsování reportu: {path.name}")
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
ws = wb[wb.sheetnames[0]]
rows = ws.iter_rows()
header = [c.value for c in next(rows)]
try:
i_num = header.index("Document Number")
i_name = header.index("Document Name")
i_status = header.index("Document Status")
i_type = header.index("Type")
i_sub = header.index("Subtype")
i_desc = header.index("Description")
i_date = header.index("Document Date")
i_study = header.index("Study")
except ValueError as e:
raise RuntimeError(f"V reportu chybí očekávaný sloupec: {e}")
docs, bad = [], []
for row in rows:
cell = row[i_num]
if cell.value is None:
continue
raw = str(cell.value)
m = HYPERLINK_RE.search(raw)
if m:
url_raw, vtmf = m.group(1), m.group(2)
elif cell.hyperlink: # pravý hyperlink místo vzorce
url_raw, vtmf = cell.hyperlink.target, raw
else:
bad.append(raw)
continue
try:
url = extract_doc_url(url_raw)
except ValueError:
bad.append(raw)
continue
name = display_text(row[i_name])
vm = VERSION_RE.search(name)
version = vm.group(1) if vm else "v?"
desc = clean_filename(display_text(row[i_desc]))
if not desc:
# fallback: Document Name bez koncové verze (jde zvlášť na konec)
desc = clean_filename(VERSION_RE.sub("", name))
date = row[i_date].value # datetime nebo None
docs.append({
"vtmf": vtmf.strip(),
"version": version,
"url": url,
"name": name,
"status": display_text(row[i_status]),
"type": clean_filename(display_text(row[i_type])),
"subtype": clean_filename(display_text(row[i_sub])),
"desc": desc,
"date": date if hasattr(date, "strftime") else None,
"studies": display_text(row[i_study]),
})
log(f"[i] Načteno {len(docs)} dokumentů"
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
return docs
def build_target_path(doc, suggested_filename):
"""Cílová cesta: DOWNLOAD_ROOT\\Type\\Subtype\\
'YYYY-MM-DD Description [VTMF-xxx] [v1.0].<skutečná přípona>'.
Datum/verze se vynechají, když nejsou k dispozici."""
ext = Path(suggested_filename).suffix # skutečná přípona vč. tečky
date_prefix = doc["date"].strftime("%Y-%m-%d") + " " if doc["date"] else ""
version = f" [{doc['version']}]" if doc.get("version") else ""
filename = f"{date_prefix}{doc['desc']} [{doc['vtmf']}]{version}{ext}"
return DOWNLOAD_ROOT / doc["type"] / doc["subtype"] / filename
def deleted_marker_path(path):
"""Jméno souboru s příznakem smazání: 'x.pdf' -> 'x [D].pdf'."""
p = Path(path)
return p.with_name(f"{p.stem} [D]{p.suffix}")
# --- MongoDB synchronizace ---------------------------------------------
def doc_key(vtmf, version):
return f"{vtmf}|{version}"
def get_collection():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
coll = client[MONGO_DB][MONGO_COLL]
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)],
unique=True)
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
return coll
def migrate_old_csv(coll):
"""Jednorázová migrace download_state.csv do Mongo: záznamy 'ok'
se zapíší jako downloaded=True k odpovídajícímu VTMF (aktuální,
nesmazané verzi). CSV se pak přejmenuje na .imported."""
if not OLD_STATE_FILE.exists():
return
migrated = 0
with open(OLD_STATE_FILE, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
if row["result"] != "ok":
continue
r = coll.update_one(
{"vtmf": row["vtmf"], "deleted": False,
"downloaded": {"$ne": True}},
{"$set": {"downloaded": True, "file": row["file"],
"downloaded_at": row["timestamp"]}})
migrated += r.modified_count
OLD_STATE_FILE.rename(OLD_STATE_FILE.with_suffix(".csv.imported"))
log(f"[i] Migrace download_state.csv -> Mongo: {migrated} záznamů; "
f"CSV přejmenováno na .imported")
def sync_report_to_mongo(coll, docs):
"""Promítne aktuální report do kolekce documents.
Klíč = (vtmf, version). Nové založí, změny polí promítne
(s history[]), chybějící označí deleted + soubor přejmenuje
s ' [D]', znovuobjevené vzkřísí a ' [D]' odebere."""
now = datetime.now()
stats = {"new": 0, "updated": 0, "unchanged": 0,
"resurrected": 0, "marked_deleted": 0}
current_keys = set()
for d in docs:
key = doc_key(d["vtmf"], d["version"])
current_keys.add(key)
existing = coll.find_one({"_id": key})
if existing is None:
coll.insert_one({
"_id": key, **d,
"first_seen": now, "last_seen": now,
"deleted": False, "downloaded": False,
"file": None, "history": [],
})
stats["new"] += 1
continue
changes = {}
for fld in TRACKED_FIELDS:
if existing.get(fld) != d.get(fld):
changes[fld] = {"old": existing.get(fld),
"new": d.get(fld)}
update = {"$set": {**d, "last_seen": now, "deleted": False}}
if changes:
update["$push"] = {"history": {"ts": now, "changes": changes}}
stats["updated"] += 1
else:
stats["unchanged"] += 1
if existing.get("deleted"):
# dokument se do reportu vrátil -> odebrat [D] ze souboru
stats["resurrected"] += 1
stats["unchanged"] -= 0 # (počítá se výše jako updated/unchanged)
old_file = existing.get("file")
if old_file:
marked = deleted_marker_path(old_file)
if marked.exists() and not Path(old_file).exists():
marked.rename(old_file)
log(f"[i] {key}: soubor vrácen z ' [D]' zpět.")
update["$set"]["file"] = str(old_file)
coll.update_one({"_id": key}, update)
# dokumenty, které v aktuálním reportu nejsou -> deleted + ' [D]'
for rec in coll.find({"deleted": False}):
if rec["_id"] in current_keys:
continue
upd = {"deleted": True, "deleted_at": now}
f = rec.get("file")
if f and Path(f).exists():
marked = deleted_marker_path(f)
try:
Path(f).rename(marked)
upd["file"] = str(marked)
log(f"[i] {rec['_id']}: soubor označen ' [D]'.")
except OSError as e:
log(f"[!] {rec['_id']}: přejmenování na [D] selhalo: {e}")
coll.update_one({"_id": rec["_id"]},
{"$set": upd,
"$push": {"history": {"ts": now,
"changes": {"deleted": {
"old": False,
"new": True}}}}})
stats["marked_deleted"] += 1
log(f"[ok] Mongo sync: {stats['new']} nových, {stats['updated']} změněných, "
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
f"{stats['marked_deleted']} označených deleted.")
return stats
# --- Přihlášení --------------------------------------------------------
def submit_login_form(page, password_box):
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
candidates = [
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
page.locator("input[type='submit']"),
page.locator("button[type='submit']"),
page.get_by_role("button", name=re.compile("^ok$", re.I)),
]
for loc in candidates:
try:
if loc.count() and loc.first.is_visible():
label = (loc.first.inner_text() or
loc.first.get_attribute("value") or "submit").strip()
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
loc.first.click()
return
except Exception:
continue
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
password_box.press("Enter")
def login_if_needed(page):
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
log(f"[i] Otevírám přihlašovací URL...")
page.goto(LOGIN_URL, wait_until="domcontentloaded")
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Už přihlášen (perzistentní session).")
return
user_box = page.locator("input[type='text']").first
try:
user_box.wait_for(timeout=8000)
except PWTimeout:
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Přihlášen bez formuláře (session redirect).")
return
raise RuntimeError(
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
username = os.environ["VAULT_USER"]
password = os.environ["VAULT_PASS"]
log("[i] Vyplňuji přihlašovací údaje...")
user_box.fill(username)
password_box = page.locator("input[type='password']").first
password_box.fill(password)
submit_login_form(page, password_box)
log("[i] Odeslán login, čekám na výsledek...")
try:
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
log("[ok] Přihlášen rovnou (bez 2FA).")
return
except PWTimeout:
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
err = page.locator("text=/invalid|incorrect|failed/i")
try:
if err.count() and err.first.is_visible():
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
except PWTimeout:
pass
print("\n" + "=" * 60)
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
print(" Potvrď přihlášení v mobilní aplikaci.")
print("=" * 60)
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
log("[ok] Přihlášení dokončeno.")
def verify_inside(page):
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
log(f"[ok] Uvnitř Vaultu: {page.url}")
def dialog_visible(page):
"""True, pokud je na stránce viditelný jQuery UI dialog."""
try:
dlg = page.locator(".ui-dialog")
return bool(dlg.count() and dlg.first.is_visible())
except Exception:
return False
def save_page_debug(page, tag):
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
kandidátů na tlačítka. Vrátí cestu složky."""
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
out.mkdir(parents=True, exist_ok=True)
try:
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
except Exception as e:
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
report = []
for i, frame in enumerate(page.frames):
report.append(f"=== frame[{i}] url={frame.url}")
try:
(out / f"frame_{i}.html").write_text(frame.content(),
encoding="utf-8")
for sel in (".ui-dialog", "a.ok.vv_button",
".ui-dialog-titlebar-close",
"button", "input[type='button']",
"[title]", "[aria-label]"):
n = frame.locator(sel).count()
if n:
report.append(f" {sel}: {n}x")
# výpis title/aria-label atributů — pomáhá najít menu ⋯
for attr in ("title", "aria-label"):
vals = frame.locator(f"[{attr}]").evaluate_all(
f"els => els.map(e => e.getAttribute('{attr}'))")
uniq = sorted({v for v in vals if v})[:80]
report.append(f" {attr}: {uniq}")
except Exception as e:
report.append(f" [chyba čtení framu: {e}]")
(out / "frames_report.txt").write_text("\n".join(report),
encoding="utf-8")
log(f"[!] Diagnostika stránky uložena do: {out}")
return out
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
".vv_login_msg_dialog .vv_button.ok")
def dismiss_maintenance_popup(page, timeout=8000):
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
proto se na něj krátce čeká. Bezpečné volat vždy."""
ok = page.locator(DIALOG_OK_SELECTOR)
try:
ok.first.wait_for(state="visible", timeout=timeout)
except PWTimeout:
return False # okno se neobjevilo — pokračujeme
except Exception:
return False
closed = 0
for _ in range(5): # dialogy umí být ve frontě
try:
if ok.count() and ok.first.is_visible():
ok.first.click()
page.wait_for_timeout(300)
closed += 1
log("[i] Maintenance/login dialog zavřen (OK).")
continue
except Exception:
pass
break
if not dialog_visible(page):
return bool(closed)
page.keyboard.press("Escape")
page.wait_for_timeout(500)
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
if dialog_visible(page):
save_page_debug(page, "dialog")
print("\n" + "=" * 60)
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
print(" Zavři ho prosím ručně v prohlížeči.")
print("=" * 60)
input(" Po ručním zavření stiskni ENTER... ")
return bool(closed)
# --- Export reportu ----------------------------------------------------
def _first_visible(page, builders):
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
hlavní stránce i ve všech frames."""
for frame in page.frames:
for build, desc in builders:
try:
loc = build(frame)
if loc.count() and loc.first.is_visible():
return loc.first, desc
except Exception:
continue
return None, None
def download_report(page):
"""Stáhne report (Export to Excel, Data Only) do WhatToDownload/
pod timestampovaným názvem. Vrátí cestu k souboru.
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
log("[i] Otevírám report Document Inventory Report - Study Level...")
page.goto(REPORT_URL, wait_until="domcontentloaded")
dismiss_maintenance_popup(page, timeout=4000)
# report je hotový, když se objeví počet záznamů / statusy
try:
page.wait_for_selector("text=Returned", timeout=30000)
except PWTimeout:
try:
page.wait_for_selector("text=Document Status:", timeout=30000)
except PWTimeout:
save_page_debug(page, "report_load")
raise RuntimeError(
"Report se nenačetl (nenašel jsem 'Returned' ani "
"'Document Status:'). Diagnostika v debug/.")
log("[i] Report načten, otevírám menu akcí (⋯)...")
# Menu ⋯ (Actions): button bez title/aria-label uvnitř
# .actionMenuContainer (ověřeno na živém DOM, žádný iframe).
actions, desc = _first_visible(page, [
(lambda f: f.locator(
".actionMenuContainer .dropDown.vv_dropdown_toggle "
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
])
if actions is None:
save_page_debug(page, "report_menu")
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. "
"Diagnostika v debug/.")
log(f"[i] Menu nalezeno přes: {desc}")
actions.click()
# Menu se načítá ASYNCHRONNĚ (data-loaded=false -> AJAX),
# počkat na položku, nečíst hned po kliknutí.
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
try:
item.first.wait_for(state="visible", timeout=15000)
except PWTimeout:
# fallback podle textu (kdyby se data atribut změnil)
item = page.get_by_text("Export to Excel", exact=True)
try:
item.first.wait_for(state="visible", timeout=5000)
except PWTimeout:
save_page_debug(page, "report_export_item")
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
"Excel' jsem nenašel. Diagnostika v debug/.")
log("[i] Klikám 'Export to Excel'...")
item.first.click()
log("[i] Dialog Excel Export Options...")
# 'Data Only' = radio value=STANDARD, defaultně checked; pojistka.
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
try:
radio.first.wait_for(state="visible", timeout=10000)
if not radio.first.is_checked():
radio.first.check()
log("[i] Přepnuto na 'Data Only'.")
except PWTimeout:
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
# Export = <button> s textem Export (React dialog, emotion třídy —
# NEselektovat podle class hash, jen role+text).
export_btn = page.get_by_role("button", name="Export", exact=True)
try:
export_btn.first.wait_for(state="visible", timeout=10000)
except PWTimeout:
save_page_debug(page, "report_export_btn")
raise RuntimeError("Dialog exportu bez tlačítka Export. "
"Diagnostika v debug/.")
export_btn = export_btn.first
# Export kliknout PRÁVĚ jednou (vícenásobné kliky = duplikáty);
# 503/redirecty v network logu neřešit — rozhoduje expect_download
with page.expect_download(timeout=120000) as dl_info:
export_btn.click()
download = dl_info.value
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dest = EXCEL_DIR / f"{ts} {download.suggested_filename}"
download.save_as(str(dest))
log(f"[ok] Report uložen: {dest}")
return dest
def archive_report(path):
"""Po úspěšném zpracování přesune report do Zpracovano/."""
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
target = PROCESSED_DIR / path.name
path.rename(target)
log(f"[i] Report archivován: {target}")
# --- SeaweedFS ---------------------------------------------------------
def _sw_path(sha256):
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
def seaweed_store(data, mime="application/octet-stream"):
"""Idempotentní upload do SeaweedFS Filer.
Vrací (path, url, uploaded): uploaded=False znamená dedup hit."""
sha256 = hashlib.sha256(data).hexdigest()
path = _sw_path(sha256)
url = SEAWEED_FILER + path
try:
urllib.request.urlopen(
urllib.request.Request(url, method="HEAD"), timeout=10)
return path, url, False # soubor už existuje
except urllib.error.HTTPError as e:
if e.code != 404:
raise
req = urllib.request.Request(
url, data=data, method="PUT",
headers={"Content-Type": mime})
urllib.request.urlopen(req, timeout=120)
return path, url, True
# --- Stažení dokumentů -------------------------------------------------
def find_source_file_button(page):
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře).
Více fallback selektorů — DOM se může lišit podle typu dokumentu."""
candidates = [
"[title='Source File']",
"[aria-label='Source File']",
]
for sel in candidates:
loc = page.locator(sel)
if loc.count():
return loc.first
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
if loc.count():
return loc.first
return None
def download_source_file(page, doc):
vtmf = doc["vtmf"]
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
page.goto(doc["url"], wait_until="domcontentloaded")
try:
page.wait_for_load_state("networkidle", timeout=30000)
except PWTimeout:
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
dismiss_maintenance_popup(page, timeout=2000)
ph = page.locator("div.vv_placeholder_text")
if ph.count() and ph.first.is_visible():
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
raise PlaceholderDocument(vtmf)
target = find_source_file_button(page)
if target is None:
raise RuntimeError(
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
log("[i] Klikám na Source File a čekám na download...")
with page.expect_download(timeout=60000) as dl_info:
target.click()
# Varianta s dropdownem (Source File + Viewable Rendition)
try:
item = page.get_by_role("menuitem",
name=re.compile("Source File", re.I))
if item.count() and item.first.is_visible():
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
item.first.click()
except Exception:
pass
download = dl_info.value
dest = build_target_path(doc, download.suggested_filename)
dest.parent.mkdir(parents=True, exist_ok=True)
download.save_as(str(dest))
return dest
def download_missing(page, coll):
"""Stáhne všechny nesmazané dokumenty bez downloaded=True.
Výsledek každého se ihned zapíše do Mongo."""
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
.sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
if LIMIT:
todo = todo[:LIMIT]
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
ok_count, fail_count, placeholder_count = 0, 0, 0
sw_uploaded = sw_dedup = sw_failed = 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
log(f"\n--- [{n}/{len(todo)}] {key} | {doc['desc'][:70]}")
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
try:
dest = download_source_file(page, doc)
# SeaweedFS upload (neblokuje při chybě)
sw_path = sw_url = sw_ts = sha256_hex = None
try:
data = dest.read_bytes()
size_kb = len(data) / 1024
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
ext = dest.suffix.lstrip('.').upper()
log(f"[ok] Stazeno: {dest.name} ({size_str} {ext})")
mime = mimetypes.guess_type(dest.name)[0] or "application/octet-stream"
sw_path, sw_url, uploaded = seaweed_store(data, mime)
sha256_hex = hashlib.sha256(data).hexdigest()
sw_ts = datetime.now()
if uploaded:
sw_uploaded += 1
log(f"[ok] SeaweedFS: nahrano ({size_str}) -> {sw_path}")
else:
sw_dedup += 1
log(f"[i] SeaweedFS: dedup hit ({size_str}) -> {sw_path}")
except Exception as sw_err:
sw_failed += 1
log(f"[!] SeaweedFS upload selhal (soubor je na disku): {sw_err}")
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "file": str(dest),
"downloaded_at": datetime.now(),
"sha256": sha256_hex,
"seaweed_path": sw_path,
"seaweed_url": sw_url,
"seaweed_synced_at": sw_ts,
"last_error": None}})
ok_count += 1
last_err = None
break
except PlaceholderDocument:
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "placeholder": True,
"file": None, "downloaded_at": datetime.now(),
"last_error": None}})
placeholder_count += 1
last_err = None
break
except Exception as e:
last_err = e
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
if attempt < MAX_ATTEMPTS:
page.wait_for_timeout(RETRY_PAUSE_MS)
if last_err is not None:
coll.update_one({"_id": key}, {"$set": {
"last_error": str(last_err),
"error_at": datetime.now()}})
fail_count += 1
page.wait_for_timeout(BETWEEN_DOCS_MS)
return ok_count, fail_count, placeholder_count, sw_uploaded, sw_dedup, sw_failed
# --- Main --------------------------------------------------------------
def main():
ensure_credentials()
coll = get_collection()
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
accept_downloads=True,
no_viewport=True, # okno se chová nativně
args=["--start-maximized"],
)
page = ctx.pages[0] if ctx.pages else ctx.new_page()
ok_count = fail_count = placeholder_count = 0
sw_uploaded = sw_dedup = sw_failed = 0
pipeline_error = None
try:
# 1) login
login_if_needed(page)
verify_inside(page)
dismiss_maintenance_popup(page)
# 2) export reportu
report_path = download_report(page)
# 3) parse + sync do Mongo
docs = read_documents_from_excel(report_path)
if not docs:
raise RuntimeError("Report neobsahuje žádné dokumenty — "
"sync přeskočen, nic se nemaže.")
sync_report_to_mongo(coll, docs)
migrate_old_csv(coll)
archive_report(report_path)
# 4) stažení chybějících
DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
(ok_count, fail_count, placeholder_count,
sw_uploaded, sw_dedup, sw_failed) = download_missing(page, coll)
except KeyboardInterrupt:
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
except Exception as e:
pipeline_error = e
print("\n" + "=" * 60)
print(" PIPELINE SELHALA!")
print(f" {type(e).__name__}: {e}")
print("=" * 60)
finally:
total = coll.count_documents({})
have = coll.count_documents({"deleted": False, "downloaded": True})
active = coll.count_documents({"deleted": False})
sw_info = (f"SeaweedFS: {sw_uploaded} nových, {sw_dedup} dedup"
+ (f", {sw_failed} chyb uploadu" if sw_failed else ""))
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
f"{placeholder_count} placeholderů přeskočeno, {fail_count} chyb"
+ (f", PIPELINE SELHALA ({pipeline_error})" if pipeline_error else ".")
+ (f"\n[i] {sw_info}" if ok_count else ""))
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
f"z toho staženo {have} ({active - have} zbývá).")
log("[i] Zavírám prohlížeč.")
ctx.close()
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
if __name__ == "__main__":
main()
+134
View File
@@ -0,0 +1,134 @@
# vtmf_pipeline_v1.6 — V-TMF workflow přes 3 úrovně (STUDY / COUNTRY / SITE)
**Verze:** 1.6 · **Datum:** 2026-06-15
## Co je nové proti v1.5
v1.5 stahovala jen **study-level** dokumenty jedné studie do ploché
`<Type>\<Subtype>` struktury. v1.6 řeší celou hierarchii VTMF
**STUDY → COUNTRY → SITE** a sdílený (M:N) charakter dokumentů.
**Klíčové poznatky z reportů:**
- Dokument je do studií/zemí/center jen **referencovaný** (M:N) — např.
Master Confidentiality Agreement v nemocnici je jeden dokument
referencovaný do všech studií i center té nemocnice. Reference ≠ kopie.
- Sloupce `Study`, `Study Country`, `Site` jsou **comma-separated seznamy**.
- Tři reporty = tři **úrovně** dokumentu. Aby byl TMF kompletní, musí se
stáhnout všechny tři.
- Country i site report filtrují **jen na zemi** (CZ), ne na studii →
empiricky vrací 100 % dokumentů navázaných na UCO3001, ořez na studii je
pojistka (no-op).
- Study report má 15 sloupců (+ `Document Date`), country/site 17
(+ `Created By`, `Study Country`, `Site`; bez `Document Date`).
## Konfigurace REPORTS
```python
TARGET_STUDY = "77242113UCO3001"
REPORTS = [
{"level":"study", "study":TARGET_STUDY, "country":None,
"url":".../0RP000000000182?study__v...IN=0ST000000137008"},
{"level":"country", "study":TARGET_STUDY, "country":"Czech Republic",
"url":".../0RP000000000319?study_country__v...IN=0SC00000017T056"},
{"level":"site", "study":TARGET_STUDY, "country":"Czech Republic",
"url":".../0RP000000000762?study_country__v...EQ=0SC00000017T056"},
]
```
Jiná studie / země = jen úprava ID v URL + TARGET_STUDY.
## Tok jednoho běhu
1. **Login** (persistentní profil, J&J SSO, 2FA na telefonu).
2. Pro **každý report** v `REPORTS`:
- export do Excelu (Data Only) → `WhatToDownload/<ts> <level> ...xlsx`,
- parse (zobecněný parser, sloupce podle názvu),
- ořez na `TARGET_STUDY` (řádek se bere jen pokud má studii v `studies`),
- **scoped sync** do Mongo,
- archiv reportu do `Zpracovano/`.
3. **Jeden průchod stažení** všech `deleted=False, downloaded≠True`
na disk i do SeaweedFS.
## Mongo schéma (kolekce documents)
```
_id: "VTMF-9108777|v2.0" # číslo dokumentu | verze
vtmf, version, url, level # level = study|country|site (pro cestu)
levels: ["site"] # všechny úrovně, kde se objevil
scopes: ["site|77242113UCO3001|Czech Republic", ...] # pro scoped mazání
name, status, type, subtype, classification, desc
process_name, external_system_name
created_by, last_modified_by, version_created_by
date # YYYY-MM-DD (Document/Approval/Version date)
studies: ["77242113UCO3001", ...] # comma-split sloupce reportu
countries: ["Czech Republic", ...]
sites: ["BH5-CZ10001", ...]
first_seen, last_seen, deleted, deleted_at
downloaded, downloaded_at, placeholder # žádné pole file (Dropbox zrušen)
sha256 # kontrolní součet (NE cesta)
seaweed_path, seaweed_url, seaweed_synced_at # jediné umístění souboru
history: [{ts, changes:{pole:{old,new}}}]
```
## Scoped sync (řeší mazací háček)
Mazání už **nekouká na celou kolekci** (to by sync country reportu označil
study/site dokumenty jako smazané). Každý report má
`scope = "<level>|<study>|<country>"`; dokument nese pole `scopes[]`.
- dokument v reportu → `$addToSet` scope,
- dokument, který z **tohoto** scope zmizel → scope se odebere; teprve když
nemá **žádný** scope → `deleted=True` + soubor ` [D]`.
## Evidence reportů — kolekce report_runs
```
level, study, country, url, scope, exported_at, file, row_count, doc_keys[]
```
Umožní ukázat „co přesně bylo v reportu" a slouží jako audit.
## Úložiště = JEN SeaweedFS (žádný Dropbox/disk)
Dokumenty se stahují z Vaultu přes **dočasný soubor Playwrightu** rovnou do
SeaweedFS Fileru — na disk/Dropbox se nic neukládá. Klíč = číslo dokumentu
+ verze:
```
/vtmf-documents/<vtmf>/<verze>.<přípona>
např. /vtmf-documents/VTMF-9108777/v2.0.pdf
```
Žádné SHA cesty, žádný content dedup, žádné hardlinky. SHA-256 se počítá a
ukládá do Mongo jen jako kontrolní součet. Která úroveň / země / centra =
pole `level` / `countries[]` / `sites[]` v Mongo.
Aktuální verzi čehokoli do Dropboxu (nebo kamkoli jinam) zařídí samostatný
export skript ze SeaweedFS — pipeline se tím nezdržuje.
## Migrace stávajících dat → migrate_to_v16.py
Stávající study-level data (v1.3v1.5) převede na schéma v1.6. Dvě fáze,
**default DRY-RUN**, ostře s `--apply`:
- `--phase mongo` — re-parse nejnovějšího archivu study reportu v1.6
parserem → obohatí ~1692 dokumentů o nová pole (level, scopes[],
studies[], countries=[], sites=[], classification, …). Nesahá na
download stav.
- `--phase seaweed` — překlíčuje SeaweedFS ze starých SHA cest na nové
`<vtmf>/<verze>` (~1637 souborů; zdroj bajtů = stávající soubor na disku,
fallback GET ze SHA cesty), opraví `seaweed_path/url` + `sha256`, smaže
staré SHA objekty a odebere pole `file` z Mongo. Fyzické soubory
v Dropboxu pak můžeš smazat ručně.
```powershell
# náhled
& "...\.venv\Scripts\python.exe" "...\migrate_to_v16.py"
# ostře
& "...\.venv\Scripts\python.exe" "...\migrate_to_v16.py" --apply
```
## Spuštění pipeline
```powershell
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.6.py"
```
Předchůdce: vtmf_pipeline_v1.5 (TRASH/).
```
+937
View File
@@ -0,0 +1,937 @@
# ============================================================
# vtmf_pipeline_v1.6.py
# Verze: 1.6
# Datum: 2026-06-15
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault) pro studii
# 77242113UCO3001 přes VŠECHNY TŘI ÚROVNĚ dokumentů
# (STUDY / COUNTRY / SITE). Jeden běh udělá pro každý
# report ze seznamu REPORTS:
# 1) login do Vaultu (persistentní session + ruční 2FA),
# 2) export reportu do Excelu (Data Only) do WhatToDownload/,
# 3) parse + scoped sync do MongoDB (db VTMF, kolekce
# documents; klíč _id = "číslo|verze"),
# a nakonec jeden průchod stažení všech dosud nestažených
# dokumentů PŘÍMO do SeaweedFS (žádný Dropbox/disk).
#
# ZÁSADNÍ ZMĚNY proti v1.5:
#
# • Hierarchie dokumentů ve VTMF je STUDY -> COUNTRY -> SITE.
# Dokument je do studií/zemí/center jen REFERENCOVANÝ (M:N) —
# např. Master Confidentiality Agreement v nemocnici je jeden
# dokument referencovaný do všech studií i center té nemocnice.
# Proto: jeden dokument = jeden záznam = jeden SeaweedFS objekt;
# příslušnost je jen metadatová pole studies[]/countries[]/sites[].
#
# • REPORTS = seznam (level, study, country, url). Country i site
# report filtrují jen na zemi (CZ), ne na studii -> při ukládání
# se row bere jen pokud cílová studie je v jeho Study sloupci
# (prakticky no-op, vše vrácené UCO3001 obsahuje).
#
# • Zobecněný parser: study report má 15 sloupců (+ Document Date),
# country/site mají 17 (+ Created By, Study Country, Site; bez
# Document Date). Sloupce se hledají podle NÁZVU, datum má
# fallback Document Date -> Approval Complete Date -> Version
# Creation Date. Study/Study Country/Site se parsují na pole.
#
# • Scoped sync: mazání už NEkouká na celou kolekci. Každý report
# má scope = (level|study|country); dokument nese pole scopes[].
# Když z reportu daného scope zmizí, scope se odebere; teprve
# když nemá žádný scope -> deleted=True.
#
# • Evidence reportů: kolekce report_runs (level, study, country,
# url, exported_at, file, row_count, doc_keys).
#
# • ÚLOŽIŠTĚ = JEN SeaweedFS, klíč číslo dokumentu + verze:
# /vtmf-documents/<vtmf>/<verze>.<přípona>
# Žádné ukládání dokumentů na disk/Dropbox — stahují se přes
# dočasný soubor Playwrightu rovnou do Fileru. SHA-256 se počítá
# a ukládá do Mongo jen jako kontrolní součet. (Aktuální verzi
# čehokoli do Dropboxu zařídí samostatný export skript ze SeaweedFS.)
#
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
#
# Migrace stávajících study-level dat na toto schéma: migrate_to_v16.py
# Předchůdce: vtmf_pipeline_v1.5 (v TRASH/).
# ============================================================
import hashlib
import mimetypes
import os
import re
import sys
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
from pymongo import MongoClient, ASCENDING
# --- Konfigurace -------------------------------------------------------
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
"&IdpAdapterId=CompIWALDAPEXTFORM"
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
# Studie, jejíž TMF stavíme (cíl ořezu country/site reportů).
TARGET_STUDY = "77242113UCO3001"
# ====================================================================
# SEZNAM REPORTŮ KE ZPRACOVÁNÍ
# --------------------------------------------------------------------
# Každý řádek = jeden report. Pole:
# enabled = True/False -> přepni na False a report se v dalším běhu
# NEnačte (zůstane v seznamu jako dokumentace)
# name = popisek do logu (co to je za report)
# level = "study" | "country" | "site" (úroveň + scope)
# study = kód cílové studie (scope + ořez na tuto studii)
# country = země scope (None u study-level)
# url = přímý odkaz na report viewer ve Vaultu
#
# Přidání jiné studie = prostě dopiš další 3 řádky s jejím kódem
# a URL; běh je zpracuje vedle stávajících.
# ====================================================================
REPORTS = [
{"enabled": True, "name": "UCO3001 — STUDY level",
"level": "study", "study": TARGET_STUDY, "country": None,
"url": "https://vtmf.veevavault.com/ui/#reporting/viewer/"
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008"},
{"enabled": True, "name": "UCO3001 — COUNTRY level (Czech Republic)",
"level": "country", "study": TARGET_STUDY, "country": "Czech Republic",
"url": "https://vtmf.veevavault.com/ui/#reporting/viewer/"
"0RP000000000319?study_country__v%2C%2C%2CIN=0SC00000017T056"},
{"enabled": False, "name": "UCO3001 — SITE level (all sites in Czech Republic)",
"level": "site", "study": TARGET_STUDY, "country": "Czech Republic",
"url": "https://vtmf.veevavault.com/ui/#reporting/viewer/"
"0RP000000000762?study_country__v%2C%2C%2CEQ=0SC00000017T056"},
]
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
SCRIPT_DIR = Path(__file__).resolve().parent
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty (jen Excel)
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "VTMF"
MONGO_COLL = "documents"
RUNS_COLL = "report_runs"
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
LIMIT = None
# Pole, jejichž změny se verzují do history[]
TRACKED_FIELDS = ("name", "status", "type", "subtype", "classification",
"desc", "date", "url", "studies", "countries", "sites",
"level")
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
RETRY_PAUSE_MS = 5000 # pauza před opakováním
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
SEAWEED_FILER = "http://192.168.1.50:8888"
SEAWEED_PREFIX = "/vtmf-documents"
class PlaceholderDocument(Exception):
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
def log(msg):
print(msg, flush=True)
def load_env_file(path):
"""Načte KEY=VALUE řádky z .env do os.environ.
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
if not path.exists():
log(f"[!] .env nenalezen: {path}")
return
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key, value = key.strip(), value.strip().strip('"').strip("'")
if value and key not in os.environ:
os.environ[key] = value
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
def ensure_credentials():
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
load_env_file(ENV_FILE)
if all(os.environ.get(k) for k in ENV_KEYS):
return
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
missing_lines = [f"{k}=" for k in ENV_KEYS
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
if not ENV_FILE.exists():
ENV_FILE.write_text(
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
+ ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n",
encoding="utf-8")
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
elif missing_lines:
with open(ENV_FILE, "a", encoding="utf-8") as f:
f.write("\n" + ENV_SECTION_HEADER + "\n"
+ "\n".join(missing_lines) + "\n")
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
print("\n" + "=" * 60)
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
print(f" {ENV_FILE}")
print(" a spusť skript znovu.")
print("=" * 60)
sys.exit(1)
# --- Parsování Excelu --------------------------------------------------
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")
# nepovolené znaky názvů + řídicí znaky + unicode artefakt
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f]")
def clean_text(s):
"""Očistí string na rozumný název (bez nepovolených znaků)."""
s = BAD_CHARS_RE.sub("_", str(s))
s = re.sub(r"\s+", " ", s)
s = re.sub(r"_{2,}", "_", s)
return s.strip(" ._")
def display_text(cell):
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
raw = str(cell.value or "").strip()
m = HYPERLINK_RE.search(raw)
return m.group(2).strip() if m else raw
def split_multi(text):
"""Comma-separated seznam -> list (strip, bez prázdných, dedup pořadí)."""
out, seen = [], set()
for part in str(text or "").split(","):
p = part.strip()
if p and p not in seen:
seen.add(p)
out.append(p)
return out
def cell_date(cell):
"""Z buňky vytáhne datum jako 'YYYY-MM-DD' (datetime i string), nebo ''."""
v = cell.value if cell is not None else None
if hasattr(v, "strftime"):
return v.strftime("%Y-%m-%d")
m = DATE_RE.search(str(v or ""))
return m.group(1) if m else ""
def extract_doc_url(raw):
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
if not m:
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
return m.group(1)
def read_documents_from_excel(path, level):
"""Načte dokumenty z .xlsx reportu dané úrovně (study/country/site).
Sloupce se hledají podle NÁZVU (study má 15, country/site 17).
Document Name/Number jsou =HYPERLINK vzorce -> URL i text regexem.
Report má rozbité deklarované rozměry -> přímá iterace řádků."""
from openpyxl import load_workbook
log(f"[i] Parsování reportu ({level}): {path.name}")
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
ws = wb[wb.sheetnames[0]]
rows = ws.iter_rows()
header = [c.value for c in next(rows)]
idx = {h: i for i, h in enumerate(header) if h is not None}
required = ("Document Number", "Document Name", "Document Status",
"Type", "Subtype", "Description", "Study")
missing = [c for c in required if c not in idx]
if missing:
raise RuntimeError(f"V reportu chybí očekávané sloupce: {missing}")
i_num, i_name = idx["Document Number"], idx["Document Name"]
i_status, i_type, i_sub = idx["Document Status"], idx["Type"], idx["Subtype"]
i_desc, i_study = idx["Description"], idx["Study"]
i_class = idx.get("Classification")
i_proc = idx.get("Process Name")
i_extsys = idx.get("External System Name")
i_created = idx.get("Created By")
i_modby = idx.get("Last Modified By")
i_verby = idx.get("Version Created By")
i_country = idx.get("Study Country")
i_site = idx.get("Site")
i_date_cols = [idx.get(c) for c in
("Document Date", "Approval Complete Date", "Version Creation Date")
if idx.get(c) is not None]
def g(row, i):
return display_text(row[i]) if i is not None else ""
docs, bad = [], []
for row in rows:
cell = row[i_num]
if cell.value is None:
continue
raw = str(cell.value)
m = HYPERLINK_RE.search(raw)
if m:
url_raw, vtmf = m.group(1), m.group(2)
elif cell.hyperlink:
url_raw, vtmf = cell.hyperlink.target, raw
else:
bad.append(raw)
continue
try:
url = extract_doc_url(url_raw)
except ValueError:
bad.append(raw)
continue
name = display_text(row[i_name])
vm = VERSION_RE.search(name)
version = vm.group(1) if vm else "v?"
desc = clean_text(g(row, i_desc))
if not desc:
desc = clean_text(VERSION_RE.sub("", name))
date = ""
for i_d in i_date_cols:
date = cell_date(row[i_d])
if date:
break
docs.append({
"vtmf": vtmf.strip(),
"version": version,
"url": url,
"level": level,
"name": name,
"status": g(row, i_status),
"type": clean_text(g(row, i_type)),
"subtype": clean_text(g(row, i_sub)),
"classification": g(row, i_class),
"desc": desc,
"process_name": g(row, i_proc),
"external_system_name": g(row, i_extsys),
"created_by": g(row, i_created),
"last_modified_by": g(row, i_modby),
"version_created_by": g(row, i_verby),
"date": date,
"studies": split_multi(g(row, i_study)),
"countries": split_multi(g(row, i_country)) if i_country is not None else [],
"sites": split_multi(g(row, i_site)) if i_site is not None else [],
})
log(f"[i] Načteno {len(docs)} dokumentů"
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
return docs
# --- MongoDB synchronizace ---------------------------------------------
def doc_key(vtmf, version):
return f"{vtmf}|{version}"
def scope_key(report):
return f"{report['level']}|{report['study']}|{report.get('country') or ''}"
def get_db():
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[MONGO_DB]
coll = db[MONGO_COLL]
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)], unique=True)
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
coll.create_index([("scopes", ASCENDING)])
coll.create_index([("studies", ASCENDING)])
coll.create_index([("sites", ASCENDING)])
coll.create_index([("level", ASCENDING)])
runs = db[RUNS_COLL]
runs.create_index([("level", ASCENDING), ("study", ASCENDING),
("country", ASCENDING), ("exported_at", ASCENDING)])
return db, coll, runs
def sync_report_to_mongo(coll, runs, docs, report, report_file):
"""Promítne report daného scope do kolekce documents.
- nové založí, změny polí promítne (+ history[]),
- každému dokumentu přidá scope do scopes[] (a level do levels[]),
- dokument, který z TOHOTO scope zmizel, ztratí tento scope;
bez jakéhokoli scope -> deleted=True.
Scoped mazání = sync jednoho reportu NIKDY neoznačí dokumenty
jiného scope (study/country/site) jako smazané. Žádné souborové
operace (úložiště je SeaweedFS)."""
now = datetime.now()
sk = scope_key(report)
stats = {"new": 0, "updated": 0, "unchanged": 0,
"resurrected": 0, "scope_removed": 0, "marked_deleted": 0}
current_keys = set()
for d in docs:
key = doc_key(d["vtmf"], d["version"])
current_keys.add(key)
existing = coll.find_one({"_id": key})
if existing is None:
coll.insert_one({
"_id": key, **d,
"levels": [d["level"]], "scopes": [sk],
"first_seen": now, "last_seen": now,
"deleted": False, "downloaded": False,
"seaweed_path": None, "history": [],
})
stats["new"] += 1
continue
changes = {}
for fld in TRACKED_FIELDS:
if existing.get(fld) != d.get(fld):
changes[fld] = {"old": existing.get(fld), "new": d.get(fld)}
update = {"$set": {**d, "last_seen": now, "deleted": False},
"$addToSet": {"scopes": sk, "levels": d["level"]}}
if changes:
update["$push"] = {"history": {"ts": now, "changes": changes}}
stats["updated"] += 1
else:
stats["unchanged"] += 1
if existing.get("deleted"):
stats["resurrected"] += 1
coll.update_one({"_id": key}, update)
# dokumenty dříve v TOMTO scope, které v reportu chybí -> odebrat scope
for rec in coll.find({"scopes": sk, "_id": {"$nin": list(current_keys)}}):
remaining = [s for s in rec.get("scopes", []) if s != sk]
upd = {"scopes": remaining}
op = {"$set": upd}
stats["scope_removed"] += 1
if not remaining: # už nikde -> smazáno
upd["deleted"] = True
upd["deleted_at"] = now
op["$push"] = {"history": {"ts": now,
"changes": {"deleted": {"old": False, "new": True}}}}
stats["marked_deleted"] += 1
coll.update_one({"_id": rec["_id"]}, op)
runs.insert_one({
"level": report["level"], "study": report["study"],
"country": report.get("country"), "url": report["url"],
"scope": sk, "exported_at": now,
"file": str(report_file), "row_count": len(docs),
"doc_keys": sorted(current_keys),
})
log(f"[ok] Mongo sync [{sk}]: {stats['new']} nových, {stats['updated']} změněných, "
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
f"{stats['scope_removed']} odebrán scope ({stats['marked_deleted']} úplně smazáno).")
return stats
# --- Přihlášení --------------------------------------------------------
def submit_login_form(page, password_box):
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
candidates = [
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
page.locator("input[type='submit']"),
page.locator("button[type='submit']"),
page.get_by_role("button", name=re.compile("^ok$", re.I)),
]
for loc in candidates:
try:
if loc.count() and loc.first.is_visible():
label = (loc.first.inner_text() or
loc.first.get_attribute("value") or "submit").strip()
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
loc.first.click()
return
except Exception:
continue
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
password_box.press("Enter")
def login_if_needed(page):
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
log(f"[i] Otevírám přihlašovací URL...")
page.goto(LOGIN_URL, wait_until="domcontentloaded")
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Už přihlášen (perzistentní session).")
return
user_box = page.locator("input[type='text']").first
try:
user_box.wait_for(timeout=8000)
except PWTimeout:
if "vtmf.veevavault.com/ui" in page.url:
log("[i] Přihlášen bez formuláře (session redirect).")
return
raise RuntimeError(
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
username = os.environ["VAULT_USER"]
password = os.environ["VAULT_PASS"]
log("[i] Vyplňuji přihlašovací údaje...")
user_box.fill(username)
password_box = page.locator("input[type='password']").first
password_box.fill(password)
submit_login_form(page, password_box)
log("[i] Odeslán login, čekám na výsledek...")
try:
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
log("[ok] Přihlášen rovnou (bez 2FA).")
return
except PWTimeout:
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
err = page.locator("text=/invalid|incorrect|failed/i")
try:
if err.count() and err.first.is_visible():
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
except PWTimeout:
pass
print("\n" + "=" * 60)
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
print(" Potvrď přihlášení v mobilní aplikaci.")
print("=" * 60)
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
log("[ok] Přihlášení dokončeno.")
def verify_inside(page):
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
log(f"[ok] Uvnitř Vaultu: {page.url}")
def dialog_visible(page):
"""True, pokud je na stránce viditelný jQuery UI dialog."""
try:
dlg = page.locator(".ui-dialog")
return bool(dlg.count() and dlg.first.is_visible())
except Exception:
return False
def save_page_debug(page, tag):
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
kandidátů na tlačítka. Vrátí cestu složky."""
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
out.mkdir(parents=True, exist_ok=True)
try:
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
except Exception as e:
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
report = []
for i, frame in enumerate(page.frames):
report.append(f"=== frame[{i}] url={frame.url}")
try:
(out / f"frame_{i}.html").write_text(frame.content(), encoding="utf-8")
for sel in (".ui-dialog", "a.ok.vv_button",
".ui-dialog-titlebar-close",
"button", "input[type='button']",
"[title]", "[aria-label]"):
n = frame.locator(sel).count()
if n:
report.append(f" {sel}: {n}x")
for attr in ("title", "aria-label"):
vals = frame.locator(f"[{attr}]").evaluate_all(
f"els => els.map(e => e.getAttribute('{attr}'))")
uniq = sorted({v for v in vals if v})[:80]
report.append(f" {attr}: {uniq}")
except Exception as e:
report.append(f" [chyba čtení framu: {e}]")
(out / "frames_report.txt").write_text("\n".join(report), encoding="utf-8")
log(f"[!] Diagnostika stránky uložena do: {out}")
return out
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
".vv_login_msg_dialog .vv_button.ok")
def dismiss_maintenance_popup(page, timeout=8000):
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
proto se na něj krátce čeká. Bezpečné volat vždy."""
ok = page.locator(DIALOG_OK_SELECTOR)
try:
ok.first.wait_for(state="visible", timeout=timeout)
except PWTimeout:
return False
except Exception:
return False
closed = 0
for _ in range(5): # dialogy umí být ve frontě
try:
if ok.count() and ok.first.is_visible():
ok.first.click()
page.wait_for_timeout(300)
closed += 1
log("[i] Maintenance/login dialog zavřen (OK).")
continue
except Exception:
pass
break
if not dialog_visible(page):
return bool(closed)
page.keyboard.press("Escape")
page.wait_for_timeout(500)
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
if dialog_visible(page):
save_page_debug(page, "dialog")
print("\n" + "=" * 60)
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
print(" Zavři ho prosím ručně v prohlížeči.")
print("=" * 60)
input(" Po ručním zavření stiskni ENTER... ")
return bool(closed)
# --- Export reportu ----------------------------------------------------
def _first_visible(page, builders):
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
hlavní stránce i ve všech frames."""
for frame in page.frames:
for build, desc in builders:
try:
loc = build(frame)
if loc.count() and loc.first.is_visible():
return loc.first, desc
except Exception:
continue
return None, None
def download_report(page, report):
"""Stáhne daný report (Export to Excel, Data Only) do WhatToDownload/
pod timestampovaným názvem. Vrátí cestu k souboru.
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
log(f"\n[i] === Report {report['level'].upper()} "
f"({report.get('country') or report['study']}) ===")
log("[i] Otevírám report...")
page.goto(report["url"], wait_until="domcontentloaded")
dismiss_maintenance_popup(page, timeout=4000)
try:
page.wait_for_selector("text=Returned", timeout=30000)
except PWTimeout:
try:
page.wait_for_selector("text=Document Status:", timeout=30000)
except PWTimeout:
save_page_debug(page, f"report_load_{report['level']}")
raise RuntimeError(
"Report se nenačetl (nenašel jsem 'Returned' ani "
"'Document Status:'). Diagnostika v debug/.")
log("[i] Report načten, otevírám menu akcí (⋯)...")
actions, desc = _first_visible(page, [
(lambda f: f.locator(
".actionMenuContainer .dropDown.vv_dropdown_toggle "
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
])
if actions is None:
save_page_debug(page, f"report_menu_{report['level']}")
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. Diagnostika v debug/.")
log(f"[i] Menu nalezeno přes: {desc}")
actions.click()
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
try:
item.first.wait_for(state="visible", timeout=15000)
except PWTimeout:
item = page.get_by_text("Export to Excel", exact=True)
try:
item.first.wait_for(state="visible", timeout=5000)
except PWTimeout:
save_page_debug(page, f"report_export_item_{report['level']}")
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
"Excel' jsem nenašel. Diagnostika v debug/.")
log("[i] Klikám 'Export to Excel'...")
item.first.click()
log("[i] Dialog Excel Export Options...")
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
try:
radio.first.wait_for(state="visible", timeout=10000)
if not radio.first.is_checked():
radio.first.check()
log("[i] Přepnuto na 'Data Only'.")
except PWTimeout:
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
export_btn = page.get_by_role("button", name="Export", exact=True)
try:
export_btn.first.wait_for(state="visible", timeout=10000)
except PWTimeout:
save_page_debug(page, f"report_export_btn_{report['level']}")
raise RuntimeError("Dialog exportu bez tlačítka Export. Diagnostika v debug/.")
export_btn = export_btn.first
with page.expect_download(timeout=120000) as dl_info:
export_btn.click()
download = dl_info.value
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dest = EXCEL_DIR / f"{ts} {report['level']} {download.suggested_filename}"
download.save_as(str(dest))
log(f"[ok] Report uložen: {dest}")
return dest
def archive_report(path):
"""Po úspěšném zpracování přesune report do Zpracovano/."""
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
target = PROCESSED_DIR / path.name
path.rename(target)
log(f"[i] Report archivován: {target}")
# --- SeaweedFS ---------------------------------------------------------
def seaweed_path(vtmf, version, ext):
"""Cesta podle identity dokumentu: /vtmf-documents/<vtmf>/<verze><ext>."""
ver = version or "vunknown"
return f"{SEAWEED_PREFIX}/{vtmf}/{ver}{ext}"
def seaweed_store(vtmf, version, ext, data, mime="application/octet-stream"):
"""Upload do SeaweedFS Filer pod cestou <vtmf>/<verze><ext>.
Vrací (path, url)."""
path = seaweed_path(vtmf, version, ext)
url = SEAWEED_FILER + path
req = urllib.request.Request(url, data=data, method="PUT",
headers={"Content-Type": mime})
urllib.request.urlopen(req, timeout=120)
return path, url
# --- Stažení dokumentů -------------------------------------------------
def find_source_file_button(page):
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře)."""
for sel in ("[title='Source File']", "[aria-label='Source File']"):
loc = page.locator(sel)
if loc.count():
return loc.first
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
if loc.count():
return loc.first
return None
def download_source_bytes(page, doc):
"""Otevře dokument, stáhne Source File do dočasného souboru Playwrightu
a vrátí (data: bytes, ext: str). Žádné trvalé uložení na disk.
PlaceholderDocument když dokument nemá obsah."""
vtmf = doc["vtmf"]
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
page.goto(doc["url"], wait_until="domcontentloaded")
try:
page.wait_for_load_state("networkidle", timeout=30000)
except PWTimeout:
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
dismiss_maintenance_popup(page, timeout=2000)
ph = page.locator("div.vv_placeholder_text")
if ph.count() and ph.first.is_visible():
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
raise PlaceholderDocument(vtmf)
target = find_source_file_button(page)
if target is None:
raise RuntimeError(
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
log("[i] Klikám na Source File a čekám na download...")
with page.expect_download(timeout=60000) as dl_info:
target.click()
try:
item = page.get_by_role("menuitem", name=re.compile("Source File", re.I))
if item.count() and item.first.is_visible():
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
item.first.click()
except Exception:
pass
download = dl_info.value
ext = Path(download.suggested_filename).suffix
tmp = download.path() # dočasný soubor Playwrightu
data = Path(tmp).read_bytes()
return data, ext
def download_missing(page, coll):
"""Stáhne všechny nesmazané dokumenty bez downloaded=True PŘÍMO do
SeaweedFS (žádný disk). Výsledek každého se ihned zapíše do Mongo."""
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
.sort([("level", ASCENDING), ("vtmf", ASCENDING),
("version", ASCENDING)]))
if LIMIT:
todo = todo[:LIMIT]
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
ok_count = fail_count = placeholder_count = 0
for n, doc in enumerate(todo, 1):
key = doc["_id"]
log(f"\n--- [{n}/{len(todo)}] {key} | {doc.get('level', '?')} | {doc['desc'][:60]}")
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
try:
data, ext = download_source_bytes(page, doc)
size_kb = len(data) / 1024
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
sha256_hex = hashlib.sha256(data).hexdigest()
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
sw_path, sw_url = seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
log(f"[ok] {size_str} -> SeaweedFS {sw_path}")
coll.update_one({"_id": key}, {"$set": {
"downloaded": True,
"downloaded_at": datetime.now(),
"sha256": sha256_hex,
"seaweed_path": sw_path, "seaweed_url": sw_url,
"seaweed_synced_at": datetime.now(),
"last_error": None}})
ok_count += 1
last_err = None
break
except PlaceholderDocument:
coll.update_one({"_id": key}, {"$set": {
"downloaded": True, "placeholder": True,
"downloaded_at": datetime.now(), "last_error": None}})
placeholder_count += 1
last_err = None
break
except Exception as e:
last_err = e
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
if attempt < MAX_ATTEMPTS:
page.wait_for_timeout(RETRY_PAUSE_MS)
if last_err is not None:
coll.update_one({"_id": key}, {"$set": {
"last_error": str(last_err), "error_at": datetime.now()}})
fail_count += 1
page.wait_for_timeout(BETWEEN_DOCS_MS)
return ok_count, fail_count, placeholder_count
# --- Main --------------------------------------------------------------
def main():
ensure_credentials()
db, coll, runs = get_db()
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
with sync_playwright() as p:
ctx = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
accept_downloads=True,
no_viewport=True,
args=["--start-maximized"],
)
page = ctx.pages[0] if ctx.pages else ctx.new_page()
ok_count = fail_count = placeholder_count = 0
pipeline_error = None
try:
# 1) login
login_if_needed(page)
verify_inside(page)
dismiss_maintenance_popup(page)
# 2+3) pro každý ZAPNUTÝ report: export -> parse -> scoped sync
log("\n[i] Plán reportů:")
for r in REPORTS:
flag = "ZAP" if r.get("enabled", True) else "VYP"
log(f" [{flag}] {r.get('name', r['level'])}")
for report in REPORTS:
if not report.get("enabled", True):
log(f"\n[i] Přeskakuji (enabled=False): {report.get('name', report['level'])}")
continue
report_path = download_report(page, report)
docs = read_documents_from_excel(report_path, report["level"])
before = len(docs)
docs = [d for d in docs if report["study"] in d["studies"]]
if before != len(docs):
log(f"[i] Ořez na {report['study']}: {len(docs)}/{before} řádků.")
if not docs:
log(f"[!] Report {report['level']} prázdný (po ořezu) — "
f"sync přeskočen, nic se nemaže.")
archive_report(report_path)
continue
sync_report_to_mongo(coll, runs, docs, report, report_path)
archive_report(report_path)
# 4) jeden průchod stažení všeho nestaženého do SeaweedFS
ok_count, fail_count, placeholder_count = download_missing(page, coll)
except KeyboardInterrupt:
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
except Exception as e:
pipeline_error = e
print("\n" + "=" * 60)
print(" PIPELINE SELHALA!")
print(f" {type(e).__name__}: {e}")
print("=" * 60)
finally:
total = coll.count_documents({})
active = coll.count_documents({"deleted": False})
have = coll.count_documents({"deleted": False, "downloaded": True})
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
f"{placeholder_count} placeholderů, {fail_count} chyb"
+ (f", PIPELINE SELHALA ({pipeline_error})" if pipeline_error else "."))
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
f"z toho v SeaweedFS {have} ({active - have} zbývá).")
log("[i] Zavírám prohlížeč.")
ctx.close()
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
if __name__ == "__main__":
main()