Add Outlook/Soubory/Clario/Feasibility scripts and reports; ignore Incoming, Outlook downloads & profile

This commit is contained in:
2026-06-03 16:15:19 +02:00
parent 61c6aeea23
commit 6c57ab3ae6
36 changed files with 4949 additions and 0 deletions
+3
View File
@@ -4,3 +4,6 @@ __pycache__/
.idea/
.claude/
EmailsImport/SouboryRůznéVelikosti/
IWRS/Patients/Incoming/
Outlook/downloads/
Outlook/outlook_profile/
@@ -0,0 +1,44 @@
"Protocol","Study Population","Country","Site","Principal Investigator","Participant ID","Baseline Stool Frequency","Visit","Visit Date","Endoscopy Completed?","Endoscopy Date","Bowel Preparation Start Date 1","Bowel Preparation End Date 1","Bowel Preparation Start Date 2","Bowel Preparation End Date 2","Central Endoscopy Score","Local Endoscopy Score","PGA Score","Eligible Day (-1)","Day (-1) Excluded Reason(s)","Eligible Day (-2)","Day (-2) Excluded Reason(s)","Eligible Day (-3)","Day (-3) Excluded Reason(s)","Eligible Day (-4)","Day (-4) Excluded Reason(s)","Eligible Day (-5)","Day (-5) Excluded Reason(s)","Eligible Day (-6)","Day (-6) Excluded Reason(s)","Eligible Day (-7)","Day (-7) Excluded Reason(s)","Eligible Day (-8)","Day (-8) Excluded Reason(s)","Eligible Day (-9)","Day (-9) Excluded Reason(s)","Eligible Day (-10)","Day (-10) Excluded Reason(s)","Eligible Day (-1) Stool Count","Eligible Day (-2) Stool Count","Eligible Day (-3) Stool Count","Eligible Day (-4) Stool Count","Eligible Day (-5) Stool Count","Eligible Day (-6) Stool Count","Eligible Day (-7) Stool Count","Eligible Day (-8) Stool Count","Eligible Day (-9) Stool Count","Eligible Day (-10) Stool Count","Stool Frequency Sub-score","Eligible Day (-1) Rectal Bleeding Score","Eligible Day (-2) Rectal Bleeding Score","Eligible Day (-3) Rectal Bleeding Score","Eligible Day (-4) Rectal Bleeding Score","Eligible Day (-5) Rectal Bleeding Score","Eligible Day (-6) Rectal Bleeding Score","Eligible Day (-7) Rectal Bleeding Score","Eligible Day (-8) Rectal Bleeding Score","Eligible Day (-9) Rectal Bleeding Score","Eligible Day (-10) Rectal Bleeding Score","Rectal Bleeding Sub-score","Partial Mayo Score","Modified Mayo Score","Full Mayo Score","Site Action","Last Mayo Score Submission","Week I-12 Clinical Responder","Week I-12 Clinical Remission","Clinical Flare","Loss of Response","Partial Mayo Response Post Loss of Response","Partial Mayo Response for Clinical Non-Responders"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-0","19 Feb 2026","Yes","05 Feb 2026","04 Feb 2026","04 Feb 2026","-","-","2","-","3","18 Feb 2026","-","17 Feb 2026","-","16 Feb 2026","-","15 Feb 2026","-","14 Feb 2026","-","13 Feb 2026","-","12 Feb 2026","-","11 Feb 2026","Day Not Applicable for Calculation","10 Feb 2026","Day Not Applicable for Calculation","09 Feb 2026","Day Not Applicable for Calculation","10","8","7","5","7","8","8","-","-","-","3","1","1","1","0","1","1","1","-","-","-","1","7","6","9","-","08 Apr 2026 07:11:25","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-2","04 Mar 2026","-","-","-","-","-","-","-","-","3","03 Mar 2026","-","02 Mar 2026","-","01 Mar 2026","-","28 Feb 2026","-","27 Feb 2026","-","26 Feb 2026","-","25 Feb 2026","-","24 Feb 2026","Day Not Applicable for Calculation","23 Feb 2026","Day Not Applicable for Calculation","22 Feb 2026","Day Not Applicable for Calculation","5","4","5","4","5","6","6","-","-","-","2","1","0","1","0","1","0","1","-","-","-","1","6","","","-","28 May 2026 10:04:05","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-4","18 Mar 2026","-","-","-","-","-","-","-","-","2","17 Mar 2026","-","16 Mar 2026","-","15 Mar 2026","-","14 Mar 2026","-","13 Mar 2026","-","12 Mar 2026","-","11 Mar 2026","-","10 Mar 2026","Day Not Applicable for Calculation","09 Mar 2026","Day Not Applicable for Calculation","08 Mar 2026","Day Not Applicable for Calculation","5","5","5","4","5","4","5","-","-","-","2","1","0","0","1","1","1","0","-","-","-","1","5","","","-","08 Apr 2026 11:04:49","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-8","05 May 2026","-","-","-","-","-","-","-","-","1","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","3","3","4","4","5","4","4","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","4","","","-","28 May 2026 14:42:53","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-12","13 May 2026","Yes","06 May 2026","05 May 2026","05 May 2026","-","-","1","-","1","12 May 2026","-","11 May 2026","-","10 May 2026","-","09 May 2026","-","08 May 2026","-","07 May 2026","-","06 May 2026","Endoscopy","05 May 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","04 May 2026","-","03 May 2026","Day Not Applicable for Calculation","5","4","6","5","5","5","-","-","3","-","2","1","0","1","1","1","1","-","-","1","-","1","4","4","5","-","28 May 2026 14:43:11","Clinical Responder","No","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-0","08 Apr 2026","Yes","18 Mar 2026","17 Mar 2026","18 Mar 2026","-","-","2","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","Missing Diary","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","3","3","4","-","3","3","4","-","-","-","1","0","0","0","-","0","0","1","-","-","-","0","3","3","5","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-2","23 Apr 2026","-","-","-","-","-","-","-","-","2","22 Apr 2026","Missing Diary","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","Day Not Applicable for Calculation","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","-","3","3","6","5","5","4","-","-","-","2","-","0","0","1","1","1","1","-","-","-","1","5","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-4","06 May 2026","-","-","-","-","-","-","-","-","1","05 May 2026","-","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","Day Not Applicable for Calculation","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","6","3","2","3","3","3","3","-","-","-","1","1","0","0","0","1","1","0","-","-","-","0","2","","","-","28 May 2026 14:43:38","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012003","1","I-0","27 May 2026","Yes","13 May 2026","12 May 2026","12 May 2026","-","-","3","-","2","26 May 2026","-","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","Day Not Applicable for Calculation","18 May 2026","Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","6","9","7","8","9","7","8","-","-","-","3","2","2","2","2","1","1","1","-","-","-","2","7","8","10","-","27 May 2026 07:24:39","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-0","20 Mar 2026","Yes","19 Feb 2026","-","-","-","-","3","-","3","19 Mar 2026","-","18 Mar 2026","-","17 Mar 2026","-","16 Mar 2026","-","15 Mar 2026","-","14 Mar 2026","-","13 Mar 2026","-","12 Mar 2026","Day Not Applicable for Calculation","11 Mar 2026","Day Not Applicable for Calculation","10 Mar 2026","Day Not Applicable for Calculation","7","7","8","8","7","8","5","-","-","-","3","2","1","1","1","1","1","0","-","-","-","1","7","7","10","-","20 Mar 2026 07:02:44","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-2","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","Medication For Diarrhea","06 Apr 2026","Medication For Diarrhea","05 Apr 2026","Medication For Diarrhea","04 Apr 2026","Medication For Diarrhea","03 Apr 2026","Medication For Diarrhea","02 Apr 2026","Medication For Diarrhea","01 Apr 2026","Medication For Diarrhea","31 Mar 2026","Medication For Diarrhea;Day Not Applicable for Calculation","30 Mar 2026","Medication For Diarrhea;Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","-","-","-","-","-","-","-","-","-","-","Non-Evaluable","-","-","-","-","-","-","-","-","-","-","Non-Evaluable","Non-Evaluable","Non-Evaluable","Non-Evaluable","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-4","15 Apr 2026","-","-","-","-","-","-","-","-","3","14 Apr 2026","-","13 Apr 2026","-","12 Apr 2026","-","11 Apr 2026","-","10 Apr 2026","-","09 Apr 2026","-","08 Apr 2026","-","07 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","06 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","05 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","9","22","20","19","17","18","18","-","-","-","3","1","3","2","2","2","2","2","-","-","-","2","8","","","-","04 May 2026 22:05:32","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-8","18 May 2026","-","-","-","-","-","-","-","-","2","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","-","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","08 May 2026","Day Not Applicable for Calculation","7","5","9","7","7","8","8","-","-","-","3","1","1","1","1","1","1","1","-","-","-","1","6","","","-","29 May 2026 15:43:30","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062002","1","I-0","26 May 2026","Yes","14 May 2026","13 May 2026","13 May 2026","-","-","2","-","2","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","-","18 May 2026","Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","16 May 2026","Day Not Applicable for Calculation","8","8","6","7","7","6","7","-","-","-","3","2","2","2","2","2","2","2","-","-","-","2","7","7","9","-","29 May 2026 15:45:00","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-0","05 May 2026","Yes","24 Apr 2026","23 Apr 2026","23 Apr 2026","-","-","2","-","2","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","5","5","5","5","5","5","5","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","5","5","7","-","05 May 2026 11:19:40","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-2","19 May 2026","-","-","-","-","-","-","-","-","1","18 May 2026","-","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","Day Not Applicable for Calculation","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","5","4","5","5","5","4","6","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","4","","","-","19 May 2026 10:38:25","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-0","07 Apr 2026","Yes","24 Mar 2026","22 Mar 2026","22 Mar 2026","-","-","2","-","2","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","-","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","28 Mar 2026","Day Not Applicable for Calculation","8","11","5","9","11","10","13","-","-","-","3","1","2","2","2","2","2","2","-","-","-","2","7","7","9","-","04 May 2026 08:44:52","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-2","22 Apr 2026","-","-","-","-","-","-","-","-","2","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","7","5","6","6","7","8","2","-","-","-","1","1","0","1","1","1","2","0","-","-","-","1","4","","","-","04 May 2026 08:45:07","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-4","07 May 2026","-","-","-","-","-","-","-","-","1","06 May 2026","-","05 May 2026","-","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","Day Not Applicable for Calculation","28 Apr 2026","Day Not Applicable for Calculation","27 Apr 2026","Day Not Applicable for Calculation","8","7","7","8","4","11","7","-","-","-","1","2","1","1","1","0","1","1","-","-","-","1","3","","","-","01 Jun 2026 00:57:35","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-0","24 Mar 2026","Yes","12 Mar 2026","11 Mar 2026","11 Mar 2026","-","-","2","-","2","23 Mar 2026","-","22 Mar 2026","-","21 Mar 2026","-","20 Mar 2026","-","19 Mar 2026","-","18 Mar 2026","-","17 Mar 2026","-","16 Mar 2026","Day Not Applicable for Calculation","15 Mar 2026","Day Not Applicable for Calculation","14 Mar 2026","Day Not Applicable for Calculation","8","6","5","7","6","7","6","-","-","-","3","1","1","1","0","1","1","1","-","-","-","1","6","6","8","-","05 Apr 2026 22:41:27","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-2","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","5","2","3","6","5","5","5","-","-","-","2","0","0","0","0","1","1","0","-","-","-","0","4","","","-","27 May 2026 12:53:52","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-4","21 Apr 2026","-","-","-","-","-","-","-","-","0","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","-","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","11 Apr 2026","Day Not Applicable for Calculation","4","3","4","3","3","4","4","-","-","-","2","0","0","0","0","0","0","0","-","-","-","0","2","","","-","27 May 2026 12:54:41","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-0","12 May 2026","Yes","21 Apr 2026","20 Apr 2026","21 Apr 2026","-","-","2","-","2","11 May 2026","-","10 May 2026","-","09 May 2026","-","08 May 2026","-","07 May 2026","-","06 May 2026","-","05 May 2026","Missing Diary","04 May 2026","Day Not Applicable for Calculation","03 May 2026","Day Not Applicable for Calculation","02 May 2026","Day Not Applicable for Calculation","2","1","1","1","1","2","-","-","-","-","0","0","0","0","0","0","0","-","-","-","-","0","2","2","4","-","28 May 2026 23:19:30","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-2","26 May 2026","-","-","-","-","-","-","-","-","1","25 May 2026","-","24 May 2026","Missing Diary","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","-","18 May 2026","Missing Diary;Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","16 May 2026","Day Not Applicable for Calculation","1","-","1","2","1","2","2","-","-","-","1","0","-","0","0","0","0","0","-","-","-","0","2","","","-","28 May 2026 23:19:51","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","0","I-0","02 Jun 2026","Yes","25 May 2026","24 May 2026","24 May 2026","-","-","2","-","2","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Endoscopy;Missing Diary;Day Not Applicable for Calculation","24 May 2026","Bowel Preparation for Procedure;Missing Diary;Day Not Applicable for Calculation","23 May 2026","Missing Diary;Day Not Applicable for Calculation","8","8","11","10","10","11","6","-","-","-","3","2","2","1","2","1","2","2","-","-","-","2","7","7","9","-","02 Jun 2026 08:17:40","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","1","I-0","28 May 2026","Yes","19 May 2026","18 May 2026","19 May 2026","-","-","3","-","3","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","Day Not Applicable for Calculation","19 May 2026","Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation","18 May 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","14","15","15","15","15","15","15","-","-","-","3","2","3","3","2","2","3","3","-","-","-","3","9","9","12","-","28 May 2026 10:17:25","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","Unscheduled 1","04 May 2026","Yes","20 Apr 2026","12 Apr 2026","15 Apr 2026","-","-","2","-","3","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","-","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","24 Apr 2026","Day Not Applicable for Calculation","5","6","6","7","6","3","3","-","-","-","2","0","0","0","0","0","0","0","-","-","-","0","5","4","7","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","I-0","18 May 2026","Yes","01 May 2026","01 May 2026","01 May 2026","-","-","2","-","3","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","-","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","08 May 2026","Day Not Applicable for Calculation","6","6","6","6","6","6","6","-","-","-","3","0","0","0","0","0","0","0","-","-","-","0","6","5","8","-","18 May 2026 08:38:55","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","I-2","01 Jun 2026","-","-","-","-","-","-","-","-","3","31 May 2026","-","30 May 2026","Missing Diary","29 May 2026","Missing Diary","28 May 2026","Missing Diary","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","22 May 2026","Day Not Applicable for Calculation","6","-","-","-","6","6","6","-","-","-","3","0","-","-","-","0","0","0","-","-","-","0","6","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-0","07 Apr 2026","Yes","16 Mar 2026","15 Mar 2026","16 Mar 2026","-","-","3","-","3","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","-","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","28 Mar 2026","Day Not Applicable for Calculation","11","11","10","11","11","10","9","-","-","-","3","2","2","2","2","2","2","2","-","-","-","2","8","8","11","-","20 Apr 2026 09:27:58","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-2","20 Apr 2026","-","-","-","-","-","-","-","-","3","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","-","13 Apr 2026","-","12 Apr 2026","Day Not Applicable for Calculation","11 Apr 2026","Day Not Applicable for Calculation","10 Apr 2026","Day Not Applicable for Calculation","8","7","9","8","8","7","8","-","-","-","3","2","2","1","1","1","2","1","-","-","-","1","7","","","-","20 Apr 2026 09:29:01","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-4","05 May 2026","-","-","-","-","-","-","-","-","1","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","6","6","6","6","7","7","6","-","-","-","3","0","0","1","1","1","1","1","-","-","-","1","5","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-8","02 Jun 2026","-","-","-","-","-","-","-","-","1","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","3","4","4","4","5","5","5","-","-","-","2","0","0","0","0","0","1","1","-","-","-","0","3","","","-","02 Jun 2026 14:44:34","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222002","1","I-0","19 Feb 2026","Yes","11 Feb 2026","10 Feb 2026","11 Feb 2026","-","-","2","-","2","18 Feb 2026","-","17 Feb 2026","-","16 Feb 2026","-","15 Feb 2026","-","14 Feb 2026","-","13 Feb 2026","-","12 Feb 2026","-","11 Feb 2026","Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation","10 Feb 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","09 Feb 2026","Day Not Applicable for Calculation","3","2","2","3","4","3","2","-","-","-","1","1","1","0","0","0","2","2","-","-","-","1","4","4","6","-","19 Feb 2026 15:41:35","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-0","09 Mar 2026","Yes","11 Feb 2026","10 Feb 2026","11 Feb 2026","-","-","2","-","2","08 Mar 2026","-","07 Mar 2026","-","06 Mar 2026","-","05 Mar 2026","-","04 Mar 2026","-","03 Mar 2026","Missing Diary","02 Mar 2026","Missing Diary","01 Mar 2026","Missing Diary;Day Not Applicable for Calculation","28 Feb 2026","Missing Diary;Day Not Applicable for Calculation","27 Feb 2026","Missing Diary;Day Not Applicable for Calculation","7","7","6","6","7","-","-","-","-","-","3","2","2","2","2","2","-","-","-","-","-","2","7","7","9","-","22 Mar 2026 18:34:58","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-2","27 Mar 2026","-","-","-","-","-","-","-","-","2","26 Mar 2026","-","25 Mar 2026","-","24 Mar 2026","-","23 Mar 2026","-","22 Mar 2026","-","21 Mar 2026","-","20 Mar 2026","-","19 Mar 2026","Day Not Applicable for Calculation","18 Mar 2026","Day Not Applicable for Calculation","17 Mar 2026","Day Not Applicable for Calculation","7","3","3","3","5","5","5","-","-","-","2","0","0","1","1","1","1","2","-","-","-","1","5","","","-","08 Apr 2026 07:36:56","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-4","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","3","3","4","4","5","4","3","-","-","-","2","1","0","0","2","1","1","2","-","-","-","1","5","","","-","08 Apr 2026 07:59:35","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-8","04 May 2026","-","-","-","-","-","-","-","-","2","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","-","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","24 Apr 2026","Missing Diary;Day Not Applicable for Calculation","3","5","3","3","3","2","3","-","-","-","1","0","0","0","0","0","0","0","-","-","-","0","3","","","-","04 May 2026 08:08:40","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-12","01 Jun 2026","Yes","20 May 2026","19 May 2026","20 May 2026","-","-","3","-","2","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","22 May 2026","Day Not Applicable for Calculation","4","4","6","3","3","3","3","-","-","-","2","1","1","2","1","1","1","2","-","-","-","1","5","6","8","-","01 Jun 2026 14:25:57","Clinical Nonresponder","No","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-0","09 Apr 2026","Yes","08 Apr 2026","31 Mar 2026","01 Apr 2026","-","-","2","-","2","08 Apr 2026","Endoscopy","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","31 Mar 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","30 Mar 2026","-","-","3","3","4","3","4","3","-","-","3","1","-","2","2","2","2","2","2","-","-","2","2","5","5","7","-","29 May 2026 11:07:08","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-2","22 Apr 2026","-","-","-","-","-","-","-","-","2","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","3","3","5","3","2","3","2","-","-","-","1","1","2","2","1","1","1","2","-","-","-","1","4","","","-","05 May 2026 15:00:39","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-4","05 May 2026","-","-","-","-","-","-","-","-","2","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","4","2","2","2","2","2","2","-","-","-","1","1","1","1","1","2","1","1","-","-","-","1","4","","","-","05 May 2026 07:30:02","N/A","N/A","N/A","N/A","N/A","N/A"
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-8","02 Jun 2026","-","-","-","-","-","-","-","-","2","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","2","2","2","2","2","4","10","-","-","-","1","2","1","2","1","2","2","2","-","-","-","2","5","","","-","02 Jun 2026 08:19:16","N/A","N/A","N/A","N/A","N/A","N/A"
1 Protocol Study Population Country Site Principal Investigator Participant ID Baseline Stool Frequency Visit Visit Date Endoscopy Completed? Endoscopy Date Bowel Preparation Start Date 1 Bowel Preparation End Date 1 Bowel Preparation Start Date 2 Bowel Preparation End Date 2 Central Endoscopy Score Local Endoscopy Score PGA Score Eligible Day (-1) Day (-1) Excluded Reason(s) Eligible Day (-2) Day (-2) Excluded Reason(s) Eligible Day (-3) Day (-3) Excluded Reason(s) Eligible Day (-4) Day (-4) Excluded Reason(s) Eligible Day (-5) Day (-5) Excluded Reason(s) Eligible Day (-6) Day (-6) Excluded Reason(s) Eligible Day (-7) Day (-7) Excluded Reason(s) Eligible Day (-8) Day (-8) Excluded Reason(s) Eligible Day (-9) Day (-9) Excluded Reason(s) Eligible Day (-10) Day (-10) Excluded Reason(s) Eligible Day (-1) Stool Count Eligible Day (-2) Stool Count Eligible Day (-3) Stool Count Eligible Day (-4) Stool Count Eligible Day (-5) Stool Count Eligible Day (-6) Stool Count Eligible Day (-7) Stool Count Eligible Day (-8) Stool Count Eligible Day (-9) Stool Count Eligible Day (-10) Stool Count Stool Frequency Sub-score Eligible Day (-1) Rectal Bleeding Score Eligible Day (-2) Rectal Bleeding Score Eligible Day (-3) Rectal Bleeding Score Eligible Day (-4) Rectal Bleeding Score Eligible Day (-5) Rectal Bleeding Score Eligible Day (-6) Rectal Bleeding Score Eligible Day (-7) Rectal Bleeding Score Eligible Day (-8) Rectal Bleeding Score Eligible Day (-9) Rectal Bleeding Score Eligible Day (-10) Rectal Bleeding Score Rectal Bleeding Sub-score Partial Mayo Score Modified Mayo Score Full Mayo Score Site Action Last Mayo Score Submission Week I-12 Clinical Responder Week I-12 Clinical Remission Clinical Flare Loss of Response Partial Mayo Response Post Loss of Response Partial Mayo Response for Clinical Non-Responders
2 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-0 19 Feb 2026 Yes 05 Feb 2026 04 Feb 2026 04 Feb 2026 - - 2 - 3 18 Feb 2026 - 17 Feb 2026 - 16 Feb 2026 - 15 Feb 2026 - 14 Feb 2026 - 13 Feb 2026 - 12 Feb 2026 - 11 Feb 2026 Day Not Applicable for Calculation 10 Feb 2026 Day Not Applicable for Calculation 09 Feb 2026 Day Not Applicable for Calculation 10 8 7 5 7 8 8 - - - 3 1 1 1 0 1 1 1 - - - 1 7 6 9 - 08 Apr 2026 07:11:25 N/A N/A N/A N/A N/A N/A
3 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-2 04 Mar 2026 - - - - - - - - 3 03 Mar 2026 - 02 Mar 2026 - 01 Mar 2026 - 28 Feb 2026 - 27 Feb 2026 - 26 Feb 2026 - 25 Feb 2026 - 24 Feb 2026 Day Not Applicable for Calculation 23 Feb 2026 Day Not Applicable for Calculation 22 Feb 2026 Day Not Applicable for Calculation 5 4 5 4 5 6 6 - - - 2 1 0 1 0 1 0 1 - - - 1 6 - 28 May 2026 10:04:05 N/A N/A N/A N/A N/A N/A
4 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-4 18 Mar 2026 - - - - - - - - 2 17 Mar 2026 - 16 Mar 2026 - 15 Mar 2026 - 14 Mar 2026 - 13 Mar 2026 - 12 Mar 2026 - 11 Mar 2026 - 10 Mar 2026 Day Not Applicable for Calculation 09 Mar 2026 Day Not Applicable for Calculation 08 Mar 2026 Day Not Applicable for Calculation 5 5 5 4 5 4 5 - - - 2 1 0 0 1 1 1 0 - - - 1 5 - 08 Apr 2026 11:04:49 N/A N/A N/A N/A N/A N/A
5 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-8 05 May 2026 - - - - - - - - 1 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 3 3 4 4 5 4 4 - - - 2 1 1 1 1 1 1 1 - - - 1 4 - 28 May 2026 14:42:53 N/A N/A N/A N/A N/A N/A
6 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012001 1 I-12 13 May 2026 Yes 06 May 2026 05 May 2026 05 May 2026 - - 1 - 1 12 May 2026 - 11 May 2026 - 10 May 2026 - 09 May 2026 - 08 May 2026 - 07 May 2026 - 06 May 2026 Endoscopy 05 May 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 04 May 2026 - 03 May 2026 Day Not Applicable for Calculation 5 4 6 5 5 5 - - 3 - 2 1 0 1 1 1 1 - - 1 - 1 4 4 5 - 28 May 2026 14:43:11 Clinical Responder No N/A N/A N/A N/A
7 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-0 08 Apr 2026 Yes 18 Mar 2026 17 Mar 2026 18 Mar 2026 - - 2 - 2 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 Missing Diary 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 Day Not Applicable for Calculation 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 3 3 4 - 3 3 4 - - - 1 0 0 0 - 0 0 1 - - - 0 3 3 5 - - N/A N/A N/A N/A N/A N/A
8 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-2 23 Apr 2026 - - - - - - - - 2 22 Apr 2026 Missing Diary 21 Apr 2026 - 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 Day Not Applicable for Calculation 14 Apr 2026 Day Not Applicable for Calculation 13 Apr 2026 Day Not Applicable for Calculation - 3 3 6 5 5 4 - - - 2 - 0 0 1 1 1 1 - - - 1 5 - - N/A N/A N/A N/A N/A N/A
9 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012002 1 I-4 06 May 2026 - - - - - - - - 1 05 May 2026 - 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 Day Not Applicable for Calculation 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 6 3 2 3 3 3 3 - - - 1 1 0 0 0 1 1 0 - - - 0 2 - 28 May 2026 14:43:38 N/A N/A N/A N/A N/A N/A
10 77242113UCO3001 Adult Czech Republic DD5-CZ10001 Matej Falc CZ100012003 1 I-0 27 May 2026 Yes 13 May 2026 12 May 2026 12 May 2026 - - 3 - 2 26 May 2026 - 25 May 2026 - 24 May 2026 - 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 - 19 May 2026 Day Not Applicable for Calculation 18 May 2026 Day Not Applicable for Calculation 17 May 2026 Day Not Applicable for Calculation 6 9 7 8 9 7 8 - - - 3 2 2 2 2 1 1 1 - - - 2 7 8 10 - 27 May 2026 07:24:39 N/A N/A N/A N/A N/A N/A
11 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-0 20 Mar 2026 Yes 19 Feb 2026 - - - - 3 - 3 19 Mar 2026 - 18 Mar 2026 - 17 Mar 2026 - 16 Mar 2026 - 15 Mar 2026 - 14 Mar 2026 - 13 Mar 2026 - 12 Mar 2026 Day Not Applicable for Calculation 11 Mar 2026 Day Not Applicable for Calculation 10 Mar 2026 Day Not Applicable for Calculation 7 7 8 8 7 8 5 - - - 3 2 1 1 1 1 1 0 - - - 1 7 7 10 - 20 Mar 2026 07:02:44 N/A N/A N/A N/A N/A N/A
12 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-2 08 Apr 2026 - - - - - - - - 2 07 Apr 2026 Medication For Diarrhea 06 Apr 2026 Medication For Diarrhea 05 Apr 2026 Medication For Diarrhea 04 Apr 2026 Medication For Diarrhea 03 Apr 2026 Medication For Diarrhea 02 Apr 2026 Medication For Diarrhea 01 Apr 2026 Medication For Diarrhea 31 Mar 2026 Medication For Diarrhea;Day Not Applicable for Calculation 30 Mar 2026 Medication For Diarrhea;Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation - - - - - - - - - - Non-Evaluable - - - - - - - - - - Non-Evaluable Non-Evaluable Non-Evaluable Non-Evaluable - - N/A N/A N/A N/A N/A N/A
13 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-4 15 Apr 2026 - - - - - - - - 3 14 Apr 2026 - 13 Apr 2026 - 12 Apr 2026 - 11 Apr 2026 - 10 Apr 2026 - 09 Apr 2026 - 08 Apr 2026 - 07 Apr 2026 Medication For Diarrhea;Day Not Applicable for Calculation 06 Apr 2026 Medication For Diarrhea;Day Not Applicable for Calculation 05 Apr 2026 Medication For Diarrhea;Day Not Applicable for Calculation 9 22 20 19 17 18 18 - - - 3 1 3 2 2 2 2 2 - - - 2 8 - 04 May 2026 22:05:32 N/A N/A N/A N/A N/A N/A
14 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 1 I-8 18 May 2026 - - - - - - - - 2 17 May 2026 - 16 May 2026 - 15 May 2026 - 14 May 2026 - 13 May 2026 - 12 May 2026 - 11 May 2026 - 10 May 2026 Day Not Applicable for Calculation 09 May 2026 Day Not Applicable for Calculation 08 May 2026 Day Not Applicable for Calculation 7 5 9 7 7 8 8 - - - 3 1 1 1 1 1 1 1 - - - 1 6 - 29 May 2026 15:43:30 N/A N/A N/A N/A N/A N/A
15 77242113UCO3001 Adult Czech Republic DD5-CZ10006 Michal Konecny CZ100062002 1 I-0 26 May 2026 Yes 14 May 2026 13 May 2026 13 May 2026 - - 2 - 2 25 May 2026 - 24 May 2026 - 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 - 19 May 2026 - 18 May 2026 Day Not Applicable for Calculation 17 May 2026 Day Not Applicable for Calculation 16 May 2026 Day Not Applicable for Calculation 8 8 6 7 7 6 7 - - - 3 2 2 2 2 2 2 2 - - - 2 7 7 9 - 29 May 2026 15:45:00 N/A N/A N/A N/A N/A N/A
16 77242113UCO3001 Adult Czech Republic DD5-CZ10009 Jiri Pumprla CZ100092001 1 I-0 05 May 2026 Yes 24 Apr 2026 23 Apr 2026 23 Apr 2026 - - 2 - 2 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 5 5 5 5 5 5 5 - - - 2 1 1 1 1 1 1 1 - - - 1 5 5 7 - 05 May 2026 11:19:40 N/A N/A N/A N/A N/A N/A
17 77242113UCO3001 Adult Czech Republic DD5-CZ10009 Jiri Pumprla CZ100092001 1 I-2 19 May 2026 - - - - - - - - 1 18 May 2026 - 17 May 2026 - 16 May 2026 - 15 May 2026 - 14 May 2026 - 13 May 2026 - 12 May 2026 - 11 May 2026 Day Not Applicable for Calculation 10 May 2026 Day Not Applicable for Calculation 09 May 2026 Day Not Applicable for Calculation 5 4 5 5 5 4 6 - - - 2 1 1 1 1 1 1 1 - - - 1 4 - 19 May 2026 10:38:25 N/A N/A N/A N/A N/A N/A
18 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-0 07 Apr 2026 Yes 24 Mar 2026 22 Mar 2026 22 Mar 2026 - - 2 - 2 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 - 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 28 Mar 2026 Day Not Applicable for Calculation 8 11 5 9 11 10 13 - - - 3 1 2 2 2 2 2 2 - - - 2 7 7 9 - 04 May 2026 08:44:52 N/A N/A N/A N/A N/A N/A
19 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-2 22 Apr 2026 - - - - - - - - 2 21 Apr 2026 - 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 Day Not Applicable for Calculation 13 Apr 2026 Day Not Applicable for Calculation 12 Apr 2026 Day Not Applicable for Calculation 7 5 6 6 7 8 2 - - - 1 1 0 1 1 1 2 0 - - - 1 4 - 04 May 2026 08:45:07 N/A N/A N/A N/A N/A N/A
20 77242113UCO3001 Adult Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 5 I-4 07 May 2026 - - - - - - - - 1 06 May 2026 - 05 May 2026 - 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 Day Not Applicable for Calculation 28 Apr 2026 Day Not Applicable for Calculation 27 Apr 2026 Day Not Applicable for Calculation 8 7 7 8 4 11 7 - - - 1 2 1 1 1 0 1 1 - - - 1 3 - 01 Jun 2026 00:57:35 N/A N/A N/A N/A N/A N/A
21 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132001 1 I-0 24 Mar 2026 Yes 12 Mar 2026 11 Mar 2026 11 Mar 2026 - - 2 - 2 23 Mar 2026 - 22 Mar 2026 - 21 Mar 2026 - 20 Mar 2026 - 19 Mar 2026 - 18 Mar 2026 - 17 Mar 2026 - 16 Mar 2026 Day Not Applicable for Calculation 15 Mar 2026 Day Not Applicable for Calculation 14 Mar 2026 Day Not Applicable for Calculation 8 6 5 7 6 7 6 - - - 3 1 1 1 0 1 1 1 - - - 1 6 6 8 - 05 Apr 2026 22:41:27 N/A N/A N/A N/A N/A N/A
22 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132001 1 I-2 08 Apr 2026 - - - - - - - - 2 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 Day Not Applicable for Calculation 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 5 2 3 6 5 5 5 - - - 2 0 0 0 0 1 1 0 - - - 0 4 - 27 May 2026 12:53:52 N/A N/A N/A N/A N/A N/A
23 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132001 1 I-4 21 Apr 2026 - - - - - - - - 0 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 - 13 Apr 2026 Day Not Applicable for Calculation 12 Apr 2026 Day Not Applicable for Calculation 11 Apr 2026 Day Not Applicable for Calculation 4 3 4 3 3 4 4 - - - 2 0 0 0 0 0 0 0 - - - 0 2 - 27 May 2026 12:54:41 N/A N/A N/A N/A N/A N/A
24 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132002 1 I-0 12 May 2026 Yes 21 Apr 2026 20 Apr 2026 21 Apr 2026 - - 2 - 2 11 May 2026 - 10 May 2026 - 09 May 2026 - 08 May 2026 - 07 May 2026 - 06 May 2026 - 05 May 2026 Missing Diary 04 May 2026 Day Not Applicable for Calculation 03 May 2026 Day Not Applicable for Calculation 02 May 2026 Day Not Applicable for Calculation 2 1 1 1 1 2 - - - - 0 0 0 0 0 0 0 - - - - 0 2 2 4 - 28 May 2026 23:19:30 N/A N/A N/A N/A N/A N/A
25 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132002 1 I-2 26 May 2026 - - - - - - - - 1 25 May 2026 - 24 May 2026 Missing Diary 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 - 19 May 2026 - 18 May 2026 Missing Diary;Day Not Applicable for Calculation 17 May 2026 Day Not Applicable for Calculation 16 May 2026 Day Not Applicable for Calculation 1 - 1 2 1 2 2 - - - 1 0 - 0 0 0 0 0 - - - 0 2 - 28 May 2026 23:19:51 N/A N/A N/A N/A N/A N/A
26 77242113UCO3001 Adult Czech Republic DD5-CZ10013 David Stepek CZ100132003 0 I-0 02 Jun 2026 Yes 25 May 2026 24 May 2026 24 May 2026 - - 2 - 2 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 Endoscopy;Missing Diary;Day Not Applicable for Calculation 24 May 2026 Bowel Preparation for Procedure;Missing Diary;Day Not Applicable for Calculation 23 May 2026 Missing Diary;Day Not Applicable for Calculation 8 8 11 10 10 11 6 - - - 3 2 2 1 2 1 2 2 - - - 2 7 7 9 - 02 Jun 2026 08:17:40 N/A N/A N/A N/A N/A N/A
27 77242113UCO3001 Adult Czech Republic DD5-CZ10016 Robert Mudr CZ100162001 1 I-0 28 May 2026 Yes 19 May 2026 18 May 2026 19 May 2026 - - 3 - 3 27 May 2026 - 26 May 2026 - 25 May 2026 - 24 May 2026 - 23 May 2026 - 22 May 2026 - 21 May 2026 - 20 May 2026 Day Not Applicable for Calculation 19 May 2026 Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation 18 May 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 14 15 15 15 15 15 15 - - - 3 2 3 3 2 2 3 3 - - - 3 9 9 12 - 28 May 2026 10:17:25 N/A N/A N/A N/A N/A N/A
28 77242113UCO3001 Adolescent Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 1 Unscheduled 1 04 May 2026 Yes 20 Apr 2026 12 Apr 2026 15 Apr 2026 - - 2 - 3 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 - 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 24 Apr 2026 Day Not Applicable for Calculation 5 6 6 7 6 3 3 - - - 2 0 0 0 0 0 0 0 - - - 0 5 4 7 - - N/A N/A N/A N/A N/A N/A
29 77242113UCO3001 Adolescent Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 1 I-0 18 May 2026 Yes 01 May 2026 01 May 2026 01 May 2026 - - 2 - 3 17 May 2026 - 16 May 2026 - 15 May 2026 - 14 May 2026 - 13 May 2026 - 12 May 2026 - 11 May 2026 - 10 May 2026 Day Not Applicable for Calculation 09 May 2026 Day Not Applicable for Calculation 08 May 2026 Day Not Applicable for Calculation 6 6 6 6 6 6 6 - - - 3 0 0 0 0 0 0 0 - - - 0 6 5 8 - 18 May 2026 08:38:55 N/A N/A N/A N/A N/A N/A
30 77242113UCO3001 Adolescent Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 1 I-2 01 Jun 2026 - - - - - - - - 3 31 May 2026 - 30 May 2026 Missing Diary 29 May 2026 Missing Diary 28 May 2026 Missing Diary 27 May 2026 - 26 May 2026 - 25 May 2026 - 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 22 May 2026 Day Not Applicable for Calculation 6 - - - 6 6 6 - - - 3 0 - - - 0 0 0 - - - 0 6 - - N/A N/A N/A N/A N/A N/A
31 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-0 07 Apr 2026 Yes 16 Mar 2026 15 Mar 2026 16 Mar 2026 - - 3 - 3 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 - 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 28 Mar 2026 Day Not Applicable for Calculation 11 11 10 11 11 10 9 - - - 3 2 2 2 2 2 2 2 - - - 2 8 8 11 - 20 Apr 2026 09:27:58 N/A N/A N/A N/A N/A N/A
32 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-2 20 Apr 2026 - - - - - - - - 3 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 - 13 Apr 2026 - 12 Apr 2026 Day Not Applicable for Calculation 11 Apr 2026 Day Not Applicable for Calculation 10 Apr 2026 Day Not Applicable for Calculation 8 7 9 8 8 7 8 - - - 3 2 2 1 1 1 2 1 - - - 1 7 - 20 Apr 2026 09:29:01 N/A N/A N/A N/A N/A N/A
33 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-4 05 May 2026 - - - - - - - - 1 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 6 6 6 6 7 7 6 - - - 3 0 0 1 1 1 1 1 - - - 1 5 - - N/A N/A N/A N/A N/A N/A
34 77242113UCO3001 Adult Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 1 I-8 02 Jun 2026 - - - - - - - - 1 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 Day Not Applicable for Calculation 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 3 4 4 4 5 5 5 - - - 2 0 0 0 0 0 1 1 - - - 0 3 - 02 Jun 2026 14:44:34 N/A N/A N/A N/A N/A N/A
35 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222002 1 I-0 19 Feb 2026 Yes 11 Feb 2026 10 Feb 2026 11 Feb 2026 - - 2 - 2 18 Feb 2026 - 17 Feb 2026 - 16 Feb 2026 - 15 Feb 2026 - 14 Feb 2026 - 13 Feb 2026 - 12 Feb 2026 - 11 Feb 2026 Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation 10 Feb 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 09 Feb 2026 Day Not Applicable for Calculation 3 2 2 3 4 3 2 - - - 1 1 1 0 0 0 2 2 - - - 1 4 4 6 - 19 Feb 2026 15:41:35 N/A N/A N/A N/A N/A N/A
36 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-0 09 Mar 2026 Yes 11 Feb 2026 10 Feb 2026 11 Feb 2026 - - 2 - 2 08 Mar 2026 - 07 Mar 2026 - 06 Mar 2026 - 05 Mar 2026 - 04 Mar 2026 - 03 Mar 2026 Missing Diary 02 Mar 2026 Missing Diary 01 Mar 2026 Missing Diary;Day Not Applicable for Calculation 28 Feb 2026 Missing Diary;Day Not Applicable for Calculation 27 Feb 2026 Missing Diary;Day Not Applicable for Calculation 7 7 6 6 7 - - - - - 3 2 2 2 2 2 - - - - - 2 7 7 9 - 22 Mar 2026 18:34:58 N/A N/A N/A N/A N/A N/A
37 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-2 27 Mar 2026 - - - - - - - - 2 26 Mar 2026 - 25 Mar 2026 - 24 Mar 2026 - 23 Mar 2026 - 22 Mar 2026 - 21 Mar 2026 - 20 Mar 2026 - 19 Mar 2026 Day Not Applicable for Calculation 18 Mar 2026 Day Not Applicable for Calculation 17 Mar 2026 Day Not Applicable for Calculation 7 3 3 3 5 5 5 - - - 2 0 0 1 1 1 1 2 - - - 1 5 - 08 Apr 2026 07:36:56 N/A N/A N/A N/A N/A N/A
38 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-4 08 Apr 2026 - - - - - - - - 2 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 - 31 Mar 2026 Day Not Applicable for Calculation 30 Mar 2026 Day Not Applicable for Calculation 29 Mar 2026 Day Not Applicable for Calculation 3 3 4 4 5 4 3 - - - 2 1 0 0 2 1 1 2 - - - 1 5 - 08 Apr 2026 07:59:35 N/A N/A N/A N/A N/A N/A
39 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-8 04 May 2026 - - - - - - - - 2 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 - 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 24 Apr 2026 Missing Diary;Day Not Applicable for Calculation 3 5 3 3 3 2 3 - - - 1 0 0 0 0 0 0 0 - - - 0 3 - 04 May 2026 08:08:40 N/A N/A N/A N/A N/A N/A
40 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 1 I-12 01 Jun 2026 Yes 20 May 2026 19 May 2026 20 May 2026 - - 3 - 2 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 - 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 22 May 2026 Day Not Applicable for Calculation 4 4 6 3 3 3 3 - - - 2 1 1 2 1 1 1 2 - - - 1 5 6 8 - 01 Jun 2026 14:25:57 Clinical Nonresponder No N/A N/A N/A N/A
41 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-0 09 Apr 2026 Yes 08 Apr 2026 31 Mar 2026 01 Apr 2026 - - 2 - 2 08 Apr 2026 Endoscopy 07 Apr 2026 - 06 Apr 2026 - 05 Apr 2026 - 04 Apr 2026 - 03 Apr 2026 - 02 Apr 2026 - 01 Apr 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 31 Mar 2026 Bowel Preparation for Procedure;Day Not Applicable for Calculation 30 Mar 2026 - - 3 3 4 3 4 3 - - 3 1 - 2 2 2 2 2 2 - - 2 2 5 5 7 - 29 May 2026 11:07:08 N/A N/A N/A N/A N/A N/A
42 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-2 22 Apr 2026 - - - - - - - - 2 21 Apr 2026 - 20 Apr 2026 - 19 Apr 2026 - 18 Apr 2026 - 17 Apr 2026 - 16 Apr 2026 - 15 Apr 2026 - 14 Apr 2026 Day Not Applicable for Calculation 13 Apr 2026 Day Not Applicable for Calculation 12 Apr 2026 Day Not Applicable for Calculation 3 3 5 3 2 3 2 - - - 1 1 2 2 1 1 1 2 - - - 1 4 - 05 May 2026 15:00:39 N/A N/A N/A N/A N/A N/A
43 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-4 05 May 2026 - - - - - - - - 2 04 May 2026 - 03 May 2026 - 02 May 2026 - 01 May 2026 - 30 Apr 2026 - 29 Apr 2026 - 28 Apr 2026 - 27 Apr 2026 Day Not Applicable for Calculation 26 Apr 2026 Day Not Applicable for Calculation 25 Apr 2026 Day Not Applicable for Calculation 4 2 2 2 2 2 2 - - - 1 1 1 1 1 2 1 1 - - - 1 4 - 05 May 2026 07:30:02 N/A N/A N/A N/A N/A N/A
44 77242113UCO3001 Adult Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 1 I-8 02 Jun 2026 - - - - - - - - 2 01 Jun 2026 - 31 May 2026 - 30 May 2026 - 29 May 2026 - 28 May 2026 - 27 May 2026 - 26 May 2026 - 25 May 2026 Day Not Applicable for Calculation 24 May 2026 Day Not Applicable for Calculation 23 May 2026 Day Not Applicable for Calculation 2 2 2 2 2 4 10 - - - 1 2 1 2 1 2 2 2 - - - 2 5 - 02 Jun 2026 08:19:16 N/A N/A N/A N/A N/A N/A
@@ -0,0 +1,178 @@
"Protocol","Country","Site","PI Name","Subject ID","Age at Informed Consent","Baseline Stool Count","Confirm Baseline Stool Count","Data Correction ID","Creation Date UTC","Status","Description","Date of Last Action UTC","Total Open Period","Total Open Time (Days)","Current Status Time (Days)","Type","Next Action Required","Category","Query History","Reason for Change","Resolution"
"77242113UCO3001","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","48","1","","SW00703544","13-May-2026","Submitted","Please change answer to clinical remision from no to YES (week 12). Entry erros ","20-May-2026","8-14 Days","14","9","Query Active ","Site","New","(1) 20 May 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification Request.
For us to process your request, please let us know the name of the form (with date) with question.
Thank you. ERT/CLARIO Data Coordination Team
","Entry Error",""
"77242113UCO3001","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","79","1","","SW00696586","09-Apr-2026","ReadyForQC","Please correct date of endoscopy to date: 18 March 2026 (from 25 March 2026)","15-Apr-2026","Over 28 Days","36","32","Query Active ","Site","Site-Entered Data","","Entry Error","CLARIO RESOLUTION:
Part 1: In Mayo Subscore (1) dated 08 Apr 2026 for I-0 visit, CLARIO to make the following changes:
- What was the date of endoscopy? (ENDODT1D): from 25 Mar 2026 to 18 Mar 2026
- Data Flag (QSDFLG1B): from blank to check
"
"77242113UCO3001","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","19","1","","SW00704536","19-May-2026","ReadyForQC","Please change the endoscopy date to 19-FEB-2026. 06-MAR-2026 was entered in error. ","26-May-2026","8-14 Days","10","5","Query Active ","Site","Site-Entered Data","","Entry Error","CLARIO RESOLUTION:
Part 1: In Mayo Subscore (1) dated 20 Mar 2026 for I-0 visit, CLARIO to make the following changes:
-What was the date of endoscopy? (ENDODT1D): from 06 Mar 2026 to 19 Feb 2026
- Data Flag (QSDFLG1B): from blank to check
"
"77242113UCO3001","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","22","5","Yes, I confirm this is the correct stool count.","SW00706684","01-Jun-2026","Submitted","The right endoscopy date is 23MAR2026, please change the date","01-Jun-2026","2-3 Days","2","2","","Clario DM","New","","Entry Error",""
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","29","1","","SW00705646","26-May-2026","Submitted","Correct visit date I-O is 12-May-2026. All questionaries were filled on paper and entered in tablet later.
Log-in issue. ","01-Jun-2026","4-7 Days","6","2","","Clario DM","New","(1) 01 Jun 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please provide the timestamps for each of the assessments if you used paper forms and transcribed into the device.
If unknown, ERT will use a dummy timestamp.
Thank you. ERT/CLARIO Data Coordination Team.
(2) 01 Jun 2026 dstepek@vnbrno.cz (Site User): time is unknown
","Changed Information",""
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","49","0","","SW00706581","29-May-2026","Submitted","baseline stool count reported by subject is 0, please change to 1 as per CRA request (subject has 1 stool in 2-3 days if in remission)","29-May-2026","2-3 Days","2","2","","Clario DM","New","","Changed Information",""
"77242113UCO3001","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","48","1","","SW00705916","27-May-2026","ReadyForEntry","As per ATS investigation (ATS26040111), please remove the below form which was entered as a duplicate
- MAYO Diary (5) 24 Apr 2026","02-Jun-2026","4-7 Days","5","1","","Clario DM","Technical Revision","","Technical Revision - Other","CLARIO RESOLUTION:
Part 1: CLARIO to delete MAYO Diary (5) dated 24 Apr 2026
"
"77242113UCO3001","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","15","1","","SW00701729","06-May-2026","Completed","Dears, please delete data from visit I-0 (reported as 4th of May 2026) as this visit had to be postponed - see the previous DCR of this patient and change data request that was corrected. Patient has left the site before it was resolved and and new date of I-0 was planned. Patient continues to fill in his diary and patient is coming to I=0 visit within allowed window. We need the system and tablet to be ready to run new Mayo Score Report with updated and recent data (e.g. reflect new I-0 visit date, new eligible days -1 to -7.).
thank you, Jiri Skopek","19-May-2026","8-14 Days","8","","","","Visit Data","(1) 11 May 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please note that the delete forms are allowed if the reason is one of the following.
If not, forms will move to unscheduled visit.
Data collected by the wrong patient.
Data collected by someone other than the patient.
Data collected prior to informed consent, or after withdrawal from the study.
Duplicate data erroneously entered at an Unscheduled visit via paper transcription.
Data collected that is not expected per protocol.
Also, I-0 visit is still ongoing. Please close the visit.
Once the visit was closed, we will process accoridngly.
Thank you. ERT/CLARIO Data Coordination Team
(2) 11 May 2026 jskopek (Site User): Dears,
I do not see any option that is adequate -from the list. Data are not needed to be deleted fully, they reflect the situation at May4th. Please mark it as unscheduled visit - as exactly that is the case. We need the system to be ready for I-0 visit planned for next week.
I will close the visit tomorrow - do you mean in tablet/ipad?
Thank you very much for your help! Jiri
(3) 12 May 2026 venkata.ramana (Clario): Thank you for your response.
Please note that the visit I-0 was still ongoing but not closed yet.
So please close the visit.
Kind Regards, Clario Data Coordination Team.
(4) 12 May 2026 jskopek (Site User): If I try to close the I-O visit in TABLET, it asks me if patient fulfils eligibility criteria to proceed to next visit based on these old data if I answer NO, it asks me to DEACTIVATE patient. I do not want to DEACTIVATE patient can you help WHERE and HOW to close this visit for you to change it to UNSCHEDULED and not to de-activate patient?
Thank you Jiri
","Other-delete visit I-0","CLARIO RESOLUTION:
Part 1: In the following forms dated 04 May 2026, CLARIO to make the following changes:
-Event ID: from I-0 to Unscheduled Visit 1
-Event At Entry: from I-0 to Unscheduled Visit 1
+Visit Start (49)
+ePRO Availability (1)
+Mayo Subscore (1)
+PGA (1)
Part 2: CLARIO to delete the following forms dated 04 May 2026 for I-0 visit.
+C-SSRS Since Last Visit (1)
+C-SSRS Since Last Visit Findings Report (1)
Part 3: CLARIO to manually enter Visit End form for Unscheduled visit 1 with the following information:
-Protocol: 77242113UCO3001
-Report Date: 04 May 2026
-Report Start Date and Time: 04 May 2026 23:59:59
-Event ID: Unscheduled Visit 1
-Event End Date: 04 May 2026 23:59:59
-Visit Status: Incomplete
-Phase At Entry: Screening
-Phase At Entry Timestamp: 13 Apr 2026 12:32:20
-Event At Entry: Unscheduled visit 1
-Event Start Date: 04 May 2026 23:59:59
-Event Time Zone Offset in Milliseconds: 7200000
-Session Repeat Number (SESREP1N): 0
-Session Instance Id (SESINST1S): 3f1214f0-4788-11f1-a0cf-bb403212adce
"
"77242113UCO3001","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","15","1","","SW00701226","04-May-2026","Completed","Dears, we would like ask you to change the information I read on assignment form given by patient on April 13, 2026 (Visit 1), Baseline Stool Count (PT.Custom4) as 3 that should be reported as 1.
Patient has entered wrong number as he did not understood it should be number of stools when illness is in remission or absent. He is a child and did not reflected this question correctly. Therefore, please change Baseline Stool Count = 1.
Thank you, Jiri Skopek ","04-May-2026","1 Day","1","","","","Demographic","","Changed Information","(Clario instructions)
1. Please make below changes in the assignment form:
Baseline Stool Count (PT. Custom4): 03 to 01."
"77242113UCO3001","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","61","1","","SW00699492","23-Apr-2026","ReadyForQC","Please correct the date of endoscopy done during screening visit of patient CZ100212001 to correct date 16-MAR-2026.","29-Apr-2026","22-28 Days","27","23","Query Active ","Site","Site-Entered Data","","Changed Information","CLARIO RESOLUTION:
Part 1: In the Mayo Subscore (1) dated 07 Apr 2026 for I-0 visit, CLARIO to make the following changes:
-What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 16 Mar 2026
- Data Flag (QSDFLG1B): from blank to check
"
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","39","1","","SW00703322","12-May-2026","Completed","As per ATS investigation (ATS26040111), please remove the below form that's been entered as a duplicate
- MAYO Diary (16) - 18 Mar 2026
","20-May-2026","4-7 Days","6","","","","Technical Revision","","Technical Revision - Other","CLARIO RESOLUTION:
Part 1: CLARIO to delete the MAYO Diary (16) dated 18 Mar 2026.
"
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","39","1","","SW00689748","09-Mar-2026","Completed","Dear all,
Patient CZ 100222003 was randomized on 9 Mar 2026. Kindly correct the colonoscopy date to 11 Feb 2025.
The date was initially entered as 21 Feb 2025 because the earlier date could not be entered in the system. The patient was rescreened.","02-Apr-2026","15-21 Days","17","","","","Site-Entered Data","(1) 13 Mar 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Could you please conform that if you are requesting following?
Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit
-What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025
Could you please confirm the year? This subject was assigned on 02 Mar 2026, you are providing that correct date is 11 Feb 2025 which a year ago.
If you are not requesting above, please provide us the name of the form with question.
Thank you. ERT/CLARIO Data Coordination Team
(2) 13 Mar 2026 katerina.havlikova@clinoxus.com (Site User): confirm date of colonoscopy 11Feb2026
(3) 21 Mar 2026 msullivan (Clario): Dear Site,
The requested changes to the Mayo data have been updated. Please navigate to the Mayo Score Report and resubmit the form for visit to log the updated Mayo Score form. Once done, please respond to this query confirming that the Mayo Score has been resubmitted.
Thank you. ERT/CLARIO Data Coordination Team
(4) 24 Mar 2026 jana.pomahacova@clinoxus.com (Site User): Thank you and sent
","New Information","CLARIO RESOLUTION:
Part 1: In the Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit, CLARIO to make the following changes:
-What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025
-Data Flag (QSDFLG1B): from blank to check"
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","33","1","","SW00705372","22-May-2026","Submitted","Dear all, please change Colonoscopz date from 8April2026 to date 01Apr2026 Thank you in advance","02-Jun-2026","4-7 Days","7","1","","Clario DM","New","(1) 29 May 2026 msullivan (Clario): Please confirm your request
Dear Site. Thank you for submitting this Data Clarification.
Please provide us the name of the form for this request.
Thank you. ERT/CLARIO Data Coordination Team
(2) 02 Jun 2026 katerina.havlikova@clinoxus.com (Site User): Dear all, please change Colonoscopy for Week I-12 date from 8April2026 to date 01Apr2026 Thank you in advance
","Changed Information",""
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","33","1","","SW00702538","08-May-2026","Completed","This TRR is to document the correction to the Mayo Subscore (1) form, where the following variables were populated with NULL values, due to a known core defect:
Event At Entry, Event Start Date, Event Time Zone Offset in Milliseconds.","12-May-2026","2-3 Days","2","","","","Technical Revision","","Technical Revision - Other","Please make the below changes in Mayo Subscore (1) dated 22 Apr 2026:
-Event At Entry: I-0
-Event Start Date: 09 Apr 2026 08:09:19
-Event Time Zone Offset in Milliseconds: 7200000"
1 Protocol Country Site PI Name Subject ID Age at Informed Consent Baseline Stool Count Confirm Baseline Stool Count Data Correction ID Creation Date UTC Status Description Date of Last Action UTC Total Open Period Total Open Time (Days) Current Status Time (Days) Type Next Action Required Category Query History Reason for Change Resolution
2 77242113UCO3001 Czech Republic DD5-CZ10001 Matej Falc CZ100012001 48 1 SW00703544 13-May-2026 Submitted Please change answer to clinical remision from no to YES (week 12). Entry erros 20-May-2026 8-14 Days 14 9 Query Active Site New (1) 20 May 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification Request. For us to process your request, please let us know the name of the form (with date) with question. Thank you. ERT/CLARIO Data Coordination Team Entry Error
3 77242113UCO3001 Czech Republic DD5-CZ10001 Matej Falc CZ100012002 79 1 SW00696586 09-Apr-2026 ReadyForQC Please correct date of endoscopy to date: 18 March 2026 (from 25 March 2026) 15-Apr-2026 Over 28 Days 36 32 Query Active Site Site-Entered Data Entry Error CLARIO RESOLUTION: Part 1: In Mayo Subscore (1) dated 08 Apr 2026 for I-0 visit, CLARIO to make the following changes: - What was the date of endoscopy? (ENDODT1D): from 25 Mar 2026 to 18 Mar 2026 - Data Flag (QSDFLG1B): from blank to check
4 77242113UCO3001 Czech Republic DD5-CZ10006 Michal Konecny CZ100062001 19 1 SW00704536 19-May-2026 ReadyForQC Please change the endoscopy date to 19-FEB-2026. 06-MAR-2026 was entered in error. 26-May-2026 8-14 Days 10 5 Query Active Site Site-Entered Data Entry Error CLARIO RESOLUTION: Part 1: In Mayo Subscore (1) dated 20 Mar 2026 for I-0 visit, CLARIO to make the following changes: -What was the date of endoscopy? (ENDODT1D): from 06 Mar 2026 to 19 Feb 2026 - Data Flag (QSDFLG1B): from blank to check
5 77242113UCO3001 Czech Republic DD5-CZ10012 Stefan Konecny CZ100122001 22 5 Yes, I confirm this is the correct stool count. SW00706684 01-Jun-2026 Submitted The right endoscopy date is 23MAR2026, please change the date 01-Jun-2026 2-3 Days 2 2 Clario DM New Entry Error
6 77242113UCO3001 Czech Republic DD5-CZ10013 David Stepek CZ100132002 29 1 SW00705646 26-May-2026 Submitted Correct visit date I-O is 12-May-2026. All questionaries were filled on paper and entered in tablet later. Log-in issue. 01-Jun-2026 4-7 Days 6 2 Clario DM New (1) 01 Jun 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please provide the timestamps for each of the assessments if you used paper forms and transcribed into the device. If unknown, ERT will use a dummy timestamp. Thank you. ERT/CLARIO Data Coordination Team. (2) 01 Jun 2026 dstepek@vnbrno.cz (Site User): time is unknown Changed Information
7 77242113UCO3001 Czech Republic DD5-CZ10013 David Stepek CZ100132003 49 0 SW00706581 29-May-2026 Submitted baseline stool count reported by subject is 0, please change to 1 as per CRA request (subject has 1 stool in 2-3 days if in remission) 29-May-2026 2-3 Days 2 2 Clario DM New Changed Information
8 77242113UCO3001 Czech Republic DD5-CZ10016 Robert Mudr CZ100162001 48 1 SW00705916 27-May-2026 ReadyForEntry As per ATS investigation (ATS26040111), please remove the below form which was entered as a duplicate - MAYO Diary (5) 24 Apr 2026 02-Jun-2026 4-7 Days 5 1 Clario DM Technical Revision Technical Revision - Other CLARIO RESOLUTION: Part 1: CLARIO to delete MAYO Diary (5) dated 24 Apr 2026
9 77242113UCO3001 Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 15 1 SW00701729 06-May-2026 Completed Dears, please delete data from visit I-0 (reported as 4th of May 2026) as this visit had to be postponed - see the previous DCR of this patient and change data request that was corrected. Patient has left the site before it was resolved and and new date of I-0 was planned. Patient continues to fill in his diary and patient is coming to I=0 visit within allowed window. We need the system and tablet to be ready to run new Mayo Score Report with updated and recent data (e.g. reflect new I-0 visit date, new eligible days -1 to -7.). thank you, Jiri Skopek 19-May-2026 8-14 Days 8 Visit Data (1) 11 May 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please note that the delete forms are allowed if the reason is one of the following. If not, forms will move to unscheduled visit. Data collected by the wrong patient. Data collected by someone other than the patient. Data collected prior to informed consent, or after withdrawal from the study. Duplicate data erroneously entered at an Unscheduled visit via paper transcription. Data collected that is not expected per protocol. Also, I-0 visit is still ongoing. Please close the visit. Once the visit was closed, we will process accoridngly. Thank you. ERT/CLARIO Data Coordination Team (2) 11 May 2026 jskopek (Site User): Dears, I do not see any option that is adequate -from the list. Data are not needed to be deleted fully, they reflect the situation at May4th. Please mark it as unscheduled visit - as exactly that is the case. We need the system to be ready for I-0 visit planned for next week. I will close the visit tomorrow - do you mean in tablet/ipad? Thank you very much for your help! Jiri (3) 12 May 2026 venkata.ramana (Clario): Thank you for your response. Please note that the visit I-0 was still ongoing but not closed yet. So please close the visit. Kind Regards, Clario Data Coordination Team. (4) 12 May 2026 jskopek (Site User): If I try to close the I-O visit in TABLET, it asks me if patient fulfils eligibility criteria to proceed to next visit based on these old data – if I answer NO, it asks me to DEACTIVATE patient. I do not want to DEACTIVATE patient – can you help WHERE and HOW to close this visit for you to change it to UNSCHEDULED and not to de-activate patient? Thank you Jiri Other-delete visit I-0 CLARIO RESOLUTION: Part 1: In the following forms dated 04 May 2026, CLARIO to make the following changes: -Event ID: from I-0 to Unscheduled Visit 1 -Event At Entry: from I-0 to Unscheduled Visit 1 +Visit Start (49) +ePRO Availability (1) +Mayo Subscore (1) +PGA (1) Part 2: CLARIO to delete the following forms dated 04 May 2026 for I-0 visit. +C-SSRS Since Last Visit (1) +C-SSRS Since Last Visit Findings Report (1) Part 3: CLARIO to manually enter Visit End form for Unscheduled visit 1 with the following information: -Protocol: 77242113UCO3001 -Report Date: 04 May 2026 -Report Start Date and Time: 04 May 2026 23:59:59 -Event ID: Unscheduled Visit 1 -Event End Date: 04 May 2026 23:59:59 -Visit Status: Incomplete -Phase At Entry: Screening -Phase At Entry Timestamp: 13 Apr 2026 12:32:20 -Event At Entry: Unscheduled visit 1 -Event Start Date: 04 May 2026 23:59:59 -Event Time Zone Offset in Milliseconds: 7200000 -Session Repeat Number (SESREP1N): 0 -Session Instance Id (SESINST1S): 3f1214f0-4788-11f1-a0cf-bb403212adce
10 77242113UCO3001 Czech Republic DD5-CZ10020 Lucie Gonsorcikova CZ100201001 15 1 SW00701226 04-May-2026 Completed Dears, we would like ask you to change the information I read on assignment form given by patient on April 13, 2026 (Visit 1), Baseline Stool Count (PT.Custom4) as 3 that should be reported as 1. Patient has entered wrong number as he did not understood it should be number of stools when illness is in remission or absent. He is a child and did not reflected this question correctly. Therefore, please change Baseline Stool Count = 1. Thank you, Jiri Skopek 04-May-2026 1 Day 1 Demographic Changed Information (Clario instructions) 1. Please make below changes in the assignment form: Baseline Stool Count (PT. Custom4): 03 to 01.
11 77242113UCO3001 Czech Republic DD5-CZ10021 Martin Bortlik CZ100212001 61 1 SW00699492 23-Apr-2026 ReadyForQC Please correct the date of endoscopy done during screening visit of patient CZ100212001 to correct date 16-MAR-2026. 29-Apr-2026 22-28 Days 27 23 Query Active Site Site-Entered Data Changed Information CLARIO RESOLUTION: Part 1: In the Mayo Subscore (1) dated 07 Apr 2026 for I-0 visit, CLARIO to make the following changes: -What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 16 Mar 2026 - Data Flag (QSDFLG1B): from blank to check
12 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 39 1 SW00703322 12-May-2026 Completed As per ATS investigation (ATS26040111), please remove the below form that's been entered as a duplicate - MAYO Diary (16) - 18 Mar 2026 20-May-2026 4-7 Days 6 Technical Revision Technical Revision - Other CLARIO RESOLUTION: Part 1: CLARIO to delete the MAYO Diary (16) dated 18 Mar 2026.
13 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222003 39 1 SW00689748 09-Mar-2026 Completed Dear all, Patient CZ 100222003 was randomized on 9 Mar 2026. Kindly correct the colonoscopy date to 11 Feb 2025. The date was initially entered as 21 Feb 2025 because the earlier date could not be entered in the system. The patient was rescreened. 02-Apr-2026 15-21 Days 17 Site-Entered Data (1) 13 Mar 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Could you please conform that if you are requesting following? Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit -What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025 Could you please confirm the year? This subject was assigned on 02 Mar 2026, you are providing that correct date is 11 Feb 2025 which a year ago. If you are not requesting above, please provide us the name of the form with question. Thank you. ERT/CLARIO Data Coordination Team (2) 13 Mar 2026 katerina.havlikova@clinoxus.com (Site User): confirm date of colonoscopy 11Feb2026 (3) 21 Mar 2026 msullivan (Clario): Dear Site, The requested changes to the Mayo data have been updated. Please navigate to the Mayo Score Report and resubmit the form for visit to log the updated Mayo Score form. Once done, please respond to this query confirming that the Mayo Score has been resubmitted. Thank you. ERT/CLARIO Data Coordination Team (4) 24 Mar 2026 jana.pomahacova@clinoxus.com (Site User): Thank you and sent New Information CLARIO RESOLUTION: Part 1: In the Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit, CLARIO to make the following changes: -What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025 -Data Flag (QSDFLG1B): from blank to check
14 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 33 1 SW00705372 22-May-2026 Submitted Dear all, please change Colonoscopz date from 8April2026 to date 01Apr2026 Thank you in advance 02-Jun-2026 4-7 Days 7 1 Clario DM New (1) 29 May 2026 msullivan (Clario): Please confirm your request Dear Site. Thank you for submitting this Data Clarification. Please provide us the name of the form for this request. Thank you. ERT/CLARIO Data Coordination Team (2) 02 Jun 2026 katerina.havlikova@clinoxus.com (Site User): Dear all, please change Colonoscopy for Week I-12 date from 8April2026 to date 01Apr2026 Thank you in advance Changed Information
15 77242113UCO3001 Czech Republic DD5-CZ10022 Petr Hrabak CZ100222005 33 1 SW00702538 08-May-2026 Completed This TRR is to document the correction to the Mayo Subscore (1) form, where the following variables were populated with NULL values, due to a known core defect: Event At Entry, Event Start Date, Event Time Zone Offset in Milliseconds. 12-May-2026 2-3 Days 2 Technical Revision Technical Revision - Other Please make the below changes in Mayo Subscore (1) dated 22 Apr 2026: -Event At Entry: I-0 -Event Start Date: 09 Apr 2026 08:09:19 -Event Time Zone Offset in Milliseconds: 7200000
@@ -0,0 +1,6 @@
"Protocol","Country","Site ID","PI_NAME","Subject Number","Age","Data Correction ID","Creation Date UTC","Status","Date of Last Action UTC","Total Open Period","Total Open Time (Days)","Current Status Time (Days)","Type","Next Action Required","Category","Query History","Reason for Change"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10001","Falc, Matej","CZ100012001","48 Years","16923867","14-May-2026","Escalated","26-May-2026","8-14 Days","13","5","QUERY","Site","Patient","(3) 15 May 2026 Clario: You can upload scans of your paper ECGs using the Site Upload Tool. ---- Instructions can be found in the ""Reference Materials"" tab of the study portal. Please contact Customer Care for assistance if needed!","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10001","Falc, Matej","CZ100012001","48 Years","16567067","22-Jan-2026","Resolved","28-Jan-2026","4-7 Days","4","","QUERY","","Patient","MD Falc","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10009","Pumprla, Jiri","CZ100092001","49 Years","16776685","31-Mar-2026","Resolved","13-May-2026","Over 28 Days","29","","QUERY","","Patient","(2) 13 May 2026 Clario: I confirm, that only ONE ECG was collected by mistake.","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10021","Bortlik, Martin","CZ100212001","61 Years","16717619","11-Mar-2026","Resolved","28-Apr-2026","Over 28 Days","32","","QUERY","","Patient","(2) 28 Apr 2026 Clario: I confirmed that due to technical problems, the ECG was done only twice","Data Checks"
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10022","Hrabak, Petr","CZ100222003","39 Years","16945114","21-May-2026","Escalated","27-May-2026","8-14 Days","8","4","DCR","Site","Patient","(6) 27 May 2026 Botdorf, Paul-Daniel: We still do not have any ECGs for any patients at your site with a collection Date/Time of 20-May-2026 at 14:19:34, 14:20:32, 14:21:15. Please review the records in the portal and let us know if anything more is needed. If you see these ECGs, please double check that this is actually the study they are currently in(77242113UCO3001_ANALYSIS).Thank you",""
1 Protocol Country Site ID PI_NAME Subject Number Age Data Correction ID Creation Date UTC Status Date of Last Action UTC Total Open Period Total Open Time (Days) Current Status Time (Days) Type Next Action Required Category Query History Reason for Change
2 77242113UCO3001_ANALYSIS Czech Republic The CZ10001 Falc, Matej CZ100012001 48 Years 16923867 14-May-2026 Escalated 26-May-2026 8-14 Days 13 5 QUERY Site Patient (3) 15 May 2026 Clario: You can upload scans of your paper ECGs using the Site Upload Tool. ---- Instructions can be found in the "Reference Materials" tab of the study portal. Please contact Customer Care for assistance if needed! Data Checks
3 77242113UCO3001_ANALYSIS Czech Republic The CZ10001 Falc, Matej CZ100012001 48 Years 16567067 22-Jan-2026 Resolved 28-Jan-2026 4-7 Days 4 QUERY Patient MD Falc Data Checks
4 77242113UCO3001_ANALYSIS Czech Republic The CZ10009 Pumprla, Jiri CZ100092001 49 Years 16776685 31-Mar-2026 Resolved 13-May-2026 Over 28 Days 29 QUERY Patient (2) 13 May 2026 Clario: I confirm, that only ONE ECG was collected by mistake. Data Checks
5 77242113UCO3001_ANALYSIS Czech Republic The CZ10021 Bortlik, Martin CZ100212001 61 Years 16717619 11-Mar-2026 Resolved 28-Apr-2026 Over 28 Days 32 QUERY Patient (2) 28 Apr 2026 Clario: I confirmed that due to technical problems, the ECG was done only twice Data Checks
6 77242113UCO3001_ANALYSIS Czech Republic The CZ10022 Hrabak, Petr CZ100222003 39 Years 16945114 21-May-2026 Escalated 27-May-2026 8-14 Days 8 4 DCR Site Patient (6) 27 May 2026 Botdorf, Paul-Daniel: We still do not have any ECGs for any patients at your site with a collection Date/Time of 20-May-2026 at 14:19:34, 14:20:32, 14:21:15. Please review the records in the portal and let us know if anything more is needed. If you see these ECGs, please double check that this is actually the study they are currently in(77242113UCO3001_ANALYSIS).Thank you
@@ -0,0 +1,138 @@
# Report generator: feasibility/investigators -> Excel
# Projekt: 77242113UCO2001
# Ulozeni: u:\Dropbox\!!!Days\Downloads Z230\\
import os
import sys
from datetime import datetime
from pymongo import MongoClient
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
# --- Připojení k MongoDB ---
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://192.168.1.76:27017")
client = MongoClient(MONGO_URI)
db = client["feasibility"]
col = db["investigators"]
# --- Načtení dat ---
docs = list(col.find({}))
print(f"Načteno {len(docs)} záznamů.")
# --- Cílová složka ---
OUTPUT_DIR = r"u:\Dropbox\!!!Days\Downloads Z230"
os.makedirs(OUTPUT_DIR, exist_ok=True)
datum = datetime.now().strftime("%Y%m%d_%H%M")
filename = f"77242113UCO2001_investigators_{datum}.xlsx"
filepath = os.path.join(OUTPUT_DIR, filename)
# --- Definice sloupců ---
# Pořadí: jméno, email, status, kriticka_poznamka, pak ostatní
FIXED_COLS = [
("prijmeni", "Příjmení"),
("jmeno", "Jméno"),
("email", "Email"),
("STATUS", "STATUS"),
("kriticka_poznamka", "Kritická poznámka"),
("zeme", "Země"),
("pracoviste", "Pracoviště"),
("internet_summary","Internet summary"),
]
# Klíče, které přeskočíme (složité nested objekty)
SKIP_KEYS = {"_id", "excel", "sites_illuminator", "maf", "zdroje", "studie", "Viper_Performance", "Viper_Contacts"}
# Ostatní skalární pole
fixed_keys = {c[0] for c in FIXED_COLS}
extra_keys = set()
for doc in docs:
for k in doc.keys():
if k not in fixed_keys and k not in SKIP_KEYS:
extra_keys.add(k)
extra_keys = sorted(extra_keys)
ALL_COLS = FIXED_COLS + [(k, k) for k in extra_keys]
# --- Barvy podle STATUS ---
def status_color(status):
if not status:
return None
s = status.lower()
if "nezájem" in s or "nezajem" in s or "nechceme" in s:
return "FFFFC7CE" # červená
if "zájem" in s or "zajem" in s:
return "FFC6EFCE" # zelená
if "nedoručen" in s or "nedorucen" in s:
return "FFFFEB9C" # žlutá
if "email odeslán" in s or "email odeslan" in s:
return "FFDCE6F1" # modrá
return None
# --- Vytvoření workbooku ---
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Investigators"
# Styly
header_font = Font(bold=True, color="FFFFFFFF")
header_fill = PatternFill("solid", fgColor="FF1F4E79")
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
cell_align = Alignment(vertical="top", wrap_text=True)
thin = Side(style="thin", color="FFB0B0B0")
border = Border(left=thin, right=thin, top=thin, bottom=thin)
# Záhlaví
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
cell = ws.cell(row=1, column=col_idx, value=label)
cell.font = header_font
cell.fill = header_fill
cell.alignment = header_align
cell.border = border
ws.row_dimensions[1].height = 30
# Data
for row_idx, doc in enumerate(docs, 2):
status_val = str(doc.get("STATUS", "") or "")
bg = status_color(status_val)
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
val = doc.get(key, "")
# Převod na string pokud je list nebo dict
if isinstance(val, list):
val = ", ".join(str(v) for v in val)
elif isinstance(val, dict):
val = str(val)
elif val is None:
val = ""
else:
val = str(val)
cell = ws.cell(row=row_idx, column=col_idx, value=val)
cell.alignment = cell_align
cell.border = border
if bg:
cell.fill = PatternFill("solid", fgColor=bg)
# Šířky sloupců
col_widths = {
"prijmeni": 18, "jmeno": 15, "email": 35,
"STATUS": 45, "kriticka_poznamka": 60,
"zeme": 12, "pracoviste": 35, "internet_summary": 60,
}
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
w = col_widths.get(key, 20)
ws.column_dimensions[get_column_letter(col_idx)].width = w
# Zmrazení záhlaví
ws.freeze_panes = "A2"
# Autofilter
ws.auto_filter.ref = ws.dimensions
# Uložení
wb.save(filepath)
print(f"Ulozeno: {filepath}")
@@ -0,0 +1,56 @@
# Test: najit posledni odeslany email na klucho@gastroenterolog.com,
# preposlat na vladimir.buzalka@buzalka.cz, predmet "Ahoj", prvni radek "Ahoj"
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context(storage_state="outlook_auth.json")
page = context.new_page()
# 1. Otevrit Outlook
page.goto("https://outlook.cloud.microsoft/mail/")
page.wait_for_selector('[placeholder="Search or ask Copilot"]')
# 2. Prejit do Sent Items
page.click('text=Sent Items')
page.wait_for_url("**/sentitems")
# 3. Vyhledat emaily na klucho@gastroenterolog.com
search = page.locator('[placeholder="Search or ask Copilot"]')
search.click()
search.fill("to:klucho@gastroenterolog.com")
search.press("Enter")
page.wait_for_selector("text=All results")
page.wait_for_timeout(1000)
# 4. Kliknout na prvni (nejnovejsi) email
page.locator('[role="option"]').first.click()
page.wait_for_selector('button:has-text("Forward"), [aria-label="Forward"]')
# 5. Kliknout na Forward
page.locator('button[aria-label="Forward"]').first.click()
page.wait_for_selector('[aria-label="To"]', timeout=5000)
# 6. Vyplnit prijemce
page.locator('[aria-label="To"]').fill("vladimir.buzalka@buzalka.cz")
page.keyboard.press("Tab")
# 7. Zmenit predmet na "Ahoj"
subject = page.locator('[aria-label="Subject"]')
subject.triple_click()
subject.type("Ahoj")
# 8. Napsat "Ahoj" na prvni radek tela emailu
body = page.locator('[aria-label="Message body"]')
body.click()
page.keyboard.press("Control+Home")
page.keyboard.type("Ahoj")
page.keyboard.press("Enter")
# 9. Odeslat
page.click('button[aria-label="Send"]')
page.wait_for_timeout(2000)
print("Email uspesne odeslan!")
browser.close()
+39
View File
@@ -0,0 +1,39 @@
"""
Jednorázový skript — vytvoří/aktualizuje tabulky v MySQL.
Spusť jednou: python create_iwrs_tables.py
"""
import os
import mysql.connector
import db_config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SQL_FILE = os.path.join(BASE_DIR, "create_iwrs_tables.sql")
conn = mysql.connector.connect(
host=db_config.DB_HOST,
port=db_config.DB_PORT,
user=db_config.DB_USER,
password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
cursor = conn.cursor()
sql = open(SQL_FILE, encoding="utf-8").read()
# Odstraň komentáře a rozdělíme na příkazy
stmts = [s.strip() for s in sql.split(";")]
for stmt in stmts:
# Odstraň řádkové komentáře
lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")]
stmt = "\n".join(lines).strip()
if not stmt or stmt.upper().startswith("USE"):
continue
try:
cursor.execute(stmt)
print(f"OK: {stmt[:80]}")
except Exception as e:
print(f"SKIP: {e}")
conn.commit()
cursor.close()
conn.close()
print("\nHotovo.")
+128
View File
@@ -0,0 +1,128 @@
-- IWRS tabulky pro databázi studie
-- Spustit jednou: mysql -h 192.168.1.76 -u root -p studie < create_iwrs_tables.sql
USE studie;
-- ── Import log ───────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_import (
import_id INT AUTO_INCREMENT PRIMARY KEY,
study VARCHAR(20) NOT NULL,
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
source_file VARCHAR(500) NOT NULL,
INDEX idx_study (study)
);
-- ── UCO3001 subject summary ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_uco3001_subject_summary (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
subject VARCHAR(20) NOT NULL,
prior_subject_identifier VARCHAR(20),
site VARCHAR(50),
investigator VARCHAR(100),
location VARCHAR(50),
cohort_per_irt VARCHAR(100),
informed_consent_date DATE,
adolescent_assent_date DATE,
age SMALLINT,
weight DECIMAL(5,1),
rescreened_subject VARCHAR(10),
adt_ir VARCHAR(10),
three_or_more_advanced_therapies VARCHAR(10),
only_oral_5asa_compounds VARCHAR(10),
ustekinumab VARCHAR(10),
isolated_proctitis VARCHAR(10),
clinical_responder_status_i12_m0 VARCHAR(100),
irt_subject_status VARCHAR(50),
i0_rand_date_local DATE,
last_irt_transaction VARCHAR(100),
last_irt_transaction_date_local DATE,
last_irt_transaction_date_utc DATE,
next_irt_transaction VARCHAR(100),
next_irt_transaction_date_local DATE,
most_recent_med_assignment_date DATE,
days_since_last_med_assignment SMALLINT,
patient_forecast_status VARCHAR(50),
patient_forecast_status_changed_date DATE,
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_subject (subject)
);
-- ── MDD3003 subject summary ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_mdd3003_subject_summary (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
subject VARCHAR(20) NOT NULL,
prior_subject_identifier VARCHAR(20),
site VARCHAR(50),
investigator VARCHAR(100),
location VARCHAR(50),
cohort_per_irt VARCHAR(50),
madrs_criteria_integrated VARCHAR(50),
informed_consent_date DATE,
age SMALLINT,
madrs_criteria_v15 VARCHAR(10),
madrs_criteria_v16 VARCHAR(10),
madrs_criteria_v17 VARCHAR(10),
stratification_country VARCHAR(10),
age_group VARCHAR(20),
stable_remitters VARCHAR(50),
irt_subject_status VARCHAR(100),
last_irt_transaction VARCHAR(100),
last_irt_transaction_date_local DATE,
last_irt_transaction_date_utc DATE,
next_irt_transaction VARCHAR(100),
next_irt_transaction_date_local DATE,
date_screened DATE,
date_screen_failed DATE,
date_randomized_part1 DATE,
date_early_withdraw_randomized_part1 DATE,
date_open_label_induction DATE,
date_early_withdraw_open_label_induction DATE,
date_randomized_part2 DATE,
date_early_withdraw_randomized_part2 DATE,
date_completed DATE,
date_unblinded DATE,
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_subject (subject)
);
-- ── Notifications ────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_notifications (
id INT AUTO_INCREMENT PRIMARY KEY,
study VARCHAR(20) NOT NULL,
subject VARCHAR(20) NOT NULL,
pk INT NOT NULL,
title VARCHAR(100),
label VARCHAR(500),
event VARCHAR(50),
actual_date DATE,
text TEXT,
pdf MEDIUMBLOB,
source_file VARCHAR(500),
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE KEY uq_pk (pk),
INDEX idx_study_subject (study, subject)
);
-- ── Subject visits / transactions (obě studie) ───────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_subject_visits (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
study VARCHAR(20) NOT NULL,
subject VARCHAR(20) NOT NULL,
visit_type ENUM('Past','Upcoming') NOT NULL,
scheduled_date DATE,
window_days VARCHAR(20),
actual_date DATE,
irt_transaction_no SMALLINT,
irt_transaction_description VARCHAR(200),
medication_assignment VARCHAR(200),
quantity_assigned SMALLINT,
medication_id VARCHAR(20),
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_study_subject (study, subject)
);
@@ -0,0 +1,201 @@
from playwright.sync_api import sync_playwright
import os
import glob
import datetime
import requests
import pandas as pd
# ── CONFIG ──────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
# ────────────────────────────────────────────────────────────────────────────
def get_subjects(study):
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
files = sorted(
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
key=os.path.getmtime,
reverse=True,
)
if not files:
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
today = datetime.date.today().strftime("%Y-%m-%d")
if not os.path.basename(files[0]).startswith(today):
raise FileNotFoundError(
f"Dnešní Subject Summary Report pro {study} neexistuje — spusť nejdříve download_subject_summary.py"
)
path = files[0]
print(f" Čtu subjekty z: {os.path.basename(path)}")
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError("Hlavičkový řádek nenalezen")
df = pd.read_excel(path, header=header_row)
subjects = df["Subject"].dropna().astype(str).str.strip().tolist()
return subjects
def get_jwt_and_api_base(page, study):
"""Získá JWT token a api_base_url pro danou studii."""
jwt = page.evaluate("localStorage.getItem('JWT.access')")
if not jwt:
raise ValueError("JWT token nenalezen v localStorage")
instances = page.evaluate("""async (jwt) => {
const res = await fetch('/_/api/dispatch/app_instances/', {
headers: { 'Authorization': `Bearer ${jwt}` }
});
return res.json();
}""", jwt)
instance = next(
(i for i in instances if study in i.get("label", "")),
None
)
if not instance:
raise ValueError(f"app_instance pro studii {study} nenalezena")
return jwt, instance["api_base_url"]
def get_notifications(jwt, api_base, study, subject):
"""Načte seznam notifikací pro daného subjekta přes report_data API."""
url = f"{BASE_URL}{api_base}/api/v1/reports_api/report_data"
params = {
"path": "patient_detail_report",
"id": subject,
"key": "table_1",
"unblinded": "false",
}
payload = {
"path": "patient_detail_report",
"study": study,
"id": subject,
"key": "table_1",
"fields": {},
"filters": [{"tableId": "table_1", "tableFilters": {}}],
"pagination_details": {"order": "type", "reverseOrder": False, "page": 1, "limit": 500},
"cache_key": f"py_{subject}_{datetime.datetime.now().timestamp()}",
}
headers = {
"Authorization": f"Bearer {jwt}",
"Content-Type": "application/json",
"lang": "en",
}
resp = requests.post(url, params=params, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
notifications = []
for row in data.get("data", []):
for notif in row.get("notification", []):
item = notif.get("item", {})
pk = item.get("pk")
title = item.get("et_title")
if pk and title:
notifications.append({"pk": pk, "title": title, "event": row.get("event_event_id", "")})
return notifications
def download_pdf(jwt, api_base, pk, title, out_path):
"""Stáhne PDF notifikaci a uloží ji."""
url = f"{BASE_URL}{api_base}/api/v1/meta_api/pdfnotification"
params = {"pk": pk, "title": title, "html": "true"}
headers = {
"Authorization": f"Bearer {jwt}",
"lang": "en",
"Accept": "*/*",
}
resp = requests.get(url, params=params, headers=headers)
resp.raise_for_status()
with open(out_path, "wb") as f:
f.write(resp.content)
def run(page, study):
out_dir = os.path.join(DETAILS_DIR, study)
os.makedirs(out_dir, exist_ok=True)
subjects = get_subjects(study)
print(f" Nalezeno {len(subjects)} subjektů")
today = datetime.date.today().strftime("%Y-%m-%d")
# Načteme stránku aby byl platný session kontext
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=120000)
jwt, api_base = get_jwt_and_api_base(page, study)
print(f" API base: {api_base}")
for subject in subjects:
print(f" [{subject}] Stahuji notifikace...")
try:
notifications = get_notifications(jwt, api_base, study, subject)
if not notifications:
print(f" [{subject}] Žádné notifikace")
continue
for notif in notifications:
pk = notif["pk"]
title = notif["title"]
filename = os.path.join(out_dir, f"{today} {study} {subject} Notification {title} pk{pk}.pdf")
if os.path.exists(filename):
print(f" [{subject}] {title} (pk={pk}) — již existuje, přeskakuji")
continue
download_pdf(jwt, api_base, pk, title, filename)
print(f" [{subject}] {title} (pk={pk}) OK")
except Exception as e:
print(f" [{subject}] CHYBA při notifikacích: {e}")
print(f" [{study}] Notifikace hotovo.")
def main():
os.makedirs(DETAILS_DIR, exist_ok=True)
with sync_playwright() as p:
for study in STUDIES:
print(f"\n[{study}] Přihlášení...")
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
try:
run(page, study)
except Exception as e:
print(f" [{study}] CHYBA: {e}")
browser.close()
print("\nVše hotovo.")
main()
@@ -0,0 +1,76 @@
from playwright.sync_api import sync_playwright
import os
import datetime
# ── CONFIG ──────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
CREATED_DIR = os.path.join(BASE_DIR, "CreatedReports")
# ────────────────────────────────────────────────────────────────────────────
def unique_path(directory, stem):
path = os.path.join(directory, f"{stem}.xlsx")
if not os.path.exists(path):
return path
time_tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
def download_study(page, study, today):
print(f"\n[{study}] Prihlaseni...")
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
print(f"[{study}] Vyber studie...")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
print(f"[{study}] Stahuji Subject Summary Report...")
page.goto(f"{BASE_URL}/report/patient_summary_report")
page.wait_for_load_state("networkidle", timeout=120000)
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f"[{study}] OK -> {filename}")
return filename
def main():
today = datetime.date.today().strftime("%Y-%m-%d")
os.makedirs(INCOMING_DIR, exist_ok=True)
os.makedirs(CREATED_DIR, exist_ok=True)
downloaded = []
with sync_playwright() as p:
for study in STUDIES:
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
filename = download_study(page, study, today)
downloaded.append((study, filename))
browser.close()
print("\nVse stazeno:")
for study, path in downloaded:
print(f" {study}: {path}")
main()
+453
View File
@@ -0,0 +1,453 @@
"""
Importuje data z IWRS Excel reportů do MySQL (databáze studie).
Pořadí spuštění:
1. download_subject_summary.py
2. download_subject_details.py
3. tento skript
Každé spuštění vytvoří nový import_id v iwrs_import.
Reportovací skripty pracují vždy s MAX(import_id) pro danou studii.
"""
import os
import glob
import datetime
import re
import numpy as np
import pandas as pd
import mysql.connector
import db_config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
# ── helpers ──────────────────────────────────────────────────────────────────
def get_conn():
return mysql.connector.connect(
host=db_config.DB_HOST,
port=db_config.DB_PORT,
user=db_config.DB_USER,
password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
def _py(val):
"""Převede numpy skalár na Python nativní typ."""
if isinstance(val, np.generic):
return val.item()
return val
def to_date(val):
"""Převede pandas Timestamp / string / NaT / NaN na date nebo None."""
val = _py(val)
if val is None or (isinstance(val, float) and (val != val)): # NaN check
return None
try:
if pd.isna(val):
return None
except (TypeError, ValueError):
pass
if isinstance(val, pd.Timestamp):
return None if pd.isna(val) else val.date()
if isinstance(val, datetime.datetime):
return val.date()
if isinstance(val, datetime.date):
return val
s = str(val).strip()
if not s or s.lower() in ("nat", "nan", "none", ""):
return None
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.datetime.strptime(s, fmt).date()
except ValueError:
pass
return None
def to_int(val):
val = _py(val)
try:
v = float(val)
return None if (v != v) else int(v) # v != v je True jen pro NaN
except (TypeError, ValueError):
return None
def to_float(val):
val = _py(val)
try:
v = float(val)
return None if (v != v) else float(v)
except (TypeError, ValueError):
return None
def to_str(val):
val = _py(val)
if val is None:
return None
if isinstance(val, float) and (val != val): # NaN
return None
s = str(val).strip()
return None if s.lower() in ("nan", "nat", "none", "") else s
def find_summary_file(study):
today = datetime.date.today().strftime("%Y-%m-%d")
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
files = sorted(
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
key=os.path.getmtime,
reverse=True,
)
if not files:
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
if not os.path.basename(files[0]).startswith(today):
print(f" UPOZORNĚNÍ: nejnovější Summary Report pro {study} není z dnešního dne ({os.path.basename(files[0])[:10]})")
return files[0]
def read_summary_df(path):
"""Přečte Summary xlsx, vrátí DataFrame od řádku s hlavičkou."""
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
return pd.read_excel(path, header=header_row)
def find_detail_files(study):
out_dir = os.path.join(DETAILS_DIR, study)
# Vezme soubory ze stejného dne jako nejnovější Summary Report
summary_path = find_summary_file(study)
file_date = os.path.basename(summary_path)[:10] # "YYYY-MM-DD"
pattern = os.path.join(out_dir, f"{file_date} {study} * Subject Detail.xlsx")
files = [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")]
return sorted(files)
def parse_detail_visits(path):
"""
Vrátí list slovníků s daty visitů z Detail xlsx.
Každý řádek tabulky (od řádku s hlavičkou Visit Type) je jedna transakce.
"""
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
header_row = None
for i, row in df.iterrows():
if "Visit Type" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
return []
visits_df = df.iloc[header_row + 1:].copy()
visits_df.columns = range(visits_df.shape[1])
rows = []
for _, r in visits_df.iterrows():
visit_type = to_str(r.get(0))
if visit_type not in ("Past", "Upcoming"):
continue
rows.append({
"visit_type": visit_type,
"scheduled_date": to_date(r.get(1)),
"window_days": to_str(r.get(2)),
"actual_date": to_date(r.get(3)),
"irt_transaction_no": to_int(r.get(4)),
"irt_transaction_description": to_str(r.get(5)),
"medication_assignment": to_str(r.get(6)),
"quantity_assigned": to_int(r.get(7)),
"medication_id": to_str(r.get(8)),
})
return rows
# ── insert helpers ────────────────────────────────────────────────────────────
def insert_import(cursor, study, source_file):
cursor.execute(
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
(study, datetime.datetime.now(), os.path.basename(source_file)),
)
return cursor.lastrowid
def insert_uco3001_summary(cursor, import_id, df):
sql = """
INSERT INTO iwrs_uco3001_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
clinical_responder_status_i12_m0, irt_subject_status,
i0_rand_date_local, last_irt_transaction,
last_irt_transaction_date_local, last_irt_transaction_date_utc,
next_irt_transaction, next_irt_transaction_date_local,
most_recent_med_assignment_date, days_since_last_med_assignment,
patient_forecast_status, patient_forecast_status_changed_date
) VALUES (
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
)
"""
col = df.columns.tolist()
def c(name):
return col.index(name) if name in col else None
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_date(r["Informed Consent Date"]),
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
to_int(r["Subject's age collection"]),
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
to_str(r["IRT Subject Status"]),
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
))
def insert_mdd3003_summary(cursor, import_id, df):
sql = """
INSERT INTO iwrs_mdd3003_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
stratification_country, age_group, stable_remitters, irt_subject_status,
last_irt_transaction, last_irt_transaction_date_local,
last_irt_transaction_date_utc, next_irt_transaction,
next_irt_transaction_date_local, date_screened, date_screen_failed,
date_randomized_part1, date_early_withdraw_randomized_part1,
date_open_label_induction, date_early_withdraw_open_label_induction,
date_randomized_part2, date_early_withdraw_randomized_part2,
date_completed, date_unblinded
) VALUES (
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
)
"""
col = df.columns.tolist()
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
to_date(r["Informed Consent Date"]),
to_int(r["Subject's age collection"]),
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
to_str(r["Age Group"]) if "Age Group" in col else None,
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
to_str(r["IRT Subject Status"]),
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
))
def insert_visits(cursor, import_id, study, subject, visits):
if not visits:
return
sql = """
INSERT INTO iwrs_subject_visits (
import_id, study, subject, visit_type, scheduled_date, window_days,
actual_date, irt_transaction_no, irt_transaction_description,
medication_assignment, quantity_assigned, medication_id
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
for v in visits:
cursor.execute(sql, (
import_id, study, subject,
v["visit_type"], v["scheduled_date"], v["window_days"],
v["actual_date"], v["irt_transaction_no"],
v["irt_transaction_description"], v["medication_assignment"],
v["quantity_assigned"], v["medication_id"],
))
# ── notifications ─────────────────────────────────────────────────────────────
def find_notification_json_files(study):
"""Najde všechny .json soubory notifikací pro danou studii."""
out_dir = os.path.join(DETAILS_DIR, study)
return sorted(glob.glob(os.path.join(out_dir, "*.json")))
def import_notifications(conn, study):
import json as json_lib
json_files = find_notification_json_files(study)
if not json_files:
print(f" Žádné notifikace k importu pro {study}")
return 0
sql = """
INSERT INTO iwrs_notifications
(study, subject, pk, title, label, event, actual_date, text, pdf, source_file)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
label = VALUES(label),
text = VALUES(text),
pdf = VALUES(pdf),
source_file = VALUES(source_file)
"""
done_dir = os.path.join(os.path.join(DETAILS_DIR, study), "Zpracováno")
os.makedirs(done_dir, exist_ok=True)
cursor = conn.cursor()
count = 0
for json_path in json_files:
try:
with open(json_path, "r", encoding="utf-8") as f:
meta = json_lib.load(f)
pdf_path = json_path.replace(".json", ".pdf")
pdf_data = None
if os.path.exists(pdf_path):
with open(pdf_path, "rb") as f:
pdf_data = f.read()
cursor.execute(sql, (
meta.get("study", study),
meta.get("subject"),
meta.get("pk"),
meta.get("title"),
meta.get("label"),
meta.get("event"),
to_date(meta.get("actual_date")),
meta.get("text"),
pdf_data,
os.path.basename(json_path),
))
count += 1
# Přesun do Zpracováno
import shutil
shutil.move(json_path, os.path.join(done_dir, os.path.basename(json_path)))
if os.path.exists(pdf_path):
shutil.move(pdf_path, os.path.join(done_dir, os.path.basename(pdf_path)))
except Exception as e:
print(f" CHYBA při importu {os.path.basename(json_path)}: {e}")
conn.commit()
cursor.close()
print(f" Notifikací uloženo/přesunuto: {count}")
return count
# ── main ──────────────────────────────────────────────────────────────────────
def import_study(conn, study):
summary_path = find_summary_file(study)
print(f" Summary: {os.path.basename(summary_path)}")
df_summary = read_summary_df(summary_path)
df_summary = df_summary.dropna(how="all")
detail_files = find_detail_files(study)
print(f" Detail souborů: {len(detail_files)}")
cursor = conn.cursor()
import_id = insert_import(cursor, study, summary_path)
print(f" import_id = {import_id}")
if study == "77242113UCO3001":
insert_uco3001_summary(cursor, import_id, df_summary)
else:
insert_mdd3003_summary(cursor, import_id, df_summary)
print(f" Summary řádků: {len(df_summary)}")
visited = 0
for path in detail_files:
fname = os.path.basename(path)
# název: "2026-05-04 77242113UCO3001 CZ100012001 Subject Detail.xlsx"
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
subject = m.group(1) if m else "UNKNOWN"
visits = parse_detail_visits(path)
insert_visits(cursor, import_id, study, subject, visits)
visited += len(visits)
conn.commit()
cursor.close()
print(f" Transakce uloženo: {visited}")
return import_id
def main():
conn = get_conn()
print("Připojeno k MySQL.\n")
for study in STUDIES:
print(f"[{study}]")
try:
import_id = import_study(conn, study)
print(f" OK — import_id {import_id}")
except Exception as e:
print(f" CHYBA: {e}")
try:
import_notifications(conn, study)
except Exception as e:
print(f" CHYBA notifikace: {e}")
print()
conn.close()
print("Hotovo.")
main()
+175
View File
@@ -0,0 +1,175 @@
"""
Kompletní pipeline:
1. Stažení Subject Summary Reportů (obě studie)
2. Stažení Subject Detail Reportů + notifikací (obě studie)
3. Import do MongoDB (subject_summary + visits + notifications)
Spusť tento skript místo samostatných skriptů.
"""
import os
import sys
import datetime
import glob
from playwright.sync_api import sync_playwright
import download_subject_details as dsd
import import_to_mongo
import import_notifications_to_mongo
# ── CONFIG ───────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
# ── helpers ───────────────────────────────────────────────────────────────────
def unique_path(directory, stem):
path = os.path.join(directory, f"{stem}.xlsx")
if not os.path.exists(path):
return path
time_tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
def login(page, study):
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
# ── KROK 1: Subject Summary ───────────────────────────────────────────────────
def download_summary(page, study, today):
print(f" [{study}] Stahuji Subject Summary Report...")
page.goto(f"{BASE_URL}/report/patient_summary_report")
page.wait_for_load_state("networkidle", timeout=120000)
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
return filename
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
def get_subjects_from_summary(summary_path):
import pandas as pd
raw = pd.read_excel(summary_path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError("Hlavičkový řádek nenalezen")
df = pd.read_excel(summary_path, header=header_row)
return df["Subject"].dropna().astype(str).str.strip().tolist()
def download_details(page, study, summary_path, today):
out_dir = os.path.join(DETAILS_DIR, study)
os.makedirs(out_dir, exist_ok=True)
subjects = get_subjects_from_summary(summary_path)
print(f" [{study}] Subjektů k stažení: {len(subjects)}")
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=120000)
for subject in subjects:
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
input_field.click()
input_field.fill(subject)
page.wait_for_timeout(500)
page.locator("mat-option").first.dispatch_event("click")
page.wait_for_load_state("networkidle", timeout=120000)
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Detail {subject} OK")
page.get_by_role("button", name="Clear").click()
page.wait_for_load_state("networkidle", timeout=120000)
# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────
def main():
today = datetime.date.today().strftime("%Y-%m-%d")
os.makedirs(INCOMING_DIR, exist_ok=True)
os.makedirs(DETAILS_DIR, exist_ok=True)
summary_paths = {}
# Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
with sync_playwright() as p:
for study in STUDIES:
print("\n" + "=" * 60)
print(f"[{study}] KROK 1: Subject Summary Report")
print("=" * 60)
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
try:
login(page, study)
summary_path = download_summary(page, study, today)
summary_paths[study] = summary_path
print(f"\n[{study}] KROK 2: Subject Detail Reports + notifikace")
dsd.run(page, study)
except Exception as e:
print(f" [{study}] CHYBA při stahování: {e}")
summary_paths[study] = None
finally:
browser.close()
# Krok 3: import do MongoDB
print("\n" + "=" * 60)
print("KROK 3: Import do MongoDB")
print("=" * 60)
for study in STUDIES:
summary_path = summary_paths.get(study)
if not summary_path:
print(f" [{study}] PŘESKOČENO — stahování selhalo")
continue
try:
import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
except Exception as e:
print(f" [{study}] CHYBA při importu summary/visits: {e}")
# Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
print("\n [notifikace] import PDF/JSON do Mongo...")
try:
import_notifications_to_mongo.main(STUDIES)
except Exception as e:
print(f" CHYBA při importu notifikací: {e}")
print("\n" + "=" * 60)
print("Vše hotovo.")
print("=" * 60)
main()
+172
View File
@@ -0,0 +1,172 @@
from playwright.sync_api import sync_playwright
import re
import os
import datetime
import mysql.connector
import db_config
def get_existing_pks(study):
"""Vrátí set pk notifikací které už jsou v DB pro danou studii."""
try:
conn = mysql.connector.connect(
host=db_config.DB_HOST, port=db_config.DB_PORT,
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
cursor = conn.cursor()
cursor.execute("SELECT pk FROM iwrs_notifications WHERE study = %s", (study,))
pks = {row[0] for row in cursor.fetchall()}
cursor.close()
conn.close()
return pks
except Exception as e:
print(f" UPOZORNĚNÍ: nelze načíst existující pk z DB ({e}), stahuji vše")
return set()
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDY = "77242113UCO3001"
SUBJECT = "CZ100222003"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
def strip_html(html):
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def main():
existing_pks = get_existing_pks(STUDY)
print(f"V DB již existuje {len(existing_pks)} notifikací pro {STUDY}")
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, args=["--start-maximized"])
context = browser.new_context(no_viewport=True)
page = context.new_page()
print("Přihlašuji se...")
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=STUDY).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=60000)
# JWT + api_base
jwt = page.evaluate("localStorage.getItem('JWT.access')")
print(f"JWT: {jwt[:30]}...")
instances = page.evaluate("""async (jwt) => {
const res = await fetch('/_/api/dispatch/app_instances/', {
headers: { 'Authorization': `Bearer ${jwt}` }
});
return res.json();
}""", jwt)
instance = next((i for i in instances if STUDY in i.get("label", "")), None)
if not instance:
raise ValueError(f"Instance pro {STUDY} nenalezena")
api_base = instance["api_base_url"]
print(f"API base: {api_base}")
# Vyber subjekt a zachyť table_1 response přímo
print(f"Vybírám subjekt {SUBJECT}...")
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
input_field.click()
input_field.fill(SUBJECT)
page.wait_for_timeout(1000)
captured = {}
with page.expect_response(
lambda r: "report_data" in r.url and "table_1" in r.url,
timeout=60000
) as resp_info:
page.locator("mat-option").first.dispatch_event("click")
response = resp_info.value
data = response.json()
out_dir = os.path.join(DETAILS_DIR, STUDY)
os.makedirs(out_dir, exist_ok=True)
today = datetime.date.today().strftime("%Y-%m-%d")
print(f"\n{'='*60}")
print(f"Subjekt: {SUBJECT} | Studie: {STUDY}")
print(f"{'='*60}")
count = 0
for row in data.get("data", []):
for notif in (row.get("notification") or []):
item = notif.get("item", {})
pk = item.get("pk")
title = item.get("et_title")
label = (notif.get("label") or title or "").strip()
# Celý label, mezery → podtržítka, nepovolené znaky pryč
safe_label = re.sub(r'[\\/*?:"<>|]', "", label).replace(" ", "_")
body = item.get("body", "")
text = strip_html(body)
count += 1
print(f"\n--- Notifikace #{count}: {safe_label} (pk={pk}) | event: {row.get('event_event_id')} ---")
print(text)
if pk in existing_pks:
print(f" → pk={pk} již v DB, přeskakuji")
continue
actual_date = row.get("actual_date_raw", "0000-00-00")
pdf_filename = os.path.join(out_dir, f"{actual_date}_{safe_label}.pdf")
if os.path.exists(pdf_filename):
pdf_filename = os.path.join(out_dir, f"{actual_date}_{safe_label}_pk{pk}.pdf")
pdf_url = f"{BASE_URL}{api_base}/api/v1/meta_api/pdfnotification?pk={pk}&title={title}&html=true"
pdf_resp = page.request.get(pdf_url, headers={
"Authorization": f"Bearer {jwt}",
"lang": "en",
"prancer_study": STUDY,
"Accept": "application/json, text/plain, */*",
})
if pdf_resp.ok:
with open(pdf_filename, "wb") as f:
f.write(pdf_resp.body())
print(f" → PDF uloženo: {os.path.basename(pdf_filename)}")
json_filename = pdf_filename.replace(".pdf", ".json")
import json
with open(json_filename, "w", encoding="utf-8") as f:
json.dump({
"pk": pk,
"title": title,
"label": label,
"event": row.get("event_event_id"),
"actual_date": actual_date,
"subject": SUBJECT,
"study": STUDY,
"text": text,
}, f, ensure_ascii=False, indent=2)
print(f" → JSON uloženo: {os.path.basename(json_filename)}")
else:
print(f" → PDF chyba: {pdf_resp.status}")
page.wait_for_timeout(300)
if count == 0:
print("Žádné notifikace nalezeny.")
else:
print(f"\n{'='*60}")
print(f"Celkem notifikací: {count}")
browser.close()
main()
+90
View File
@@ -0,0 +1,90 @@
"""
Stažení reportů z IWRS portálu — vše do jednoho adresáře `Incoming/`.
1. Subject Summary Report (per studie)
2. Subject Detail Reports + notifikace (per subjekt)
Import se spouští samostatně skriptem `import_all.py`.
"""
import os
import datetime
from playwright.sync_api import sync_playwright
import download_subject_details as dsd
# ── CONFIG ───────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "Incoming")
def unique_path(directory, stem, ext=".xlsx"):
path = os.path.join(directory, f"{stem}{ext}")
if not os.path.exists(path):
return path
time_tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(directory, f"{stem} {time_tag}{ext}")
def login(page, study):
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
def download_summary(page, study, today):
print(f" [{study}] Stahuji Subject Summary Report...")
page.goto(f"{BASE_URL}/report/patient_summary_report")
page.wait_for_load_state("networkidle", timeout=120000)
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
return filename
def main():
today = datetime.date.today().strftime("%Y-%m-%d")
os.makedirs(INCOMING_DIR, exist_ok=True)
with sync_playwright() as p:
for study in STUDIES:
print("\n" + "=" * 60)
print(f"[{study}] Stažení reportů")
print("=" * 60)
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
try:
login(page, study)
download_summary(page, study, today)
# detail XLSX + notifikace přímo do Incoming/
dsd.run(page, study, out_dir=INCOMING_DIR, subjects_source_dir=INCOMING_DIR)
except Exception as e:
print(f" [{study}] CHYBA: {e}")
finally:
browser.close()
print("\n" + "=" * 60)
print(f"Stahování hotovo. Soubory v: {INCOMING_DIR}")
print("Pro import spusť: python import_all.py")
print("=" * 60)
if __name__ == "__main__":
main()
+107
View File
@@ -0,0 +1,107 @@
"""
Import všech čekajících reportů z `Incoming/` do MongoDB.
Pořadí zpracování per typ + studie: nejstarší soubor podle mtime první
(důležité pro chronologickou správnost snapshotů).
Po úspěšném importu se soubor přesune do `Incoming/Zpracováno/`.
Při chybě zůstane soubor v `Incoming/`.
"""
import os
import sys
import glob
import shutil
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from common.mongo_writer import ensure_indexes
import import_to_mongo
import import_notifications_to_mongo
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "Incoming")
DONE_DIR = os.path.join(INCOMING_DIR, "Zpracováno")
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
def _move_done(path):
os.makedirs(DONE_DIR, exist_ok=True)
dst = os.path.join(DONE_DIR, os.path.basename(path))
# kolize → přepiš (Mongo už má aktuální data, soubor je jen archiv)
if os.path.exists(dst):
os.remove(dst)
shutil.move(path, dst)
def _sorted_by_mtime(paths):
"""Nejstarší první."""
return sorted(
(p for p in paths if not os.path.basename(p).startswith("~$")),
key=os.path.getmtime,
)
def import_summaries(study):
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report*.xlsx")
files = _sorted_by_mtime(glob.glob(pattern))
if not files:
print(f" [{study}] summary: nic ke zpracování")
return
print(f" [{study}] summary: {len(files)} soubor(ů) (oldest first)")
for path in files:
try:
import_to_mongo.import_subject_summary(study, path)
_move_done(path)
except Exception as e:
print(f" [{study}] CHYBA summary {os.path.basename(path)}: {e}")
def import_details(study):
pattern = os.path.join(INCOMING_DIR, f"* {study} * Subject Detail.xlsx")
files = _sorted_by_mtime(glob.glob(pattern))
if not files:
print(f" [{study}] detail: nic ke zpracování")
return
print(f" [{study}] detail: {len(files)} soubor(ů) (oldest first)")
for path in files:
parsed = import_to_mongo.parse_detail_filename(path)
if not parsed:
print(f" [{study}] PŘESKAKUJI (nelze parsovat název): {os.path.basename(path)}")
continue
_, parsed_study, subject = parsed
if parsed_study != study:
continue # patří jiné studii
try:
import_to_mongo.import_visits_single_file(study, subject, path)
_move_done(path)
except Exception as e:
print(f" [{study}] CHYBA detail {os.path.basename(path)}: {e}")
def main():
if not os.path.isdir(INCOMING_DIR):
print(f"Adresář neexistuje: {INCOMING_DIR}")
return
ensure_indexes()
print("=" * 60)
print("Import Subject Summary + Visits")
print("=" * 60)
for study in STUDIES:
import_summaries(study)
import_details(study)
print("\n" + "=" * 60)
print("Import notifikací")
print("=" * 60)
import_notifications_to_mongo.import_from_dir(INCOMING_DIR, DONE_DIR, STUDIES)
print("\n" + "=" * 60)
print(f"Hotovo. Zpracované soubory: {DONE_DIR}")
print("=" * 60)
if __name__ == "__main__":
main()
+23
View File
@@ -0,0 +1,23 @@
# JustOpenOutlook_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
## Cíl
Jen otevře Outlook OWA v Playwrightu pomocí už uloženého persistent profilu —
žádný login, žádné ukládání.
## Co dělá
1. Načte profil `outlook_profile/` (vytvořený `outlook_login_v1.0.py`).
2. Otevře `https://outlook.cloud.microsoft/mail/`.
3. Čeká na Enter v konzoli.
4. Zavře prohlížeč.
## Spuštění
```
python JustOpenOutlook_v1.0.py
```
## Předpoklad
Existuje `outlook_profile/` ve stejném adresáři.
Pokud ne — nejprve spustit `outlook_login_v1.0.py`.
+50
View File
@@ -0,0 +1,50 @@
"""
=======================================================================
Název: JustOpenOutlook_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Popis: Otevře Outlook OWA v persistent Chromium profilu vytvořeném
skriptem outlook_login_v1.0.py. Žádný login — pouze otevře
okno, počká, až uživatel stiskne Enter, a zavře.
=======================================================================
"""
from pathlib import Path
from playwright.sync_api import sync_playwright
BASE_DIR = Path(__file__).resolve().parent
PROFILE_DIR = BASE_DIR / "outlook_profile"
START_URL = "https://outlook.cloud.microsoft/mail/"
def main() -> None:
if not PROFILE_DIR.exists():
print(f" Profil nenalezen: {PROFILE_DIR}")
print(" Nejprve spusť outlook_login_v1.0.py a přihlas se.")
return
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
no_viewport=True,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
],
)
page = context.pages[0] if context.pages else context.new_page()
page.goto(START_URL)
print()
print("=" * 70)
print(" Outlook otevřen. Stiskni Enter pro zavření.")
print("=" * 70)
input()
context.close()
if __name__ == "__main__":
main()
+54
View File
@@ -0,0 +1,54 @@
# download_all_inbox_eml_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
## Cíl
Stáhnout zprávy z Outlook Inboxu jako `.eml` soubory.
## Klíčový princip — virtualizovaný seznam
OWA drží v DOM jen ~16 viditelných řádků. `nth(19)` proto nefunguje.
Řešení: **navigace klávesnicí** — vybrat první zprávu a opakovaně mačkat
`ArrowDown`. Outlook sám scrolluje a dorenderovává. Aktuálně vybraná zpráva
je vždy `[role="option"][aria-selected="true"]`.
### Oddělovače sekcí (Today / Yesterday / This week)
Jsou to `role="button"` `aria-expanded` prvky, ne zprávy. Když na nich kurzor
po `ArrowDown` zastaví, **žádná** zpráva nemá `aria-selected`
(`selected.count() == 0`). Takový krok se musí jen přeskočit (`ArrowDown` dál),
NEpočítat jako zprávu a NEukončovat smyčku. Konec seznamu se pozná až podle
toho, že se `aria-label` vybrané zprávy přestane měnit (`no_progress`).
Alternativa: v OWA přepnout řazení na "Show as Messages" (bez seskupení podle
data) — pak seznam žádné oddělovače nemá.
## Postup
1. Otevře OWA z persistent profilu (`outlook_profile/`).
2. Přejde do Inboxu.
3. Vybere první zprávu.
4. Smyčka: stáhne vybranou (pravý klik → Download → Download as EML) →
`ArrowDown` → opakuje, dokud se výběr přestane hýbat (= konec seznamu).
## Nastavení (v hlavičce skriptu)
- `LIMIT` — max počet **uložených zpráv** (`None` = celý Inbox). Aktuálně `30`.
- `SKIP_EXISTING``True` = soubor stejného jména v `downloads/` znovu neuloží;
`False` (aktuální) = existující soubor **smaže a uloží nový** (přepis).
## Výstup
`downloads/<název_z_OWA>.eml`. Při kolizi jmen:
- `SKIP_EXISTING=False` → starý soubor se smaže a přepíše novým,
- `SKIP_EXISTING=True` → soubor se ponechá, nový se neuloží.
## Spuštění
```
python download_all_inbox_eml_v1.0.py
```
## Poznámky / omezení
- Celý Inbox (tisíce zpráv) přes UI je pomalý a křehký — pro velký objem
nejdřív zúžit hledáním/filtrem v OWA. `LIMIT=30` je rozumný test.
- `SKIP_EXISTING` nešetří čas: identitu zprávy známe až z názvu **po** stažení,
takže pravý klik + download proběhne pro každou zprávu; jen se nepřepíše soubor.
- Konec seznamu se pozná tak, že se `aria-label` vybrané zprávy přestane měnit
(počítadlo `no_progress`, práh `NO_PROGRESS_MAX = 4`).
- Okno se po doběhnutí nezavře, čeká na Enter.
+216
View File
@@ -0,0 +1,216 @@
"""
=======================================================================
Název: download_all_inbox_eml_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Popis: Stáhne zprávy z Outlook Inboxu jako .eml. Virtualizovaný seznam
řeší navigací klávesnicí (ArrowDown) — Outlook sám scrolluje
a dorenderovává. Postup:
1. vybrat první zprávu
2. stáhnout vybranou (pravý klik → Download → Download as EML)
3. ArrowDown na další
4. opakovat, dokud se výběr (aria-selected) přestane hýbat
Používá persistent profil z outlook_login_v1.0.py.
Nastavení:
LIMIT max počet zpráv (None = celý Inbox)
SKIP_EXISTING přeskočit zprávy, jejichž EML už v downloads/ existuje
=======================================================================
"""
import re
from pathlib import Path
from playwright.sync_api import sync_playwright
BASE_DIR = Path(__file__).resolve().parent
PROFILE_DIR = BASE_DIR / "outlook_profile"
OUT_DIR = BASE_DIR / "downloads"
START_URL = "https://outlook.cloud.microsoft/mail/"
LIMIT = 30 # max počet zpráv; None = celý Inbox
SKIP_EXISTING = False # False = existující stejný soubor přepsat (smazat + uložit nový)
def safe_name(name: str) -> str:
"""Očistí název pro filesystem (Windows)."""
name = re.sub(r'[<>:"/\\|?*\r\n\t]', "_", name).strip().strip(".")
return name[:150] or "message"
def download_selected(page, out_dir: Path) -> Path | None:
"""Pravý klik na vybranou zprávu → Download as EML. Vrátí cestu nebo None."""
selected = page.locator('[role="option"][aria-selected="true"]').first
if selected.count() == 0:
return None
selected.click(button="right")
page.wait_for_timeout(600)
# Download (rodič submenu)
download_parent = None
for name in ("Download", "Stáhnout"):
loc = page.get_by_role("menuitem", name=name).first
if loc.count() and loc.is_visible():
download_parent = loc
break
if download_parent is None:
page.keyboard.press("Escape")
return None
download_parent.hover()
page.wait_for_timeout(500)
# Download as EML (submenu); fallback = klik přímo na Download
eml_item = None
for name in ("Download as EML", "Stáhnout jako EML", "Stáhnout jako .eml"):
loc = page.get_by_role("menuitem", name=name).first
if loc.count() and loc.is_visible():
eml_item = loc
break
try:
if eml_item is not None:
with page.expect_download(timeout=15_000) as dl:
eml_item.click()
else:
with page.expect_download(timeout=15_000) as dl:
download_parent.click()
download = dl.value
except Exception:
page.keyboard.press("Escape")
return None
fname = safe_name(download.suggested_filename or "message.eml")
if not fname.lower().endswith(".eml"):
fname += ".eml"
target = out_dir / fname
if target.exists():
if SKIP_EXISTING:
return target # už máme — neukládat znovu
target.unlink() # přepsat: smazat starou verzi a uložit novou
download.save_as(str(target))
return target
def main() -> None:
if not PROFILE_DIR.exists():
print(f" Profil nenalezen: {PROFILE_DIR}")
print(" Nejprve spusť outlook_login_v1.0.py.")
return
OUT_DIR.mkdir(exist_ok=True)
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
no_viewport=True,
accept_downloads=True,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
],
)
page = context.pages[0] if context.pages else context.new_page()
# 1) Otevřít Outlook
print(" 1/4 Otevírám Outlook...")
page.goto(START_URL)
page.wait_for_load_state("domcontentloaded")
search_selector = (
'[placeholder*="Search"], [aria-label*="Search"], '
'[placeholder*="Hledat"], [aria-label*="Hledat"]'
)
page.wait_for_selector(search_selector, timeout=30_000)
# 2) Inbox / Doručená pošta
print(" 2/4 Otevírám Inbox...")
inbox_candidates = [
'div[role="treeitem"]:has-text("Inbox")',
'div[role="treeitem"]:has-text("Doručená pošta")',
'text=Inbox',
'text=Doručená pošta',
]
for sel in inbox_candidates:
loc = page.locator(sel).first
if loc.count() and loc.is_visible():
loc.click()
break
page.wait_for_selector('div[role="option"]', timeout=15_000)
page.wait_for_timeout(1000)
# 3) Vybrat první zprávu
print(" 3/4 Vybírám první zprávu...")
page.locator('div[role="option"]').first.click()
page.wait_for_timeout(800)
# 4) Smyčka: stáhni vybranou → ArrowDown → dokud se výběr hýbe
# Pozn.: oddělovače sekcí (Today/Yesterday/...) jsou role="button"
# aria-expanded — kurzor na nich ZASTAVÍ a žádná zpráva nemá
# aria-selected (selected.count()==0). Takový krok jen přeskočíme
# (ArrowDown dál), NEpočítáme ho a NEukončujeme smyčku.
print(" 4/4 Stahuji zprávy...\n")
saved = 0
dividers = 0
failed = 0
prev_label = None
no_progress = 0 # kolikrát po sobě se výběr neposunul
NO_PROGRESS_MAX = 4 # tolik = konec seznamu / zaseknutí
while LIMIT is None or saved < LIMIT:
selected = page.locator('[role="option"][aria-selected="true"]').first
# (a) stojíme na oddělovači sekce → krok přes něj
if selected.count() == 0:
dividers += 1
no_progress += 1
if no_progress >= NO_PROGRESS_MAX:
print(" Konec seznamu / zaseknutí — končím.")
break
page.keyboard.press("ArrowDown")
page.wait_for_timeout(250)
continue
label = selected.get_attribute("aria-label") or ""
# (b) výběr se neposunul (konec seznamu)
if label == prev_label:
no_progress += 1
if no_progress >= NO_PROGRESS_MAX:
print(" Konec seznamu (výběr se nehýbe).")
break
page.keyboard.press("ArrowDown")
page.wait_for_timeout(250)
continue
# (c) nová zpráva → stáhni
no_progress = 0
prev_label = label
target = download_selected(page, OUT_DIR)
if target is None:
failed += 1
print(f" [!] selhalo: {label[:70]}")
else:
saved += 1
print(f" [{saved:>4}] {target.name}")
# refokus seznamu (klik na zprávu, ne na oddělovač) + posun dál
try:
selected.click()
except Exception:
pass
page.wait_for_timeout(200)
page.keyboard.press("ArrowDown")
page.wait_for_timeout(300)
print(f"\n Hotovo. Uloženo {saved}, oddělovačů přeskočeno {dividers}, "
f"selhalo {failed}{OUT_DIR}")
input(" Stiskni Enter pro zavření okna... ")
context.close()
if __name__ == "__main__":
main()
+30
View File
@@ -0,0 +1,30 @@
# download_first_inbox_eml_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
## Cíl
Otevřít Outlook OWA, vybrat první zprávu v Inboxu a stáhnout ji jako `.eml`.
## Kroky
1. Otevře OWA z persistent profilu (`outlook_profile/`).
2. Přejde do Inboxu / Doručené pošty.
3. Klikne na první zprávu v seznamu.
4. **Pravý klik** na řádek zprávy → kontextové menu (patří celé zprávě, ne příloze)
→ hover na **Download** → klik **Download as EML**, soubor uloží do `downloads/`.
## Výstup
`downloads/<původní_název_z_OWA>.eml`
## Spuštění
```
python download_first_inbox_eml_v1.0.py
```
## Poznámky
- **Pravý klik na řádek zprávy** je spolehlivější než "..." v toolbaru — kontextové
menu je vždy svázané s celou zprávou, takže odpadá riziko trefení "..." přílohy.
- Na **Download** se najíždí `hover()` (otevře submenu), ne klikem.
- Selektory mají EN i CZ varianty.
- `accept_downloads=True` + `page.expect_download()` — bez toho Playwright stažení nezachytí.
- Okno se po stažení nezavře, čeká na Enter.
+142
View File
@@ -0,0 +1,142 @@
"""
=======================================================================
Název: download_first_inbox_eml_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Popis: Pokusný skript: otevře Outlook OWA, přejde do Inboxu, klikne
na první zprávu a stáhne ji jako .eml přes menu
"More email actions" → Download → Download as EML.
Používá persistent profil z outlook_login_v1.0.py.
=======================================================================
"""
from pathlib import Path
from playwright.sync_api import sync_playwright
BASE_DIR = Path(__file__).resolve().parent
PROFILE_DIR = BASE_DIR / "outlook_profile"
OUT_DIR = BASE_DIR / "downloads"
START_URL = "https://outlook.cloud.microsoft/mail/"
def main() -> None:
if not PROFILE_DIR.exists():
print(f" Profil nenalezen: {PROFILE_DIR}")
print(" Nejprve spusť outlook_login_v1.0.py.")
return
OUT_DIR.mkdir(exist_ok=True)
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
no_viewport=True,
accept_downloads=True,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
],
)
page = context.pages[0] if context.pages else context.new_page()
# 1) Otevřít Outlook
print(" 1/6 Otevírám Outlook...")
page.goto(START_URL)
page.wait_for_load_state("domcontentloaded")
search_selector = (
'[placeholder*="Search"], [aria-label*="Search"], '
'[placeholder*="Hledat"], [aria-label*="Hledat"]'
)
page.wait_for_selector(search_selector, timeout=30_000)
# 2) Inbox / Doručená pošta
print(" 2/6 Otevírám Inbox...")
inbox_candidates = [
'div[role="treeitem"]:has-text("Inbox")',
'div[role="treeitem"]:has-text("Doručená pošta")',
'text=Inbox',
'text=Doručená pošta',
]
for sel in inbox_candidates:
loc = page.locator(sel).first
if loc.count() and loc.is_visible():
loc.click()
break
page.wait_for_selector('div[role="option"]', timeout=15_000)
page.wait_for_timeout(1000)
# 3) První zpráva v inboxu
print(" 3/4 Vybírám první zprávu...")
first_msg = page.locator('div[role="option"]').first
first_msg.click()
page.wait_for_timeout(1000)
# 4) PRAVÝ KLIK na řádek zprávy → kontextové menu patří CELÉ zprávě
# (ne příloze). Na "Download" najet hoverem (otevře submenu), pak
# kliknout na "Download as EML".
print(" 4/4 Pravý klik → Download → Download as EML...")
first_msg.click(button="right")
page.wait_for_timeout(700)
download_parent = None
for name in ("Download", "Stáhnout"):
loc = page.get_by_role("menuitem", name=name).first
if loc.count() and loc.is_visible():
download_parent = loc
break
if download_parent is None:
items = page.get_by_role("menuitem").all()
print(" ! Download položka v menu nenalezena. Obsah menu:")
for it in items:
try:
txt = it.inner_text(timeout=500).strip().replace("\n", " | ")
print(f" - {txt[:100]}")
except Exception:
pass
page.screenshot(path=str(OUT_DIR / "debug_menu.png"))
print(f" screenshot: {OUT_DIR / 'debug_menu.png'}")
input(" Enter pro zavření... ")
context.close()
return
download_parent.hover()
page.wait_for_timeout(600)
eml_item = None
for name in ("Download as EML", "Stáhnout jako EML", "Stáhnout jako .eml"):
loc = page.get_by_role("menuitem", name=name).first
if loc.count() and loc.is_visible():
eml_item = loc
break
try:
if eml_item is not None:
with page.expect_download(timeout=15_000) as download_info:
eml_item.click()
else:
# některé buildy OWA stahují EML přímo bez submenu
with page.expect_download(timeout=15_000) as download_info:
download_parent.click()
download = download_info.value
except Exception as e:
page.screenshot(path=str(OUT_DIR / "debug_menu.png"))
print(f" ! Stažení selhalo: {e}")
print(f" screenshot: {OUT_DIR / 'debug_menu.png'}")
input(" Enter pro zavření... ")
context.close()
return
target = OUT_DIR / (download.suggested_filename or "first_inbox.eml")
download.save_as(str(target))
print(f" Hotovo → {target}")
if not target.name.lower().endswith(".eml"):
print(f" ! POZOR: {target.name} nevypadá jako EML — možná stažena příloha!")
input(" Stiskni Enter pro zavření okna... ")
context.close()
if __name__ == "__main__":
main()
+30
View File
@@ -0,0 +1,30 @@
# forward_last_to_klucho_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
## Cíl
Pokusný skript: přepošle poslední odeslaný e-mail na `klucho@gastroenterolog.com`
na adresu `vladimir.buzalka@buzalka.cz` s předmětem `Ahoj` a slovem `Ahoj`
na prvním řádku těla.
## Kroky
1. Otevře OWA (persistent profil z `outlook_login_v1.0.py`).
2. Přejde do Odeslané pošty.
3. Vyhledá `to:klucho@gastroenterolog.com`.
4. Otevře nejnovější výsledek.
5. Klikne Forward / Přeposlat.
6. Vyplní příjemce.
7. Změní předmět na `Ahoj`.
8. Vloží `Ahoj` na první řádek těla.
9. Odešle (a počká na potvrzení Enterem před zavřením okna).
## Poznámky
- Selektory mají EN i CZ varianty (`Forward` / `Přeposlat`, `To` / `Komu`, …).
- `headless=False` — schválně viditelné, aby šlo sledovat průběh.
- POZOR: krok 9 reálně odešle e-mail. Pro suchý běh zakomentuj `send_btn.click()`.
## Spuštění
```
python forward_last_to_klucho_v1.0.py
```
+155
View File
@@ -0,0 +1,155 @@
"""
=======================================================================
Název: forward_last_to_klucho_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Popis: Pokusný skript: v Outlook OWA najde poslední odeslaný e-mail
na adresu klucho@gastroenterolog.com, otevře Forward, vyplní
příjemce vladimir.buzalka@buzalka.cz, předmět "Ahoj", na
první řádek těla "Ahoj" a odešle.
Používá persistent profil z outlook_login_v1.0.py.
headless=False kvůli sledování průběhu.
=======================================================================
"""
from pathlib import Path
from playwright.sync_api import sync_playwright
BASE_DIR = Path(__file__).resolve().parent
PROFILE_DIR = BASE_DIR / "outlook_profile"
START_URL = "https://outlook.cloud.microsoft/mail/"
TARGET_RECIPIENT = "klucho@gastroenterolog.com"
FORWARD_TO = "vladimir.buzalka@buzalka.cz"
GREETING = "Ahoj"
def main() -> None:
if not PROFILE_DIR.exists():
print(f" Profil nenalezen: {PROFILE_DIR}")
print(" Nejprve spusť outlook_login_v1.0.py.")
return
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
no_viewport=True,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
],
)
page = context.pages[0] if context.pages else context.new_page()
# 1) Otevřít Outlook
print(" 1/9 Otevírám Outlook...")
page.goto(START_URL)
page.wait_for_load_state("domcontentloaded")
# Search box má proměnlivý placeholder; zkusíme víc variant
search_selector = (
'[placeholder*="Search"], [aria-label*="Search"], '
'[placeholder*="Hledat"], [aria-label*="Hledat"]'
)
page.wait_for_selector(search_selector, timeout=30_000)
# 2) Přejít do Sent Items / Odeslaná pošta
print(" 2/9 Otevírám Odeslanou poštu...")
sent_candidates = [
'div[role="treeitem"]:has-text("Sent Items")',
'div[role="treeitem"]:has-text("Odeslaná pošta")',
'text=Sent Items',
'text=Odeslaná pošta',
]
for sel in sent_candidates:
loc = page.locator(sel).first
if loc.count() and loc.is_visible():
loc.click()
break
page.wait_for_timeout(1500)
# 3) Vyhledat e-maily na příjemce
print(f" 3/9 Hledám e-maily na {TARGET_RECIPIENT}...")
search = page.locator(search_selector).first
search.click()
search.fill(f"to:{TARGET_RECIPIENT}")
search.press("Enter")
page.wait_for_timeout(2500)
# 4) Kliknout na první (nejnovější) výsledek
print(" 4/9 Otevírám nejnovější výsledek...")
first_msg = page.locator('div[role="option"]').first
first_msg.wait_for(state="visible", timeout=15_000)
first_msg.click()
page.wait_for_timeout(2000)
# 5) Forward
print(" 5/9 Klikám Forward...")
forward_candidates = [
'button[aria-label="Forward"]',
'button[aria-label="Přeposlat"]',
'button:has-text("Forward")',
'button:has-text("Přeposlat")',
]
clicked = False
for sel in forward_candidates:
btn = page.locator(sel).first
if btn.count() and btn.is_visible():
btn.click()
clicked = True
break
if not clicked:
print(" ! Tlačítko Forward nenalezeno — končím.")
input(" Stiskni Enter pro zavření... ")
context.close()
return
# 6) Příjemce
print(f" 6/9 Vyplňuji příjemce {FORWARD_TO}...")
to_field = page.locator(
'[aria-label="To"], [aria-label="Komu"], '
'[placeholder*="To"], [placeholder*="Komu"]'
).first
to_field.wait_for(state="visible", timeout=10_000)
to_field.click()
to_field.fill(FORWARD_TO)
page.keyboard.press("Tab")
page.wait_for_timeout(500)
# 7) Předmět
print(f" 7/9 Měním předmět na '{GREETING}'...")
subject = page.locator(
'[aria-label="Subject"], [aria-label="Předmět"]'
).first
subject.click()
# vybrat vše a přepsat
page.keyboard.press("Control+A")
page.keyboard.type(GREETING)
# 8) Tělo — "Ahoj" na první řádek
print(f" 8/9 Vkládám '{GREETING}' na první řádek těla...")
body = page.locator(
'[aria-label="Message body"], [aria-label="Tělo zprávy"], '
'div[role="textbox"][contenteditable="true"]'
).first
body.click()
page.keyboard.press("Control+Home")
page.keyboard.type(GREETING)
page.keyboard.press("Enter")
# 9) Send — POZOR: skutečně odešle e-mail
print(" 9/9 Odesílám...")
send_btn = page.locator(
'button[aria-label="Send"], button[aria-label="Odeslat"]'
).first
send_btn.click()
page.wait_for_timeout(3000)
print(" Hotovo — e-mail odeslán.")
input(" Stiskni Enter pro zavření okna... ")
context.close()
if __name__ == "__main__":
main()
File diff suppressed because one or more lines are too long
+44
View File
@@ -0,0 +1,44 @@
# outlook_login_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
## Cíl
Jednorázové ruční přihlášení do Outlook OWA (`https://outlook.cloud.microsoft/mail/`)
a uložení session pro pozdější neinteraktivní skripty.
## Co dělá
1. Spustí Chromium v **persistent contextu** (adresář `outlook_profile/` vedle skriptu).
2. Otevře OWA.
3. Čeká, až se uživatel ručně přihlásí (účet, heslo, MFA, "Stay signed in").
4. V konzoli se zeptá `Hotovo? Napiš 'OK' pro uložení session:`.
5. Po zadání `OK` uloží:
- `outlook_profile/` — persistent profil (cookies, IndexedDB, service workers)
- `outlook_auth.json``storage_state` (cookies + localStorage)
6. Zavře prohlížeč.
## Spuštění
```
python outlook_login_v1.0.py
```
## Závislosti
- `playwright` (`pip install playwright && playwright install chromium`)
## Použití session v dalším skriptu
Persistent profil (doporučeno pro OWA):
```python
context = p.chromium.launch_persistent_context(
user_data_dir="./outlook_profile",
headless=False,
)
```
Nebo `storage_state` (pokud chceš jen cookies):
```python
context = browser.new_context(storage_state="outlook_auth.json")
```
## Poznámky
- Při prvním přihlášení zaškrtnout **"Zůstat přihlášen"** — MFA cookie u J&J typicky vydrží ~30 dní.
- Pokud session vyprší, stačí znovu spustit tento skript.
+62
View File
@@ -0,0 +1,62 @@
"""
=======================================================================
Název: outlook_login_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Popis: Otevře Outlook OWA (https://outlook.cloud.microsoft/mail/)
v persistent Chromium profilu, počká na ruční přihlášení
uživatele (včetně MFA), po potvrzení v konzoli uloží
session (profile + storage_state) a zavře prohlížeč.
Další skripty mohou stejný profil znovu otevřít bez loginu.
=======================================================================
"""
from pathlib import Path
from playwright.sync_api import sync_playwright
BASE_DIR = Path(__file__).resolve().parent
PROFILE_DIR = BASE_DIR / "outlook_profile"
STORAGE_STATE = BASE_DIR / "outlook_auth.json"
START_URL = "https://outlook.cloud.microsoft/mail/"
def main() -> None:
PROFILE_DIR.mkdir(exist_ok=True)
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
no_viewport=True,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
],
)
page = context.pages[0] if context.pages else context.new_page()
page.goto(START_URL)
print()
print("=" * 70)
print(" Přihlas se v otevřeném okně do Outlooku.")
print(" Až budeš v inboxu (vidíš seznam e-mailů), vrať se sem.")
print("=" * 70)
answer = input(" Hotovo? Napiš 'OK' pro uložení session: ").strip().lower()
if answer == "ok":
try:
context.storage_state(path=str(STORAGE_STATE))
print(f" Uloženo: {STORAGE_STATE}")
except Exception as e:
print(f" storage_state se neuložil: {e}")
print(f" Persistent profil: {PROFILE_DIR}")
else:
print(" Zrušeno — session se neuloží (profil ale zůstává).")
context.close()
if __name__ == "__main__":
main()
+80
View File
@@ -0,0 +1,80 @@
# enrich_fulltext_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
**Skript:** `enrich_fulltext_v1.0.py`
## Účel
Pro každý dokument odkazovaný v MongoDB (`soubory.*`) vytáhne **plný text** a uloží do PostgreSQL s GIN `tsvector` indexem pro fulltext vyhledávání.
## Cíl: PostgreSQL `MongoSoubory`
- **host:** 192.168.1.76:5432
- **db:** `MongoSoubory`
- **user:** vladimir.buzalka
- **extension:** `unaccent`, `pg_trgm`
- **text search config:** `soubory` (= simple + unaccent → case- a diakritika-insensitivní)
## Tabulka `documents`
| sloupec | typ | popis |
|---|---|---|
| id | BIGSERIAL | PK |
| mongo_id | TEXT | ObjectId z Mongo |
| study | TEXT | kolekce v Mongo (`42847922MDD3003` / `77242113UCO3001`) |
| path | TEXT | absolutní cesta (UNIQUE s study) |
| rel_path, name, ext | TEXT | doplňková metadata |
| sha256 | TEXT | pro inkrementální kontrolu |
| size_bytes, mtime | | |
| **body** | TEXT | plný extrahovaný text (max 5 MB) |
| body_length | INT | délka v znacích |
| **tsv** | tsvector GENERATED STORED | `to_tsvector('soubory', body)` |
| extracted_at | TIMESTAMPTZ | čas extrakce |
| extractor_version | TEXT | verze tohoto skriptu |
| ok | BOOLEAN | true pokud extrakce proběhla |
| error | TEXT | chybové hlášení |
**Indexy:** GIN nad `tsv`, GIN trigram nad `name`, btree `sha256`, btree `(study, ext)`.
## Podporované přípony
`pdf`, `docx`, `xlsx`, `xlsm`, `pptx`, `eml`, `msg`, `txt`, `csv`
## Inkrementální chování
Soubor se přeskočí pokud v PG už existuje záznam s:
- shodným `sha256`
- shodnou `extractor_version`
- `ok = true`
Jinak se přeparsuje a UPSERT.
## Limity (skip s `error=too_big_...`)
- PDF nad 500 MB
- XLSX nad 200 MB
- ostatní nad 300 MB
- `body` se vždy ořízne na 5 MB UTF-8
## Příklady dotazů (psql)
```sql
-- fulltext (case+diakritika insensitivní)
SELECT study, name, ts_rank_cd(tsv, q) AS rank,
ts_headline('soubory', body, q, 'MaxFragments=2,MinWords=5,MaxWords=15') AS snippet
FROM documents, plainto_tsquery('soubory', 'amendment 3') q
WHERE tsv @@ q
ORDER BY rank DESC
LIMIT 20;
-- jméno obsahuje (trigram, fuzzy)
SELECT study, name FROM documents
WHERE name ILIKE '%protokol%';
-- nejdelsi dokumenty per studie
SELECT study, name, body_length
FROM documents
WHERE ok = true
ORDER BY body_length DESC LIMIT 10;
```
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.0.py
```
Průběh tiskne řádek na soubor: `[n/total] OK pdf 2.3MB protokol.pdf | 12340 znaku 'Protocol amendment ...'`
+416
View File
@@ -0,0 +1,416 @@
"""
==============================================================================
Skript: enrich_fulltext_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector
fulltext indexem.
Zdroje:
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
Inkrementalne: preskoci soubor, kde v PG existuje radek se shodnym
sha256 a extractor_version a ok=true.
Pri prvnim behu sam vytvori tabulku, indexy a textovou konfiguraci
'soubory' (unaccent + simple) - vyhleda case- a diakritika-insensitivni.
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import sys
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
import psycopg
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.0"
MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB textu na dokument max
MAX_PDF_BYTES = 500 * 1024 * 1024
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
mongo_id TEXT NOT NULL,
study TEXT NOT NULL,
path TEXT NOT NULL,
rel_path TEXT,
name TEXT,
ext TEXT,
sha256 TEXT NOT NULL,
size_bytes BIGINT,
mtime TIMESTAMPTZ,
body TEXT,
body_length INT,
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, coalesce(body, ''))
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (study, path)
);
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
"""
# --- EXTRAKTORY (vraci string, max MAX_TEXT_BYTES) --------------------------
def _truncate(s: str) -> str:
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
def extract_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
if reader.is_encrypted:
try:
reader.decrypt("")
except Exception:
return ""
parts = []
total = 0
for page in reader.pages:
try:
t = page.extract_text() or ""
except Exception:
continue
parts.append(t)
total += len(t)
if total > MAX_TEXT_BYTES:
break
return _truncate("\n".join(parts))
def extract_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
parts = [p.text for p in doc.paragraphs if p.text]
for tbl in doc.tables:
for row in tbl.rows:
parts.append(" | ".join(c.text for c in row.cells))
return _truncate("\n".join(parts))
def extract_xlsx(path: Path) -> str:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=True)
parts = []
total = 0
for ws in wb.worksheets:
parts.append(f"# {ws.title}")
for row in ws.iter_rows(values_only=True):
line = "\t".join("" if v is None else str(v) for v in row)
if line.strip():
parts.append(line)
total += len(line)
if total > MAX_TEXT_BYTES:
break
if total > MAX_TEXT_BYTES:
break
wb.close()
return _truncate("\n".join(parts))
def extract_pptx(path: Path) -> str:
from pptx import Presentation
prs = Presentation(str(path))
parts = []
for i, slide in enumerate(prs.slides, 1):
parts.append(f"# slide {i}")
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs)
if line.strip():
parts.append(line)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes:
parts.append(f"[notes] {notes}")
return _truncate("\n".join(parts))
def extract_eml(path: Path) -> str:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
head = []
for k in ("From", "To", "Cc", "Subject", "Date"):
v = msg.get(k)
if v:
head.append(f"{k}: {v}")
parts = ["\n".join(head)]
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain" and not part.get_filename():
try:
parts.append(part.get_content())
except Exception:
pass
else:
try:
parts.append(msg.get_content())
except Exception:
pass
return _truncate("\n\n".join(parts))
def extract_msg(path: Path) -> str:
import extract_msg
with extract_msg.openMsg(str(path)) as m:
head = []
if m.subject: head.append(f"Subject: {m.subject}")
if m.sender: head.append(f"From: {m.sender}")
if m.to: head.append(f"To: {m.to}")
if m.cc: head.append(f"Cc: {m.cc}")
if m.date: head.append(f"Date: {m.date}")
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
def extract_text(path: Path) -> str:
data = path.read_bytes()[:MAX_TEXT_BYTES]
for enc in ("utf-8-sig", "cp1250", "latin-1"):
try:
return data.decode(enc)
except UnicodeDecodeError:
continue
return data.decode("utf-8", errors="replace")
EXTRACTORS = {
"pdf": (extract_pdf, MAX_PDF_BYTES),
"docx": (extract_docx, MAX_GENERIC_BYTES),
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
"eml": (extract_eml, MAX_GENERIC_BYTES),
"msg": (extract_msg, MAX_GENERIC_BYTES),
"txt": (extract_text, MAX_GENERIC_BYTES),
"csv": (extract_text, MAX_GENERIC_BYTES),
}
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
# nactu z PG existujici sha256 + verzi
with pg.cursor() as cur:
cur.execute(
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
(study,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
cursor = mongo_coll.find(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
"sha256": 1, "size_bytes": 1, "mtime": 1},
no_cursor_timeout=True,
)
processed = ok = errors = skipped = too_big = 0
queue = []
total_pending = mongo_coll.count_documents(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
)
print(f"[{study}] kandidatu v Mongo: {total_pending}")
n = 0
try:
for doc in cursor:
n += 1
prev = existing.get(doc["path"])
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
skipped += 1
continue
ext = doc["ext"]
extractor, max_bytes = EXTRACTORS[ext]
path = Path(doc["path"])
row = {
"mongo_id": str(doc["_id"]),
"study": study,
"path": doc["path"],
"rel_path": doc.get("rel_path"),
"name": doc.get("name"),
"ext": ext,
"sha256": doc.get("sha256"),
"size_bytes": doc.get("size_bytes"),
"mtime": doc.get("mtime"),
"body": None,
"body_length": 0,
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "
detail = ""
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
if not path.exists():
row["error"] = "file_missing"
status = "ERR"; detail = "file_missing"; errors += 1
elif (doc.get("size_bytes") or 0) > max_bytes:
row["error"] = f"too_big_>{max_bytes}"
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
else:
try:
body = extractor(path) or ""
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
f"{path.name} | {detail}", flush=True)
if len(queue) >= 50:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"study": study, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "too_big": too_big}
UPSERT_SQL = """
INSERT INTO documents
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
body, body_length, extracted_at, extractor_version, ok, error)
VALUES
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
%(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (study, path) DO UPDATE SET
mongo_id = EXCLUDED.mongo_id,
rel_path = EXCLUDED.rel_path,
name = EXCLUDED.name,
ext = EXCLUDED.ext,
sha256 = EXCLUDED.sha256,
size_bytes = EXCLUDED.size_bytes,
mtime = EXCLUDED.mtime,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Mongo OK.")
results = []
for name in MONGO_COLLECTIONS:
results.append(process_collection(pg, db[name], name))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+22
View File
@@ -0,0 +1,22 @@
# enrich_fulltext_v1.1
**Verze:** 1.1
**Datum:** 2026-06-03
**Skript:** `enrich_fulltext_v1.1.py`
## Změny proti v1.0
- **NUL bajty (0x00) v textu** — PG TEXT je odmítá. v1.1 odstraní všechny `\x00` a ostatní controly (kromě `\n \r \t`) ve společné funkci `_clean_for_pg`, navíc bezpečnostní strip i v `_flush` před UPSERT.
- **DOCX fallback** — pokud python-docx hodí výjimku (typicky `"no tr above topmost tr in w:tbl"` u VTMF formulářů s rozbitými tabulkami), v1.1 sáhne přímo do `word/document.xml` v ZIPu a regexem vytáhne text z `<w:t>` elementů. Přijde o strukturu tabulek, ale text zachrání.
- `extractor_version` zvýšena na `1.1` → všechny řádky z v1.0 se přeparsují (původní jsou pravděpodobně stejně chyběly kvůli pádu).
## Vše ostatní
Beze změny proti [v1.0](Trash/enrich_fulltext_v1.0.md):
- Tabulka `documents` v PG `MongoSoubory` (192.168.1.76:5432)
- Text search config `soubory` (simple + unaccent)
- Limity: PDF 500 MB, XLSX 200 MB, ostatní 300 MB; text max 5 MB
- Inkrementálně podle `sha256` + `extractor_version`
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.1.py
```
+457
View File
@@ -0,0 +1,457 @@
"""
==============================================================================
Skript: enrich_fulltext_v1.1.py
Verze: 1.1
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector indexem.
Zmeny proti v1.0:
- PG odmita NUL (0x00) bajty v TEXT -> v _truncate se vsechny NULy odstrani
(i jine controly krome \\n \\r \\t)
- DOCX fallback: pokud python-docx selze (typicky "no tr above topmost tr
in w:tbl" u rozbitych tabulek), pokusi se primy raw extract z word/document.xml
pres regex - prijde o strukturu tabulek, ale zachrani text
- drobnost: posunul jsem extractor_version na "1.1" -> stare radky se preparsuji
Cilove ulozeni:
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import re
import sys
import time
import traceback
import zipfile
from datetime import datetime, timezone
from pathlib import Path
import psycopg
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.1"
MAX_TEXT_BYTES = 5 * 1024 * 1024
MAX_PDF_BYTES = 500 * 1024 * 1024
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
mongo_id TEXT NOT NULL,
study TEXT NOT NULL,
path TEXT NOT NULL,
rel_path TEXT,
name TEXT,
ext TEXT,
sha256 TEXT NOT NULL,
size_bytes BIGINT,
mtime TIMESTAMPTZ,
body TEXT,
body_length INT,
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, coalesce(body, ''))
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (study, path)
);
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
"""
# --- HELPERY ----------------------------------------------------------------
# odstrani 0x00 a ostatni controly krome whitespace
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
# --- EXTRAKTORY -------------------------------------------------------------
def extract_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
if reader.is_encrypted:
try:
reader.decrypt("")
except Exception:
return ""
parts = []
total = 0
for page in reader.pages:
try:
t = page.extract_text() or ""
except Exception:
continue
parts.append(t)
total += len(t)
if total > MAX_TEXT_BYTES:
break
return _truncate("\n".join(parts))
# regex pro DOCX fallback - vytahne <w:t>...</w:t>
_DOCX_WT_RX = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
_DOCX_WP_END_RX = re.compile(r"</w:p>")
def _docx_raw_text(path: Path) -> str:
"""Fallback - cte primo word/document.xml ze ZIPu."""
with zipfile.ZipFile(str(path)) as z:
try:
xml = z.read("word/document.xml").decode("utf-8", errors="replace")
except KeyError:
return ""
xml = _DOCX_WP_END_RX.sub("\n", xml)
return "\n".join(m.group(1) for m in _DOCX_WT_RX.finditer(xml))
def extract_docx(path: Path) -> str:
from docx import Document
try:
doc = Document(str(path))
parts = [p.text for p in doc.paragraphs if p.text]
for tbl in doc.tables:
for row in tbl.rows:
parts.append(" | ".join(c.text for c in row.cells))
return _truncate("\n".join(parts))
except Exception:
# fallback - raw XML extract
return _truncate(_docx_raw_text(path))
def extract_xlsx(path: Path) -> str:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=True)
parts = []
total = 0
for ws in wb.worksheets:
parts.append(f"# {ws.title}")
for row in ws.iter_rows(values_only=True):
line = "\t".join("" if v is None else str(v) for v in row)
if line.strip():
parts.append(line)
total += len(line)
if total > MAX_TEXT_BYTES:
break
if total > MAX_TEXT_BYTES:
break
wb.close()
return _truncate("\n".join(parts))
def extract_pptx(path: Path) -> str:
from pptx import Presentation
prs = Presentation(str(path))
parts = []
for i, slide in enumerate(prs.slides, 1):
parts.append(f"# slide {i}")
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs)
if line.strip():
parts.append(line)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes:
parts.append(f"[notes] {notes}")
return _truncate("\n".join(parts))
def extract_eml(path: Path) -> str:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
head = []
for k in ("From", "To", "Cc", "Subject", "Date"):
v = msg.get(k)
if v:
head.append(f"{k}: {v}")
parts = ["\n".join(head)]
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain" and not part.get_filename():
try:
parts.append(part.get_content())
except Exception:
pass
else:
try:
parts.append(msg.get_content())
except Exception:
pass
return _truncate("\n\n".join(parts))
def extract_msg(path: Path) -> str:
import extract_msg
with extract_msg.openMsg(str(path)) as m:
head = []
if m.subject: head.append(f"Subject: {m.subject}")
if m.sender: head.append(f"From: {m.sender}")
if m.to: head.append(f"To: {m.to}")
if m.cc: head.append(f"Cc: {m.cc}")
if m.date: head.append(f"Date: {m.date}")
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
def extract_text(path: Path) -> str:
data = path.read_bytes()[:MAX_TEXT_BYTES]
for enc in ("utf-8-sig", "cp1250", "latin-1"):
try:
return _truncate(data.decode(enc))
except UnicodeDecodeError:
continue
return _truncate(data.decode("utf-8", errors="replace"))
EXTRACTORS = {
"pdf": (extract_pdf, MAX_PDF_BYTES),
"docx": (extract_docx, MAX_GENERIC_BYTES),
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
"eml": (extract_eml, MAX_GENERIC_BYTES),
"msg": (extract_msg, MAX_GENERIC_BYTES),
"txt": (extract_text, MAX_GENERIC_BYTES),
"csv": (extract_text, MAX_GENERIC_BYTES),
}
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
with pg.cursor() as cur:
cur.execute(
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
(study,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
cursor = mongo_coll.find(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
"sha256": 1, "size_bytes": 1, "mtime": 1},
no_cursor_timeout=True,
)
processed = ok = errors = skipped = too_big = 0
queue: list[dict] = []
total_pending = mongo_coll.count_documents(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
)
print(f"[{study}] kandidatu v Mongo: {total_pending}")
n = 0
try:
for doc in cursor:
n += 1
prev = existing.get(doc["path"])
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
skipped += 1
continue
ext = doc["ext"]
extractor, max_bytes = EXTRACTORS[ext]
path = Path(doc["path"])
row = {
"mongo_id": str(doc["_id"]),
"study": study,
"path": doc["path"],
"rel_path": doc.get("rel_path"),
"name": doc.get("name"),
"ext": ext,
"sha256": doc.get("sha256"),
"size_bytes": doc.get("size_bytes"),
"mtime": doc.get("mtime"),
"body": None,
"body_length": 0,
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "
detail = ""
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
if not path.exists():
row["error"] = "file_missing"
status = "ERR"; detail = "file_missing"; errors += 1
elif (doc.get("size_bytes") or 0) > max_bytes:
row["error"] = f"too_big_>{max_bytes}"
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
else:
try:
body = extractor(path) or ""
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
f"{path.name} | {detail}", flush=True)
if len(queue) >= 50:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"study": study, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "too_big": too_big}
UPSERT_SQL = """
INSERT INTO documents
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
body, body_length, extracted_at, extractor_version, ok, error)
VALUES
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
%(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (study, path) DO UPDATE SET
mongo_id = EXCLUDED.mongo_id,
rel_path = EXCLUDED.rel_path,
name = EXCLUDED.name,
ext = EXCLUDED.ext,
sha256 = EXCLUDED.sha256,
size_bytes = EXCLUDED.size_bytes,
mtime = EXCLUDED.mtime,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
# posledni pojistka - jeste jednou strip NUL (kdyby se necim prokrouzil)
for r in rows:
if r.get("body"):
r["body"] = _clean_for_pg(r["body"])
if r.get("error"):
r["error"] = _clean_for_pg(r["error"])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Mongo OK.")
results = []
for name in MONGO_COLLECTIONS:
results.append(process_collection(pg, db[name], name))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+46
View File
@@ -0,0 +1,46 @@
# enrich_files_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
**Skript:** `enrich_files_v1.0.py`
## Účel
Doplnit do existujících záznamů v MongoDB `soubory.*` pole `content.*` parsovaná z obsahu souborů.
Spouští se **až po** [scan_files_v1.0.py](scan_files_v1.0.md).
## Podporované přípony a pole
| ext | knihovna | pole v `content` |
|---|---|---|
| pdf | pypdf | pages, encrypted, author, title, subject, creator, producer, created, modified, text_head |
| docx | python-docx | author, title, subject, last_modified_by, paragraphs, words, created, modified, text_head |
| xlsx, xlsm | openpyxl | total_sheets, sheets[{name,rows,cols}], author, title, subject, last_modified_by, created, modified |
| pptx | python-pptx | slides, author, title, subject, last_modified_by, created, modified, text_head (z prvních 3 snímků) |
| eml | stdlib email | subject, from, to, cc, date, has_attachments, attachments[], body_head |
| msg | extract_msg | totéž co eml |
Společná pole vždy: `ok` (bool), `parsed_at`, `parser_version`, `sha256_at_parse`. Při chybě `error` (název výjimky + zpráva).
## Inkrementální chování
Zpracují se jen dokumenty kde:
- `content` chybí, NEBO
- `content.parser_version` != aktuální verze (1.0), NEBO
- `content.sha256_at_parse` != aktuální `sha256` (soubor se změnil)
Při dalším spuštění **přidá** jen nové/změněné. Při zvýšení verze parseru přeparsuje vše.
## Limity (skip)
- PDF nad 500 MB → ok=False, error="too_big_..."
- XLSX nad 200 MB → ok=False
- ostatní nad 300 MB → ok=False
`text_head` max 2000 znaků.
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_files_v1.0.py
```
## Plán
Po doběhnutí ověřit `content.ok` rate, případně doladit (chybové vzory) a teprve pak stavět `MCP_SOUBORY` server.
+388
View File
@@ -0,0 +1,388 @@
"""
==============================================================================
Skript: enrich_files_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Doplni metadata z obsahu souboru (PDF/DOCX/XLSX/PPTX/EML/MSG)
do existujicich zaznamu v MongoDB (db: soubory).
Pole se uklada do podobjektu `content`:
- common: ok (bool), error (str|None), parsed_at, parser_version
- pdf: pages, author, title, subject, creator, producer,
created, modified, encrypted, text_head (prvni stranka, max 2000 znaku)
- docx: author, title, subject, last_modified_by, paragraphs,
words, created, modified, text_head
- xlsx: sheets [{name, rows, cols}], total_sheets,
author, title, last_modified_by, created, modified
- pptx: slides, author, title, subject, last_modified_by,
created, modified, text_head (text z prvnich 3 snimku)
- eml: subject, from, to, cc, date, has_attachments,
attachments [filenames], body_head
- msg: same as eml
Inkrementalni:
- preskaci soubor, kde content.sha256_at_parse == aktualni sha256
a content.parser_version == aktualni verze
- pri zmene obsahu (jiny sha256) prepocita
- pri chybe ulozi content.error a content.ok=False
MongoDB: 192.168.1.76:27017
DB: soubory
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import sys
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
from pymongo import MongoClient, UpdateOne
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "soubory"
COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PARSER_VERSION = "1.0"
TEXT_HEAD_LIMIT = 2000
# limity pro velke soubory - aby skript neuvazil na 1GB PDF
MAX_PDF_BYTES = 500 * 1024 * 1024 # 500 MB
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
def _truncate(s: str | None, n: int = TEXT_HEAD_LIMIT) -> str | None:
if s is None:
return None
s = s.strip()
return s if len(s) <= n else s[:n]
def _to_dt(value):
if isinstance(value, datetime):
return value if value.tzinfo else value.replace(tzinfo=timezone.utc)
if isinstance(value, str) and value:
try:
return datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return None
return None
# --- PARSERY ----------------------------------------------------------------
def parse_pdf(path: Path) -> dict:
from pypdf import PdfReader
reader = PdfReader(str(path))
info = reader.metadata or {}
out = {
"pages": len(reader.pages),
"encrypted": reader.is_encrypted,
"author": getattr(info, "author", None),
"title": getattr(info, "title", None),
"subject": getattr(info, "subject", None),
"creator": getattr(info, "creator", None),
"producer": getattr(info, "producer", None),
"created": _to_dt(getattr(info, "creation_date", None)),
"modified": _to_dt(getattr(info, "modification_date", None)),
}
text_head = None
try:
if not reader.is_encrypted and reader.pages:
text_head = reader.pages[0].extract_text()
except Exception:
text_head = None
out["text_head"] = _truncate(text_head)
return out
def parse_docx(path: Path) -> dict:
from docx import Document
doc = Document(str(path))
core = doc.core_properties
paragraphs = doc.paragraphs
text = "\n".join(p.text for p in paragraphs if p.text)
words = len(text.split())
return {
"author": core.author,
"title": core.title,
"subject": core.subject,
"last_modified_by": core.last_modified_by,
"paragraphs": len(paragraphs),
"words": words,
"created": _to_dt(core.created),
"modified": _to_dt(core.modified),
"text_head": _truncate(text),
}
def parse_xlsx(path: Path) -> dict:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=False)
sheets = []
for ws in wb.worksheets:
sheets.append({
"name": ws.title,
"rows": ws.max_row,
"cols": ws.max_column,
})
props = wb.properties
out = {
"total_sheets": len(sheets),
"sheets": sheets,
"author": props.creator,
"title": props.title,
"subject": props.subject,
"last_modified_by": props.lastModifiedBy,
"created": _to_dt(props.created),
"modified": _to_dt(props.modified),
}
wb.close()
return out
def parse_pptx(path: Path) -> dict:
from pptx import Presentation
prs = Presentation(str(path))
core = prs.core_properties
head_parts = []
for slide in list(prs.slides)[:3]:
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
for run in para.runs:
if run.text:
head_parts.append(run.text)
return {
"slides": len(prs.slides),
"author": core.author,
"title": core.title,
"subject": core.subject,
"last_modified_by": core.last_modified_by,
"created": _to_dt(core.created),
"modified": _to_dt(core.modified),
"text_head": _truncate(" ".join(head_parts)),
}
def parse_eml(path: Path) -> dict:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
attachments = []
body_parts = []
if msg.is_multipart():
for part in msg.walk():
disp = (part.get("Content-Disposition") or "").lower()
ctype = part.get_content_type()
if "attachment" in disp or part.get_filename():
fname = part.get_filename()
if fname:
attachments.append(fname)
elif ctype == "text/plain":
try:
body_parts.append(part.get_content())
except Exception:
pass
else:
try:
body_parts.append(msg.get_content())
except Exception:
pass
def _addrs(field):
v = msg.get(field)
return v if v else None
return {
"subject": msg.get("Subject"),
"from": _addrs("From"),
"to": _addrs("To"),
"cc": _addrs("Cc"),
"date": msg.get("Date"),
"has_attachments": bool(attachments),
"attachments": attachments,
"body_head": _truncate("\n".join(body_parts)),
}
def parse_msg(path: Path) -> dict:
import extract_msg
with extract_msg.openMsg(str(path)) as msg:
attachments = []
for att in msg.attachments or []:
try:
fname = att.longFilename or att.shortFilename
if fname:
attachments.append(fname)
except Exception:
continue
return {
"subject": msg.subject,
"from": msg.sender,
"to": msg.to,
"cc": msg.cc,
"date": str(msg.date) if msg.date else None,
"has_attachments": bool(attachments),
"attachments": attachments,
"body_head": _truncate(msg.body or ""),
}
PARSERS = {
"pdf": (parse_pdf, MAX_PDF_BYTES),
"docx": (parse_docx, MAX_GENERIC_BYTES),
"xlsx": (parse_xlsx, MAX_XLSX_BYTES),
"xlsm": (parse_xlsx, MAX_XLSX_BYTES),
"pptx": (parse_pptx, MAX_GENERIC_BYTES),
"eml": (parse_eml, MAX_GENERIC_BYTES),
"msg": (parse_msg, MAX_GENERIC_BYTES),
}
# --- SUMMARY PRO KONZOLI ----------------------------------------------------
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _summary(content: dict, ext: str) -> str:
if not content.get("ok"):
return f"chyba: {_short(content.get('error'), 80)}"
parts = []
if ext == "pdf":
parts.append(f"{content.get('pages')}p")
if content.get("encrypted"): parts.append("enc")
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
if content.get("title"): parts.append(f"t={_short(content['title'], 30)}")
elif ext == "docx":
parts.append(f"{content.get('paragraphs')}para")
parts.append(f"{content.get('words')}w")
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
elif ext in ("xlsx", "xlsm"):
n = content.get("total_sheets", 0)
sheets = content.get("sheets") or []
names = ",".join(_short(s["name"], 12) for s in sheets[:3])
if n > 3:
names += f",+{n-3}"
parts.append(f"{n}sh[{names}]")
if content.get("author"): parts.append(f"by={_short(content['author'], 20)}")
elif ext == "pptx":
parts.append(f"{content.get('slides')}slides")
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
if content.get("title"): parts.append(f"t={_short(content['title'], 25)}")
elif ext in ("eml", "msg"):
if content.get("from"): parts.append(f"from={_short(content['from'], 25)}")
if content.get("subject"): parts.append(f"subj={_short(content['subject'], 40)}")
if content.get("has_attachments"):
parts.append(f"att={len(content.get('attachments') or [])}")
return " ".join(parts) if parts else "ok"
# --- HLAVNI SMYCKA ----------------------------------------------------------
def enrich_collection(coll, study: str) -> dict:
supported = list(PARSERS.keys())
query = {
"ext": {"$in": supported},
"deleted_at": {"$exists": False},
"$or": [
{"content": {"$exists": False}},
{"content.parser_version": {"$ne": PARSER_VERSION}},
{"$expr": {"$ne": ["$content.sha256_at_parse", "$sha256"]}},
],
}
total_pending = coll.count_documents(query)
print(f"[{study}] k zpracovani: {total_pending} souboru")
ops: list[UpdateOne] = []
processed = 0
ok = 0
errors = 0
too_big = 0
cursor = coll.find(query, {"path": 1, "ext": 1, "size_bytes": 1, "sha256": 1}, no_cursor_timeout=True)
try:
for doc in cursor:
ext = doc["ext"]
parser, max_bytes = PARSERS[ext]
path = Path(doc["path"])
content: dict = {
"parser_version": PARSER_VERSION,
"parsed_at": _now(),
"sha256_at_parse": doc.get("sha256"),
}
if not path.exists():
content.update(ok=False, error="file_missing")
errors += 1
elif doc.get("size_bytes", 0) > max_bytes:
content.update(ok=False, error=f"too_big_>{max_bytes}")
too_big += 1
else:
try:
payload = parser(path)
content["ok"] = True
content.update(payload)
ok += 1
except Exception as e:
content["ok"] = False
content["error"] = f"{type(e).__name__}: {e}"[:500]
errors += 1
ops.append(UpdateOne({"_id": doc["_id"]}, {"$set": {"content": content}}))
processed += 1
status = "OK " if content.get("ok") else ("BIG" if "too_big" in (content.get("error") or "") else "ERR")
size_mb = (doc.get("size_bytes", 0) or 0) / 1024 / 1024
detail = _summary(content, ext)
print(f" [{processed:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB {path.name} | {detail}", flush=True)
if len(ops) >= 50:
coll.bulk_write(ops, ordered=False)
ops.clear()
finally:
cursor.close()
if ops:
coll.bulk_write(ops, ordered=False)
return {"study": study, "processed": processed, "ok": ok, "errors": errors, "too_big": too_big}
def main() -> int:
t0 = time.time()
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
results = []
for name in COLLECTIONS:
results.append(enrich_collection(db[name], name))
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+51
View File
@@ -0,0 +1,51 @@
# enrich_fulltext_v1.2
**Verze:** 1.2
**Datum:** 2026-06-03
**Skript:** `enrich_fulltext_v1.2.py`
## Změna proti v1.1
Velký XLSX (`#400 MDD3003_EAT detail report_30jun25.xlsx`, 5 242 128 znaků textu) způsobil pád:
```
psycopg.errors.ProgramLimitExceeded:
string is too long for tsvector (1114090 bytes, max 1048575 bytes)
```
PostgreSQL `tsvector`**tvrdý limit ~1 MB** binární velikosti — nelze obejít.
**Řešení:** `tsv` se generuje z prvních **800 000 znaků** sloupce `body`:
```sql
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, left(coalesce(body, ''), 800000))
) STORED
```
- sloupec `body` zůstává **plný** (až 5 MB) — pro náhledy, snippet, `ts_headline`
- vyhledávání (`tsv @@ q`) ignoruje obsah za 800 000. znakem
- u rozsáhlých XLSX/PDF (např. data exporty) je 800 KB stále víc než 100 000 slov — pro fulltext bohatě stačí
## Migrace
`SCHEMA_SQL` při startu zkontroluje, zda současný výraz `tsv` obsahuje `left(`. Pokud ne (starý sloupec z v1.0/v1.1):
1. dropne `documents_tsv_gin` index
2. dropne sloupec `tsv`
3. přidá nový s `left(body, 800000)`
4. index se vytvoří znovu na konci `SCHEMA_SQL`
Bezpečné spustit opakovaně.
## extractor_version
Posunuto na `1.2` → všechny řádky z v1.0/v1.1 se přeparsují (potřebné už proto, že migrace tsv změnila co je v indexu).
## Vše ostatní
Beze změny proti [v1.1](Trash/enrich_fulltext_v1.1.md):
- DOCX fallback přes raw `word/document.xml`
- NUL byte strip
- Limity souborů (PDF 500 MB, XLSX 200 MB, ostatní 300 MB), text max 5 MB
- Inkrementálně podle `sha256` + `extractor_version`
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.2.py
```
+481
View File
@@ -0,0 +1,481 @@
"""
==============================================================================
Skript: enrich_fulltext_v1.2.py
Verze: 1.2
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector indexem.
Zmeny proti v1.1:
- PG tsvector ma tvrdy limit ~1 MB binarne -> velky XLSX (5 MB textu) ho prekrocil.
v1.2 generuje tsv z prvnich 800 000 znaku body: left(body, 800000).
Sloupec body zustava plny (max 5 MB pro nahled / snippet).
- SCHEMA_SQL provadi migraci sloupce tsv: pokud uz existuje stara verze
(bez `left`), dropne index+sloupec a vytvori znovu s truncated vyrazem.
- extractor_version = "1.2" -> preparsuji se vsechny radky z v1.0/v1.1.
Zachovano z v1.1:
- NUL bajty (0x00) se strippuji z body i error
- DOCX fallback na raw XML pres regex pri padu python-docx
Cilove ulozeni:
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import re
import sys
import time
import traceback
import zipfile
from datetime import datetime, timezone
from pathlib import Path
import psycopg
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.2"
MAX_TEXT_BYTES = 5 * 1024 * 1024
MAX_PDF_BYTES = 500 * 1024 * 1024
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
mongo_id TEXT NOT NULL,
study TEXT NOT NULL,
path TEXT NOT NULL,
rel_path TEXT,
name TEXT,
ext TEXT,
sha256 TEXT NOT NULL,
size_bytes BIGINT,
mtime TIMESTAMPTZ,
body TEXT,
body_length INT,
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, left(coalesce(body, ''), 800000))
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (study, path)
);
-- migrace tsv sloupce ze stareho vyrazu (bez `left`) na novy (s `left(..,800000)`)
DO $$
DECLARE
cur_expr TEXT;
BEGIN
SELECT pg_get_expr(d.adbin, d.adrelid)
INTO cur_expr
FROM pg_attribute a
JOIN pg_class c ON c.oid = a.attrelid
JOIN pg_attrdef d ON d.adrelid = a.attrelid AND d.adnum = a.attnum
WHERE c.relname = 'documents' AND a.attname = 'tsv';
IF cur_expr IS NOT NULL AND position('left' in cur_expr) = 0 THEN
EXECUTE 'DROP INDEX IF EXISTS documents_tsv_gin';
EXECUTE 'ALTER TABLE documents DROP COLUMN tsv';
EXECUTE 'ALTER TABLE documents ADD COLUMN tsv tsvector GENERATED ALWAYS AS '
|| '(to_tsvector(''soubory''::regconfig, left(coalesce(body, ''''), 800000))) STORED';
END IF;
END$$;
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
"""
# --- HELPERY ----------------------------------------------------------------
# odstrani 0x00 a ostatni controly krome whitespace
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
# --- EXTRAKTORY -------------------------------------------------------------
def extract_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
if reader.is_encrypted:
try:
reader.decrypt("")
except Exception:
return ""
parts = []
total = 0
for page in reader.pages:
try:
t = page.extract_text() or ""
except Exception:
continue
parts.append(t)
total += len(t)
if total > MAX_TEXT_BYTES:
break
return _truncate("\n".join(parts))
# regex pro DOCX fallback - vytahne <w:t>...</w:t>
_DOCX_WT_RX = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
_DOCX_WP_END_RX = re.compile(r"</w:p>")
def _docx_raw_text(path: Path) -> str:
"""Fallback - cte primo word/document.xml ze ZIPu."""
with zipfile.ZipFile(str(path)) as z:
try:
xml = z.read("word/document.xml").decode("utf-8", errors="replace")
except KeyError:
return ""
xml = _DOCX_WP_END_RX.sub("\n", xml)
return "\n".join(m.group(1) for m in _DOCX_WT_RX.finditer(xml))
def extract_docx(path: Path) -> str:
from docx import Document
try:
doc = Document(str(path))
parts = [p.text for p in doc.paragraphs if p.text]
for tbl in doc.tables:
for row in tbl.rows:
parts.append(" | ".join(c.text for c in row.cells))
return _truncate("\n".join(parts))
except Exception:
# fallback - raw XML extract
return _truncate(_docx_raw_text(path))
def extract_xlsx(path: Path) -> str:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=True)
parts = []
total = 0
for ws in wb.worksheets:
parts.append(f"# {ws.title}")
for row in ws.iter_rows(values_only=True):
line = "\t".join("" if v is None else str(v) for v in row)
if line.strip():
parts.append(line)
total += len(line)
if total > MAX_TEXT_BYTES:
break
if total > MAX_TEXT_BYTES:
break
wb.close()
return _truncate("\n".join(parts))
def extract_pptx(path: Path) -> str:
from pptx import Presentation
prs = Presentation(str(path))
parts = []
for i, slide in enumerate(prs.slides, 1):
parts.append(f"# slide {i}")
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs)
if line.strip():
parts.append(line)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes:
parts.append(f"[notes] {notes}")
return _truncate("\n".join(parts))
def extract_eml(path: Path) -> str:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
head = []
for k in ("From", "To", "Cc", "Subject", "Date"):
v = msg.get(k)
if v:
head.append(f"{k}: {v}")
parts = ["\n".join(head)]
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain" and not part.get_filename():
try:
parts.append(part.get_content())
except Exception:
pass
else:
try:
parts.append(msg.get_content())
except Exception:
pass
return _truncate("\n\n".join(parts))
def extract_msg(path: Path) -> str:
import extract_msg
with extract_msg.openMsg(str(path)) as m:
head = []
if m.subject: head.append(f"Subject: {m.subject}")
if m.sender: head.append(f"From: {m.sender}")
if m.to: head.append(f"To: {m.to}")
if m.cc: head.append(f"Cc: {m.cc}")
if m.date: head.append(f"Date: {m.date}")
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
def extract_text(path: Path) -> str:
data = path.read_bytes()[:MAX_TEXT_BYTES]
for enc in ("utf-8-sig", "cp1250", "latin-1"):
try:
return _truncate(data.decode(enc))
except UnicodeDecodeError:
continue
return _truncate(data.decode("utf-8", errors="replace"))
EXTRACTORS = {
"pdf": (extract_pdf, MAX_PDF_BYTES),
"docx": (extract_docx, MAX_GENERIC_BYTES),
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
"eml": (extract_eml, MAX_GENERIC_BYTES),
"msg": (extract_msg, MAX_GENERIC_BYTES),
"txt": (extract_text, MAX_GENERIC_BYTES),
"csv": (extract_text, MAX_GENERIC_BYTES),
}
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
with pg.cursor() as cur:
cur.execute(
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
(study,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
cursor = mongo_coll.find(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
"sha256": 1, "size_bytes": 1, "mtime": 1},
no_cursor_timeout=True,
)
processed = ok = errors = skipped = too_big = 0
queue: list[dict] = []
total_pending = mongo_coll.count_documents(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
)
print(f"[{study}] kandidatu v Mongo: {total_pending}")
n = 0
try:
for doc in cursor:
n += 1
prev = existing.get(doc["path"])
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
skipped += 1
continue
ext = doc["ext"]
extractor, max_bytes = EXTRACTORS[ext]
path = Path(doc["path"])
row = {
"mongo_id": str(doc["_id"]),
"study": study,
"path": doc["path"],
"rel_path": doc.get("rel_path"),
"name": doc.get("name"),
"ext": ext,
"sha256": doc.get("sha256"),
"size_bytes": doc.get("size_bytes"),
"mtime": doc.get("mtime"),
"body": None,
"body_length": 0,
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "
detail = ""
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
if not path.exists():
row["error"] = "file_missing"
status = "ERR"; detail = "file_missing"; errors += 1
elif (doc.get("size_bytes") or 0) > max_bytes:
row["error"] = f"too_big_>{max_bytes}"
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
else:
try:
body = extractor(path) or ""
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
f"{path.name} | {detail}", flush=True)
if len(queue) >= 50:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"study": study, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "too_big": too_big}
UPSERT_SQL = """
INSERT INTO documents
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
body, body_length, extracted_at, extractor_version, ok, error)
VALUES
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
%(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (study, path) DO UPDATE SET
mongo_id = EXCLUDED.mongo_id,
rel_path = EXCLUDED.rel_path,
name = EXCLUDED.name,
ext = EXCLUDED.ext,
sha256 = EXCLUDED.sha256,
size_bytes = EXCLUDED.size_bytes,
mtime = EXCLUDED.mtime,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
# posledni pojistka - jeste jednou strip NUL (kdyby se necim prokrouzil)
for r in rows:
if r.get("body"):
r["body"] = _clean_for_pg(r["body"])
if r.get("error"):
r["error"] = _clean_for_pg(r["error"])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Mongo OK.")
results = []
for name in MONGO_COLLECTIONS:
results.append(process_collection(pg, db[name], name))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+63
View File
@@ -0,0 +1,63 @@
# scan_files_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
**Skript:** `scan_files_v1.0.py`
## Účel
Rekurzivní sken dvou Dropbox složek studií (`!!42847922MDD3003`, `!77242113UCO3001`) a zápis metadat všech souborů do MongoDB.
## Konfigurace
- **MongoDB:** `mongodb://192.168.1.76:27017` (bez autentizace)
- **DB:** `soubory`
- **Kolekce:** `42847922MDD3003`, `77242113UCO3001` (jedna kolekce na studii)
- **Cesta k Dropboxu:** zjištěna pomocí `Knihovny/najdi_dropbox.py` (přenositelné mezi PC)
## Struktura dokumentu v MongoDB
| pole | popis |
|---|---|
| `path` | absolutní cesta (unikátní klíč) |
| `study` | kód studie (= název kolekce) |
| `rel_path` | relativní cesta od kořene studie |
| `dir`, `rel_dir` | nadřazený adresář (absolutní/relativní) |
| `parent_folders` | pole názvů složek (pro filtrování) |
| `name`, `stem`, `ext` | jméno, jméno bez přípony, přípona (lower-case) |
| `size_bytes` | velikost |
| `mtime`, `ctime`, `atime` | časové údaje (UTC) |
| `sha256` | hash obsahu |
| `mime` | mimetype dle přípony |
| `tokens` | jméno rozparsované na slova/čísla (lower-case) |
| `dates_in_name` | datumy nalezené v názvu, formát `YYYY-MM-DD` |
| `first_seen_at` | první sken, kdy byl soubor viděn |
| `last_seen_at` | poslední sken, kdy byl viděn |
| `deleted_at` | nastaveno, pokud soubor v posledním skenu už nebyl nalezen |
## Datumy v názvu
Skript hledá tři varianty:
- `12JAN2026`, `12Jan2026` (den + 3-písm. zkratka měsíce + rok)
- `2026-01-12`, `2026_01_12`, `2026.01.12`
- `12-01-2026`, `12_01_2026`, `12.01.2026`
Všechny se normalizují do ISO `YYYY-MM-DD` v poli `dates_in_name`.
## Inkrementální chování
- `size_bytes` + `mtime` souhlasí se záznamem v DB → SHA256 se nepřepočítává, jen se aktualizuje `last_seen_at`
- nový soubor → vloží se s `first_seen_at`
- chybějící v aktuálním běhu → `deleted_at` se nastaví na čas běhu
## Co se ignoruje
- `.dropbox*`, `Thumbs.db`, `desktop.ini`, `~$*.*` (Office locky), `.DS_Store`
- adresář `.dropbox.cache`
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\scan_files_v1.0.py
```
## Index pole pro rychlé dotazy
`path` (unique), `ext`, `dates_in_name`, `tokens`, `sha256`
## Plán pokračování
1. Spustit první sken → zjistit profil dat (přípony, hloubku stromů)
2. Doplnit dle potřeby (např. počet stran PDF, autor DOCX, listy XLSX)
3. Postavit `MCP_SOUBORY` server nad touto kolekcí
+272
View File
@@ -0,0 +1,272 @@
"""
==============================================================================
Skript: scan_files_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Rekurzivni sken Dropbox slozek dvou studii a zapis metadat
vsech souboru do MongoDB (db: soubory, kolekce = nazev studie).
- cesty k Dropboxu se zjisti pres Knihovny.najdi_dropbox
- pro kazdy soubor: stat, sha256, mime (podle pripony),
parsing data v nazvu (12JAN2026, 2026-01-12, 12-01-2026 ...)
- inkrementalni: pokud size+mtime souhlasi se zaznamem v DB,
sha256 se nepocita znovu (jen se aktualizuje last_seen_at)
- smazane soubory dostanou deleted_at pri behu, ve kterem
uz nebyly videny
- vynechavaji se: .dropbox*, Thumbs.db, desktop.ini,
~$*.* (Office lock), .DS_Store, *.tmp
MongoDB: 192.168.1.76:27017, bez autentizace
DB: soubory
Kolekce: 42847922MDD3003, 77242113UCO3001 (extrahovano z rootu cesty)
==============================================================================
"""
from __future__ import annotations
import hashlib
import mimetypes
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from pymongo import MongoClient, UpdateOne, ASCENDING
# --- prida Knihovny do path -------------------------------------------------
HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(HERE.parent))
from Knihovny.najdi_dropbox import get_dropbox_root # noqa: E402
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "soubory"
STUDIES = {
"42847922MDD3003": "!!42847922MDD3003",
"77242113UCO3001": "!77242113UCO3001",
}
SKIP_NAME_PATTERNS = [
re.compile(r"^\.dropbox.*", re.IGNORECASE),
re.compile(r"^Thumbs\.db$", re.IGNORECASE),
re.compile(r"^desktop\.ini$", re.IGNORECASE),
re.compile(r"^~\$.*", re.IGNORECASE),
re.compile(r"^\.DS_Store$", re.IGNORECASE),
]
SKIP_DIR_NAMES = {".dropbox.cache"}
HASH_CHUNK = 1024 * 1024 # 1 MiB
# --- parsovani datumu v nazvu ----------------------------------------------
MONTHS = {
"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12,
}
DATE_PATTERNS = [
# 12JAN2026 / 12Jan2026
(re.compile(r"(\d{1,2})([A-Za-z]{3})(\d{4})"), "dmonth"),
# 2026-01-12 / 2026_01_12 / 2026.01.12
(re.compile(r"(20\d{2})[-_.](\d{1,2})[-_.](\d{1,2})"), "ymd"),
# 12-01-2026 / 12_01_2026 / 12.01.2026
(re.compile(r"(\d{1,2})[-_.](\d{1,2})[-_.](20\d{2})"), "dmy"),
]
def extract_dates(name: str) -> list[str]:
"""Vraci unikatni ISO datumy (YYYY-MM-DD) nalezene v nazvu."""
found: set[str] = set()
for rx, kind in DATE_PATTERNS:
for m in rx.finditer(name):
try:
if kind == "dmonth":
d = int(m.group(1))
mo = MONTHS.get(m.group(2).upper())
y = int(m.group(3))
if not mo:
continue
elif kind == "ymd":
y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
else: # dmy
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
datetime(y, mo, d)
found.add(f"{y:04d}-{mo:02d}-{d:02d}")
except ValueError:
continue
return sorted(found)
TOKEN_RX = re.compile(r"[A-Za-z0-9]+")
def tokenize(name: str) -> list[str]:
return [t.lower() for t in TOKEN_RX.findall(name)]
def should_skip(name: str) -> bool:
return any(p.match(name) for p in SKIP_NAME_PATTERNS)
def sha256_of(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
while True:
chunk = f.read(HASH_CHUNK)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def to_dt(ts: float) -> datetime:
return datetime.fromtimestamp(ts, tz=timezone.utc)
def scan_study(study_code: str, study_root: Path, db, scan_started_at: datetime) -> dict:
coll = db[study_code]
coll.create_index([("path", ASCENDING)], unique=True)
coll.create_index([("ext", ASCENDING)])
coll.create_index([("dates_in_name", ASCENDING)])
coll.create_index([("tokens", ASCENDING)])
coll.create_index([("sha256", ASCENDING)])
# existujici zaznamy -> mapa path -> (size, mtime_iso, sha256)
existing = {
d["path"]: (d.get("size_bytes"), d.get("mtime"), d.get("sha256"))
for d in coll.find({}, {"path": 1, "size_bytes": 1, "mtime": 1, "sha256": 1})
}
ops: list[UpdateOne] = []
seen = 0
rehashed = 0
skipped = 0
errors: list[tuple[str, str]] = []
print(f"[{study_code}] sken: {study_root}")
for root, dirs, files in os.walk(study_root):
# vyrad skip-dirs in-place
dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES]
for fname in files:
if should_skip(fname):
skipped += 1
continue
fpath = Path(root) / fname
try:
st = fpath.stat()
except OSError as e:
errors.append((str(fpath), f"stat: {e}"))
continue
path_str = str(fpath)
size = st.st_size
mtime = to_dt(st.st_mtime)
prev = existing.get(path_str)
if prev and prev[0] == size and prev[1] == mtime and prev[2]:
# bez zmeny - jen last_seen_at + clear deleted_at
ops.append(UpdateOne(
{"path": path_str},
{"$set": {"last_seen_at": scan_started_at},
"$unset": {"deleted_at": ""}},
))
else:
try:
digest = sha256_of(fpath)
except OSError as e:
errors.append((path_str, f"hash: {e}"))
continue
rehashed += 1
rel = fpath.relative_to(study_root)
doc = {
"path": path_str,
"study": study_code,
"rel_path": str(rel),
"dir": str(fpath.parent),
"rel_dir": str(rel.parent) if str(rel.parent) != "." else "",
"parent_folders": list(rel.parts[:-1]),
"name": fname,
"stem": fpath.stem,
"ext": fpath.suffix.lower().lstrip("."),
"size_bytes": size,
"mtime": mtime,
"ctime": to_dt(st.st_ctime),
"atime": to_dt(st.st_atime),
"sha256": digest,
"mime": mimetypes.guess_type(fname)[0],
"tokens": tokenize(fpath.stem),
"dates_in_name": extract_dates(fname),
"last_seen_at": scan_started_at,
}
ops.append(UpdateOne(
{"path": path_str},
{"$set": doc, "$unset": {"deleted_at": ""},
"$setOnInsert": {"first_seen_at": scan_started_at}},
upsert=True,
))
seen += 1
if len(ops) >= 500:
coll.bulk_write(ops, ordered=False)
ops.clear()
print(f" ... {seen} souboru zpracovano")
if ops:
coll.bulk_write(ops, ordered=False)
# oznac smazane
res = coll.update_many(
{"last_seen_at": {"$lt": scan_started_at}, "deleted_at": {"$exists": False}},
{"$set": {"deleted_at": scan_started_at}},
)
return {
"study": study_code,
"seen": seen,
"rehashed": rehashed,
"unchanged": seen - rehashed,
"skipped": skipped,
"marked_deleted": res.modified_count,
"errors": errors,
}
def main() -> int:
t0 = time.time()
dropbox_root = Path(get_dropbox_root())
print(f"Dropbox root: {dropbox_root}")
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
scan_started_at = datetime.now(tz=timezone.utc)
results = []
for study_code, folder in STUDIES.items():
study_root = dropbox_root / folder
if not study_root.is_dir():
print(f"[!] {study_root} neexistuje, preskakuji")
continue
results.append(scan_study(study_code, study_root, db, scan_started_at))
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: seen={r['seen']} rehashed={r['rehashed']} "
f"unchanged={r['unchanged']} skipped={r['skipped']} "
f"deleted={r['marked_deleted']} errors={len(r['errors'])}")
for path, err in r["errors"][:5]:
print(f" ! {err} ({path})")
if len(r["errors"]) > 5:
print(f" ... +{len(r['errors']) - 5} dalsich chyb")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
raise SystemExit(main())