z230
This commit is contained in:
@@ -0,0 +1,55 @@
|
||||
"Protocol","Study Population","Country","Site","Principal Investigator","Participant ID","Baseline Stool Frequency","Visit","Visit Date","Endoscopy Completed?","Endoscopy Date","Bowel Preparation Start Date 1","Bowel Preparation End Date 1","Bowel Preparation Start Date 2","Bowel Preparation End Date 2","Central Endoscopy Score","Local Endoscopy Score","PGA Score","Eligible Day (-1)","Day (-1) Excluded Reason(s)","Eligible Day (-2)","Day (-2) Excluded Reason(s)","Eligible Day (-3)","Day (-3) Excluded Reason(s)","Eligible Day (-4)","Day (-4) Excluded Reason(s)","Eligible Day (-5)","Day (-5) Excluded Reason(s)","Eligible Day (-6)","Day (-6) Excluded Reason(s)","Eligible Day (-7)","Day (-7) Excluded Reason(s)","Eligible Day (-8)","Day (-8) Excluded Reason(s)","Eligible Day (-9)","Day (-9) Excluded Reason(s)","Eligible Day (-10)","Day (-10) Excluded Reason(s)","Eligible Day (-1) Stool Count","Eligible Day (-2) Stool Count","Eligible Day (-3) Stool Count","Eligible Day (-4) Stool Count","Eligible Day (-5) Stool Count","Eligible Day (-6) Stool Count","Eligible Day (-7) Stool Count","Eligible Day (-8) Stool Count","Eligible Day (-9) Stool Count","Eligible Day (-10) Stool Count","Stool Frequency Sub-score","Eligible Day (-1) Rectal Bleeding Score","Eligible Day (-2) Rectal Bleeding Score","Eligible Day (-3) Rectal Bleeding Score","Eligible Day (-4) Rectal Bleeding Score","Eligible Day (-5) Rectal Bleeding Score","Eligible Day (-6) Rectal Bleeding Score","Eligible Day (-7) Rectal Bleeding Score","Eligible Day (-8) Rectal Bleeding Score","Eligible Day (-9) Rectal Bleeding Score","Eligible Day (-10) Rectal Bleeding Score","Rectal Bleeding Sub-score","Partial Mayo Score","Modified Mayo Score","Full Mayo Score","Site Action","Last Mayo Score Submission","Week I-12 Clinical Responder","Week I-12 Clinical Remission","Clinical Flare","Loss of Response","Partial Mayo Response Post Loss of Response","Partial Mayo Response for Clinical Non-Responders"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-0","19 Feb 2026","Yes","05 Feb 2026","04 Feb 2026","04 Feb 2026","-","-","2","-","3","18 Feb 2026","-","17 Feb 2026","-","16 Feb 2026","-","15 Feb 2026","-","14 Feb 2026","-","13 Feb 2026","-","12 Feb 2026","-","11 Feb 2026","Day Not Applicable for Calculation","10 Feb 2026","Day Not Applicable for Calculation","09 Feb 2026","Day Not Applicable for Calculation","10","8","7","5","7","8","8","-","-","-","3","1","1","1","0","1","1","1","-","-","-","1","7","6","9","-","08 Apr 2026 07:11:25","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-2","04 Mar 2026","-","-","-","-","-","-","-","-","3","03 Mar 2026","-","02 Mar 2026","-","01 Mar 2026","-","28 Feb 2026","-","27 Feb 2026","-","26 Feb 2026","-","25 Feb 2026","-","24 Feb 2026","Day Not Applicable for Calculation","23 Feb 2026","Day Not Applicable for Calculation","22 Feb 2026","Day Not Applicable for Calculation","5","4","5","4","5","6","6","-","-","-","2","1","0","1","0","1","0","1","-","-","-","1","6","","","-","28 May 2026 10:04:05","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-4","18 Mar 2026","-","-","-","-","-","-","-","-","2","17 Mar 2026","-","16 Mar 2026","-","15 Mar 2026","-","14 Mar 2026","-","13 Mar 2026","-","12 Mar 2026","-","11 Mar 2026","-","10 Mar 2026","Day Not Applicable for Calculation","09 Mar 2026","Day Not Applicable for Calculation","08 Mar 2026","Day Not Applicable for Calculation","5","5","5","4","5","4","5","-","-","-","2","1","0","0","1","1","1","0","-","-","-","1","5","","","-","08 Apr 2026 11:04:49","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-8","05 May 2026","-","-","-","-","-","-","-","-","1","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","3","3","4","4","5","4","4","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","4","","","-","28 May 2026 14:42:53","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","I-12","13 May 2026","Yes","06 May 2026","05 May 2026","05 May 2026","-","-","1","-","1","12 May 2026","-","11 May 2026","-","10 May 2026","-","09 May 2026","-","08 May 2026","-","07 May 2026","-","06 May 2026","Endoscopy","05 May 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","04 May 2026","-","03 May 2026","Day Not Applicable for Calculation","5","4","6","5","5","5","-","-","3","-","2","1","0","1","1","1","1","-","-","1","-","1","4","4","5","-","28 May 2026 14:43:11","Clinical Responder","No","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","1","M-4","10 Jun 2026","-","-","-","-","-","-","-","-","1","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","4","5","3","4","5","4","5","-","-","-","2","0","0","0","0","1","0","1","-","-","-","0","3","","","-","10 Jun 2026 07:15:50","N/A","N/A","No","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-0","08 Apr 2026","Yes","18 Mar 2026","17 Mar 2026","18 Mar 2026","-","-","2","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","Missing Diary","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","3","3","4","-","3","3","4","-","-","-","1","0","0","0","-","0","0","1","-","-","-","0","3","3","5","-","10 Jun 2026 08:42:08","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-2","23 Apr 2026","-","-","-","-","-","-","-","-","2","22 Apr 2026","Missing Diary","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","Day Not Applicable for Calculation","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","-","3","3","6","5","5","4","-","-","-","2","-","0","0","1","1","1","1","-","-","-","1","5","","","-","10 Jun 2026 08:42:33","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-4","06 May 2026","-","-","-","-","-","-","-","-","1","05 May 2026","-","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","Day Not Applicable for Calculation","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","6","3","2","3","3","3","3","-","-","-","1","1","0","0","0","1","1","0","-","-","-","0","2","","","-","04 Jun 2026 07:39:06","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","1","I-8","04 Jun 2026","-","-","-","-","-","-","-","-","1","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","Day Not Applicable for Calculation","26 May 2026","Day Not Applicable for Calculation","25 May 2026","Day Not Applicable for Calculation","3","4","3","3","3","3","4","-","-","-","1","0","0","0","0","0","0","1","-","-","-","0","2","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012003","1","I-0","27 May 2026","Yes","13 May 2026","12 May 2026","12 May 2026","-","-","3","-","2","26 May 2026","-","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","Day Not Applicable for Calculation","18 May 2026","Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","6","9","7","8","9","7","8","-","-","-","3","2","2","2","2","1","1","1","-","-","-","2","7","8","10","-","27 May 2026 07:24:39","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012003","1","I-2","10 Jun 2026","-","-","-","-","-","-","-","-","2","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","7","8","8","7","6","8","6","-","-","-","3","2","2","1","2","2","2","1","-","-","-","2","7","","","-","10 Jun 2026 07:30:18","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10003","Leksa Vaclav","CZ100032001","2","I-0","10 Jun 2026","Yes","27 May 2026","26 May 2026","26 May 2026","-","-","2","-","2","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","4","4","4","4","5","4","5","-","-","-","1","2","2","2","2","2","2","2","-","-","-","2","5","5","7","-","10 Jun 2026 08:48:09","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-0","20 Mar 2026","Yes","19 Feb 2026","-","-","-","-","3","-","3","19 Mar 2026","-","18 Mar 2026","-","17 Mar 2026","-","16 Mar 2026","-","15 Mar 2026","-","14 Mar 2026","-","13 Mar 2026","-","12 Mar 2026","Day Not Applicable for Calculation","11 Mar 2026","Day Not Applicable for Calculation","10 Mar 2026","Day Not Applicable for Calculation","7","7","8","8","7","8","5","-","-","-","3","2","1","1","1","1","1","0","-","-","-","1","7","7","10","-","20 Mar 2026 07:02:44","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-2","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","Medication For Diarrhea","06 Apr 2026","Medication For Diarrhea","05 Apr 2026","Medication For Diarrhea","04 Apr 2026","Medication For Diarrhea","03 Apr 2026","Medication For Diarrhea","02 Apr 2026","Medication For Diarrhea","01 Apr 2026","Medication For Diarrhea","31 Mar 2026","Medication For Diarrhea;Day Not Applicable for Calculation","30 Mar 2026","Medication For Diarrhea;Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","-","-","-","-","-","-","-","-","-","-","Non-Evaluable","-","-","-","-","-","-","-","-","-","-","Non-Evaluable","Non-Evaluable","Non-Evaluable","Non-Evaluable","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-4","15 Apr 2026","-","-","-","-","-","-","-","-","3","14 Apr 2026","-","13 Apr 2026","-","12 Apr 2026","-","11 Apr 2026","-","10 Apr 2026","-","09 Apr 2026","-","08 Apr 2026","-","07 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","06 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","05 Apr 2026","Medication For Diarrhea;Day Not Applicable for Calculation","9","22","20","19","17","18","18","-","-","-","3","1","3","2","2","2","2","2","-","-","-","2","8","","","-","04 May 2026 22:06:03","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-8","18 May 2026","-","-","-","-","-","-","-","-","2","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","-","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","08 May 2026","Day Not Applicable for Calculation","7","5","9","7","7","8","8","-","-","-","3","1","1","1","1","1","1","1","-","-","-","1","6","","","-","04 Jun 2026 21:46:30","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","1","I-12","08 Jun 2026","Yes","28 May 2026","-","-","-","-","3","-","3","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","Missing Diary","31 May 2026","Day Not Applicable for Calculation","30 May 2026","Day Not Applicable for Calculation","29 May 2026","Day Not Applicable for Calculation","6","5","5","5","7","6","-","-","-","-","3","1","1","0","0","1","0","-","-","-","-","1","7","7","10","-","11 Jun 2026 22:12:05","Clinical Nonresponder","No","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062002","1","I-0","26 May 2026","Yes","14 May 2026","13 May 2026","13 May 2026","-","-","2","-","2","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","-","18 May 2026","Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","16 May 2026","Day Not Applicable for Calculation","8","8","6","7","7","6","7","-","-","-","3","2","2","2","2","2","2","2","-","-","-","2","7","7","9","-","29 May 2026 15:45:00","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062002","1","I-2","09 Jun 2026","-","-","-","-","-","-","-","-","2","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","30 May 2026","Day Not Applicable for Calculation","7","8","7","7","7","5","7","-","-","-","3","2","1","1","1","2","2","2","-","-","-","2","7","","","-","11 Jun 2026 22:12:40","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-0","05 May 2026","Yes","24 Apr 2026","23 Apr 2026","23 Apr 2026","-","-","2","-","2","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","5","5","5","5","5","5","5","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","5","5","7","-","05 May 2026 11:19:40","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-2","19 May 2026","-","-","-","-","-","-","-","-","1","18 May 2026","-","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","Day Not Applicable for Calculation","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","5","4","5","5","5","4","6","-","-","-","2","1","1","1","1","1","1","1","-","-","-","1","4","","","-","19 May 2026 10:38:25","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10009","Jiri Pumprla","CZ100092001","1","I-4","04 Jun 2026","-","-","-","-","-","-","-","-","1","03 Jun 2026","-","02 Jun 2026","-","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","Day Not Applicable for Calculation","26 May 2026","Day Not Applicable for Calculation","25 May 2026","Day Not Applicable for Calculation","2","3","2","3","3","2","3","-","-","-","1","0","0","0","0","0","0","0","-","-","-","0","2","","","-","04 Jun 2026 09:24:54","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-0","07 Apr 2026","Yes","24 Mar 2026","22 Mar 2026","22 Mar 2026","-","-","2","-","2","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","-","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","28 Mar 2026","Day Not Applicable for Calculation","8","11","5","9","11","10","13","-","-","-","3","1","2","2","2","2","2","2","-","-","-","2","7","7","9","-","04 May 2026 08:44:52","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-2","22 Apr 2026","-","-","-","-","-","-","-","-","2","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","7","5","6","6","7","8","2","-","-","-","1","1","0","1","1","1","2","0","-","-","-","1","4","","","-","04 May 2026 08:45:07","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-4","07 May 2026","-","-","-","-","-","-","-","-","1","06 May 2026","-","05 May 2026","-","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","Day Not Applicable for Calculation","28 Apr 2026","Day Not Applicable for Calculation","27 Apr 2026","Day Not Applicable for Calculation","8","7","7","8","4","11","7","-","-","-","1","2","1","1","1","0","1","1","-","-","-","1","3","","","-","01 Jun 2026 00:57:35","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","5","I-8","03 Jun 2026","-","-","-","-","-","-","-","-","2","02 Jun 2026","-","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","Day Not Applicable for Calculation","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","5","9","7","5","5","9","7","-","-","-","1","1","1","1","0","3","0","1","-","-","-","1","4","","","-","03 Jun 2026 17:47:25","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-0","24 Mar 2026","Yes","12 Mar 2026","11 Mar 2026","11 Mar 2026","-","-","2","-","2","23 Mar 2026","-","22 Mar 2026","-","21 Mar 2026","-","20 Mar 2026","-","19 Mar 2026","-","18 Mar 2026","-","17 Mar 2026","-","16 Mar 2026","Day Not Applicable for Calculation","15 Mar 2026","Day Not Applicable for Calculation","14 Mar 2026","Day Not Applicable for Calculation","8","6","5","7","6","7","6","-","-","-","3","1","1","1","0","1","1","1","-","-","-","1","6","6","8","-","05 Apr 2026 22:41:27","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-2","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","5","2","3","6","5","5","5","-","-","-","2","0","0","0","0","1","1","0","-","-","-","0","4","","","-","28 May 2026 23:19:03","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132001","1","I-4","21 Apr 2026","-","-","-","-","-","-","-","-","0","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","-","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","11 Apr 2026","Day Not Applicable for Calculation","4","3","4","3","3","4","4","-","-","-","2","0","0","0","0","0","0","0","-","-","-","0","2","","","-","27 May 2026 12:54:41","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-0","12 May 2026","Yes","21 Apr 2026","20 Apr 2026","21 Apr 2026","-","-","2","-","2","11 May 2026","-","10 May 2026","-","09 May 2026","-","08 May 2026","-","07 May 2026","-","06 May 2026","-","05 May 2026","Missing Diary","04 May 2026","Day Not Applicable for Calculation","03 May 2026","Day Not Applicable for Calculation","02 May 2026","Day Not Applicable for Calculation","2","1","1","1","1","2","-","-","-","-","0","0","0","0","0","0","0","-","-","-","-","0","2","2","4","-","28 May 2026 23:19:30","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-2","26 May 2026","-","-","-","-","-","-","-","-","1","25 May 2026","-","24 May 2026","Missing Diary","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","-","19 May 2026","-","18 May 2026","Missing Diary;Day Not Applicable for Calculation","17 May 2026","Day Not Applicable for Calculation","16 May 2026","Day Not Applicable for Calculation","1","-","1","2","1","2","2","-","-","-","1","0","-","0","0","0","0","0","-","-","-","0","2","","","-","28 May 2026 23:19:51","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","1","I-4","10 Jun 2026","-","-","-","-","-","-","-","-","2","09 Jun 2026","-","08 Jun 2026","Missing Diary","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Missing Diary;Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","4","-","1","1","2","2","1","-","-","-","1","0","-","0","0","0","0","0","-","-","-","0","3","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","1","I-0","02 Jun 2026","Yes","25 May 2026","24 May 2026","24 May 2026","-","-","2","-","2","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Endoscopy;Missing Diary;Day Not Applicable for Calculation","24 May 2026","Bowel Preparation for Procedure;Missing Diary;Day Not Applicable for Calculation","23 May 2026","Missing Diary;Day Not Applicable for Calculation","8","8","11","10","10","11","6","-","-","-","3","2","2","1","2","1","2","2","-","-","-","2","7","7","9","-","02 Jun 2026 08:17:40","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","1","I-2","10 Jun 2026","-","-","-","-","-","-","-","-","2","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","-","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","31 May 2026","Day Not Applicable for Calculation","9","2","1","4","2","4","2","-","-","-","1","1","1","0","1","1","1","0","-","-","-","1","4","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","1","I-0","28 May 2026","Yes","19 May 2026","18 May 2026","19 May 2026","-","-","3","-","3","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","-","23 May 2026","-","22 May 2026","-","21 May 2026","-","20 May 2026","Day Not Applicable for Calculation","19 May 2026","Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation","18 May 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","14","15","15","15","15","15","15","-","-","-","3","2","3","3","2","2","3","3","-","-","-","3","9","9","12","-","28 May 2026 10:22:48","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","1","I-2","11 Jun 2026","-","-","-","-","-","-","-","-","3","10 Jun 2026","-","09 Jun 2026","-","08 Jun 2026","-","07 Jun 2026","-","06 Jun 2026","-","05 Jun 2026","-","04 Jun 2026","-","03 Jun 2026","Day Not Applicable for Calculation","02 Jun 2026","Day Not Applicable for Calculation","01 Jun 2026","Day Not Applicable for Calculation","10","9","9","8","13","9","8","-","-","-","3","2","1","1","1","2","1","1","-","-","-","1","7","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","Unscheduled 1","04 May 2026","Yes","20 Apr 2026","12 Apr 2026","15 Apr 2026","-","-","2","-","3","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","-","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","24 Apr 2026","Day Not Applicable for Calculation","5","6","6","7","6","3","3","-","-","-","2","0","0","0","0","0","0","0","-","-","-","0","5","4","7","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","I-0","18 May 2026","Yes","01 May 2026","01 May 2026","01 May 2026","-","-","2","-","3","17 May 2026","-","16 May 2026","-","15 May 2026","-","14 May 2026","-","13 May 2026","-","12 May 2026","-","11 May 2026","-","10 May 2026","Day Not Applicable for Calculation","09 May 2026","Day Not Applicable for Calculation","08 May 2026","Day Not Applicable for Calculation","6","6","6","6","6","6","6","-","-","-","3","0","0","0","0","0","0","0","-","-","-","0","6","5","8","-","18 May 2026 08:39:27","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adolescent","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","1","I-2","01 Jun 2026","-","-","-","-","-","-","-","-","3","31 May 2026","-","30 May 2026","Missing Diary","29 May 2026","Missing Diary","28 May 2026","Missing Diary","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","22 May 2026","Day Not Applicable for Calculation","6","-","-","-","6","6","6","-","-","-","3","0","-","-","-","0","0","0","-","-","-","0","6","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-0","07 Apr 2026","Yes","16 Mar 2026","15 Mar 2026","16 Mar 2026","-","-","3","-","3","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","-","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","28 Mar 2026","Day Not Applicable for Calculation","11","11","10","11","11","10","9","-","-","-","3","2","2","2","2","2","2","2","-","-","-","2","8","8","11","-","20 Apr 2026 09:27:58","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-2","20 Apr 2026","-","-","-","-","-","-","-","-","3","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","-","13 Apr 2026","-","12 Apr 2026","Day Not Applicable for Calculation","11 Apr 2026","Day Not Applicable for Calculation","10 Apr 2026","Day Not Applicable for Calculation","8","7","9","8","8","7","8","-","-","-","3","2","2","1","1","1","2","1","-","-","-","1","7","","","-","20 Apr 2026 09:29:01","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-4","05 May 2026","-","-","-","-","-","-","-","-","1","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","6","6","6","6","7","7","6","-","-","-","3","0","0","1","1","1","1","1","-","-","-","1","5","","","-","-","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","1","I-8","02 Jun 2026","-","-","-","-","-","-","-","-","1","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","3","4","4","4","5","5","5","-","-","-","2","0","0","0","0","0","1","1","-","-","-","0","3","","","-","02 Jun 2026 14:44:34","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222002","1","I-0","19 Feb 2026","Yes","11 Feb 2026","10 Feb 2026","11 Feb 2026","-","-","2","-","2","18 Feb 2026","-","17 Feb 2026","-","16 Feb 2026","-","15 Feb 2026","-","14 Feb 2026","-","13 Feb 2026","-","12 Feb 2026","-","11 Feb 2026","Endoscopy;Bowel Preparation for Procedure;Day Not Applicable for Calculation","10 Feb 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","09 Feb 2026","Day Not Applicable for Calculation","3","2","2","3","4","3","2","-","-","-","1","1","1","0","0","0","2","2","-","-","-","1","4","4","6","-","19 Feb 2026 15:24:43","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-0","09 Mar 2026","Yes","11 Feb 2026","10 Feb 2026","11 Feb 2026","-","-","2","-","2","08 Mar 2026","-","07 Mar 2026","-","06 Mar 2026","-","05 Mar 2026","-","04 Mar 2026","-","03 Mar 2026","Missing Diary","02 Mar 2026","Missing Diary","01 Mar 2026","Missing Diary;Day Not Applicable for Calculation","28 Feb 2026","Missing Diary;Day Not Applicable for Calculation","27 Feb 2026","Missing Diary;Day Not Applicable for Calculation","7","7","6","6","7","-","-","-","-","-","3","2","2","2","2","2","-","-","-","-","-","2","7","7","9","-","22 Mar 2026 18:34:58","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-2","27 Mar 2026","-","-","-","-","-","-","-","-","2","26 Mar 2026","-","25 Mar 2026","-","24 Mar 2026","-","23 Mar 2026","-","22 Mar 2026","-","21 Mar 2026","-","20 Mar 2026","-","19 Mar 2026","Day Not Applicable for Calculation","18 Mar 2026","Day Not Applicable for Calculation","17 Mar 2026","Day Not Applicable for Calculation","7","3","3","3","5","5","5","-","-","-","2","0","0","1","1","1","1","2","-","-","-","1","5","","","-","08 Apr 2026 07:36:56","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-4","08 Apr 2026","-","-","-","-","-","-","-","-","2","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","-","31 Mar 2026","Day Not Applicable for Calculation","30 Mar 2026","Day Not Applicable for Calculation","29 Mar 2026","Day Not Applicable for Calculation","3","3","4","4","5","4","3","-","-","-","2","1","0","0","2","1","1","2","-","-","-","1","5","","","-","08 Apr 2026 07:59:35","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-8","04 May 2026","-","-","-","-","-","-","-","-","2","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","-","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","24 Apr 2026","Missing Diary;Day Not Applicable for Calculation","3","5","3","3","3","2","3","-","-","-","1","0","0","0","0","0","0","0","-","-","-","0","3","","","-","04 May 2026 07:52:47","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","1","I-12","01 Jun 2026","Yes","20 May 2026","19 May 2026","20 May 2026","-","-","3","-","2","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","-","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","22 May 2026","Day Not Applicable for Calculation","4","4","6","3","3","3","3","-","-","-","2","1","1","2","1","1","1","2","-","-","-","1","5","6","8","-","01 Jun 2026 14:25:57","Clinical Nonresponder","No","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-0","09 Apr 2026","Yes","08 Apr 2026","31 Mar 2026","01 Apr 2026","-","-","2","-","2","08 Apr 2026","Endoscopy","07 Apr 2026","-","06 Apr 2026","-","05 Apr 2026","-","04 Apr 2026","-","03 Apr 2026","-","02 Apr 2026","-","01 Apr 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","31 Mar 2026","Bowel Preparation for Procedure;Day Not Applicable for Calculation","30 Mar 2026","-","-","3","3","4","3","4","3","-","-","3","1","-","2","2","2","2","2","2","-","-","2","2","5","5","7","-","29 May 2026 11:07:08","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-2","22 Apr 2026","-","-","-","-","-","-","-","-","2","21 Apr 2026","-","20 Apr 2026","-","19 Apr 2026","-","18 Apr 2026","-","17 Apr 2026","-","16 Apr 2026","-","15 Apr 2026","-","14 Apr 2026","Day Not Applicable for Calculation","13 Apr 2026","Day Not Applicable for Calculation","12 Apr 2026","Day Not Applicable for Calculation","3","3","5","3","2","3","2","-","-","-","1","1","2","2","1","1","1","2","-","-","-","1","4","","","-","05 May 2026 07:29:35","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-4","05 May 2026","-","-","-","-","-","-","-","-","2","04 May 2026","-","03 May 2026","-","02 May 2026","-","01 May 2026","-","30 Apr 2026","-","29 Apr 2026","-","28 Apr 2026","-","27 Apr 2026","Day Not Applicable for Calculation","26 Apr 2026","Day Not Applicable for Calculation","25 Apr 2026","Day Not Applicable for Calculation","4","2","2","2","2","2","2","-","-","-","1","1","1","1","1","2","1","1","-","-","-","1","4","","","-","05 May 2026 07:28:55","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
"77242113UCO3001","Adult","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","1","I-8","02 Jun 2026","-","-","-","-","-","-","-","-","2","01 Jun 2026","-","31 May 2026","-","30 May 2026","-","29 May 2026","-","28 May 2026","-","27 May 2026","-","26 May 2026","-","25 May 2026","Day Not Applicable for Calculation","24 May 2026","Day Not Applicable for Calculation","23 May 2026","Day Not Applicable for Calculation","2","2","2","2","2","4","10","-","-","-","1","2","1","2","1","2","2","2","-","-","-","2","5","","","-","02 Jun 2026 08:18:08","N/A","N/A","N/A","N/A","N/A","N/A"
|
||||
|
@@ -0,0 +1,219 @@
|
||||
"Protocol","Country","Site","PI Name","Subject ID","Age at Informed Consent","Baseline Stool Count","Confirm Baseline Stool Count","Data Correction ID","Creation Date UTC","Status","Description","Date of Last Action UTC","Total Open Period","Total Open Time (Days)","Current Status Time (Days)","Type","Next Action Required","Category","Query History","Reason for Change","Resolution"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012001","48","1","","SW00703544","13-May-2026","Submitted","Please change answer to clinical remision from no to YES (week 12). Entry erros ","20-May-2026","15-21 Days","21","16","Query Active ","Site","New","(1) 20 May 2026 msullivan (Clario): Please confirm your request
|
||||
|
||||
Dear Site. Thank you for submitting this Data Clarification Request.
|
||||
|
||||
For us to process your request, please let us know the name of the form (with date) with question.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team
|
||||
|
||||
","Entry Error",""
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10001","Matej Falc","CZ100012002","79","1","","SW00696586","09-Apr-2026","ReadyForQC","Please correct date of endoscopy to date: 18 March 2026 (from 25 March 2026)","15-Apr-2026","Over 28 Days","43","40","Query Active ","Site","Site-Entered Data","","Entry Error","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In Mayo Subscore (1) dated 08 Apr 2026 for I-0 visit, CLARIO to make the following changes:
|
||||
- What was the date of endoscopy? (ENDODT1D): from 25 Mar 2026 to 18 Mar 2026
|
||||
- Data Flag (QSDFLG1B): from blank to check
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10006","Michal Konecny","CZ100062001","19","1","","SW00704536","19-May-2026","ReadyForQC","Please change the endoscopy date to 19-FEB-2026. 06-MAR-2026 was entered in error. ","26-May-2026","15-21 Days","18","13","Query Active ","Site","Site-Entered Data","","Entry Error","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In Mayo Subscore (1) dated 20 Mar 2026 for I-0 visit, CLARIO to make the following changes:
|
||||
-What was the date of endoscopy? (ENDODT1D): from 06 Mar 2026 to 19 Feb 2026
|
||||
- Data Flag (QSDFLG1B): from blank to check
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10012","Stefan Konecny","CZ100122001","22","5","Yes, I confirm this is the correct stool count.","SW00706684","01-Jun-2026","Submitted","The right endoscopy date is 23MAR2026, please change the date","15-Jun-2026","8-14 Days","9","","","Clario DM","New","(1) 05 Jun 2026 msullivan (Clario): Please confirm your request
|
||||
|
||||
Dear Site. Thank you for submitting this Data Clarification.
|
||||
|
||||
Please confirm that if you are requesting following.
|
||||
|
||||
Mayo Subscore (1) dated 07 Apr 2026 for I-0
|
||||
What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 23 Mar 2026
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team.
|
||||
|
||||
|
||||
(2) 15 Jun 2026 hosova.kristyna@fnbrno.cz (Site User): The endoscopy was performed 23MAR2026
|
||||
|
||||
","Entry Error",""
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132002","29","1","","SW00705646","26-May-2026","ReadyForQC","Correct visit date I-O is 12-May-2026. All questionaries were filled on paper and entered in tablet later.
|
||||
Log-in issue. ","09-Jun-2026","8-14 Days","13","3","","Clario DM","Visit Data","(1) 01 Jun 2026 msullivan (Clario): Please confirm your request
|
||||
|
||||
Dear Site. Thank you for submitting this Data Clarification.
|
||||
|
||||
Please provide the timestamps for each of the assessments if you used paper forms and transcribed into the device.
|
||||
If unknown, ERT will use a dummy timestamp.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team.
|
||||
|
||||
(2) 01 Jun 2026 dstepek@vnbrno.cz (Site User): time is unknown
|
||||
|
||||
","Changed Information","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In the following forms for I-0, CLARIO to make the following changes:
|
||||
-Report Date: from 26May 2026 to 12 May 2026
|
||||
-Report Start Date and time: from 26 May 2026 to 12 May 2026 23:59:59
|
||||
-Event End Date: from 26 May 2026 08:27:57 to 12 May 2026 23:59:59
|
||||
|
||||
+Tablet Training Module (1)
|
||||
+Participant Start Instructions (1)
|
||||
+IBDQ (1)
|
||||
+PROMIS Fatigue – Short Form 7a (1)
|
||||
+BASDAI (1)
|
||||
+Participant End Instructions (1)
|
||||
+Visit End (122)
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","49","1","","SW00708623","10-Jun-2026","Cancelled","Correct date of I-2 is 26.5.2026. all questionaries were entered on paper at 07,45 and transmited later. ","10-Jun-2026","1 Day","1","","","","New","","yes, subject mishmasch",""
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10013","David Stepek","CZ100132003","49","1","","SW00706581","29-May-2026","Completed","baseline stool count reported by subject is 0, please change to 1 as per CRA request (subject has 1 stool in 2-3 days if in remission)","10-Jun-2026","4-7 Days","7","","","","Demographic","","Changed Information","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In System Variables form, CLARIO to make the following changes:
|
||||
- Baseline Stool Count (PT.Custom4): from 0 to 1
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10016","Robert Mudr","CZ100162001","48","1","","SW00705916","27-May-2026","Completed","As per ATS investigation (ATS26040111), please remove the below form which was entered as a duplicate
|
||||
|
||||
- MAYO Diary (5) 24 Apr 2026","10-Jun-2026","8-14 Days","9","","","","Technical Revision","","Technical Revision - Other","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: CLARIO to delete MAYO Diary (5) dated 24 Apr 2026
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","15","1","","SW00701729","06-May-2026","Completed","Dears, please delete data from visit I-0 (reported as 4th of May 2026) as this visit had to be postponed - see the previous DCR of this patient and change data request that was corrected. Patient has left the site before it was resolved and and new date of I-0 was planned. Patient continues to fill in his diary and patient is coming to I=0 visit within allowed window. We need the system and tablet to be ready to run new Mayo Score Report with updated and recent data (e.g. reflect new I-0 visit date, new eligible days -1 to -7.).
|
||||
thank you, Jiri Skopek","19-May-2026","8-14 Days","8","","","","Visit Data","(1) 11 May 2026 msullivan (Clario): Please confirm your request
|
||||
|
||||
Dear Site. Thank you for submitting this Data Clarification.
|
||||
|
||||
Please note that the delete forms are allowed if the reason is one of the following.
|
||||
If not, forms will move to unscheduled visit.
|
||||
|
||||
Data collected by the wrong patient.
|
||||
Data collected by someone other than the patient.
|
||||
Data collected prior to informed consent, or after withdrawal from the study.
|
||||
Duplicate data erroneously entered at an Unscheduled visit via paper transcription.
|
||||
Data collected that is not expected per protocol.
|
||||
|
||||
Also, I-0 visit is still ongoing. Please close the visit.
|
||||
Once the visit was closed, we will process accoridngly.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team
|
||||
|
||||
(2) 11 May 2026 jskopek (Site User): Dears,
|
||||
I do not see any option that is adequate -from the list. Data are not needed to be deleted fully, they reflect the situation at May4th. Please mark it as unscheduled visit - as exactly that is the case. We need the system to be ready for I-0 visit planned for next week.
|
||||
I will close the visit tomorrow - do you mean in tablet/ipad?
|
||||
Thank you very much for your help! Jiri
|
||||
|
||||
(3) 12 May 2026 venkata.ramana (Clario): Thank you for your response.
|
||||
Please note that the visit I-0 was still ongoing but not closed yet.
|
||||
So please close the visit.
|
||||
Kind Regards, Clario Data Coordination Team.
|
||||
|
||||
(4) 12 May 2026 jskopek (Site User): If I try to close the I-O visit in TABLET, it asks me if patient fulfils eligibility criteria to proceed to next visit based on these old data – if I answer NO, it asks me to DEACTIVATE patient. I do not want to DEACTIVATE patient – can you help WHERE and HOW to close this visit for you to change it to UNSCHEDULED and not to de-activate patient?
|
||||
Thank you Jiri
|
||||
|
||||
|
||||
","Other-delete visit I-0","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In the following forms dated 04 May 2026, CLARIO to make the following changes:
|
||||
-Event ID: from I-0 to Unscheduled Visit 1
|
||||
-Event At Entry: from I-0 to Unscheduled Visit 1
|
||||
|
||||
+Visit Start (49)
|
||||
+ePRO Availability (1)
|
||||
+Mayo Subscore (1)
|
||||
+PGA (1)
|
||||
|
||||
Part 2: CLARIO to delete the following forms dated 04 May 2026 for I-0 visit.
|
||||
|
||||
+C-SSRS Since Last Visit (1)
|
||||
+C-SSRS Since Last Visit Findings Report (1)
|
||||
|
||||
Part 3: CLARIO to manually enter Visit End form for Unscheduled visit 1 with the following information:
|
||||
-Protocol: 77242113UCO3001
|
||||
-Report Date: 04 May 2026
|
||||
-Report Start Date and Time: 04 May 2026 23:59:59
|
||||
-Event ID: Unscheduled Visit 1
|
||||
-Event End Date: 04 May 2026 23:59:59
|
||||
-Visit Status: Incomplete
|
||||
-Phase At Entry: Screening
|
||||
-Phase At Entry Timestamp: 13 Apr 2026 12:32:20
|
||||
-Event At Entry: Unscheduled visit 1
|
||||
-Event Start Date: 04 May 2026 23:59:59
|
||||
-Event Time Zone Offset in Milliseconds: 7200000
|
||||
-Session Repeat Number (SESREP1N): 0
|
||||
-Session Instance Id (SESINST1S): 3f1214f0-4788-11f1-a0cf-bb403212adce
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10020","Lucie Gonsorcikova","CZ100201001","15","1","","SW00701226","04-May-2026","Completed","Dears, we would like ask you to change the information I read on assignment form given by patient on April 13, 2026 (Visit 1), Baseline Stool Count (PT.Custom4) as 3 that should be reported as 1.
|
||||
Patient has entered wrong number as he did not understood it should be number of stools when illness is in remission or absent. He is a child and did not reflected this question correctly. Therefore, please change Baseline Stool Count = 1.
|
||||
Thank you, Jiri Skopek ","04-May-2026","1 Day","1","","","","Demographic","","Changed Information","(Clario instructions)
|
||||
|
||||
1. Please make below changes in the assignment form:
|
||||
|
||||
Baseline Stool Count (PT. Custom4): 03 to 01."
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10021","Martin Bortlik","CZ100212001","61","1","","SW00699492","23-Apr-2026","ReadyForQC","Please correct the date of endoscopy done during screening visit of patient CZ100212001 to correct date 16-MAR-2026.","29-Apr-2026","Over 28 Days","34","30","Query Active ","Site","Site-Entered Data","","Changed Information","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In the Mayo Subscore (1) dated 07 Apr 2026 for I-0 visit, CLARIO to make the following changes:
|
||||
-What was the date of endoscopy? (ENDODT1D): from 24 Mar 2026 to 16 Mar 2026
|
||||
- Data Flag (QSDFLG1B): from blank to check
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","39","1","","SW00703322","12-May-2026","Completed","As per ATS investigation (ATS26040111), please remove the below form that's been entered as a duplicate
|
||||
|
||||
- MAYO Diary (16) - 18 Mar 2026
|
||||
","20-May-2026","4-7 Days","6","","","","Technical Revision","","Technical Revision - Other","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: CLARIO to delete the MAYO Diary (16) dated 18 Mar 2026.
|
||||
"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222003","39","1","","SW00689748","09-Mar-2026","Completed","Dear all,
|
||||
|
||||
Patient CZ 100222003 was randomized on 9 Mar 2026. Kindly correct the colonoscopy date to 11 Feb 2025.
|
||||
|
||||
The date was initially entered as 21 Feb 2025 because the earlier date could not be entered in the system. The patient was rescreened.","02-Apr-2026","15-21 Days","17","","","","Site-Entered Data","(1) 13 Mar 2026 msullivan (Clario): Please confirm your request
|
||||
|
||||
Dear Site. Thank you for submitting this Data Clarification.
|
||||
|
||||
Could you please conform that if you are requesting following?
|
||||
|
||||
Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit
|
||||
-What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025
|
||||
|
||||
Could you please confirm the year? This subject was assigned on 02 Mar 2026, you are providing that correct date is 11 Feb 2025 which a year ago.
|
||||
If you are not requesting above, please provide us the name of the form with question.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team
|
||||
|
||||
|
||||
(2) 13 Mar 2026 katerina.havlikova@clinoxus.com (Site User): confirm date of colonoscopy 11Feb2026
|
||||
|
||||
(3) 21 Mar 2026 msullivan (Clario): Dear Site,
|
||||
|
||||
The requested changes to the Mayo data have been updated. Please navigate to the Mayo Score Report and resubmit the form for visit to log the updated Mayo Score form. Once done, please respond to this query confirming that the Mayo Score has been resubmitted.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team
|
||||
|
||||
(4) 24 Mar 2026 jana.pomahacova@clinoxus.com (Site User): Thank you and sent
|
||||
|
||||
","New Information","CLARIO RESOLUTION:
|
||||
|
||||
Part 1: In the Mayo Subscore (1) dated 09 Mar 2026 for I-0 visit, CLARIO to make the following changes:
|
||||
-What was the date of endoscopy? (ENDODT1D): from 23 Feb 2026 to 11 Feb 2025
|
||||
-Data Flag (QSDFLG1B): from blank to check"
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","33","1","","SW00705372","22-May-2026","Submitted","Dear all, please change Colonoscopz date from 8April2026 to date 01Apr2026 Thank you in advance","12-Jun-2026","8-14 Days","14","","Query Active ","Site","New","(1) 29 May 2026 msullivan (Clario): Please confirm your request
|
||||
|
||||
Dear Site. Thank you for submitting this Data Clarification.
|
||||
|
||||
Please provide us the name of the form for this request.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team
|
||||
|
||||
(2) 02 Jun 2026 katerina.havlikova@clinoxus.com (Site User): Dear all, please change Colonoscopy for Week I-12 date from 8April2026 to date 01Apr2026 Thank you in advance
|
||||
|
||||
(3) 12 Jun 2026 msullivan (Clario): Dear Site,
|
||||
Please note that there is no I-12 visit in StudyWorks.
|
||||
If you completed visit and stored, please submit all stored reports.
|
||||
Until we see the data in StudyWorks, we are unable to confirm your request.
|
||||
Also, please provide us the name of the form for this request.
|
||||
|
||||
Thank you. ERT/CLARIO Data Coordination Team
|
||||
|
||||
","Changed Information",""
|
||||
"77242113UCO3001","Czech Republic","DD5-CZ10022","Petr Hrabak","CZ100222005","33","1","","SW00702538","08-May-2026","Completed","This TRR is to document the correction to the Mayo Subscore (1) form, where the following variables were populated with NULL values, due to a known core defect:
|
||||
Event At Entry, Event Start Date, Event Time Zone Offset in Milliseconds.","12-May-2026","2-3 Days","2","","","","Technical Revision","","Technical Revision - Other","Please make the below changes in Mayo Subscore (1) dated 22 Apr 2026:
|
||||
|
||||
-Event At Entry: I-0
|
||||
-Event Start Date: 09 Apr 2026 08:09:19
|
||||
-Event Time Zone Offset in Milliseconds: 7200000"
|
||||
|
+1420
File diff suppressed because it is too large
Load Diff
+11
@@ -0,0 +1,11 @@
|
||||
"Protocol","Country","Site ID","PI_NAME","Subject Number","Age","Data Correction ID","Creation Date UTC","Status","Date of Last Action UTC","Total Open Period","Total Open Time (Days)","Current Status Time (Days)","Type","Next Action Required","Category","Query History","Reason for Change"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10001","Falc, Matej","CZ100012001","48 Years","16923867","14-May-2026","Escalated","14-Jun-2026","15-21 Days","20","","QUERY","Clario DM","Patient","(8) 14 Jun 2026 Clario: what should I do now? I have send you 1 ecg by normal way, 2 by pdf.","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10001","Falc, Matej","CZ100012001","48 Years","16567067","22-Jan-2026","Resolved","28-Jan-2026","4-7 Days","4","","QUERY","","Patient","MD Falc","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10009","Pumprla, Jiri","CZ100092001","49 Years","16776685","31-Mar-2026","Resolved","13-May-2026","Over 28 Days","29","","QUERY","","Patient","(2) 13 May 2026 Clario: I confirm, that only ONE ECG was collected by mistake.","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132001","29 Years","16990554","04-Jun-2026","Resolved","08-Jun-2026","2-3 Days","2","","QUERY","","Patient","(2) 07 Jun 2026 Clario: by mistake only one strip was taken","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132001","29 Years","16981256","02-Jun-2026","Resolved","04-Jun-2026","2-3 Days","2","","QUERY","","Transmittal","Visit: SCREENING/","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132002","29 Years","16985014","03-Jun-2026","Resolved","04-Jun-2026","1 Day","1","","QUERY","","Patient","(2) 04 Jun 2026 Clario: by mistake only one strip was expected","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132003","49 Years","16988974","04-Jun-2026","Resolved","05-Jun-2026","1 Day","1","","DCR","","Transmittal","Affected Event: 'SCREENING'",""
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10013","Stepek, David","CZ100132003","49 Years","16985006","03-Jun-2026","Resolved","04-Jun-2026","1 Day","1","","QUERY","","Patient","(2) 04 Jun 2026 Clario: by mistake only one strip was taken","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10021","Bortlik, Martin","CZ100212001","61 Years","16717619","11-Mar-2026","Resolved","28-Apr-2026","Over 28 Days","32","","QUERY","","Patient","(2) 28 Apr 2026 Clario: I confirmed that due to technical problems, the ECG was done only twice","Data Checks"
|
||||
"77242113UCO3001_ANALYSIS","Czech Republic The","CZ10022","Hrabak, Petr","CZ100222003","39 Years","16945114","21-May-2026","Resolved","04-Jun-2026","8-14 Days","10","","DCR","","Patient","(7) 04 Jun 2026 Portal, EXPeRT: It was mistake NO ECG for this date 20May2026 was done",""
|
||||
|
@@ -0,0 +1,302 @@
|
||||
"""
|
||||
import_to_mongo.py
|
||||
Verze: 1.2
|
||||
Datum: 2026-06-02
|
||||
|
||||
Import Clario CSV do MongoDB (databáze: Clario).
|
||||
|
||||
Kolekce: Clario.MayoDiary / Clario.MayoScore / Clario.eCOA_DCRs / Clario.ECG_DCRs
|
||||
Filtr: pouze řádky s Country == "Czech Republic"
|
||||
Klíč: MayoDiary → Subject ID + Form Number
|
||||
MayoScore → Participant ID + Visit
|
||||
eCOA_DCRs → Data Correction ID
|
||||
ECG_DCRs → Data Correction ID
|
||||
Historie: při změně fields se stará verze uloží do pole history[]
|
||||
Po importu přesune zpracované CSV do downloads/Zpracovano/
|
||||
|
||||
Použití:
|
||||
python import_to_mongo.py # importuje všechny CSV z downloads/
|
||||
python import_to_mongo.py downloads/konkretni.csv # jeden soubor
|
||||
"""
|
||||
|
||||
import csv
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "Clario"
|
||||
DOWNLOADS_DIR = Path(__file__).parent / "downloads"
|
||||
PROCESSED_DIR = DOWNLOADS_DIR / "Zpracovano"
|
||||
|
||||
COUNTRY_FILTER = "Czech Republic"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfigurace kolekcí
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
COLLECTION_CONFIG = {
|
||||
"MayoDiary": {
|
||||
"collection": "Clario.MayoDiary",
|
||||
"subject_col": "Subject ID",
|
||||
"key_cols": ("Subject ID", "Form Number"),
|
||||
},
|
||||
"MayoScore": {
|
||||
"collection": "Clario.MayoScore",
|
||||
"subject_col": "Participant ID",
|
||||
"key_cols": ("Participant ID", "Visit"),
|
||||
"outcome_cols": (
|
||||
"Site Action",
|
||||
"Last Mayo Score Submission",
|
||||
"Week I-12 Clinical Responder",
|
||||
"Week I-12 Clinical Remission",
|
||||
"Clinical Flare",
|
||||
"Loss of Response",
|
||||
"Partial Mayo Response Post Loss of Response",
|
||||
"Partial Mayo Response for Clinical Non-Responders",
|
||||
),
|
||||
},
|
||||
"eCOA DCRs": {
|
||||
"collection": "Clario.eCOA_DCRs",
|
||||
"subject_col": "Subject ID",
|
||||
"key_cols": ("Data Correction ID",),
|
||||
},
|
||||
"ECG DCRs": {
|
||||
"collection": "Clario.ECG_DCRs",
|
||||
"subject_col": "Subject Number",
|
||||
"key_cols": ("Data Correction ID",),
|
||||
},
|
||||
}
|
||||
|
||||
DATE_FORMATS = [
|
||||
"%d-%b-%Y ",
|
||||
"%d-%b-%Y",
|
||||
"%d-%b-%Y %H:%M:%S",
|
||||
"%d %b %Y %H:%M:%S",
|
||||
"%d %b %Y %H:%M:%S:%f",
|
||||
"%d %b %Y",
|
||||
"%d %B %Y",
|
||||
"%Y%m%d %H:%M:%S.%f",
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%m/%d/%Y %I:%M:%S %p",
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def clean_colname(name: str) -> str:
|
||||
"""Odstraní BOM a okolní uvozovky/mezery z názvu sloupce."""
|
||||
return name.lstrip("").strip().strip('"')
|
||||
|
||||
|
||||
def parse_date(value: str) -> str | None:
|
||||
v = value.strip()
|
||||
for fmt in DATE_FORMATS:
|
||||
try:
|
||||
dt = datetime.strptime(v, fmt.strip())
|
||||
return dt.replace(tzinfo=timezone.utc).isoformat()
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def extract_snapshot_date(filename: str) -> str:
|
||||
match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name)
|
||||
return match.group(1) if match else datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def detect_collection_type(filename: str) -> str | None:
|
||||
"""Vrátí klíč do COLLECTION_CONFIG nebo None."""
|
||||
stem = Path(filename).stem
|
||||
for key in COLLECTION_CONFIG:
|
||||
if key in stem:
|
||||
return key
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CSV → dokument
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def map_row(row: dict, col_type: str) -> dict:
|
||||
cfg = COLLECTION_CONFIG[col_type]
|
||||
doc: dict = {}
|
||||
fields: dict = {}
|
||||
|
||||
cleaned = {clean_colname(k): v.strip() if v else "" for k, v in row.items()}
|
||||
|
||||
subject_col = cfg["subject_col"]
|
||||
doc["subject"] = {"id": cleaned.get(subject_col, "")}
|
||||
# ECG DCRs používají "Site ID" místo "Site"
|
||||
site_name = cleaned.get("Site") or cleaned.get("Site ID", "")
|
||||
doc["site"] = {"name": site_name}
|
||||
doc["country"] = cleaned.get("Country", "")
|
||||
doc["study"] = cleaned.get("Protocol", "")
|
||||
|
||||
key_parts = [cleaned.get(c, "") for c in cfg["key_cols"]]
|
||||
doc["recordKey"] = "_".join(key_parts)
|
||||
|
||||
outcome_cols = set(cfg.get("outcome_cols", ()))
|
||||
for col in outcome_cols:
|
||||
value = cleaned.get(col, "")
|
||||
if value and value != "-":
|
||||
parsed = parse_date(value)
|
||||
doc[col] = parsed if parsed else value
|
||||
else:
|
||||
doc[col] = None
|
||||
|
||||
skip_top = {"Protocol", "Country", "Site", subject_col} | outcome_cols
|
||||
for col, value in cleaned.items():
|
||||
if col in skip_top:
|
||||
continue
|
||||
if not value or value == "-":
|
||||
continue
|
||||
parsed = parse_date(value)
|
||||
fields[col] = parsed if parsed else value
|
||||
|
||||
doc["fields"] = fields
|
||||
return doc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Import jednoho souboru
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def import_file(csv_path: str, db) -> dict:
|
||||
filename = Path(csv_path).name
|
||||
col_type = detect_collection_type(filename)
|
||||
if col_type is None:
|
||||
print(f" Preskakuji (neznamy typ): {filename}")
|
||||
return {"skipped": True}
|
||||
|
||||
cfg = COLLECTION_CONFIG[col_type]
|
||||
col_name = cfg["collection"]
|
||||
snapshot_date = extract_snapshot_date(filename)
|
||||
collection = db[col_name]
|
||||
|
||||
inserted = changed = unchanged = filtered_out = 0
|
||||
|
||||
with open(csv_path, encoding="utf-8-sig", newline="") as f:
|
||||
reader = csv.DictReader(f, delimiter=",", quotechar='"')
|
||||
|
||||
for row in reader:
|
||||
cleaned_row = {clean_colname(k): v for k, v in row.items()}
|
||||
country = cleaned_row.get("Country", "").strip()
|
||||
if COUNTRY_FILTER not in country:
|
||||
filtered_out += 1
|
||||
continue
|
||||
|
||||
doc = map_row(row, col_type)
|
||||
record_key = doc.get("recordKey")
|
||||
if not record_key:
|
||||
continue
|
||||
|
||||
doc["sourceFile"] = filename
|
||||
|
||||
existing = collection.find_one({"recordKey": record_key})
|
||||
|
||||
if existing is None:
|
||||
doc["firstSeen"] = snapshot_date
|
||||
doc["lastSeen"] = snapshot_date
|
||||
doc["history"] = []
|
||||
collection.insert_one(doc)
|
||||
inserted += 1
|
||||
|
||||
elif existing.get("fields") != doc["fields"]:
|
||||
old_entry = {
|
||||
"date": existing.get("lastSeen", snapshot_date),
|
||||
"fields": existing["fields"],
|
||||
}
|
||||
update_doc = {k: v for k, v in doc.items()}
|
||||
update_doc["lastSeen"] = snapshot_date
|
||||
collection.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{
|
||||
"$push": {"history": old_entry},
|
||||
"$set": update_doc,
|
||||
},
|
||||
)
|
||||
changed += 1
|
||||
|
||||
else:
|
||||
collection.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{"$set": {"lastSeen": snapshot_date, "sourceFile": filename}},
|
||||
)
|
||||
unchanged += 1
|
||||
|
||||
collection.create_index([("recordKey", ASCENDING)], unique=True)
|
||||
collection.create_index([("subject.id", ASCENDING)])
|
||||
collection.create_index([("site.name", ASCENDING)])
|
||||
if col_type == "MayoScore":
|
||||
collection.create_index([("Site Action", ASCENDING)])
|
||||
if col_type in ("eCOA DCRs", "ECG DCRs"):
|
||||
collection.create_index([("fields.Status", ASCENDING)])
|
||||
collection.create_index([("fields.Type", ASCENDING)])
|
||||
|
||||
stats = {
|
||||
"collection": col_name,
|
||||
"snapshot": snapshot_date,
|
||||
"inserted": inserted,
|
||||
"changed": changed,
|
||||
"unchanged": unchanged,
|
||||
"filtered_out": filtered_out,
|
||||
}
|
||||
print(f" {col_name} [{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{filtered_out} non-CZ")
|
||||
return stats
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
paths: list[Path] = []
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
for arg in sys.argv[1:]:
|
||||
p = Path(arg)
|
||||
if p.is_file():
|
||||
paths.append(p)
|
||||
else:
|
||||
print(f"Soubor nenalezen: {arg}")
|
||||
else:
|
||||
paths = sorted(DOWNLOADS_DIR.glob("*.csv"))
|
||||
|
||||
if not paths:
|
||||
print("Zadne CSV soubory k importu.")
|
||||
return
|
||||
|
||||
print(f"Nalezeno {len(paths)} souboru.\n")
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
db = client[DB_NAME]
|
||||
|
||||
PROCESSED_DIR.mkdir(exist_ok=True)
|
||||
|
||||
total = {"inserted": 0, "changed": 0, "unchanged": 0}
|
||||
|
||||
for csv_path in paths:
|
||||
print(f"Import: {csv_path.name}")
|
||||
stats = import_file(str(csv_path), db)
|
||||
if not stats.get("skipped"):
|
||||
for k in total:
|
||||
total[k] += stats.get(k, 0)
|
||||
|
||||
dest = PROCESSED_DIR / csv_path.name
|
||||
shutil.move(str(csv_path), str(dest))
|
||||
print(f" -> presunut do Zpracovano/")
|
||||
|
||||
client.close()
|
||||
|
||||
print(f"\nCelkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,776 @@
|
||||
"""
|
||||
create_report.py
|
||||
Verze: 1.6
|
||||
Datum: 2026-06-02
|
||||
|
||||
Generuje Excel report (.xlsm) pro studii 77242113UCO3001 z MongoDB databáze Clario.
|
||||
Výstup: U:/Dropbox/!!!Days/Downloads Z230/YYYY-MM-DD 77242113UCO3001 Clario Reports.xlsm
|
||||
|
||||
Zdroj dat:
|
||||
MongoDB 192.168.1.76, databáze Clario
|
||||
Kolekce Clario.MayoScore — skóre Mayo per pacient × visit
|
||||
Kolekce Clario.MayoDiary — denní záznamy deníku pacienta
|
||||
Kolekce Clario.eCOA_DCRs — data correction requests eCOA
|
||||
Kolekce Clario.ECG_DCRs — data correction requests ECG
|
||||
|
||||
Listy:
|
||||
MayoScore — jeden řádek = pacient × visit
|
||||
sloupec „KLIKNI SEM" naviguje na filtrovaný EligibleDays
|
||||
řádky I-0 s Modified Mayo < 5 červeně tučně
|
||||
MayoDiary — jeden řádek = denní záznam deníku pacienta
|
||||
Compliance — jeden řádek = pacient × visit; kolik dní v okně mezi návštěvami
|
||||
mělo být vyplněno v MayoDiary a kolik jich pacient skutečně
|
||||
vyplnil + procento. Okno I-0 = od první diary po I-0; ostatní
|
||||
= od (předchozí visit +1) po aktuální visit. Unscheduled se
|
||||
ignorují. Řádky s compliance ≥ 100 % zeleně.
|
||||
EligibleDays — jeden řádek = jeden eligible day z MayoScore obohacený o data z MayoDiary;
|
||||
included/excluded flag, excluded dny šedě na žlutém pozadí
|
||||
eCOA_DCRs — všechna pole z kolekce Clario.eCOA_DCRs
|
||||
ECG_DCRs — všechna pole z kolekce Clario.ECG_DCRs
|
||||
|
||||
VBA makro (Worksheet_SelectionChange na listu MayoScore):
|
||||
Klik na sloupec „KLIKNI SEM" → přepne na EligibleDays a vyfiltruje záznamy
|
||||
pro daného pacienta a visit. Vyžaduje povolení maker při otevření souboru.
|
||||
"""
|
||||
|
||||
VERSION = "1.7"
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
from pymongo import MongoClient
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
import xlwings as xw
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfigurace
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "Clario"
|
||||
OUTPUT_DIR = Path(r"U:\Dropbox\!!!Days\Downloads Z230")
|
||||
|
||||
VISIT_ORDER = ["I-0", "I-2", "I-4", "I-8", "I-12", "M-4"]
|
||||
|
||||
COLUMNS_SCORE = [
|
||||
("KLIKNI SEM", lambda d: "▶ klikni sem"),
|
||||
("Site", lambda d: d.get("site", {}).get("name", "")),
|
||||
("Subject ID", lambda d: d.get("subject", {}).get("id", "")),
|
||||
("Visit", lambda d: d["fields"].get("Visit", "")),
|
||||
("Visit Date", lambda d: d["fields"].get("Visit Date", "")),
|
||||
("Baseline Stool Frequency", lambda d: _num(d["fields"].get("Baseline Stool Frequency", ""))),
|
||||
("Central Endoscopy Score", lambda d: _num(d["fields"].get("Central Endoscopy Score", ""))),
|
||||
("PGA Score", lambda d: _num(d["fields"].get("PGA Score", ""))),
|
||||
("Stool Frequency Sub-score", lambda d: _num(d["fields"].get("Stool Frequency Sub-score", ""))),
|
||||
("Rectal Bleeding Sub-score", lambda d: _num(d["fields"].get("Rectal Bleeding Sub-score", ""))),
|
||||
("Partial Mayo Score", lambda d: _num(d["fields"].get("Partial Mayo Score", ""))),
|
||||
("Modified Mayo Score", lambda d: _num(d["fields"].get("Modified Mayo Score", ""))),
|
||||
("Full Mayo Score", lambda d: _num(d["fields"].get("Full Mayo Score", ""))),
|
||||
("Site Action", lambda d: d.get("Site Action") or ""),
|
||||
("Last Mayo Score Submission", lambda d: d.get("Last Mayo Score Submission") or ""),
|
||||
("Wk I-12 Responder", lambda d: d.get("Week I-12 Clinical Responder") or ""),
|
||||
("Wk I-12 Remission", lambda d: d.get("Week I-12 Clinical Remission") or ""),
|
||||
("Clinical Flare", lambda d: d.get("Clinical Flare") or ""),
|
||||
("Loss of Response", lambda d: d.get("Loss of Response") or ""),
|
||||
("Partial Mayo Post LoR", lambda d: d.get("Partial Mayo Response Post Loss of Response") or ""),
|
||||
("Partial Mayo Non-Resp", lambda d: d.get("Partial Mayo Response for Clinical Non-Responders") or ""),
|
||||
]
|
||||
|
||||
COLUMNS_DIARY = [
|
||||
("Subject ID", lambda d: d.get("subject", {}).get("id", "")),
|
||||
("Report Date", lambda d: d["fields"].get("Report Date", "")),
|
||||
("Baseline Stool Count", lambda d: _num(d["fields"].get("Baseline Stool Count", ""))),
|
||||
("Stool Frequency", lambda d: _num(d["fields"].get("Stool Frequency", ""))),
|
||||
("MAYO050", lambda d: d["fields"].get("MAYO050", "")),
|
||||
("Not Applicable", lambda d: d["fields"].get("Not Applicable", "")),
|
||||
("Constipation", lambda d: d["fields"].get("Constipation", "")),
|
||||
("Diarrhea", lambda d: d["fields"].get("Diarrhea", "")),
|
||||
("Irregularity", lambda d: d["fields"].get("Irregularity", "")),
|
||||
]
|
||||
|
||||
COLUMNS_ECOA_DCRS = [
|
||||
("Site", lambda d: d.get("site", {}).get("name", "")),
|
||||
("Subject ID", lambda d: d.get("subject", {}).get("id", "")),
|
||||
("Data Correction ID", lambda d: d["fields"].get("Data Correction ID", "")),
|
||||
("PI Name", lambda d: d["fields"].get("PI Name", "")),
|
||||
("Creation Date UTC", lambda d: d["fields"].get("Creation Date UTC", "")),
|
||||
("Date of Last Action UTC", lambda d: d["fields"].get("Date of Last Action UTC", "")),
|
||||
("Status", lambda d: d["fields"].get("Status", "")),
|
||||
("Type", lambda d: d["fields"].get("Type", "")),
|
||||
("Next Action Required", lambda d: d["fields"].get("Next Action Required", "")),
|
||||
("Category", lambda d: d["fields"].get("Category", "")),
|
||||
("Total Open Period", lambda d: d["fields"].get("Total Open Period", "")),
|
||||
("Total Open Time (Days)", lambda d: _num(d["fields"].get("Total Open Time (Days)", ""))),
|
||||
("Current Status Time (Days)", lambda d: _num(d["fields"].get("Current Status Time (Days)", ""))),
|
||||
("Reason for Change", lambda d: d["fields"].get("Reason for Change", "")),
|
||||
("Description", lambda d: d["fields"].get("Description", "")),
|
||||
("Resolution", lambda d: d["fields"].get("Resolution", "")),
|
||||
("Query History", lambda d: d["fields"].get("Query History", "")),
|
||||
("Age at Informed Consent", lambda d: d["fields"].get("Age at Informed Consent", "")),
|
||||
("Baseline Stool Count", lambda d: _num(d["fields"].get("Baseline Stool Count", ""))),
|
||||
("firstSeen", lambda d: d.get("firstSeen", "")),
|
||||
("lastSeen", lambda d: d.get("lastSeen", "")),
|
||||
]
|
||||
|
||||
COLUMNS_ECG_DCRS = [
|
||||
("Site ID", lambda d: d.get("site", {}).get("name", "")),
|
||||
("Subject Number", lambda d: d.get("subject", {}).get("id", "")),
|
||||
("Data Correction ID", lambda d: d["fields"].get("Data Correction ID", "")),
|
||||
("PI Name", lambda d: d["fields"].get("PI_NAME", "")),
|
||||
("Age", lambda d: d["fields"].get("Age", "")),
|
||||
("Creation Date UTC", lambda d: d["fields"].get("Creation Date UTC", "")),
|
||||
("Date of Last Action UTC", lambda d: d["fields"].get("Date of Last Action UTC", "")),
|
||||
("Status", lambda d: d["fields"].get("Status", "")),
|
||||
("Type", lambda d: d["fields"].get("Type", "")),
|
||||
("Next Action Required", lambda d: d["fields"].get("Next Action Required", "")),
|
||||
("Category", lambda d: d["fields"].get("Category", "")),
|
||||
("Total Open Period", lambda d: d["fields"].get("Total Open Period", "")),
|
||||
("Total Open Time (Days)", lambda d: _num(d["fields"].get("Total Open Time (Days)", ""))),
|
||||
("Current Status Time (Days)", lambda d: _num(d["fields"].get("Current Status Time (Days)", ""))),
|
||||
("Reason for Change", lambda d: d["fields"].get("Reason for Change", "")),
|
||||
("Query History", lambda d: d["fields"].get("Query History", "")),
|
||||
("firstSeen", lambda d: d.get("firstSeen", "")),
|
||||
("lastSeen", lambda d: d.get("lastSeen", "")),
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _num(value):
|
||||
"""Převede číselný string na int, jinak vrátí původní hodnotu nebo None."""
|
||||
if value == "" or value is None:
|
||||
return None
|
||||
try:
|
||||
return int(value)
|
||||
except (ValueError, TypeError):
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return value
|
||||
|
||||
|
||||
def _visit_sort_key(doc):
|
||||
visit = doc["fields"].get("Visit", "")
|
||||
try:
|
||||
idx = VISIT_ORDER.index(visit)
|
||||
except ValueError:
|
||||
idx = len(VISIT_ORDER)
|
||||
return (doc.get("site", {}).get("name", ""), doc.get("subject", {}).get("id", ""), idx, visit)
|
||||
|
||||
|
||||
def _iso_to_date(value):
|
||||
"""ISO string → Python date pro Excel."""
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
try:
|
||||
return datetime.fromisoformat(value).date()
|
||||
except ValueError:
|
||||
return value
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Styly
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HEADER_FILL = PatternFill("solid", fgColor="1F497D")
|
||||
HEADER_FONT = Font(bold=True, color="FFFFFF", size=10)
|
||||
CELL_FONT = Font(size=10)
|
||||
ALIGN_CTR = Alignment(horizontal="center", vertical="center", wrap_text=False)
|
||||
ALIGN_LEFT = Alignment(horizontal="left", vertical="center")
|
||||
|
||||
THIN = Side(style="thin", color="BFBFBF")
|
||||
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
|
||||
|
||||
# zebra
|
||||
FILL_ODD = PatternFill("solid", fgColor="FFFFFF")
|
||||
FILL_EVEN = PatternFill("solid", fgColor="EBF1DE")
|
||||
|
||||
# DCR status barvy
|
||||
FILL_DCR_SITE = PatternFill("solid", fgColor="FFFF00") # žlutá — čeká lékař
|
||||
FILL_DCR_CLARIO = PatternFill("solid", fgColor="BDD7EE") # modrá — čeká Clario
|
||||
FILL_DCR_QC = PatternFill("solid", fgColor="F4B942") # oranžová — ReadyForQC
|
||||
FILL_DCR_DONE = PatternFill("solid", fgColor="FFFFFF") # bílá — Completed
|
||||
|
||||
SCORE_COLS = {"Partial Mayo Score", "Modified Mayo Score", "Full Mayo Score"}
|
||||
SCORE_FILL = PatternFill("solid", fgColor="FFC7CE") # červená pro skóre ≥ 5 (placeholder — nepoužíváme podmíněné formátování)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sestavení sheetu
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_sheet(ws, docs, columns, date_cols, center_cols, col_widths, row_font_fn=None, wrap_cols=None, header_row=1):
|
||||
headers = [c[0] for c in columns]
|
||||
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
cell = ws.cell(row=header_row, column=col_idx, value=header)
|
||||
cell.font = HEADER_FONT
|
||||
cell.fill = HEADER_FILL
|
||||
cell.alignment = ALIGN_CTR
|
||||
cell.border = BORDER
|
||||
ws.row_dimensions[header_row].height = 28
|
||||
|
||||
data_start = header_row + 1
|
||||
for row_idx, doc in enumerate(docs, data_start):
|
||||
fill = FILL_EVEN if (row_idx - header_row) % 2 == 0 else FILL_ODD
|
||||
font = row_font_fn(doc) if row_font_fn else CELL_FONT
|
||||
for col_idx, (col_name, getter) in enumerate(columns, 1):
|
||||
value = getter(doc)
|
||||
if col_name in date_cols and isinstance(value, str):
|
||||
value = _iso_to_date(value)
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = font
|
||||
cell.fill = fill
|
||||
cell.border = BORDER
|
||||
if wrap_cols and col_name in wrap_cols:
|
||||
cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True)
|
||||
else:
|
||||
cell.alignment = ALIGN_CTR if col_name in center_cols else ALIGN_LEFT
|
||||
|
||||
for col_idx, (col_name, _) in enumerate(columns, 1):
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = col_widths.get(col_name, 14)
|
||||
|
||||
for col_name in date_cols:
|
||||
if col_name in headers:
|
||||
letter = get_column_letter(headers.index(col_name) + 1)
|
||||
for row_idx in range(data_start, len(docs) + data_start):
|
||||
ws[f"{letter}{row_idx}"].number_format = "DD-MMM-YYYY"
|
||||
|
||||
ws.freeze_panes = f"A{data_start}"
|
||||
ws.auto_filter.ref = f"A{header_row}:{get_column_letter(len(headers))}{header_row}"
|
||||
|
||||
|
||||
def _score_row_font(doc):
|
||||
visit = doc["fields"].get("Visit", "")
|
||||
try:
|
||||
mod_mayo = int(doc["fields"].get("Modified Mayo Score", ""))
|
||||
except (ValueError, TypeError):
|
||||
mod_mayo = None
|
||||
if visit == "I-0" and mod_mayo is not None and mod_mayo < 5:
|
||||
return Font(size=10, bold=True, color="FF0000")
|
||||
return CELL_FONT
|
||||
|
||||
|
||||
def build_mayo_score_sheet(ws, docs):
|
||||
_build_sheet(
|
||||
ws, docs, COLUMNS_SCORE,
|
||||
date_cols={"Visit Date", "Last Mayo Score Submission"},
|
||||
center_cols={"KLIKNI SEM", "Visit", "Central Endoscopy Score", "PGA Score",
|
||||
"Stool Frequency Sub-score", "Rectal Bleeding Sub-score",
|
||||
"Partial Mayo Score", "Modified Mayo Score", "Full Mayo Score",
|
||||
"Baseline Stool Frequency",
|
||||
"Wk I-12 Responder", "Wk I-12 Remission", "Clinical Flare",
|
||||
"Loss of Response", "Partial Mayo Post LoR", "Partial Mayo Non-Resp",
|
||||
"Last Mayo Score Submission"},
|
||||
col_widths={
|
||||
"KLIKNI SEM": 14,
|
||||
"Site": 18, "Subject ID": 16, "Visit": 12, "Visit Date": 14,
|
||||
"Baseline Stool Frequency": 14, "Central Endoscopy Score": 14,
|
||||
"PGA Score": 10, "Stool Frequency Sub-score": 14,
|
||||
"Rectal Bleeding Sub-score": 14, "Partial Mayo Score": 14,
|
||||
"Modified Mayo Score": 14, "Full Mayo Score": 13,
|
||||
"Site Action": 22, "Last Mayo Score Submission": 16,
|
||||
"Wk I-12 Responder": 14, "Wk I-12 Remission": 14,
|
||||
"Clinical Flare": 14, "Loss of Response": 14,
|
||||
"Partial Mayo Post LoR": 20, "Partial Mayo Non-Resp": 20,
|
||||
},
|
||||
row_font_fn=_score_row_font,
|
||||
)
|
||||
# Speciální styl pro sloupec KLIKNI SEM — vypadá jako tlačítko/odkaz
|
||||
link_font = Font(size=10, bold=True, color="FFFFFF")
|
||||
link_fill = PatternFill("solid", fgColor="2E75B6")
|
||||
for row in range(2, len(docs) + 2):
|
||||
cell = ws.cell(row=row, column=1)
|
||||
cell.font = link_font
|
||||
cell.fill = link_fill
|
||||
cell.alignment = ALIGN_CTR
|
||||
|
||||
|
||||
def build_mayo_diary_sheet(ws, docs):
|
||||
_build_sheet(
|
||||
ws, docs, COLUMNS_DIARY,
|
||||
date_cols={"Report Date"},
|
||||
center_cols={"Baseline Stool Count", "Stool Frequency", "Not Applicable",
|
||||
"Constipation", "Diarrhea", "Irregularity"},
|
||||
col_widths={
|
||||
"Subject ID": 16, "Report Date": 14, "Baseline Stool Count": 14,
|
||||
"Stool Frequency": 14, "MAYO050": 48, "Not Applicable": 14,
|
||||
"Constipation": 14, "Diarrhea": 12, "Irregularity": 14,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def build_eligible_days_sheet(ws, score_docs, diary_docs):
|
||||
# Lookup diary records by (subject_id, date_part YYYY-MM-DD)
|
||||
diary_lookup: dict[tuple, dict] = {}
|
||||
for d in diary_docs:
|
||||
subj = d.get("subject", {}).get("id", "")
|
||||
date_iso = d["fields"].get("Report Date", "")
|
||||
date_part = date_iso[:10] if date_iso else ""
|
||||
if subj and date_part:
|
||||
diary_lookup[(subj, date_part)] = d
|
||||
|
||||
headers = [
|
||||
"Included", "Subject ID", "Visit", "Visit Date", "Day",
|
||||
"Report Date", "Baseline Stool Count", "Stool Frequency",
|
||||
"MAYO050", "Not Applicable", "Constipation", "Diarrhea", "Irregularity",
|
||||
]
|
||||
col_widths = {
|
||||
"Included": 10, "Subject ID": 16, "Visit": 10, "Visit Date": 14, "Day": 8,
|
||||
"Report Date": 14, "Baseline Stool Count": 14, "Stool Frequency": 14,
|
||||
"MAYO050": 48, "Not Applicable": 14, "Constipation": 14,
|
||||
"Diarrhea": 12, "Irregularity": 14,
|
||||
}
|
||||
center_cols = {"Included", "Visit", "Day", "Baseline Stool Count", "Stool Frequency",
|
||||
"Not Applicable", "Constipation", "Diarrhea", "Irregularity"}
|
||||
date_cols = {"Visit Date", "Report Date"}
|
||||
no_fill = PatternFill("solid", fgColor="FFF2CC") # žlutá pro excluded dny
|
||||
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=header)
|
||||
cell.font = HEADER_FONT
|
||||
cell.fill = HEADER_FILL
|
||||
cell.alignment = ALIGN_CTR
|
||||
cell.border = BORDER
|
||||
ws.row_dimensions[1].height = 28
|
||||
|
||||
row_idx = 2
|
||||
for score_doc in score_docs:
|
||||
subj = score_doc.get("subject", {}).get("id", "")
|
||||
visit = score_doc["fields"].get("Visit", "")
|
||||
visit_date = score_doc["fields"].get("Visit Date", "")
|
||||
|
||||
for n in range(1, 11):
|
||||
day_date_iso = score_doc["fields"].get(f"Eligible Day (-{n})")
|
||||
if not day_date_iso or day_date_iso == "-":
|
||||
continue
|
||||
date_part = day_date_iso[:10]
|
||||
excl_reason = score_doc["fields"].get(f"Day (-{n}) Excluded Reason(s)", "")
|
||||
included = "No" if excl_reason and excl_reason != "-" else "Yes"
|
||||
|
||||
diary = diary_lookup.get((subj, date_part), {})
|
||||
df = diary.get("fields", {})
|
||||
|
||||
fill = no_fill if included == "No" else (FILL_EVEN if row_idx % 2 == 0 else FILL_ODD)
|
||||
font = Font(size=10, color="808080") if included == "No" else CELL_FONT
|
||||
|
||||
values = [
|
||||
included,
|
||||
subj,
|
||||
visit,
|
||||
_iso_to_date(visit_date) if isinstance(visit_date, str) else visit_date,
|
||||
f"-{n}",
|
||||
_iso_to_date(day_date_iso),
|
||||
_num(df.get("Baseline Stool Count", "")),
|
||||
_num(df.get("Stool Frequency", "")),
|
||||
df.get("MAYO050", ""),
|
||||
df.get("Not Applicable", ""),
|
||||
df.get("Constipation", ""),
|
||||
df.get("Diarrhea", ""),
|
||||
df.get("Irregularity", ""),
|
||||
]
|
||||
|
||||
for col_idx, (header, value) in enumerate(zip(headers, values), 1):
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = font
|
||||
cell.fill = fill
|
||||
cell.border = BORDER
|
||||
if header in date_cols:
|
||||
cell.number_format = "DD-MMM-YYYY"
|
||||
cell.alignment = ALIGN_CTR if header in center_cols else ALIGN_LEFT
|
||||
|
||||
row_idx += 1
|
||||
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = col_widths.get(header, 14)
|
||||
|
||||
ws.freeze_panes = "A2"
|
||||
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
|
||||
|
||||
|
||||
def _build_dcr_legend(ws):
|
||||
"""Vloží legendu do řádků 1–4, prázdný řádek 5. Data začínají od řádku 6."""
|
||||
legend = [
|
||||
(FILL_DCR_SITE, "Čeká lékař — Next Action Required = Site (lékař musí odpovědět nebo potvrdit)"),
|
||||
(FILL_DCR_CLARIO, "Čeká Clario — Next Action Required = Clario DM (Clario dostalo podklady, provede změnu)"),
|
||||
(FILL_DCR_QC, "ReadyForQC — Clario provedlo změny, čeká na finální QC kontrolu"),
|
||||
(FILL_DCR_DONE, "Completed / Resolved — DCR je uzavřen"),
|
||||
]
|
||||
for i, (fill, text) in enumerate(legend, 1):
|
||||
a = ws.cell(row=i, column=1, value="")
|
||||
a.fill = fill
|
||||
a.border = BORDER
|
||||
b = ws.cell(row=i, column=2, value=text)
|
||||
b.font = Font(size=10, bold=True)
|
||||
b.alignment = ALIGN_LEFT
|
||||
# řádek 5 prázdný — nic nedělat
|
||||
|
||||
|
||||
def _dcr_row_fill(doc):
|
||||
"""Vrátí fill barvu dle stavu DCR."""
|
||||
status = doc["fields"].get("Status", "")
|
||||
next_action = doc["fields"].get("Next Action Required", "")
|
||||
if status in ("Completed", "Resolved"):
|
||||
return FILL_DCR_DONE
|
||||
if status == "ReadyForQC":
|
||||
return FILL_DCR_QC
|
||||
if "Site" in next_action:
|
||||
return FILL_DCR_SITE
|
||||
if "Clario" in next_action or next_action == "":
|
||||
return FILL_DCR_CLARIO
|
||||
return FILL_ODD
|
||||
|
||||
|
||||
def build_ecoa_dcrs_sheet(ws, docs):
|
||||
_build_dcr_legend(ws)
|
||||
docs_sorted = sorted(docs, key=lambda d: (
|
||||
d.get("site", {}).get("name", ""),
|
||||
d.get("subject", {}).get("id", ""),
|
||||
d["fields"].get("Creation Date UTC", ""),
|
||||
))
|
||||
_build_sheet(
|
||||
ws, docs_sorted, COLUMNS_ECOA_DCRS,
|
||||
date_cols={"Creation Date UTC", "Date of Last Action UTC"},
|
||||
center_cols={"Status", "Type", "Next Action Required", "Category",
|
||||
"Total Open Time (Days)", "Current Status Time (Days)",
|
||||
"Baseline Stool Count", "firstSeen", "lastSeen"},
|
||||
col_widths={
|
||||
"Site": 16, "Subject ID": 16, "Data Correction ID": 18,
|
||||
"PI Name": 18, "Creation Date UTC": 14, "Date of Last Action UTC": 14,
|
||||
"Status": 14, "Type": 16, "Next Action Required": 16, "Category": 20,
|
||||
"Total Open Period": 14, "Total Open Time (Days)": 14,
|
||||
"Current Status Time (Days)": 16, "Reason for Change": 20,
|
||||
"Description": 50, "Resolution": 50, "Query History": 60,
|
||||
"Age at Informed Consent": 14, "Baseline Stool Count": 14,
|
||||
"firstSeen": 12, "lastSeen": 12,
|
||||
},
|
||||
wrap_cols={"Reason for Change", "Description", "Resolution", "Query History"},
|
||||
header_row=6,
|
||||
row_font_fn=lambda doc: CELL_FONT,
|
||||
)
|
||||
# Přebarvení řádků dle DCR stavu (přepíše zebra fill)
|
||||
data_start = 7
|
||||
for row_idx, doc in enumerate(docs_sorted, data_start):
|
||||
fill = _dcr_row_fill(doc)
|
||||
for col_idx in range(1, len(COLUMNS_ECOA_DCRS) + 1):
|
||||
ws.cell(row=row_idx, column=col_idx).fill = fill
|
||||
|
||||
|
||||
def build_ecg_dcrs_sheet(ws, docs):
|
||||
_build_dcr_legend(ws)
|
||||
docs_sorted = sorted(docs, key=lambda d: (
|
||||
d.get("site", {}).get("name", ""),
|
||||
d.get("subject", {}).get("id", ""),
|
||||
d["fields"].get("Creation Date UTC", ""),
|
||||
))
|
||||
_build_sheet(
|
||||
ws, docs_sorted, COLUMNS_ECG_DCRS,
|
||||
date_cols={"Creation Date UTC", "Date of Last Action UTC"},
|
||||
center_cols={"Status", "Type", "Next Action Required", "Category",
|
||||
"Total Open Time (Days)", "Current Status Time (Days)",
|
||||
"firstSeen", "lastSeen"},
|
||||
col_widths={
|
||||
"Site ID": 14, "Subject Number": 16, "Data Correction ID": 16,
|
||||
"PI Name": 18, "Age": 10, "Creation Date UTC": 14,
|
||||
"Date of Last Action UTC": 14, "Status": 14, "Type": 12,
|
||||
"Next Action Required": 16, "Category": 14,
|
||||
"Total Open Period": 14, "Total Open Time (Days)": 14,
|
||||
"Current Status Time (Days)": 16, "Reason for Change": 20,
|
||||
"Query History": 60, "firstSeen": 12, "lastSeen": 12,
|
||||
},
|
||||
wrap_cols={"Query History"},
|
||||
header_row=6,
|
||||
row_font_fn=lambda doc: CELL_FONT,
|
||||
)
|
||||
# Přebarvení řádků dle DCR stavu
|
||||
data_start = 7
|
||||
for row_idx, doc in enumerate(docs_sorted, data_start):
|
||||
fill = _dcr_row_fill(doc)
|
||||
for col_idx in range(1, len(COLUMNS_ECG_DCRS) + 1):
|
||||
ws.cell(row=row_idx, column=col_idx).fill = fill
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# List Compliance
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Pořadí návštěv pro výpočet oken (Unscheduled apod. se ignorují)
|
||||
COMPLIANCE_VISIT_ORDER = ["I-0", "I-2", "I-4", "I-8", "I-12", "M-4"]
|
||||
|
||||
FILL_COMPLIANCE_OK = PatternFill("solid", fgColor="C6EFCE") # zelená — compliance ≥ 100 %
|
||||
FONT_COMPLIANCE_OK = Font(size=10, color="006100")
|
||||
|
||||
|
||||
def build_compliance_sheet(ws, score_docs, diary_docs):
|
||||
"""Compliance vyplňování MayoDiary mezi návštěvami.
|
||||
|
||||
Okno pro I-0 = od první MayoDiary daného pacienta po datum I-0.
|
||||
Okno pro ostatní = od (datum předchozí návštěvy + 1 den) po datum návštěvy.
|
||||
Vyplněno = počet MayoDiary záznamů pacienta s Report Date uvnitř okna.
|
||||
Dní v okně = počet kalendářních dní okna (včetně obou krajů).
|
||||
"""
|
||||
# -- MayoDiary datumy per pacient (jeden průchod) ------------------------
|
||||
diary_by_subj: dict[str, list] = {}
|
||||
for d in diary_docs:
|
||||
subj = d.get("subject", {}).get("id", "")
|
||||
rd = d["fields"].get("Report Date", "")
|
||||
dt = _iso_to_date(rd) if isinstance(rd, str) else rd
|
||||
if subj and hasattr(dt, "year"):
|
||||
diary_by_subj.setdefault(subj, []).append(dt)
|
||||
first_diary = {s: min(dts) for s, dts in diary_by_subj.items() if dts}
|
||||
|
||||
def _vidx(v):
|
||||
try:
|
||||
return COMPLIANCE_VISIT_ORDER.index(v)
|
||||
except ValueError:
|
||||
return len(COMPLIANCE_VISIT_ORDER)
|
||||
|
||||
# -- Návštěvy per pacient (jen známé visity) -----------------------------
|
||||
by_subj: dict[str, list] = {}
|
||||
for sd in score_docs:
|
||||
if sd["fields"].get("Visit", "") not in COMPLIANCE_VISIT_ORDER:
|
||||
continue
|
||||
subj = sd.get("subject", {}).get("id", "")
|
||||
by_subj.setdefault(subj, []).append(sd)
|
||||
|
||||
rows = []
|
||||
for subj in sorted(by_subj):
|
||||
visits = sorted(by_subj[subj], key=lambda d: _vidx(d["fields"].get("Visit", "")))
|
||||
prev_end = None
|
||||
for sd in visits:
|
||||
visit = sd["fields"].get("Visit", "")
|
||||
vdate = _iso_to_date(sd["fields"].get("Visit Date", ""))
|
||||
if not hasattr(vdate, "year"):
|
||||
continue
|
||||
if visit == "I-0":
|
||||
start = first_diary.get(subj)
|
||||
else:
|
||||
start = (prev_end + timedelta(days=1)) if prev_end else first_diary.get(subj)
|
||||
prev_end = vdate
|
||||
if not start or not hasattr(start, "year"):
|
||||
continue
|
||||
days = (vdate - start).days + 1
|
||||
if days <= 0:
|
||||
continue
|
||||
filled = sum(1 for dt in diary_by_subj.get(subj, []) if start <= dt <= vdate)
|
||||
pct = round(filled / days * 100)
|
||||
rows.append({
|
||||
"site": sd.get("site", {}).get("name", ""),
|
||||
"subj": subj,
|
||||
"visit": visit,
|
||||
"start": start,
|
||||
"end": vdate,
|
||||
"days": days,
|
||||
"filled": filled,
|
||||
"pct": pct,
|
||||
})
|
||||
|
||||
# -- Zápis listu ---------------------------------------------------------
|
||||
headers = ["Site", "Subject ID", "Visit", "Okno od", "Okno do",
|
||||
"Dní v okně", "Vyplněno", "Compliance %"]
|
||||
col_widths = {"Site": 18, "Subject ID": 16, "Visit": 10, "Okno od": 14,
|
||||
"Okno do": 14, "Dní v okně": 12, "Vyplněno": 12, "Compliance %": 14}
|
||||
center_cols = {"Visit", "Dní v okně", "Vyplněno", "Compliance %"}
|
||||
date_cols = {"Okno od", "Okno do"}
|
||||
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=header)
|
||||
cell.font = HEADER_FONT
|
||||
cell.fill = HEADER_FILL
|
||||
cell.alignment = ALIGN_CTR
|
||||
cell.border = BORDER
|
||||
ws.row_dimensions[1].height = 28
|
||||
|
||||
for row_idx, r in enumerate(rows, 2):
|
||||
is_ok = r["pct"] >= 100
|
||||
if is_ok:
|
||||
fill = FILL_COMPLIANCE_OK
|
||||
font = FONT_COMPLIANCE_OK
|
||||
else:
|
||||
fill = FILL_EVEN if row_idx % 2 == 0 else FILL_ODD
|
||||
font = CELL_FONT
|
||||
values = [r["site"], r["subj"], r["visit"], r["start"], r["end"],
|
||||
r["days"], r["filled"], r["pct"]]
|
||||
for col_idx, (header, value) in enumerate(zip(headers, values), 1):
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = font
|
||||
cell.fill = fill
|
||||
cell.border = BORDER
|
||||
if header in date_cols:
|
||||
cell.number_format = "DD-MMM-YYYY"
|
||||
if header == "Compliance %":
|
||||
cell.number_format = '0"%"'
|
||||
cell.alignment = ALIGN_CTR if header in center_cols else ALIGN_LEFT
|
||||
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = col_widths.get(header, 14)
|
||||
|
||||
ws.freeze_panes = "A2"
|
||||
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers: výstupní cesta
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _unique_path(directory: Path, stem: str, suffix: str) -> Path:
|
||||
candidate = directory / f"{stem}{suffix}"
|
||||
if not candidate.exists():
|
||||
return candidate
|
||||
n = 2
|
||||
while True:
|
||||
candidate = directory / f"{stem} ({n}){suffix}"
|
||||
if not candidate.exists():
|
||||
return candidate
|
||||
n += 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Timing helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _tick(label: str, t0: float) -> float:
|
||||
"""Vypíše dobu od t0 a vrátí aktuální čas jako nový t0."""
|
||||
elapsed = time.perf_counter() - t0
|
||||
print(f" {label:<30} {elapsed:6.2f} s")
|
||||
return time.perf_counter()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
t_total = time.perf_counter()
|
||||
print("Spouštím generování reportu...")
|
||||
print()
|
||||
|
||||
# -- 1. MongoDB: připojení + načtení + seřazení --------------------------
|
||||
t = time.perf_counter()
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
db = client[DB_NAME]
|
||||
score_docs = list(db["Clario.MayoScore"].find({}))
|
||||
diary_docs = list(db["Clario.MayoDiary"].find({}))
|
||||
ecoa_dcr_docs = list(db["Clario.eCOA_DCRs"].find({}))
|
||||
ecg_dcr_docs = list(db["Clario.ECG_DCRs"].find({}))
|
||||
client.close()
|
||||
score_docs.sort(key=_visit_sort_key)
|
||||
diary_docs.sort(key=lambda d: (
|
||||
d.get("subject", {}).get("id", ""),
|
||||
d["fields"].get("Report Date", ""),
|
||||
))
|
||||
t = _tick(f"MongoDB (ping, fetch, sort → {len(score_docs)} + {len(diary_docs)} + {len(ecoa_dcr_docs)} + {len(ecg_dcr_docs)} záznamů)", t)
|
||||
|
||||
# -- 2–4. Tvorba listů ---------------------------------------------------
|
||||
wb = Workbook()
|
||||
ws_score = wb.active
|
||||
ws_score.title = "MayoScore"
|
||||
build_mayo_score_sheet(ws_score, score_docs)
|
||||
t = _tick("List MayoScore (KLIKNI SEM, zebra, červené I-0, autofilter)", t)
|
||||
|
||||
ws_diary = wb.create_sheet("MayoDiary")
|
||||
build_mayo_diary_sheet(ws_diary, diary_docs)
|
||||
t = _tick("List MayoDiary (zebra, formátování dat, autofilter)", t)
|
||||
|
||||
ws_comp = wb.create_sheet("Compliance")
|
||||
build_compliance_sheet(ws_comp, score_docs, diary_docs)
|
||||
t = _tick("List Compliance (okna mezi visitami, % vyplnění, zelená ≥100 %)", t)
|
||||
|
||||
ws_days = wb.create_sheet("EligibleDays")
|
||||
build_eligible_days_sheet(ws_days, score_docs, diary_docs)
|
||||
t = _tick("List EligibleDays (diary lookup, included/excluded flag, autofilter)", t)
|
||||
|
||||
ws_ecoa = wb.create_sheet("eCOA_DCRs")
|
||||
build_ecoa_dcrs_sheet(ws_ecoa, ecoa_dcr_docs)
|
||||
t = _tick(f"List eCOA_DCRs ({len(ecoa_dcr_docs)} záznamů)", t)
|
||||
|
||||
ws_ecg = wb.create_sheet("ECG_DCRs")
|
||||
build_ecg_dcrs_sheet(ws_ecg, ecg_dcr_docs)
|
||||
t = _tick(f"List ECG_DCRs ({len(ecg_dcr_docs)} záznamů)", t)
|
||||
|
||||
# -- 5. Uložení XLSX -----------------------------------------------------
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
base_stem = f"{today} 77242113UCO3001 Clario Reports"
|
||||
xlsm_path = _unique_path(OUTPUT_DIR, base_stem, ".xlsm")
|
||||
xlsx_path = xlsm_path.with_suffix(".xlsx")
|
||||
wb.save(str(xlsx_path))
|
||||
t = _tick("Uložení XLSX (openpyxl, dočasný soubor)", t)
|
||||
|
||||
# -- 6. Injektování VBA --------------------------------------------------
|
||||
inject_vba(xlsx_path, xlsm_path)
|
||||
xlsx_path.unlink(missing_ok=True)
|
||||
_tick("Injektování VBA (xlwings: open → AddFromString → SaveAs .xlsm)", t)
|
||||
|
||||
# -- Souhrn --------------------------------------------------------------
|
||||
total = time.perf_counter() - t_total
|
||||
print()
|
||||
print(f" {'Celkem':<30} {total:6.2f} s")
|
||||
print()
|
||||
print(f"Uloženo: {xlsm_path}")
|
||||
|
||||
|
||||
def inject_vba(xlsx_path: Path, xlsm_path: Path) -> None:
|
||||
vba_code = '''\
|
||||
Private Sub Worksheet_SelectionChange(ByVal Target As Range)
|
||||
If Target.Row < 2 Then Exit Sub
|
||||
If Target.Rows.Count > 1 Then Exit Sub
|
||||
If Target.Column <> 1 Then Exit Sub
|
||||
|
||||
Dim subjectId As String
|
||||
Dim visit As String
|
||||
subjectId = CStr(Me.Cells(Target.Row, 3).Value)
|
||||
visit = CStr(Me.Cells(Target.Row, 4).Value)
|
||||
|
||||
If subjectId = "" Or visit = "" Then Exit Sub
|
||||
|
||||
Dim ws As Worksheet
|
||||
On Error Resume Next
|
||||
Set ws = ThisWorkbook.Sheets("EligibleDays")
|
||||
On Error GoTo 0
|
||||
If ws Is Nothing Then Exit Sub
|
||||
|
||||
Application.ScreenUpdating = False
|
||||
|
||||
ws.AutoFilterMode = False
|
||||
ws.Range("A1").AutoFilter
|
||||
ws.Range("A1").AutoFilter Field:=2, Criteria1:=subjectId
|
||||
ws.Range("A1").AutoFilter Field:=3, Criteria1:=visit
|
||||
|
||||
ws.Activate
|
||||
ws.Range("A2").Select
|
||||
|
||||
Application.ScreenUpdating = True
|
||||
End Sub
|
||||
'''
|
||||
|
||||
app = xw.App(visible=False)
|
||||
try:
|
||||
wb = app.books.open(str(xlsx_path))
|
||||
# Najdi VBComponent odpovídající listu "MayoScore" podle tab názvu
|
||||
vb_comp = None
|
||||
for comp in wb.api.VBProject.VBComponents:
|
||||
if comp.Type == 100: # xlSheet
|
||||
try:
|
||||
if comp.Properties("Name").Value == "MayoScore":
|
||||
vb_comp = comp
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if vb_comp is None:
|
||||
# fallback: první sheet (Sheet1)
|
||||
vb_comp = wb.api.VBProject.VBComponents("Sheet1")
|
||||
vb_comp.CodeModule.AddFromString(vba_code)
|
||||
wb.api.SaveAs(str(xlsm_path), FileFormat=52) # 52 = xlOpenXMLWorkbookMacroEnabled
|
||||
wb.close()
|
||||
finally:
|
||||
app.quit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,7 +1,7 @@
|
||||
"""
|
||||
import_to_mongo.py
|
||||
Verze: 1.2
|
||||
Datum: 2026-06-02
|
||||
Verze: 1.3
|
||||
Datum: 2026-06-15
|
||||
|
||||
Import Clario CSV do MongoDB (databáze: Clario).
|
||||
|
||||
@@ -11,7 +11,8 @@ Klíč: MayoDiary → Subject ID + Form Number
|
||||
MayoScore → Participant ID + Visit
|
||||
eCOA_DCRs → Data Correction ID
|
||||
ECG_DCRs → Data Correction ID
|
||||
Historie: při změně fields se stará verze uloží do pole history[]
|
||||
Historie: při změně jakéhokoliv datového sloupce (fields + outcome cols) se stará
|
||||
verze uloží do pole history[] spolu s outcome poli
|
||||
Po importu přesune zpracované CSV do downloads/Zpracovano/
|
||||
|
||||
Použití:
|
||||
@@ -119,6 +120,14 @@ def detect_collection_type(filename: str) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def data_snapshot(doc: dict, outcome_cols: tuple) -> dict:
|
||||
"""Porovnatelný snapshot všech datových polí: fields{} + outcome cols."""
|
||||
snap = {"fields": doc.get("fields", {})}
|
||||
for col in outcome_cols:
|
||||
snap[col] = doc.get(col)
|
||||
return snap
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CSV → dokument
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -176,6 +185,7 @@ def import_file(csv_path: str, db) -> dict:
|
||||
|
||||
cfg = COLLECTION_CONFIG[col_type]
|
||||
col_name = cfg["collection"]
|
||||
outcome_cols = tuple(cfg.get("outcome_cols", ()))
|
||||
snapshot_date = extract_snapshot_date(filename)
|
||||
collection = db[col_name]
|
||||
|
||||
@@ -207,11 +217,13 @@ def import_file(csv_path: str, db) -> dict:
|
||||
collection.insert_one(doc)
|
||||
inserted += 1
|
||||
|
||||
elif existing.get("fields") != doc["fields"]:
|
||||
old_entry = {
|
||||
"date": existing.get("lastSeen", snapshot_date),
|
||||
"fields": existing["fields"],
|
||||
}
|
||||
elif data_snapshot(existing, outcome_cols) != data_snapshot(doc, outcome_cols):
|
||||
# Uložíme kompletní snapshot starého stavu (fields + outcome cols)
|
||||
old_entry = {"date": existing.get("lastSeen", snapshot_date)}
|
||||
for col in outcome_cols:
|
||||
old_entry[col] = existing.get(col)
|
||||
old_entry["fields"] = existing.get("fields", {})
|
||||
|
||||
update_doc = {k: v for k, v in doc.items()}
|
||||
update_doc["lastSeen"] = snapshot_date
|
||||
collection.update_one(
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
# scan_cleanup_v1.0.ps1
|
||||
|
||||
**Verze:** 1.0 · **Datum:** 2026-06-15
|
||||
|
||||
## Účel
|
||||
READ-ONLY skener volného místa na disku pro účet **bez admin práv** (např. JNJ
|
||||
počítač). Skript **nic nemaže** — jen proskenuje typická user-space místa, kde
|
||||
lze uklízet bez administrátora, a vypíše přehled seřazený podle velikosti.
|
||||
|
||||
## Spuštění
|
||||
Na JNJ počítači v PowerShellu:
|
||||
|
||||
```powershell
|
||||
powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1
|
||||
```
|
||||
|
||||
Uložení reportu do souboru:
|
||||
|
||||
```powershell
|
||||
powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1 *> report.txt
|
||||
```
|
||||
|
||||
## Co skript dělá
|
||||
1. Vypíše volné/obsazené místo na systémovém disku.
|
||||
2. **Bezpečně smazatelné** — cache/temp (uživatelský TEMP, INetCache, WER,
|
||||
thumbnail cache, Chrome/Edge/Firefox cache, Teams, Office cache, pip/npm/
|
||||
NuGet/Playwright cache, Spotify storage, CrashDumps) s velikostí a počtem souborů.
|
||||
3. **K ruční kontrole** — Downloads, Plocha, Dokumenty, Koš (nemazat automaticky).
|
||||
4. **20 největších souborů v profilu** nad 100 MB.
|
||||
5. Vypíše hotové příkazy pro **ruční** smazání (kicker: aplikace musí být zavřené).
|
||||
|
||||
## Poznámky
|
||||
- Vše skenuje pouze v rámci uživatelského profilu → nepotřebuje admina.
|
||||
- Zamčené soubory (běžící prohlížeč apod.) se při pozdějším mazání přeskočí —
|
||||
před úklidem cache zavřít příslušnou aplikaci.
|
||||
- Reálné smazání si pouští uživatel ručně, nikdy ne skript sám.
|
||||
@@ -0,0 +1,164 @@
|
||||
# =============================================================================
|
||||
# scan_cleanup_v1.0.ps1
|
||||
# Verze: 1.0
|
||||
# Datum: 2026-06-15
|
||||
# Autor: Vladimír Buzalka (s asistencí Claude)
|
||||
# Popis: READ-ONLY skener volného místa na disku pro účet bez admin práv.
|
||||
# NIC NEMAŽE. Pouze proskenuje typická "user-space" místa, kde lze
|
||||
# uklízet bez administrátorských oprávnění, a vypíše přehled
|
||||
# seřazený podle velikosti + návrhy co smazat. Na konci ukáže
|
||||
# přesné příkazy pro skutečné smazání (musíš je spustit ručně).
|
||||
#
|
||||
# Spuštění (z PowerShellu, NEpotřebuje admina):
|
||||
# powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1
|
||||
# Volitelně uložení reportu do souboru:
|
||||
# powershell -ExecutionPolicy Bypass -File .\scan_cleanup_v1.0.ps1 *> report.txt
|
||||
# =============================================================================
|
||||
|
||||
$ErrorActionPreference = 'SilentlyContinue'
|
||||
$ProgressPreference = 'SilentlyContinue'
|
||||
|
||||
function Format-Size {
|
||||
param([long]$Bytes)
|
||||
if ($Bytes -ge 1GB) { return ('{0:N2} GB' -f ($Bytes / 1GB)) }
|
||||
if ($Bytes -ge 1MB) { return ('{0:N1} MB' -f ($Bytes / 1MB)) }
|
||||
if ($Bytes -ge 1KB) { return ('{0:N0} KB' -f ($Bytes / 1KB)) }
|
||||
return "$Bytes B"
|
||||
}
|
||||
|
||||
function Get-FolderSize {
|
||||
param([string]$Path)
|
||||
if (-not (Test-Path -LiteralPath $Path)) { return $null }
|
||||
$files = Get-ChildItem -LiteralPath $Path -Recurse -Force -File -ErrorAction SilentlyContinue
|
||||
if (-not $files) { return [pscustomobject]@{ Bytes = 0; Count = 0 } }
|
||||
$sum = ($files | Measure-Object -Property Length -Sum)
|
||||
return [pscustomobject]@{
|
||||
Bytes = [long]($sum.Sum)
|
||||
Count = [int]$sum.Count
|
||||
}
|
||||
}
|
||||
|
||||
# --- Hlavička / info o disku -------------------------------------------------
|
||||
Write-Host ""
|
||||
Write-Host "===========================================================" -ForegroundColor Cyan
|
||||
Write-Host " SKEN MOZNOSTI UKLIDU DISKU (read-only, bez admina)" -ForegroundColor Cyan
|
||||
Write-Host " Pocitac: $env:COMPUTERNAME Uzivatel: $env:USERNAME" -ForegroundColor Cyan
|
||||
Write-Host " Cas: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" -ForegroundColor Cyan
|
||||
Write-Host "===========================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
# Volné místo na systémovém disku
|
||||
$sysDrive = (Get-Item $env:SystemDrive)
|
||||
$drive = Get-PSDrive -Name $sysDrive.Name.TrimEnd(':') -ErrorAction SilentlyContinue
|
||||
if ($drive) {
|
||||
$free = $drive.Free
|
||||
$used = $drive.Used
|
||||
$total = $free + $used
|
||||
Write-Host ("Disk {0} celkem: {1} volne: {2} obsazeno: {3}" -f `
|
||||
$env:SystemDrive, (Format-Size $total), (Format-Size $free), (Format-Size $used)) -ForegroundColor Yellow
|
||||
Write-Host ""
|
||||
}
|
||||
|
||||
# --- Kandidatske lokace (vse v ramci uzivatelskeho profilu = bez admina) -----
|
||||
# Bezpecne smazatelne (cache / temp / koš)
|
||||
$candidates = @(
|
||||
@{ Name = 'Uzivatelsky TEMP'; Path = $env:TEMP; Safe = $true }
|
||||
@{ Name = 'Windows Temp (user)'; Path = (Join-Path $env:LOCALAPPDATA 'Temp'); Safe = $true }
|
||||
@{ Name = 'INetCache (IE/Win)'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Windows\INetCache'); Safe = $true }
|
||||
@{ Name = 'WER - chybove reporty'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Windows\WER'); Safe = $true }
|
||||
@{ Name = 'Explorer thumbnail cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Windows\Explorer'); Safe = $true }
|
||||
@{ Name = 'Chrome - Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Google\Chrome\User Data\Default\Cache'); Safe = $true }
|
||||
@{ Name = 'Chrome - Code Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Google\Chrome\User Data\Default\Code Cache'); Safe = $true }
|
||||
@{ Name = 'Chrome - GPUCache'; Path = (Join-Path $env:LOCALAPPDATA 'Google\Chrome\User Data\Default\GPUCache'); Safe = $true }
|
||||
@{ Name = 'Edge - Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Edge\User Data\Default\Cache'); Safe = $true }
|
||||
@{ Name = 'Edge - Code Cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Edge\User Data\Default\Code Cache'); Safe = $true }
|
||||
@{ Name = 'Firefox - cache2'; Path = (Join-Path $env:LOCALAPPDATA 'Mozilla\Firefox\Profiles'); Safe = $true }
|
||||
@{ Name = 'Teams - cache (classic)'; Path = (Join-Path $env:APPDATA 'Microsoft\Teams'); Safe = $true }
|
||||
@{ Name = 'Teams - cache (new)'; Path = (Join-Path $env:LOCALAPPDATA 'Packages\MSTeams_8wekyb3d8bbwe\LocalCache'); Safe = $true }
|
||||
@{ Name = 'Office - dokumentova cache'; Path = (Join-Path $env:LOCALAPPDATA 'Microsoft\Office\16.0\OfficeFileCache'); Safe = $true }
|
||||
@{ Name = 'pip cache (Python)'; Path = (Join-Path $env:LOCALAPPDATA 'pip\Cache'); Safe = $true }
|
||||
@{ Name = 'Playwright browsers cache'; Path = (Join-Path $env:LOCALAPPDATA 'ms-playwright'); Safe = $true }
|
||||
@{ Name = 'npm cache'; Path = (Join-Path $env:LOCALAPPDATA 'npm-cache'); Safe = $true }
|
||||
@{ Name = 'NuGet cache'; Path = (Join-Path $env:USERPROFILE '.nuget\packages'); Safe = $true }
|
||||
@{ Name = 'Spotify - Storage'; Path = (Join-Path $env:LOCALAPPDATA 'Spotify\Storage'); Safe = $true }
|
||||
@{ Name = 'CrashDumps'; Path = (Join-Path $env:LOCALAPPDATA 'CrashDumps'); Safe = $true }
|
||||
)
|
||||
|
||||
# Zkontrolovat ale NEMAZAT automaticky (uzivatel musi posoudit obsah)
|
||||
$review = @(
|
||||
@{ Name = 'Slozka Downloads (stahovani)'; Path = (Join-Path $env:USERPROFILE 'Downloads') }
|
||||
@{ Name = 'Plocha (Desktop)'; Path = (Join-Path $env:USERPROFILE 'Desktop') }
|
||||
@{ Name = 'Dokumenty'; Path = (Join-Path $env:USERPROFILE 'Documents') }
|
||||
@{ Name = 'Kos (Recycle Bin)'; Path = (Join-Path $env:SystemDrive '\$Recycle.Bin') }
|
||||
)
|
||||
|
||||
Write-Host "--- BEZPECNE SMAZATELNE (cache / temp) --------------------" -ForegroundColor Green
|
||||
$results = @()
|
||||
foreach ($c in $candidates) {
|
||||
$info = Get-FolderSize -Path $c.Path
|
||||
if ($info -and $info.Bytes -gt 0) {
|
||||
$results += [pscustomobject]@{
|
||||
Name = $c.Name
|
||||
Path = $c.Path
|
||||
Bytes = $info.Bytes
|
||||
Count = $info.Count
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$totalSafe = 0
|
||||
foreach ($r in ($results | Sort-Object Bytes -Descending)) {
|
||||
$totalSafe += $r.Bytes
|
||||
Write-Host (" {0,10} {1,7} souboru {2}" -f (Format-Size $r.Bytes), $r.Count, $r.Name)
|
||||
Write-Host (" -> {0}" -f $r.Path) -ForegroundColor DarkGray
|
||||
}
|
||||
if ($results.Count -eq 0) { Write-Host " (nic vyznamneho nenalezeno)" -ForegroundColor DarkGray }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host (" >>> POTENCIAL CACHE/TEMP CELKEM: {0}" -f (Format-Size $totalSafe)) -ForegroundColor Green
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "--- K RUCNI KONTROLE (NEMAZAT automaticky) ----------------" -ForegroundColor Yellow
|
||||
foreach ($c in ($review)) {
|
||||
$info = Get-FolderSize -Path $c.Path
|
||||
if ($info) {
|
||||
Write-Host (" {0,10} {1,7} souboru {2}" -f (Format-Size $info.Bytes), $info.Count, $c.Name)
|
||||
Write-Host (" -> {0}" -f $c.Path) -ForegroundColor DarkGray
|
||||
}
|
||||
}
|
||||
Write-Host ""
|
||||
|
||||
# --- TOP velke soubory v profilu (>100 MB) -----------------------------------
|
||||
Write-Host "--- 20 NEJVETSICH SOUBORU V PROFILU (>100 MB) -------------" -ForegroundColor Magenta
|
||||
$big = Get-ChildItem -LiteralPath $env:USERPROFILE -Recurse -Force -File -ErrorAction SilentlyContinue |
|
||||
Where-Object { $_.Length -gt 100MB } |
|
||||
Sort-Object Length -Descending |
|
||||
Select-Object -First 20
|
||||
if ($big) {
|
||||
foreach ($f in $big) {
|
||||
Write-Host (" {0,10} {1}" -f (Format-Size $f.Length), $f.FullName)
|
||||
}
|
||||
} else {
|
||||
Write-Host " (zadne soubory nad 100 MB)" -ForegroundColor DarkGray
|
||||
}
|
||||
Write-Host ""
|
||||
|
||||
# --- Navod na skutecne smazani ----------------------------------------------
|
||||
Write-Host "===========================================================" -ForegroundColor Cyan
|
||||
Write-Host " JAK SKUTECNE SMAZAT (spustit RUCNE, az po kontrole):" -ForegroundColor Cyan
|
||||
Write-Host "===========================================================" -ForegroundColor Cyan
|
||||
Write-Host @"
|
||||
# Vyprazdneni kose:
|
||||
Clear-RecycleBin -Force
|
||||
|
||||
# Smazani obsahu uzivatelskeho TEMP (zavri aplikace; nektere zamcene soubory zustanou):
|
||||
Get-ChildItem -LiteralPath `$env:TEMP -Recurse -Force -ErrorAction SilentlyContinue |
|
||||
Remove-Item -Recurse -Force -ErrorAction SilentlyContinue
|
||||
|
||||
# Smazani konkretni cache slozky (priklad Chrome) - prohlizec MUSI byt zavreny:
|
||||
Remove-Item -LiteralPath "`$env:LOCALAPPDATA\Google\Chrome\User Data\Default\Cache\*" -Recurse -Force -ErrorAction SilentlyContinue
|
||||
|
||||
# Spravce mista ve Windows (bez admina): Nastaveni > System > Uloziste
|
||||
"@ -ForegroundColor Gray
|
||||
Write-Host ""
|
||||
Write-Host "Hotovo. Skript NIC nesmazal - jen vypsal prehled." -ForegroundColor Green
|
||||
@@ -0,0 +1,21 @@
|
||||
# store_cda_batch_v1.4.py
|
||||
|
||||
**Verze:** 1.4 · **Datum:** 2026-06-15
|
||||
|
||||
Dávkové uložení binárek CDA (PDF) do Mongo `feasibility.investigators` →
|
||||
`cda.data_*`. Zdroj = `.msg` na Toweru (`/mnt/user/JNJEMAILS`), SFTP + extract_msg.
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python store_cda_batch_v1.4.py # dry-run
|
||||
python store_cda_batch_v1.4.py --apply # zápis
|
||||
```
|
||||
|
||||
## Historie
|
||||
- v1.4 — DÁVKA 7 (15JUN2026): Molnár Martin (GASTROMART s.r.o., krok 4→5),
|
||||
Dzuriková Michaela (IBDcentrum s.r.o., krok 4→5).
|
||||
- v1.3 — DÁVKA 6 (12JUN2026): Gregušová Katarína, Drastich Pavel.
|
||||
- v1.2 — DÁVKA 5 (11JUN2026): Mudr Robert.
|
||||
- v1.1 — DÁVKA 4 (11JUN2026): Konečný Michal, Baláž Jozef.
|
||||
- v1.0 — DÁVKY 1–3 (09–10JUN2026): Hlavatý, Fedurco, Tichý, Falc, Pešta,
|
||||
Jungwirthová, Matouš, Mihálkanin, Krížová, Gregar, Ďurina, Horváth.
|
||||
@@ -0,0 +1,139 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# =============================================================================
|
||||
# Nazev: store_cda_batch_v1.4.py
|
||||
# Verze: 1.4
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Davkove ulozi binarky CDA (PDF) do Mongo k investigatorum
|
||||
# (feasibility.investigators -> cda.data_*). Zdroj = .msg soubory na
|
||||
# Toweru (/mnt/user/JNJEMAILS), stazene pres SFTP, priloha vytazena
|
||||
# extract_msg. Mapovani investigator -> (.msg, attachment) je
|
||||
# explicitni. Zapise cda.data_* + doplni cda.soubor.
|
||||
# Pouziti: python store_cda_batch_v1.4.py (dry-run / nahled)
|
||||
# python store_cda_batch_v1.4.py --apply (zapise do Mongo)
|
||||
# Zmeny v1.4: DAVKA 7 (15JUN2026) - Molnar Martin (GASTROMART s.r.o., krok 4->5),
|
||||
# Dzurikova Michaela (IBDcentrum s.r.o., krok 4->5).
|
||||
# =============================================================================
|
||||
|
||||
import os
|
||||
import sys
|
||||
import base64
|
||||
import hashlib
|
||||
import unicodedata
|
||||
import paramiko
|
||||
import extract_msg
|
||||
from pymongo import MongoClient
|
||||
from bson import ObjectId
|
||||
|
||||
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://192.168.1.76:27017")
|
||||
TOWER_HOST = "192.168.1.76"
|
||||
TOWER_USER = "root"
|
||||
TOWER_PASS = "7309208104"
|
||||
REMOTE_DIR = "/mnt/user/JNJEMAILS"
|
||||
TMPDIR = r"u:\Dropbox\!!!Days\Downloads Z230\_cda_tmp"
|
||||
STORED_AT = "2026-06-15"
|
||||
|
||||
# investigator_id -> (msg_filename, attachment_filename, label)
|
||||
# DAVKA 7 (15JUN2026)
|
||||
MAPPING = [
|
||||
("6a19832b5fc221351825797f", "FC130007F372CFD10000.msg",
|
||||
"SK_CDA_Institution_GASTROMART s.r.o._fully signed 15Jun2026.pdf",
|
||||
"Molnar Martin (GASTROMART s.r.o.)"),
|
||||
("6a19832b5fc2213518257964", "FC130007F17E55100000.msg",
|
||||
"SK_CDA PI_MUDr. Michaela Dzurikova_IBDcentrum s.r.o_13Jun2026.pdf",
|
||||
"Dzurikova Michaela (IBDcentrum s.r.o.)"),
|
||||
]
|
||||
|
||||
# HISTORIE drivejsich davek (jiz ulozeno):
|
||||
# DAVKA 6 (12JUN2026): Gregusova Katarina FC130007E9D30EB3, Drastich Pavel FC130007E9D30EB1.
|
||||
# DAVKA 5 (11JUN2026): Mudr Robert FC130007DE92C232.
|
||||
# DAVKA 4 (11JUN2026): Konecny Michal FC130007DE92C231, Balaz Jozef FC130007DE92C20F.
|
||||
# DAVKA 3 (10JUN2026): Gregar, Durina, Horvath.
|
||||
# DAVKA 1+2 (09JUN2026): Hlavaty, Fedurco, Tichy, Falc, Pesta, Jungwirthova, Lukac,
|
||||
# Matous, Mihalkanin, Krizova.
|
||||
|
||||
|
||||
def norm(s):
|
||||
s = s or ""
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(c for c in s if not unicodedata.combining(c))
|
||||
return " ".join(s.lower().split())
|
||||
|
||||
|
||||
def main():
|
||||
apply = "--apply" in sys.argv
|
||||
os.makedirs(TMPDIR, exist_ok=True)
|
||||
|
||||
ssh = paramiko.SSHClient()
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
ssh.connect(TOWER_HOST, username=TOWER_USER, password=TOWER_PASS, timeout=30)
|
||||
sftp = ssh.open_sftp()
|
||||
|
||||
client = MongoClient(MONGO_URI)
|
||||
col = client["feasibility"]["investigators"]
|
||||
|
||||
plan = []
|
||||
for inv_id, msg_name, att_name, label in MAPPING:
|
||||
local_msg = os.path.join(TMPDIR, msg_name)
|
||||
if not os.path.exists(local_msg):
|
||||
sftp.get(f"{REMOTE_DIR}/{msg_name}", local_msg)
|
||||
m = extract_msg.Message(local_msg)
|
||||
target = norm(att_name)
|
||||
chosen = None
|
||||
for att in m.attachments:
|
||||
name = att.longFilename or att.shortFilename or ""
|
||||
if norm(name) == target or (target in norm(name)) or (norm(name) in target and name.lower().endswith(".pdf")):
|
||||
chosen = (name, att.data)
|
||||
break
|
||||
m.close()
|
||||
if not chosen:
|
||||
plan.append((inv_id, label, msg_name, att_name, None, "!!! PRILOHA NENALEZENA"))
|
||||
continue
|
||||
raw = chosen[1]
|
||||
sha = hashlib.sha256(raw).hexdigest()
|
||||
plan.append((inv_id, label, msg_name, chosen[0], (len(raw), sha, raw), "OK"))
|
||||
|
||||
sftp.close(); ssh.close()
|
||||
|
||||
print("=== NAHLED DAVKY (CDA -> Mongo cda.data) ===\n")
|
||||
for inv_id, label, msg_name, att_name, info, status in plan:
|
||||
doc = col.find_one({"_id": ObjectId(inv_id)}, {"prijmeni": 1, "jmeno": 1, "cda.data_base64": 1})
|
||||
has = bool(doc and doc.get("cda", {}).get("data_base64"))
|
||||
print(f"[{status}] {label} (_id {inv_id})")
|
||||
print(f" .msg: {msg_name}")
|
||||
print(f" priloha: {att_name}")
|
||||
if info:
|
||||
print(f" velikost: {info[0]} B sha256: {info[1]}")
|
||||
print(f" data_base64 jiz existuje: {has}")
|
||||
print()
|
||||
|
||||
if not apply:
|
||||
print(">>> DRY-RUN. Pro zapis spust s --apply")
|
||||
return
|
||||
|
||||
n = 0
|
||||
for inv_id, label, msg_name, att_name, info, status in plan:
|
||||
if status != "OK" or not info:
|
||||
print(f"PRESKAKUJI {label}: {status}")
|
||||
continue
|
||||
size, sha, raw = info
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
res = col.update_one(
|
||||
{"_id": ObjectId(inv_id)},
|
||||
{"$set": {
|
||||
"cda.data_base64": b64,
|
||||
"cda.data_sha256": sha,
|
||||
"cda.data_filename": att_name,
|
||||
"cda.data_mime": "application/pdf",
|
||||
"cda.data_size": size,
|
||||
"cda.data_stored_at": STORED_AT,
|
||||
"cda.data_source_msg": msg_name,
|
||||
"cda.soubor": att_name,
|
||||
}},
|
||||
)
|
||||
n += res.modified_count
|
||||
print(f"ZAPSANO: {label} (modified={res.modified_count})")
|
||||
print(f"\n>>> CELKEM ZAPSANO: {n}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,121 @@
|
||||
# ============================================================
|
||||
# seaweed_backfill_v1.0.py
|
||||
# Verze: 1.0
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
|
||||
# všechny dokumenty z VTMF.documents, které jsou na disku
|
||||
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
|
||||
# Placeholdery a záznamy bez souboru přeskočí.
|
||||
# Lze spustit opakovaně — HEAD check zajistí dedup,
|
||||
# přerušení kdykoli naváže příště.
|
||||
# ============================================================
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "VTMF"
|
||||
MONGO_COLL = "documents"
|
||||
|
||||
SEAWEED_FILER = "http://192.168.1.50:8888"
|
||||
SEAWEED_PREFIX = "/vtmf-documents"
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def sw_path(sha256):
|
||||
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
||||
|
||||
|
||||
def seaweed_store(data, mime="application/octet-stream"):
|
||||
"""HEAD check + PUT. Vrací (path, url, uploaded)."""
|
||||
sha256 = hashlib.sha256(data).hexdigest()
|
||||
path = sw_path(sha256)
|
||||
url = SEAWEED_FILER + path
|
||||
|
||||
try:
|
||||
urllib.request.urlopen(
|
||||
urllib.request.Request(url, method="HEAD"), timeout=10)
|
||||
return path, url, False # dedup hit
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code != 404:
|
||||
raise
|
||||
|
||||
urllib.request.urlopen(
|
||||
urllib.request.Request(url, data=data, method="PUT",
|
||||
headers={"Content-Type": mime}),
|
||||
timeout=120)
|
||||
return path, url, True
|
||||
|
||||
|
||||
def main():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
coll = client[MONGO_DB][MONGO_COLL]
|
||||
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
||||
|
||||
query = {
|
||||
"downloaded": True,
|
||||
"placeholder": {"$ne": True},
|
||||
"seaweed_path": None,
|
||||
"file": {"$ne": None},
|
||||
}
|
||||
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
||||
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
|
||||
|
||||
uploaded = dedup = skipped = failed = 0
|
||||
|
||||
for n, doc in enumerate(todo, 1):
|
||||
key = doc["_id"]
|
||||
path = doc.get("file")
|
||||
log(f"[{n}/{len(todo)}] {key}")
|
||||
|
||||
if not path or not Path(path).exists():
|
||||
log(f" [!] Soubor nenalezen na disku: {path} — přeskočeno.")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
data = Path(path).read_bytes()
|
||||
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
|
||||
sha256_hex = hashlib.sha256(data).hexdigest()
|
||||
|
||||
sw_p, sw_url, was_new = seaweed_store(data, mime)
|
||||
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"sha256": sha256_hex,
|
||||
"seaweed_path": sw_p,
|
||||
"seaweed_url": sw_url,
|
||||
"seaweed_synced_at": datetime.now(),
|
||||
}})
|
||||
|
||||
if was_new:
|
||||
uploaded += 1
|
||||
log(f" [ok] Nahráno → {sw_p}")
|
||||
else:
|
||||
dedup += 1
|
||||
log(f" [i] Dedup hit → {sw_p}")
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
log(f" [!] Chyba: {e}")
|
||||
|
||||
log(f"\n{'='*60}")
|
||||
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
|
||||
f"{skipped} bez souboru, {failed} chyb.")
|
||||
log(f"{'='*60}")
|
||||
|
||||
sys.exit(1 if failed else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,102 @@
|
||||
# vtmf_pipeline_v1.3 — Kompletní V-TMF workflow (report → Mongo → download)
|
||||
|
||||
**Verze:** 1.3 · **Datum:** 2026-06-12
|
||||
|
||||
**Změny v1.1:** oprava tichého selhání — výjimka kteréhokoli kroku se
|
||||
vypíše jako „PIPELINE SELHALA" + exit kód 2 (v1.0 končila zavádějícím
|
||||
souhrnem „0 staženo, 0 chyb"). Export reportu robustnější: menu ⋯,
|
||||
položka Export to Excel i tlačítko Export se hledají přes víc selektorů
|
||||
a ve všech frames; při nenalezení se automaticky uloží diagnostika
|
||||
stránky do debug/<čas>_report_* (screenshot, HTML všech frames, výpis
|
||||
title/aria-label atributů) — z ní se dá určit přesný selektor.
|
||||
|
||||
**Změny v1.2:** selektory exportu ověřené na živém DOM (Claude in
|
||||
Chrome; žádný iframe na celé stránce): menu ⋯ =
|
||||
`.actionMenuContainer .dropDown.vv_dropdown_toggle button.vv-icon-button`
|
||||
(button má prázdný title!); menu se načítá asynchronně (AJAX) →
|
||||
po kliknutí se čeká na položku `a.ReportAction[data-action-name='ExcelExport']`;
|
||||
„Data Only" = radio `name=requiredRadioField value=STANDARD`, defaultně
|
||||
checked (pojistka přes .check()); tlačítko Export = React `<button>`
|
||||
s emotion class hash → selektovat jen přes roli+text.
|
||||
|
||||
**Změny v1.3:** na konci běhu se prohlížeč i konzole zavřou
|
||||
automaticky (žádné čekání na ENTER); interaktivní vstup zůstává jen
|
||||
u 2FA a u ručně nezavřitelného dialogu.
|
||||
|
||||
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
|
||||
|
||||
1. **Login** do vtmf.veevavault.com (persistentní profil
|
||||
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
|
||||
+ ENTER; údaje z `.env` v rootu projektu).
|
||||
2. **Export reportu** „Document Inventory Report - Study Level"
|
||||
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
|
||||
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
|
||||
uloží se s timestampem do `WhatToDownload/`, po zpracování se
|
||||
přesune do `WhatToDownload/Zpracovano/`.
|
||||
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
|
||||
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`
|
||||
(VTMF číslo + verze, unikátní index na dvojici):
|
||||
- nový dokument → založí se (first_seen, deleted=False,
|
||||
downloaded=False),
|
||||
- změna sledovaných polí (name, status, type, subtype, desc,
|
||||
date, url, studies) → promítne se + záznam do `history[]`
|
||||
(timestamp + old/new),
|
||||
- dokument chybí v reportu → `deleted=True, deleted_at` a stažený
|
||||
soubor se přejmenuje s ` [D]` před příponou,
|
||||
- dokument se vrátí do reportu → `deleted=False` a ` [D]`
|
||||
se ze souboru zase odebere.
|
||||
Výsledná sada = záznamy s `deleted=False`.
|
||||
4. **Stažení chybějících** — všechny `deleted=False, downloaded≠True`:
|
||||
doc URL → Source File → uložení do
|
||||
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
|
||||
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<skutečná přípona>`.
|
||||
Výsledek (cesta, čas, případně chyba) se ihned zapisuje do Mongo —
|
||||
běh jde kdykoli přerušit a příště naváže.
|
||||
|
||||
## Mongo schéma (kolekce documents)
|
||||
|
||||
```
|
||||
_id: "VTMF-19077748|v1.0"
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies
|
||||
first_seen, last_seen # kdy poprvé/naposledy v reportu
|
||||
deleted, deleted_at # není ve výsledné sadě reportu
|
||||
downloaded, file, downloaded_at
|
||||
last_error, error_at # poslední chyba stahování
|
||||
history: [{ts, changes: {pole: {old, new}}}]
|
||||
```
|
||||
|
||||
## Migrace starého stavu
|
||||
|
||||
Při prvním běhu se `download_state.csv` (z download_vault v2.x)
|
||||
jednorázově namigruje: záznamy `ok` se k odpovídajícímu VTMF zapíší
|
||||
jako `downloaded=True` + cesta. CSV se přejmenuje na
|
||||
`download_state.csv.imported`.
|
||||
|
||||
## Konfigurace (konstanty nahoře)
|
||||
|
||||
- `REPORT_URL` — ID reportu + filtr studie (pro jinou studii se mění
|
||||
jen tato dvě ID)
|
||||
- `LIMIT` — None = stáhnout vše zbývající; číslo = dávka na běh
|
||||
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
|
||||
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
|
||||
|
||||
## Ověřené technické detaily (nesahat bez ověření)
|
||||
|
||||
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
|
||||
(křížek `.ui-dialog-titlebar-close` je display:none); objevuje se
|
||||
se zpožděním → wait_for visible 8 s (home) / 2-4 s (jinde).
|
||||
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
|
||||
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
|
||||
- Export kliknout právě jednou; 503/redirecty v network logu
|
||||
ignorovat, rozhoduje expect_download.
|
||||
|
||||
## Spuštění
|
||||
|
||||
```powershell
|
||||
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.3.py"
|
||||
```
|
||||
|
||||
Předchůdce: download_vault v1.x–v2.1 (TRASH/).
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
# vtmf_pipeline_v1.4 — Kompletní V-TMF workflow (report → Mongo → download)
|
||||
|
||||
**Verze:** 1.4 · **Datum:** 2026-06-15
|
||||
|
||||
**Změny v1.1:** oprava tichého selhání — výjimka kteréhokoli kroku se
|
||||
vypíše jako „PIPELINE SELHALA" + exit kód 2 (v1.0 končila zavádějícím
|
||||
souhrnem „0 staženo, 0 chyb"). Export reportu robustnější: menu ⋯,
|
||||
položka Export to Excel i tlačítko Export se hledají přes víc selektorů
|
||||
a ve všech frames; při nenalezení se automaticky uloží diagnostika
|
||||
stránky do debug/<čas>_report_* (screenshot, HTML všech frames, výpis
|
||||
title/aria-label atributů) — z ní se dá určit přesný selektor.
|
||||
|
||||
**Změny v1.2:** selektory exportu ověřené na živém DOM (Claude in
|
||||
Chrome; žádný iframe na celé stránce): menu ⋯ =
|
||||
`.actionMenuContainer .dropDown.vv_dropdown_toggle button.vv-icon-button`
|
||||
(button má prázdný title!); menu se načítá asynchronně (AJAX) →
|
||||
po kliknutí se čeká na položku `a.ReportAction[data-action-name='ExcelExport']`;
|
||||
„Data Only" = radio `name=requiredRadioField value=STANDARD`, defaultně
|
||||
checked (pojistka přes .check()); tlačítko Export = React `<button>`
|
||||
s emotion class hash → selektovat jen přes roli+text.
|
||||
|
||||
**Změny v1.3:** na konci běhu se prohlížeč i konzole zavřou
|
||||
automaticky (žádné čekání na ENTER); interaktivní vstup zůstává jen
|
||||
u 2FA a u ručně nezavřitelného dialogu.
|
||||
|
||||
**Změny v1.4:** detekce placeholder dokumentů — Vault zobrazuje text
|
||||
„This placeholder has no content", dokument nemá žádný Source File ke
|
||||
stažení. Při detekci se zapíše `placeholder=True, downloaded=True` do
|
||||
Mongo a dokument se přeskočí bez chyby. Souhrn na konci běhu uvádí
|
||||
počet placeholderů zvlášť.
|
||||
|
||||
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
|
||||
|
||||
1. **Login** do vtmf.veevavault.com (persistentní profil
|
||||
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
|
||||
+ ENTER; údaje z `.env` v rootu projektu).
|
||||
2. **Export reportu** „Document Inventory Report - Study Level"
|
||||
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
|
||||
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
|
||||
uloží se s timestampem do `WhatToDownload/`, po zpracování se
|
||||
přesune do `WhatToDownload/Zpracovano/`.
|
||||
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
|
||||
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`
|
||||
(VTMF číslo + verze, unikátní index na dvojici):
|
||||
- nový dokument → založí se (first_seen, deleted=False,
|
||||
downloaded=False),
|
||||
- změna sledovaných polí (name, status, type, subtype, desc,
|
||||
date, url, studies) → promítne se + záznam do `history[]`
|
||||
(timestamp + old/new),
|
||||
- dokument chybí v reportu → `deleted=True, deleted_at` a stažený
|
||||
soubor se přejmenuje s ` [D]` před příponou,
|
||||
- dokument se vrátí do reportu → `deleted=False` a ` [D]`
|
||||
se ze souboru zase odebere.
|
||||
Výsledná sada = záznamy s `deleted=False`.
|
||||
4. **Stažení chybějících** — všechny `deleted=False, downloaded≠True`:
|
||||
doc URL → Source File → uložení do
|
||||
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
|
||||
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<skutečná přípona>`.
|
||||
Výsledek (cesta, čas, případně chyba) se ihned zapisuje do Mongo —
|
||||
běh jde kdykoli přerušit a příště naváže.
|
||||
Placeholder dokumenty (stránka s textem „This placeholder has no
|
||||
content") se přeskočí a označí `placeholder=True, downloaded=True`.
|
||||
|
||||
## Mongo schéma (kolekce documents)
|
||||
|
||||
```
|
||||
_id: "VTMF-19077748|v1.0"
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies
|
||||
first_seen, last_seen # kdy poprvé/naposledy v reportu
|
||||
deleted, deleted_at # není ve výsledné sadě reportu
|
||||
downloaded, file, downloaded_at
|
||||
placeholder # True = Vault placeholder bez obsahu
|
||||
last_error, error_at # poslední chyba stahování
|
||||
history: [{ts, changes: {pole: {old, new}}}]
|
||||
```
|
||||
|
||||
## Migrace starého stavu
|
||||
|
||||
Při prvním běhu se `download_state.csv` (z download_vault v2.x)
|
||||
jednorázově namigruje: záznamy `ok` se k odpovídajícímu VTMF zapíší
|
||||
jako `downloaded=True` + cesta. CSV se přejmenuje na
|
||||
`download_state.csv.imported`.
|
||||
|
||||
## Konfigurace (konstanty nahoře)
|
||||
|
||||
- `REPORT_URL` — ID reportu + filtr studie (pro jinou studii se mění
|
||||
jen tato dvě ID)
|
||||
- `LIMIT` — None = stáhnout vše zbývající; číslo = dávka na běh
|
||||
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
|
||||
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
|
||||
|
||||
## Ověřené technické detaily (nesahat bez ověření)
|
||||
|
||||
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
|
||||
(křížek `.ui-dialog-titlebar-close` je display:none); objevuje se
|
||||
se zpožděním → wait_for visible 8 s (home) / 2-4 s (jinde).
|
||||
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
|
||||
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
|
||||
- Export kliknout právě jednou; 503/redirecty v network logu
|
||||
ignorovat, rozhoduje expect_download.
|
||||
- Placeholder detekce: `page.locator("div.vv_placeholder_text")` (uvnitř
|
||||
`div.vv_placeholder_pane > div.vv_placeholder_container > div.vv-placeholder-drag-and-drop-container`)
|
||||
se testuje před hledáním Source File ikony — CSS selektor je spolehlivější
|
||||
než text match.
|
||||
|
||||
## Spuštění
|
||||
|
||||
```powershell
|
||||
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.4.py"
|
||||
```
|
||||
|
||||
Předchůdce: vtmf_pipeline_v1.3 (TRASH/).
|
||||
@@ -0,0 +1,864 @@
|
||||
# ============================================================
|
||||
# vtmf_pipeline_v1.4.py
|
||||
# Verze: 1.4
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault), studie
|
||||
# 77242113UCO3001. Jeden běh udělá:
|
||||
# 1) login do Vaultu (persistentní session + ruční 2FA),
|
||||
# 2) export reportu "Document Inventory Report - Study
|
||||
# Level" do Excelu (Data Only) do WhatToDownload/,
|
||||
# 3) parse reportu a synchronizaci do MongoDB
|
||||
# (Tower, db VTMF, kolekce documents,
|
||||
# klíč = VTMF číslo + verze):
|
||||
# - nové dokumenty se založí,
|
||||
# - změny polí se promítnou (+ history[]),
|
||||
# - dokumenty chybějící v reportu se označí
|
||||
# deleted=True a stažený soubor dostane ' [D]',
|
||||
# - znovuobjevené se vzkřísí a ' [D]' se odebere,
|
||||
# 4) stažení všech dosud nestažených dokumentů do
|
||||
# U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\
|
||||
# <Type>\<Subtype>\"YYYY-MM-DD Description
|
||||
# [VTMF-x] [v1.0].<přípona>" + zápis stavu do Mongo.
|
||||
#
|
||||
# Tracking stahování je KOMPLETNĚ v Mongo; starý
|
||||
# download_state.csv se při prvním běhu jednorázově
|
||||
# namigruje a přejmenuje na .imported.
|
||||
#
|
||||
# Vychází z download_vault_v2.1 (v TRASH/) — login, dialogy
|
||||
# a stahování beze změny; nové jsou kroky 2 a 3.
|
||||
#
|
||||
# v1.1: oprava tichého selhání — chyba kteréhokoli kroku se teď
|
||||
# hlasitě vypíše (a exit kód 2), místo aby běh skončil
|
||||
# souhrnem "0 staženo, 0 chyb". Export reportu: více
|
||||
# selektorů pro menu ⋯ i položku Export to Excel (včetně
|
||||
# hledání ve všech frames) a při selhání automatický záchyt
|
||||
# diagnostiky stránky do debug/ (screenshot + HTML frames).
|
||||
# v1.2: selektory exportu OVĚŘENÉ na živém DOM (žádný iframe):
|
||||
# menu ⋯ = .actionMenuContainer .dropDown.vv_dropdown_toggle
|
||||
# button.vv-icon-button (title prázdný!); menu se načítá
|
||||
# asynchronně -> čekat na položku; položka =
|
||||
# a.ReportAction[data-action-name='ExcelExport']; Data Only =
|
||||
# radio name=requiredRadioField value=STANDARD (default
|
||||
# checked); Export = <button> role+text (emotion class hash,
|
||||
# neselektovat podle tříd).
|
||||
# v1.3: na konci běhu se prohlížeč i okno zavře automaticky
|
||||
# (žádné čekání na ENTER) — vhodné pro bezobslužné běhy.
|
||||
# Interaktivní vstupy zůstávají jen tam, kde jsou nutné
|
||||
# (2FA, ručně nezavřitelný dialog).
|
||||
# v1.4: detekce placeholder dokumentů — stránka s textem
|
||||
# "This placeholder has no content" se přeskočí
|
||||
# (placeholder=True, downloaded=True v Mongo), žádná chyba.
|
||||
#
|
||||
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
|
||||
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
|
||||
# ============================================================
|
||||
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
# --- Konfigurace -------------------------------------------------------
|
||||
|
||||
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
|
||||
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
|
||||
"&IdpAdapterId=CompIWALDAPEXTFORM"
|
||||
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
|
||||
|
||||
# Report Document Inventory Report - Study Level, filtr na studii
|
||||
REPORT_URL = ("https://vtmf.veevavault.com/ui/#reporting/viewer/"
|
||||
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008")
|
||||
|
||||
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
|
||||
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
|
||||
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
|
||||
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty
|
||||
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
|
||||
OLD_STATE_FILE = SCRIPT_DIR / "download_state.csv" # legacy CSV (migrace)
|
||||
DOWNLOAD_ROOT = Path(r"U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001")
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "VTMF"
|
||||
MONGO_COLL = "documents"
|
||||
|
||||
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
|
||||
LIMIT = 0
|
||||
# Pole reportu, jejichž změny se promítají a verzují do history[]
|
||||
TRACKED_FIELDS = ("name", "status", "type", "subtype", "desc",
|
||||
"date", "url", "studies")
|
||||
|
||||
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
|
||||
RETRY_PAUSE_MS = 5000 # pauza před opakováním
|
||||
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
|
||||
|
||||
|
||||
class PlaceholderDocument(Exception):
|
||||
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def load_env_file(path):
|
||||
"""Načte KEY=VALUE řádky z .env do os.environ.
|
||||
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
|
||||
if not path.exists():
|
||||
log(f"[!] .env nenalezen: {path}")
|
||||
return
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, value = line.partition("=")
|
||||
key, value = key.strip(), value.strip().strip('"').strip("'")
|
||||
if value and key not in os.environ:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
|
||||
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
|
||||
|
||||
|
||||
def ensure_credentials():
|
||||
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
|
||||
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
|
||||
load_env_file(ENV_FILE)
|
||||
if all(os.environ.get(k) for k in ENV_KEYS):
|
||||
return
|
||||
|
||||
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
|
||||
missing_lines = [f"{k}=" for k in ENV_KEYS
|
||||
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
|
||||
|
||||
if not ENV_FILE.exists():
|
||||
ENV_FILE.write_text(
|
||||
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
|
||||
+ ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n",
|
||||
encoding="utf-8")
|
||||
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
|
||||
elif missing_lines:
|
||||
with open(ENV_FILE, "a", encoding="utf-8") as f:
|
||||
f.write("\n" + ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n")
|
||||
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
|
||||
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
|
||||
print(f" {ENV_FILE}")
|
||||
print(" a spusť skript znovu.")
|
||||
print("=" * 60)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# --- Parsování Excelu --------------------------------------------------
|
||||
|
||||
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
|
||||
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
|
||||
# nepovolené znaky Windows názvů + řídicí znaky + unicode artefakt �
|
||||
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f�]")
|
||||
|
||||
|
||||
def clean_filename(s):
|
||||
"""Očistí string na platné jméno souboru/složky ve Windows."""
|
||||
s = BAD_CHARS_RE.sub("_", str(s))
|
||||
s = re.sub(r"\s+", " ", s) # vícenásobné mezery -> jedna
|
||||
s = re.sub(r"_{2,}", "_", s) # vícenásobná podtržítka -> jedno
|
||||
return s.strip(" ._") # okraje: mezery, tečky, podtržítka
|
||||
|
||||
|
||||
def display_text(cell):
|
||||
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
|
||||
raw = str(cell.value or "").strip()
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
return m.group(2).strip() if m else raw
|
||||
|
||||
|
||||
def extract_doc_url(raw):
|
||||
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
|
||||
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
|
||||
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
|
||||
if not m:
|
||||
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
|
||||
return m.group(1)
|
||||
|
||||
|
||||
def read_documents_from_excel(path):
|
||||
"""Načte dokumenty z daného .xlsx reportu. Vrací list dictů:
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies.
|
||||
Document Name/Number/Status jsou =HYPERLINK vzorce — URL i text se
|
||||
berou regexem. Report má rozbité deklarované rozměry, čte se
|
||||
přímou iterací řádků."""
|
||||
from openpyxl import load_workbook
|
||||
|
||||
log(f"[i] Parsování reportu: {path.name}")
|
||||
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
|
||||
ws = wb[wb.sheetnames[0]]
|
||||
|
||||
rows = ws.iter_rows()
|
||||
header = [c.value for c in next(rows)]
|
||||
try:
|
||||
i_num = header.index("Document Number")
|
||||
i_name = header.index("Document Name")
|
||||
i_status = header.index("Document Status")
|
||||
i_type = header.index("Type")
|
||||
i_sub = header.index("Subtype")
|
||||
i_desc = header.index("Description")
|
||||
i_date = header.index("Document Date")
|
||||
i_study = header.index("Study")
|
||||
except ValueError as e:
|
||||
raise RuntimeError(f"V reportu chybí očekávaný sloupec: {e}")
|
||||
|
||||
docs, bad = [], []
|
||||
for row in rows:
|
||||
cell = row[i_num]
|
||||
if cell.value is None:
|
||||
continue
|
||||
raw = str(cell.value)
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
if m:
|
||||
url_raw, vtmf = m.group(1), m.group(2)
|
||||
elif cell.hyperlink: # pravý hyperlink místo vzorce
|
||||
url_raw, vtmf = cell.hyperlink.target, raw
|
||||
else:
|
||||
bad.append(raw)
|
||||
continue
|
||||
try:
|
||||
url = extract_doc_url(url_raw)
|
||||
except ValueError:
|
||||
bad.append(raw)
|
||||
continue
|
||||
|
||||
name = display_text(row[i_name])
|
||||
vm = VERSION_RE.search(name)
|
||||
version = vm.group(1) if vm else "v?"
|
||||
|
||||
desc = clean_filename(display_text(row[i_desc]))
|
||||
if not desc:
|
||||
# fallback: Document Name bez koncové verze (jde zvlášť na konec)
|
||||
desc = clean_filename(VERSION_RE.sub("", name))
|
||||
|
||||
date = row[i_date].value # datetime nebo None
|
||||
docs.append({
|
||||
"vtmf": vtmf.strip(),
|
||||
"version": version,
|
||||
"url": url,
|
||||
"name": name,
|
||||
"status": display_text(row[i_status]),
|
||||
"type": clean_filename(display_text(row[i_type])),
|
||||
"subtype": clean_filename(display_text(row[i_sub])),
|
||||
"desc": desc,
|
||||
"date": date if hasattr(date, "strftime") else None,
|
||||
"studies": display_text(row[i_study]),
|
||||
})
|
||||
|
||||
log(f"[i] Načteno {len(docs)} dokumentů"
|
||||
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
|
||||
return docs
|
||||
|
||||
|
||||
def build_target_path(doc, suggested_filename):
|
||||
"""Cílová cesta: DOWNLOAD_ROOT\\Type\\Subtype\\
|
||||
'YYYY-MM-DD Description [VTMF-xxx] [v1.0].<skutečná přípona>'.
|
||||
Datum/verze se vynechají, když nejsou k dispozici."""
|
||||
ext = Path(suggested_filename).suffix # skutečná přípona vč. tečky
|
||||
date_prefix = doc["date"].strftime("%Y-%m-%d") + " " if doc["date"] else ""
|
||||
version = f" [{doc['version']}]" if doc.get("version") else ""
|
||||
filename = f"{date_prefix}{doc['desc']} [{doc['vtmf']}]{version}{ext}"
|
||||
return DOWNLOAD_ROOT / doc["type"] / doc["subtype"] / filename
|
||||
|
||||
|
||||
def deleted_marker_path(path):
|
||||
"""Jméno souboru s příznakem smazání: 'x.pdf' -> 'x [D].pdf'."""
|
||||
p = Path(path)
|
||||
return p.with_name(f"{p.stem} [D]{p.suffix}")
|
||||
|
||||
|
||||
# --- MongoDB synchronizace ---------------------------------------------
|
||||
|
||||
def doc_key(vtmf, version):
|
||||
return f"{vtmf}|{version}"
|
||||
|
||||
|
||||
def get_collection():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
coll = client[MONGO_DB][MONGO_COLL]
|
||||
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)],
|
||||
unique=True)
|
||||
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
|
||||
return coll
|
||||
|
||||
|
||||
def migrate_old_csv(coll):
|
||||
"""Jednorázová migrace download_state.csv do Mongo: záznamy 'ok'
|
||||
se zapíší jako downloaded=True k odpovídajícímu VTMF (aktuální,
|
||||
nesmazané verzi). CSV se pak přejmenuje na .imported."""
|
||||
if not OLD_STATE_FILE.exists():
|
||||
return
|
||||
migrated = 0
|
||||
with open(OLD_STATE_FILE, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
if row["result"] != "ok":
|
||||
continue
|
||||
r = coll.update_one(
|
||||
{"vtmf": row["vtmf"], "deleted": False,
|
||||
"downloaded": {"$ne": True}},
|
||||
{"$set": {"downloaded": True, "file": row["file"],
|
||||
"downloaded_at": row["timestamp"]}})
|
||||
migrated += r.modified_count
|
||||
OLD_STATE_FILE.rename(OLD_STATE_FILE.with_suffix(".csv.imported"))
|
||||
log(f"[i] Migrace download_state.csv -> Mongo: {migrated} záznamů; "
|
||||
f"CSV přejmenováno na .imported")
|
||||
|
||||
|
||||
def sync_report_to_mongo(coll, docs):
|
||||
"""Promítne aktuální report do kolekce documents.
|
||||
Klíč = (vtmf, version). Nové založí, změny polí promítne
|
||||
(s history[]), chybějící označí deleted + soubor přejmenuje
|
||||
s ' [D]', znovuobjevené vzkřísí a ' [D]' odebere."""
|
||||
now = datetime.now()
|
||||
stats = {"new": 0, "updated": 0, "unchanged": 0,
|
||||
"resurrected": 0, "marked_deleted": 0}
|
||||
current_keys = set()
|
||||
|
||||
for d in docs:
|
||||
key = doc_key(d["vtmf"], d["version"])
|
||||
current_keys.add(key)
|
||||
existing = coll.find_one({"_id": key})
|
||||
if existing is None:
|
||||
coll.insert_one({
|
||||
"_id": key, **d,
|
||||
"first_seen": now, "last_seen": now,
|
||||
"deleted": False, "downloaded": False,
|
||||
"file": None, "history": [],
|
||||
})
|
||||
stats["new"] += 1
|
||||
continue
|
||||
|
||||
changes = {}
|
||||
for fld in TRACKED_FIELDS:
|
||||
if existing.get(fld) != d.get(fld):
|
||||
changes[fld] = {"old": existing.get(fld),
|
||||
"new": d.get(fld)}
|
||||
update = {"$set": {**d, "last_seen": now, "deleted": False}}
|
||||
if changes:
|
||||
update["$push"] = {"history": {"ts": now, "changes": changes}}
|
||||
stats["updated"] += 1
|
||||
else:
|
||||
stats["unchanged"] += 1
|
||||
|
||||
if existing.get("deleted"):
|
||||
# dokument se do reportu vrátil -> odebrat [D] ze souboru
|
||||
stats["resurrected"] += 1
|
||||
stats["unchanged"] -= 0 # (počítá se výše jako updated/unchanged)
|
||||
old_file = existing.get("file")
|
||||
if old_file:
|
||||
marked = deleted_marker_path(old_file)
|
||||
if marked.exists() and not Path(old_file).exists():
|
||||
marked.rename(old_file)
|
||||
log(f"[i] {key}: soubor vrácen z ' [D]' zpět.")
|
||||
update["$set"]["file"] = str(old_file)
|
||||
coll.update_one({"_id": key}, update)
|
||||
|
||||
# dokumenty, které v aktuálním reportu nejsou -> deleted + ' [D]'
|
||||
for rec in coll.find({"deleted": False}):
|
||||
if rec["_id"] in current_keys:
|
||||
continue
|
||||
upd = {"deleted": True, "deleted_at": now}
|
||||
f = rec.get("file")
|
||||
if f and Path(f).exists():
|
||||
marked = deleted_marker_path(f)
|
||||
try:
|
||||
Path(f).rename(marked)
|
||||
upd["file"] = str(marked)
|
||||
log(f"[i] {rec['_id']}: soubor označen ' [D]'.")
|
||||
except OSError as e:
|
||||
log(f"[!] {rec['_id']}: přejmenování na [D] selhalo: {e}")
|
||||
coll.update_one({"_id": rec["_id"]},
|
||||
{"$set": upd,
|
||||
"$push": {"history": {"ts": now,
|
||||
"changes": {"deleted": {
|
||||
"old": False,
|
||||
"new": True}}}}})
|
||||
stats["marked_deleted"] += 1
|
||||
|
||||
log(f"[ok] Mongo sync: {stats['new']} nových, {stats['updated']} změněných, "
|
||||
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
|
||||
f"{stats['marked_deleted']} označených deleted.")
|
||||
return stats
|
||||
|
||||
|
||||
# --- Přihlášení --------------------------------------------------------
|
||||
|
||||
def submit_login_form(page, password_box):
|
||||
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
|
||||
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
|
||||
candidates = [
|
||||
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
|
||||
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
|
||||
page.locator("input[type='submit']"),
|
||||
page.locator("button[type='submit']"),
|
||||
page.get_by_role("button", name=re.compile("^ok$", re.I)),
|
||||
]
|
||||
for loc in candidates:
|
||||
try:
|
||||
if loc.count() and loc.first.is_visible():
|
||||
label = (loc.first.inner_text() or
|
||||
loc.first.get_attribute("value") or "submit").strip()
|
||||
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
|
||||
loc.first.click()
|
||||
return
|
||||
except Exception:
|
||||
continue
|
||||
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
|
||||
password_box.press("Enter")
|
||||
|
||||
|
||||
def login_if_needed(page):
|
||||
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
|
||||
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
|
||||
log(f"[i] Otevírám přihlašovací URL...")
|
||||
page.goto(LOGIN_URL, wait_until="domcontentloaded")
|
||||
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Už přihlášen (perzistentní session).")
|
||||
return
|
||||
|
||||
user_box = page.locator("input[type='text']").first
|
||||
try:
|
||||
user_box.wait_for(timeout=8000)
|
||||
except PWTimeout:
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Přihlášen bez formuláře (session redirect).")
|
||||
return
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
|
||||
|
||||
username = os.environ["VAULT_USER"]
|
||||
password = os.environ["VAULT_PASS"]
|
||||
|
||||
log("[i] Vyplňuji přihlašovací údaje...")
|
||||
user_box.fill(username)
|
||||
password_box = page.locator("input[type='password']").first
|
||||
password_box.fill(password)
|
||||
submit_login_form(page, password_box)
|
||||
|
||||
log("[i] Odeslán login, čekám na výsledek...")
|
||||
try:
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
|
||||
log("[ok] Přihlášen rovnou (bez 2FA).")
|
||||
return
|
||||
except PWTimeout:
|
||||
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
|
||||
|
||||
err = page.locator("text=/invalid|incorrect|failed/i")
|
||||
try:
|
||||
if err.count() and err.first.is_visible():
|
||||
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
|
||||
except PWTimeout:
|
||||
pass
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
|
||||
print(" Potvrď přihlášení v mobilní aplikaci.")
|
||||
print("=" * 60)
|
||||
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
|
||||
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
|
||||
log("[ok] Přihlášení dokončeno.")
|
||||
|
||||
|
||||
def verify_inside(page):
|
||||
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
|
||||
log(f"[ok] Uvnitř Vaultu: {page.url}")
|
||||
|
||||
|
||||
def dialog_visible(page):
|
||||
"""True, pokud je na stránce viditelný jQuery UI dialog."""
|
||||
try:
|
||||
dlg = page.locator(".ui-dialog")
|
||||
return bool(dlg.count() and dlg.first.is_visible())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def save_page_debug(page, tag):
|
||||
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
|
||||
kandidátů na tlačítka. Vrátí cestu složky."""
|
||||
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
|
||||
except Exception as e:
|
||||
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
|
||||
report = []
|
||||
for i, frame in enumerate(page.frames):
|
||||
report.append(f"=== frame[{i}] url={frame.url}")
|
||||
try:
|
||||
(out / f"frame_{i}.html").write_text(frame.content(),
|
||||
encoding="utf-8")
|
||||
for sel in (".ui-dialog", "a.ok.vv_button",
|
||||
".ui-dialog-titlebar-close",
|
||||
"button", "input[type='button']",
|
||||
"[title]", "[aria-label]"):
|
||||
n = frame.locator(sel).count()
|
||||
if n:
|
||||
report.append(f" {sel}: {n}x")
|
||||
# výpis title/aria-label atributů — pomáhá najít menu ⋯
|
||||
for attr in ("title", "aria-label"):
|
||||
vals = frame.locator(f"[{attr}]").evaluate_all(
|
||||
f"els => els.map(e => e.getAttribute('{attr}'))")
|
||||
uniq = sorted({v for v in vals if v})[:80]
|
||||
report.append(f" {attr}: {uniq}")
|
||||
except Exception as e:
|
||||
report.append(f" [chyba čtení framu: {e}]")
|
||||
(out / "frames_report.txt").write_text("\n".join(report),
|
||||
encoding="utf-8")
|
||||
log(f"[!] Diagnostika stránky uložena do: {out}")
|
||||
return out
|
||||
|
||||
|
||||
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
|
||||
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
|
||||
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
|
||||
".vv_login_msg_dialog .vv_button.ok")
|
||||
|
||||
|
||||
def dismiss_maintenance_popup(page, timeout=8000):
|
||||
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
|
||||
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
|
||||
proto se na něj krátce čeká. Bezpečné volat vždy."""
|
||||
ok = page.locator(DIALOG_OK_SELECTOR)
|
||||
try:
|
||||
ok.first.wait_for(state="visible", timeout=timeout)
|
||||
except PWTimeout:
|
||||
return False # okno se neobjevilo — pokračujeme
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
closed = 0
|
||||
for _ in range(5): # dialogy umí být ve frontě
|
||||
try:
|
||||
if ok.count() and ok.first.is_visible():
|
||||
ok.first.click()
|
||||
page.wait_for_timeout(300)
|
||||
closed += 1
|
||||
log("[i] Maintenance/login dialog zavřen (OK).")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
if not dialog_visible(page):
|
||||
return bool(closed)
|
||||
|
||||
page.keyboard.press("Escape")
|
||||
page.wait_for_timeout(500)
|
||||
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
|
||||
|
||||
if dialog_visible(page):
|
||||
save_page_debug(page, "dialog")
|
||||
print("\n" + "=" * 60)
|
||||
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
|
||||
print(" Zavři ho prosím ručně v prohlížeči.")
|
||||
print("=" * 60)
|
||||
input(" Po ručním zavření stiskni ENTER... ")
|
||||
return bool(closed)
|
||||
|
||||
|
||||
# --- Export reportu ----------------------------------------------------
|
||||
|
||||
def _first_visible(page, builders):
|
||||
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
|
||||
hlavní stránce i ve všech frames."""
|
||||
for frame in page.frames:
|
||||
for build, desc in builders:
|
||||
try:
|
||||
loc = build(frame)
|
||||
if loc.count() and loc.first.is_visible():
|
||||
return loc.first, desc
|
||||
except Exception:
|
||||
continue
|
||||
return None, None
|
||||
|
||||
|
||||
def download_report(page):
|
||||
"""Stáhne report (Export to Excel, Data Only) do WhatToDownload/
|
||||
pod timestampovaným názvem. Vrátí cestu k souboru.
|
||||
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
|
||||
log("[i] Otevírám report Document Inventory Report - Study Level...")
|
||||
page.goto(REPORT_URL, wait_until="domcontentloaded")
|
||||
dismiss_maintenance_popup(page, timeout=4000)
|
||||
|
||||
# report je hotový, když se objeví počet záznamů / statusy
|
||||
try:
|
||||
page.wait_for_selector("text=Returned", timeout=30000)
|
||||
except PWTimeout:
|
||||
try:
|
||||
page.wait_for_selector("text=Document Status:", timeout=30000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_load")
|
||||
raise RuntimeError(
|
||||
"Report se nenačetl (nenašel jsem 'Returned' ani "
|
||||
"'Document Status:'). Diagnostika v debug/.")
|
||||
log("[i] Report načten, otevírám menu akcí (⋯)...")
|
||||
|
||||
# Menu ⋯ (Actions): button bez title/aria-label uvnitř
|
||||
# .actionMenuContainer (ověřeno na živém DOM, žádný iframe).
|
||||
actions, desc = _first_visible(page, [
|
||||
(lambda f: f.locator(
|
||||
".actionMenuContainer .dropDown.vv_dropdown_toggle "
|
||||
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
|
||||
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
|
||||
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
|
||||
])
|
||||
if actions is None:
|
||||
save_page_debug(page, "report_menu")
|
||||
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. "
|
||||
"Diagnostika v debug/.")
|
||||
log(f"[i] Menu nalezeno přes: {desc}")
|
||||
actions.click()
|
||||
|
||||
# Menu se načítá ASYNCHRONNĚ (data-loaded=false -> AJAX),
|
||||
# počkat na položku, nečíst hned po kliknutí.
|
||||
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=15000)
|
||||
except PWTimeout:
|
||||
# fallback podle textu (kdyby se data atribut změnil)
|
||||
item = page.get_by_text("Export to Excel", exact=True)
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=5000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_export_item")
|
||||
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
|
||||
"Excel' jsem nenašel. Diagnostika v debug/.")
|
||||
log("[i] Klikám 'Export to Excel'...")
|
||||
item.first.click()
|
||||
log("[i] Dialog Excel Export Options...")
|
||||
|
||||
# 'Data Only' = radio value=STANDARD, defaultně checked; pojistka.
|
||||
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
|
||||
try:
|
||||
radio.first.wait_for(state="visible", timeout=10000)
|
||||
if not radio.first.is_checked():
|
||||
radio.first.check()
|
||||
log("[i] Přepnuto na 'Data Only'.")
|
||||
except PWTimeout:
|
||||
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
|
||||
|
||||
# Export = <button> s textem Export (React dialog, emotion třídy —
|
||||
# NEselektovat podle class hash, jen role+text).
|
||||
export_btn = page.get_by_role("button", name="Export", exact=True)
|
||||
try:
|
||||
export_btn.first.wait_for(state="visible", timeout=10000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_export_btn")
|
||||
raise RuntimeError("Dialog exportu bez tlačítka Export. "
|
||||
"Diagnostika v debug/.")
|
||||
export_btn = export_btn.first
|
||||
# Export kliknout PRÁVĚ jednou (vícenásobné kliky = duplikáty);
|
||||
# 503/redirecty v network logu neřešit — rozhoduje expect_download
|
||||
with page.expect_download(timeout=120000) as dl_info:
|
||||
export_btn.click()
|
||||
download = dl_info.value
|
||||
|
||||
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
dest = EXCEL_DIR / f"{ts} {download.suggested_filename}"
|
||||
download.save_as(str(dest))
|
||||
log(f"[ok] Report uložen: {dest}")
|
||||
return dest
|
||||
|
||||
|
||||
def archive_report(path):
|
||||
"""Po úspěšném zpracování přesune report do Zpracovano/."""
|
||||
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
target = PROCESSED_DIR / path.name
|
||||
path.rename(target)
|
||||
log(f"[i] Report archivován: {target}")
|
||||
|
||||
|
||||
# --- Stažení dokumentů -------------------------------------------------
|
||||
|
||||
def find_source_file_button(page):
|
||||
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře).
|
||||
Více fallback selektorů — DOM se může lišit podle typu dokumentu."""
|
||||
candidates = [
|
||||
"[title='Source File']",
|
||||
"[aria-label='Source File']",
|
||||
]
|
||||
for sel in candidates:
|
||||
loc = page.locator(sel)
|
||||
if loc.count():
|
||||
return loc.first
|
||||
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
|
||||
if loc.count():
|
||||
return loc.first
|
||||
return None
|
||||
|
||||
|
||||
def download_source_file(page, doc):
|
||||
vtmf = doc["vtmf"]
|
||||
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
|
||||
page.goto(doc["url"], wait_until="domcontentloaded")
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
except PWTimeout:
|
||||
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
|
||||
dismiss_maintenance_popup(page, timeout=2000)
|
||||
|
||||
ph = page.locator("div.vv_placeholder_text")
|
||||
if ph.count() and ph.first.is_visible():
|
||||
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
|
||||
raise PlaceholderDocument(vtmf)
|
||||
|
||||
target = find_source_file_button(page)
|
||||
if target is None:
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
|
||||
|
||||
log("[i] Klikám na Source File a čekám na download...")
|
||||
with page.expect_download(timeout=60000) as dl_info:
|
||||
target.click()
|
||||
# Varianta s dropdownem (Source File + Viewable Rendition)
|
||||
try:
|
||||
item = page.get_by_role("menuitem",
|
||||
name=re.compile("Source File", re.I))
|
||||
if item.count() and item.first.is_visible():
|
||||
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
|
||||
item.first.click()
|
||||
except Exception:
|
||||
pass
|
||||
download = dl_info.value
|
||||
|
||||
dest = build_target_path(doc, download.suggested_filename)
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
download.save_as(str(dest))
|
||||
log(f"[ok] Uloženo: {dest}")
|
||||
return dest
|
||||
|
||||
|
||||
def download_missing(page, coll):
|
||||
"""Stáhne všechny nesmazané dokumenty bez downloaded=True.
|
||||
Výsledek každého se ihned zapíše do Mongo."""
|
||||
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
|
||||
.sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
||||
if LIMIT:
|
||||
todo = todo[:LIMIT]
|
||||
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
|
||||
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
|
||||
|
||||
ok_count, fail_count, placeholder_count = 0, 0, 0
|
||||
for n, doc in enumerate(todo, 1):
|
||||
key = doc["_id"]
|
||||
log(f"\n--- [{n}/{len(todo)}] {key} | {doc['desc'][:70]}")
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
dest = download_source_file(page, doc)
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "file": str(dest),
|
||||
"downloaded_at": datetime.now(),
|
||||
"last_error": None}})
|
||||
ok_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except PlaceholderDocument:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "placeholder": True,
|
||||
"file": None, "downloaded_at": datetime.now(),
|
||||
"last_error": None}})
|
||||
placeholder_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
|
||||
if attempt < MAX_ATTEMPTS:
|
||||
page.wait_for_timeout(RETRY_PAUSE_MS)
|
||||
if last_err is not None:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"last_error": str(last_err),
|
||||
"error_at": datetime.now()}})
|
||||
fail_count += 1
|
||||
page.wait_for_timeout(BETWEEN_DOCS_MS)
|
||||
return ok_count, fail_count, placeholder_count
|
||||
|
||||
|
||||
# --- Main --------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
ensure_credentials()
|
||||
coll = get_collection()
|
||||
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=str(PROFILE_DIR),
|
||||
headless=False,
|
||||
accept_downloads=True,
|
||||
no_viewport=True, # okno se chová nativně
|
||||
args=["--start-maximized"],
|
||||
)
|
||||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||||
ok_count = fail_count = placeholder_count = 0
|
||||
pipeline_error = None
|
||||
try:
|
||||
# 1) login
|
||||
login_if_needed(page)
|
||||
verify_inside(page)
|
||||
dismiss_maintenance_popup(page)
|
||||
|
||||
# 2) export reportu
|
||||
report_path = download_report(page)
|
||||
|
||||
# 3) parse + sync do Mongo
|
||||
docs = read_documents_from_excel(report_path)
|
||||
if not docs:
|
||||
raise RuntimeError("Report neobsahuje žádné dokumenty — "
|
||||
"sync přeskočen, nic se nemaže.")
|
||||
sync_report_to_mongo(coll, docs)
|
||||
migrate_old_csv(coll)
|
||||
archive_report(report_path)
|
||||
|
||||
# 4) stažení chybějících
|
||||
DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
ok_count, fail_count, placeholder_count = download_missing(page, coll)
|
||||
except KeyboardInterrupt:
|
||||
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
|
||||
except Exception as e:
|
||||
pipeline_error = e
|
||||
print("\n" + "=" * 60)
|
||||
print(" PIPELINE SELHALA!")
|
||||
print(f" {type(e).__name__}: {e}")
|
||||
print("=" * 60)
|
||||
finally:
|
||||
total = coll.count_documents({})
|
||||
have = coll.count_documents({"deleted": False, "downloaded": True})
|
||||
active = coll.count_documents({"deleted": False})
|
||||
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
|
||||
f"{placeholder_count} placeholderů přeskočeno, {fail_count} chyb"
|
||||
+ (f", PIPELINE SELHALA ({pipeline_error})"
|
||||
if pipeline_error else "."))
|
||||
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
|
||||
f"z toho staženo {have} ({active - have} zbývá).")
|
||||
log("[i] Zavírám prohlížeč.")
|
||||
ctx.close()
|
||||
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,96 @@
|
||||
# vtmf_pipeline_v1.5 — Kompletní V-TMF workflow (report → Mongo → download → SeaweedFS)
|
||||
|
||||
**Verze:** 1.5 · **Datum:** 2026-06-15
|
||||
|
||||
**Změny v1.5:** upload každého staženého dokumentu do SeaweedFS Filer
|
||||
(`192.168.1.50:8888`, cesta `/vtmf-documents/ab/cd/<sha256>`).
|
||||
SHA-256 content-addressed dedup — identický soubor se uloží jen jednou
|
||||
(HEAD check → 404 → PUT; při 200 dedup hit). Chyba uploadu neblokuje
|
||||
download ani zápis do Mongo — soubor zůstane na disku a pole
|
||||
`sha256/seaweed_path/seaweed_url/seaweed_synced_at` zůstanou `null`
|
||||
(lze doplnit backfillem). Souhrn na konci uvádí počet nově nahraných,
|
||||
dedup hitů a případných chyb uploadu zvlášť.
|
||||
|
||||
_(Předchozí změny viz TRASH/vtmf_pipeline_v1.4.md)_
|
||||
|
||||
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
|
||||
|
||||
1. **Login** do vtmf.veevavault.com (persistentní profil
|
||||
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
|
||||
+ ENTER; údaje z `.env` v rootu projektu).
|
||||
2. **Export reportu** „Document Inventory Report - Study Level"
|
||||
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
|
||||
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
|
||||
uloží se s timestampem do `WhatToDownload/`, po zpracování se
|
||||
přesune do `WhatToDownload/Zpracovano/`.
|
||||
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
|
||||
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`:
|
||||
- nové dokumenty se založí,
|
||||
- změny sledovaných polí se promítnou (+ `history[]`),
|
||||
- dokumenty chybějící v reportu se označí `deleted=True`
|
||||
a stažený soubor dostane ` [D]` před příponou,
|
||||
- znovuobjevené se vzkřísí a ` [D]` se odebere.
|
||||
4. **Stažení + SeaweedFS upload** — všechny `deleted=False, downloaded≠True`:
|
||||
- Source File se uloží do
|
||||
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
|
||||
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<přípona>`,
|
||||
- soubor se přečte z disku, vypočítá se SHA-256, obsah se nahraje
|
||||
do SeaweedFS na `/vtmf-documents/{sha256[:2]}/{sha256[2:4]}/{sha256}`,
|
||||
- do Mongo se zapíše `downloaded=True, file, sha256, seaweed_path,
|
||||
seaweed_url, seaweed_synced_at`; chyba SeaweedFS tyto fieldy
|
||||
nechá `null` ale `downloaded=True` se zapíše (soubor je na disku).
|
||||
- Placeholder dokumenty (`div.vv_placeholder_text` viditelný) se
|
||||
přeskočí s `placeholder=True, downloaded=True`.
|
||||
|
||||
## Mongo schéma (kolekce documents)
|
||||
|
||||
```
|
||||
_id: "VTMF-19077748|v1.0"
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies
|
||||
first_seen, last_seen # kdy poprvé/naposledy v reportu
|
||||
deleted, deleted_at # není ve výsledné sadě reportu
|
||||
downloaded, file, downloaded_at
|
||||
placeholder # True = Vault placeholder bez obsahu
|
||||
sha256 # hex SHA-256 staženého souboru
|
||||
seaweed_path # /vtmf-documents/ab/cd/<sha256>
|
||||
seaweed_url # http://192.168.1.50:8888/vtmf-documents/...
|
||||
seaweed_synced_at # kdy nahráno / null při chybě
|
||||
last_error, error_at # poslední chyba stahování
|
||||
history: [{ts, changes: {pole: {old, new}}}]
|
||||
```
|
||||
|
||||
## SeaweedFS detaily
|
||||
|
||||
- **Filer**: `http://192.168.1.50:8888` (přímý PUT, žádný master assign)
|
||||
- **Dedup**: HEAD → 404 → PUT; HEAD → 200 → dedup hit (vrátí `uploaded=False`)
|
||||
- **Timeout**: HEAD 10 s, PUT 120 s (velké soubory)
|
||||
- **MIME**: `mimetypes.guess_type()`, fallback `application/octet-stream`
|
||||
- **Backfill**: dokumenty s `downloaded=True, seaweed_path=null` lze
|
||||
dohnat samostatným skriptem (čte `file` z Mongo, nahraje, zapíše pola)
|
||||
|
||||
## Konfigurace (konstanty nahoře)
|
||||
|
||||
- `SEAWEED_FILER` — URL Filer serveru
|
||||
- `SEAWEED_PREFIX` — prefix cesty (`/vtmf-documents`)
|
||||
- `REPORT_URL` — ID reportu + filtr studie
|
||||
- `LIMIT` — None = vše; číslo = dávka
|
||||
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
|
||||
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
|
||||
|
||||
## Ověřené technické detaily (nesahat bez ověření)
|
||||
|
||||
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
|
||||
(křížek `.ui-dialog-titlebar-close` je display:none).
|
||||
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
|
||||
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
|
||||
- Export kliknout právě jednou; rozhoduje `expect_download`.
|
||||
- Placeholder detekce: `div.vv_placeholder_text` (uvnitř
|
||||
`div.vv_placeholder_pane > div.vv_placeholder_container`).
|
||||
|
||||
## Spuštění
|
||||
|
||||
```powershell
|
||||
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.5.py"
|
||||
```
|
||||
|
||||
Předchůdce: vtmf_pipeline_v1.4 (TRASH/).
|
||||
@@ -0,0 +1,937 @@
|
||||
# ============================================================
|
||||
# vtmf_pipeline_v1.5.py
|
||||
# Verze: 1.5
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault), studie
|
||||
# 77242113UCO3001. Jeden běh udělá:
|
||||
# 1) login do Vaultu (persistentní session + ruční 2FA),
|
||||
# 2) export reportu "Document Inventory Report - Study
|
||||
# Level" do Excelu (Data Only) do WhatToDownload/,
|
||||
# 3) parse reportu a synchronizaci do MongoDB
|
||||
# (Tower, db VTMF, kolekce documents,
|
||||
# klíč = VTMF číslo + verze):
|
||||
# - nové dokumenty se založí,
|
||||
# - změny polí se promítnou (+ history[]),
|
||||
# - dokumenty chybějící v reportu se označí
|
||||
# deleted=True a stažený soubor dostane ' [D]',
|
||||
# - znovuobjevené se vzkřísí a ' [D]' se odebere,
|
||||
# 4) stažení všech dosud nestažených dokumentů do
|
||||
# U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\
|
||||
# <Type>\<Subtype>\"YYYY-MM-DD Description
|
||||
# [VTMF-x] [v1.0].<přípona>" + zápis stavu do Mongo.
|
||||
#
|
||||
# Tracking stahování je KOMPLETNĚ v Mongo; starý
|
||||
# download_state.csv se při prvním běhu jednorázově
|
||||
# namigruje a přejmenuje na .imported.
|
||||
#
|
||||
# Vychází z download_vault_v2.1 (v TRASH/) — login, dialogy
|
||||
# a stahování beze změny; nové jsou kroky 2 a 3.
|
||||
#
|
||||
# v1.1: oprava tichého selhání — chyba kteréhokoli kroku se teď
|
||||
# hlasitě vypíše (a exit kód 2), místo aby běh skončil
|
||||
# souhrnem "0 staženo, 0 chyb". Export reportu: více
|
||||
# selektorů pro menu ⋯ i položku Export to Excel (včetně
|
||||
# hledání ve všech frames) a při selhání automatický záchyt
|
||||
# diagnostiky stránky do debug/ (screenshot + HTML frames).
|
||||
# v1.2: selektory exportu OVĚŘENÉ na živém DOM (žádný iframe):
|
||||
# menu ⋯ = .actionMenuContainer .dropDown.vv_dropdown_toggle
|
||||
# button.vv-icon-button (title prázdný!); menu se načítá
|
||||
# asynchronně -> čekat na položku; položka =
|
||||
# a.ReportAction[data-action-name='ExcelExport']; Data Only =
|
||||
# radio name=requiredRadioField value=STANDARD (default
|
||||
# checked); Export = <button> role+text (emotion class hash,
|
||||
# neselektovat podle tříd).
|
||||
# v1.3: na konci běhu se prohlížeč i okno zavře automaticky
|
||||
# (žádné čekání na ENTER) — vhodné pro bezobslužné běhy.
|
||||
# Interaktivní vstupy zůstávají jen tam, kde jsou nutné
|
||||
# (2FA, ručně nezavřitelný dialog).
|
||||
# v1.4: detekce placeholder dokumentů — stránka s textem
|
||||
# "This placeholder has no content" se přeskočí
|
||||
# (placeholder=True, downloaded=True v Mongo), žádná chyba.
|
||||
# v1.5: upload stažených dokumentů do SeaweedFS Filer
|
||||
# (192.168.1.50:8888, cesta /vtmf-documents/ab/cd/<sha256>).
|
||||
# SHA-256 content-addressed dedup — identický soubor se uloží
|
||||
# jen jednou. Chyba uploadu neblokuje download; chybějící
|
||||
# sha256/seaweed_path lze doplnit backfillem. Mongo nově ukládá:
|
||||
# sha256, seaweed_path, seaweed_url, seaweed_synced_at.
|
||||
# Souhrn běhu uvádí počet nově nahraných vs. dedup hitů.
|
||||
#
|
||||
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
|
||||
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
|
||||
# ============================================================
|
||||
|
||||
import csv
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
# --- Konfigurace -------------------------------------------------------
|
||||
|
||||
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
|
||||
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
|
||||
"&IdpAdapterId=CompIWALDAPEXTFORM"
|
||||
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
|
||||
|
||||
# Report Document Inventory Report - Study Level, filtr na studii
|
||||
REPORT_URL = ("https://vtmf.veevavault.com/ui/#reporting/viewer/"
|
||||
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008")
|
||||
|
||||
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
|
||||
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
|
||||
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
|
||||
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty
|
||||
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
|
||||
OLD_STATE_FILE = SCRIPT_DIR / "download_state.csv" # legacy CSV (migrace)
|
||||
DOWNLOAD_ROOT = Path(r"U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001")
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "VTMF"
|
||||
MONGO_COLL = "documents"
|
||||
|
||||
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
|
||||
LIMIT = 0
|
||||
# Pole reportu, jejichž změny se promítají a verzují do history[]
|
||||
TRACKED_FIELDS = ("name", "status", "type", "subtype", "desc",
|
||||
"date", "url", "studies")
|
||||
|
||||
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
|
||||
RETRY_PAUSE_MS = 5000 # pauza před opakováním
|
||||
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
|
||||
|
||||
SEAWEED_FILER = "http://192.168.1.50:8888"
|
||||
SEAWEED_PREFIX = "/vtmf-documents"
|
||||
|
||||
|
||||
class PlaceholderDocument(Exception):
|
||||
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def load_env_file(path):
|
||||
"""Načte KEY=VALUE řádky z .env do os.environ.
|
||||
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
|
||||
if not path.exists():
|
||||
log(f"[!] .env nenalezen: {path}")
|
||||
return
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, value = line.partition("=")
|
||||
key, value = key.strip(), value.strip().strip('"').strip("'")
|
||||
if value and key not in os.environ:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
|
||||
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
|
||||
|
||||
|
||||
def ensure_credentials():
|
||||
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
|
||||
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
|
||||
load_env_file(ENV_FILE)
|
||||
if all(os.environ.get(k) for k in ENV_KEYS):
|
||||
return
|
||||
|
||||
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
|
||||
missing_lines = [f"{k}=" for k in ENV_KEYS
|
||||
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
|
||||
|
||||
if not ENV_FILE.exists():
|
||||
ENV_FILE.write_text(
|
||||
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
|
||||
+ ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n",
|
||||
encoding="utf-8")
|
||||
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
|
||||
elif missing_lines:
|
||||
with open(ENV_FILE, "a", encoding="utf-8") as f:
|
||||
f.write("\n" + ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n")
|
||||
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
|
||||
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
|
||||
print(f" {ENV_FILE}")
|
||||
print(" a spusť skript znovu.")
|
||||
print("=" * 60)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# --- Parsování Excelu --------------------------------------------------
|
||||
|
||||
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
|
||||
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
|
||||
# nepovolené znaky Windows názvů + řídicí znaky + unicode artefakt �
|
||||
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f�]")
|
||||
|
||||
|
||||
def clean_filename(s):
|
||||
"""Očistí string na platné jméno souboru/složky ve Windows."""
|
||||
s = BAD_CHARS_RE.sub("_", str(s))
|
||||
s = re.sub(r"\s+", " ", s) # vícenásobné mezery -> jedna
|
||||
s = re.sub(r"_{2,}", "_", s) # vícenásobná podtržítka -> jedno
|
||||
return s.strip(" ._") # okraje: mezery, tečky, podtržítka
|
||||
|
||||
|
||||
def display_text(cell):
|
||||
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
|
||||
raw = str(cell.value or "").strip()
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
return m.group(2).strip() if m else raw
|
||||
|
||||
|
||||
def extract_doc_url(raw):
|
||||
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
|
||||
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
|
||||
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
|
||||
if not m:
|
||||
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
|
||||
return m.group(1)
|
||||
|
||||
|
||||
def read_documents_from_excel(path):
|
||||
"""Načte dokumenty z daného .xlsx reportu. Vrací list dictů:
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies.
|
||||
Document Name/Number/Status jsou =HYPERLINK vzorce — URL i text se
|
||||
berou regexem. Report má rozbité deklarované rozměry, čte se
|
||||
přímou iterací řádků."""
|
||||
from openpyxl import load_workbook
|
||||
|
||||
log(f"[i] Parsování reportu: {path.name}")
|
||||
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
|
||||
ws = wb[wb.sheetnames[0]]
|
||||
|
||||
rows = ws.iter_rows()
|
||||
header = [c.value for c in next(rows)]
|
||||
try:
|
||||
i_num = header.index("Document Number")
|
||||
i_name = header.index("Document Name")
|
||||
i_status = header.index("Document Status")
|
||||
i_type = header.index("Type")
|
||||
i_sub = header.index("Subtype")
|
||||
i_desc = header.index("Description")
|
||||
i_date = header.index("Document Date")
|
||||
i_study = header.index("Study")
|
||||
except ValueError as e:
|
||||
raise RuntimeError(f"V reportu chybí očekávaný sloupec: {e}")
|
||||
|
||||
docs, bad = [], []
|
||||
for row in rows:
|
||||
cell = row[i_num]
|
||||
if cell.value is None:
|
||||
continue
|
||||
raw = str(cell.value)
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
if m:
|
||||
url_raw, vtmf = m.group(1), m.group(2)
|
||||
elif cell.hyperlink: # pravý hyperlink místo vzorce
|
||||
url_raw, vtmf = cell.hyperlink.target, raw
|
||||
else:
|
||||
bad.append(raw)
|
||||
continue
|
||||
try:
|
||||
url = extract_doc_url(url_raw)
|
||||
except ValueError:
|
||||
bad.append(raw)
|
||||
continue
|
||||
|
||||
name = display_text(row[i_name])
|
||||
vm = VERSION_RE.search(name)
|
||||
version = vm.group(1) if vm else "v?"
|
||||
|
||||
desc = clean_filename(display_text(row[i_desc]))
|
||||
if not desc:
|
||||
# fallback: Document Name bez koncové verze (jde zvlášť na konec)
|
||||
desc = clean_filename(VERSION_RE.sub("", name))
|
||||
|
||||
date = row[i_date].value # datetime nebo None
|
||||
docs.append({
|
||||
"vtmf": vtmf.strip(),
|
||||
"version": version,
|
||||
"url": url,
|
||||
"name": name,
|
||||
"status": display_text(row[i_status]),
|
||||
"type": clean_filename(display_text(row[i_type])),
|
||||
"subtype": clean_filename(display_text(row[i_sub])),
|
||||
"desc": desc,
|
||||
"date": date if hasattr(date, "strftime") else None,
|
||||
"studies": display_text(row[i_study]),
|
||||
})
|
||||
|
||||
log(f"[i] Načteno {len(docs)} dokumentů"
|
||||
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
|
||||
return docs
|
||||
|
||||
|
||||
def build_target_path(doc, suggested_filename):
|
||||
"""Cílová cesta: DOWNLOAD_ROOT\\Type\\Subtype\\
|
||||
'YYYY-MM-DD Description [VTMF-xxx] [v1.0].<skutečná přípona>'.
|
||||
Datum/verze se vynechají, když nejsou k dispozici."""
|
||||
ext = Path(suggested_filename).suffix # skutečná přípona vč. tečky
|
||||
date_prefix = doc["date"].strftime("%Y-%m-%d") + " " if doc["date"] else ""
|
||||
version = f" [{doc['version']}]" if doc.get("version") else ""
|
||||
filename = f"{date_prefix}{doc['desc']} [{doc['vtmf']}]{version}{ext}"
|
||||
return DOWNLOAD_ROOT / doc["type"] / doc["subtype"] / filename
|
||||
|
||||
|
||||
def deleted_marker_path(path):
|
||||
"""Jméno souboru s příznakem smazání: 'x.pdf' -> 'x [D].pdf'."""
|
||||
p = Path(path)
|
||||
return p.with_name(f"{p.stem} [D]{p.suffix}")
|
||||
|
||||
|
||||
# --- MongoDB synchronizace ---------------------------------------------
|
||||
|
||||
def doc_key(vtmf, version):
|
||||
return f"{vtmf}|{version}"
|
||||
|
||||
|
||||
def get_collection():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
coll = client[MONGO_DB][MONGO_COLL]
|
||||
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)],
|
||||
unique=True)
|
||||
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
|
||||
return coll
|
||||
|
||||
|
||||
def migrate_old_csv(coll):
|
||||
"""Jednorázová migrace download_state.csv do Mongo: záznamy 'ok'
|
||||
se zapíší jako downloaded=True k odpovídajícímu VTMF (aktuální,
|
||||
nesmazané verzi). CSV se pak přejmenuje na .imported."""
|
||||
if not OLD_STATE_FILE.exists():
|
||||
return
|
||||
migrated = 0
|
||||
with open(OLD_STATE_FILE, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
if row["result"] != "ok":
|
||||
continue
|
||||
r = coll.update_one(
|
||||
{"vtmf": row["vtmf"], "deleted": False,
|
||||
"downloaded": {"$ne": True}},
|
||||
{"$set": {"downloaded": True, "file": row["file"],
|
||||
"downloaded_at": row["timestamp"]}})
|
||||
migrated += r.modified_count
|
||||
OLD_STATE_FILE.rename(OLD_STATE_FILE.with_suffix(".csv.imported"))
|
||||
log(f"[i] Migrace download_state.csv -> Mongo: {migrated} záznamů; "
|
||||
f"CSV přejmenováno na .imported")
|
||||
|
||||
|
||||
def sync_report_to_mongo(coll, docs):
|
||||
"""Promítne aktuální report do kolekce documents.
|
||||
Klíč = (vtmf, version). Nové založí, změny polí promítne
|
||||
(s history[]), chybějící označí deleted + soubor přejmenuje
|
||||
s ' [D]', znovuobjevené vzkřísí a ' [D]' odebere."""
|
||||
now = datetime.now()
|
||||
stats = {"new": 0, "updated": 0, "unchanged": 0,
|
||||
"resurrected": 0, "marked_deleted": 0}
|
||||
current_keys = set()
|
||||
|
||||
for d in docs:
|
||||
key = doc_key(d["vtmf"], d["version"])
|
||||
current_keys.add(key)
|
||||
existing = coll.find_one({"_id": key})
|
||||
if existing is None:
|
||||
coll.insert_one({
|
||||
"_id": key, **d,
|
||||
"first_seen": now, "last_seen": now,
|
||||
"deleted": False, "downloaded": False,
|
||||
"file": None, "history": [],
|
||||
})
|
||||
stats["new"] += 1
|
||||
continue
|
||||
|
||||
changes = {}
|
||||
for fld in TRACKED_FIELDS:
|
||||
if existing.get(fld) != d.get(fld):
|
||||
changes[fld] = {"old": existing.get(fld),
|
||||
"new": d.get(fld)}
|
||||
update = {"$set": {**d, "last_seen": now, "deleted": False}}
|
||||
if changes:
|
||||
update["$push"] = {"history": {"ts": now, "changes": changes}}
|
||||
stats["updated"] += 1
|
||||
else:
|
||||
stats["unchanged"] += 1
|
||||
|
||||
if existing.get("deleted"):
|
||||
# dokument se do reportu vrátil -> odebrat [D] ze souboru
|
||||
stats["resurrected"] += 1
|
||||
stats["unchanged"] -= 0 # (počítá se výše jako updated/unchanged)
|
||||
old_file = existing.get("file")
|
||||
if old_file:
|
||||
marked = deleted_marker_path(old_file)
|
||||
if marked.exists() and not Path(old_file).exists():
|
||||
marked.rename(old_file)
|
||||
log(f"[i] {key}: soubor vrácen z ' [D]' zpět.")
|
||||
update["$set"]["file"] = str(old_file)
|
||||
coll.update_one({"_id": key}, update)
|
||||
|
||||
# dokumenty, které v aktuálním reportu nejsou -> deleted + ' [D]'
|
||||
for rec in coll.find({"deleted": False}):
|
||||
if rec["_id"] in current_keys:
|
||||
continue
|
||||
upd = {"deleted": True, "deleted_at": now}
|
||||
f = rec.get("file")
|
||||
if f and Path(f).exists():
|
||||
marked = deleted_marker_path(f)
|
||||
try:
|
||||
Path(f).rename(marked)
|
||||
upd["file"] = str(marked)
|
||||
log(f"[i] {rec['_id']}: soubor označen ' [D]'.")
|
||||
except OSError as e:
|
||||
log(f"[!] {rec['_id']}: přejmenování na [D] selhalo: {e}")
|
||||
coll.update_one({"_id": rec["_id"]},
|
||||
{"$set": upd,
|
||||
"$push": {"history": {"ts": now,
|
||||
"changes": {"deleted": {
|
||||
"old": False,
|
||||
"new": True}}}}})
|
||||
stats["marked_deleted"] += 1
|
||||
|
||||
log(f"[ok] Mongo sync: {stats['new']} nových, {stats['updated']} změněných, "
|
||||
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
|
||||
f"{stats['marked_deleted']} označených deleted.")
|
||||
return stats
|
||||
|
||||
|
||||
# --- Přihlášení --------------------------------------------------------
|
||||
|
||||
def submit_login_form(page, password_box):
|
||||
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
|
||||
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
|
||||
candidates = [
|
||||
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
|
||||
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
|
||||
page.locator("input[type='submit']"),
|
||||
page.locator("button[type='submit']"),
|
||||
page.get_by_role("button", name=re.compile("^ok$", re.I)),
|
||||
]
|
||||
for loc in candidates:
|
||||
try:
|
||||
if loc.count() and loc.first.is_visible():
|
||||
label = (loc.first.inner_text() or
|
||||
loc.first.get_attribute("value") or "submit").strip()
|
||||
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
|
||||
loc.first.click()
|
||||
return
|
||||
except Exception:
|
||||
continue
|
||||
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
|
||||
password_box.press("Enter")
|
||||
|
||||
|
||||
def login_if_needed(page):
|
||||
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
|
||||
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
|
||||
log(f"[i] Otevírám přihlašovací URL...")
|
||||
page.goto(LOGIN_URL, wait_until="domcontentloaded")
|
||||
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Už přihlášen (perzistentní session).")
|
||||
return
|
||||
|
||||
user_box = page.locator("input[type='text']").first
|
||||
try:
|
||||
user_box.wait_for(timeout=8000)
|
||||
except PWTimeout:
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Přihlášen bez formuláře (session redirect).")
|
||||
return
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
|
||||
|
||||
username = os.environ["VAULT_USER"]
|
||||
password = os.environ["VAULT_PASS"]
|
||||
|
||||
log("[i] Vyplňuji přihlašovací údaje...")
|
||||
user_box.fill(username)
|
||||
password_box = page.locator("input[type='password']").first
|
||||
password_box.fill(password)
|
||||
submit_login_form(page, password_box)
|
||||
|
||||
log("[i] Odeslán login, čekám na výsledek...")
|
||||
try:
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
|
||||
log("[ok] Přihlášen rovnou (bez 2FA).")
|
||||
return
|
||||
except PWTimeout:
|
||||
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
|
||||
|
||||
err = page.locator("text=/invalid|incorrect|failed/i")
|
||||
try:
|
||||
if err.count() and err.first.is_visible():
|
||||
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
|
||||
except PWTimeout:
|
||||
pass
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
|
||||
print(" Potvrď přihlášení v mobilní aplikaci.")
|
||||
print("=" * 60)
|
||||
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
|
||||
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
|
||||
log("[ok] Přihlášení dokončeno.")
|
||||
|
||||
|
||||
def verify_inside(page):
|
||||
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
|
||||
log(f"[ok] Uvnitř Vaultu: {page.url}")
|
||||
|
||||
|
||||
def dialog_visible(page):
|
||||
"""True, pokud je na stránce viditelný jQuery UI dialog."""
|
||||
try:
|
||||
dlg = page.locator(".ui-dialog")
|
||||
return bool(dlg.count() and dlg.first.is_visible())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def save_page_debug(page, tag):
|
||||
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
|
||||
kandidátů na tlačítka. Vrátí cestu složky."""
|
||||
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
|
||||
except Exception as e:
|
||||
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
|
||||
report = []
|
||||
for i, frame in enumerate(page.frames):
|
||||
report.append(f"=== frame[{i}] url={frame.url}")
|
||||
try:
|
||||
(out / f"frame_{i}.html").write_text(frame.content(),
|
||||
encoding="utf-8")
|
||||
for sel in (".ui-dialog", "a.ok.vv_button",
|
||||
".ui-dialog-titlebar-close",
|
||||
"button", "input[type='button']",
|
||||
"[title]", "[aria-label]"):
|
||||
n = frame.locator(sel).count()
|
||||
if n:
|
||||
report.append(f" {sel}: {n}x")
|
||||
# výpis title/aria-label atributů — pomáhá najít menu ⋯
|
||||
for attr in ("title", "aria-label"):
|
||||
vals = frame.locator(f"[{attr}]").evaluate_all(
|
||||
f"els => els.map(e => e.getAttribute('{attr}'))")
|
||||
uniq = sorted({v for v in vals if v})[:80]
|
||||
report.append(f" {attr}: {uniq}")
|
||||
except Exception as e:
|
||||
report.append(f" [chyba čtení framu: {e}]")
|
||||
(out / "frames_report.txt").write_text("\n".join(report),
|
||||
encoding="utf-8")
|
||||
log(f"[!] Diagnostika stránky uložena do: {out}")
|
||||
return out
|
||||
|
||||
|
||||
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
|
||||
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
|
||||
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
|
||||
".vv_login_msg_dialog .vv_button.ok")
|
||||
|
||||
|
||||
def dismiss_maintenance_popup(page, timeout=8000):
|
||||
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
|
||||
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
|
||||
proto se na něj krátce čeká. Bezpečné volat vždy."""
|
||||
ok = page.locator(DIALOG_OK_SELECTOR)
|
||||
try:
|
||||
ok.first.wait_for(state="visible", timeout=timeout)
|
||||
except PWTimeout:
|
||||
return False # okno se neobjevilo — pokračujeme
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
closed = 0
|
||||
for _ in range(5): # dialogy umí být ve frontě
|
||||
try:
|
||||
if ok.count() and ok.first.is_visible():
|
||||
ok.first.click()
|
||||
page.wait_for_timeout(300)
|
||||
closed += 1
|
||||
log("[i] Maintenance/login dialog zavřen (OK).")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
if not dialog_visible(page):
|
||||
return bool(closed)
|
||||
|
||||
page.keyboard.press("Escape")
|
||||
page.wait_for_timeout(500)
|
||||
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
|
||||
|
||||
if dialog_visible(page):
|
||||
save_page_debug(page, "dialog")
|
||||
print("\n" + "=" * 60)
|
||||
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
|
||||
print(" Zavři ho prosím ručně v prohlížeči.")
|
||||
print("=" * 60)
|
||||
input(" Po ručním zavření stiskni ENTER... ")
|
||||
return bool(closed)
|
||||
|
||||
|
||||
# --- Export reportu ----------------------------------------------------
|
||||
|
||||
def _first_visible(page, builders):
|
||||
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
|
||||
hlavní stránce i ve všech frames."""
|
||||
for frame in page.frames:
|
||||
for build, desc in builders:
|
||||
try:
|
||||
loc = build(frame)
|
||||
if loc.count() and loc.first.is_visible():
|
||||
return loc.first, desc
|
||||
except Exception:
|
||||
continue
|
||||
return None, None
|
||||
|
||||
|
||||
def download_report(page):
|
||||
"""Stáhne report (Export to Excel, Data Only) do WhatToDownload/
|
||||
pod timestampovaným názvem. Vrátí cestu k souboru.
|
||||
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
|
||||
log("[i] Otevírám report Document Inventory Report - Study Level...")
|
||||
page.goto(REPORT_URL, wait_until="domcontentloaded")
|
||||
dismiss_maintenance_popup(page, timeout=4000)
|
||||
|
||||
# report je hotový, když se objeví počet záznamů / statusy
|
||||
try:
|
||||
page.wait_for_selector("text=Returned", timeout=30000)
|
||||
except PWTimeout:
|
||||
try:
|
||||
page.wait_for_selector("text=Document Status:", timeout=30000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_load")
|
||||
raise RuntimeError(
|
||||
"Report se nenačetl (nenašel jsem 'Returned' ani "
|
||||
"'Document Status:'). Diagnostika v debug/.")
|
||||
log("[i] Report načten, otevírám menu akcí (⋯)...")
|
||||
|
||||
# Menu ⋯ (Actions): button bez title/aria-label uvnitř
|
||||
# .actionMenuContainer (ověřeno na živém DOM, žádný iframe).
|
||||
actions, desc = _first_visible(page, [
|
||||
(lambda f: f.locator(
|
||||
".actionMenuContainer .dropDown.vv_dropdown_toggle "
|
||||
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
|
||||
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
|
||||
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
|
||||
])
|
||||
if actions is None:
|
||||
save_page_debug(page, "report_menu")
|
||||
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. "
|
||||
"Diagnostika v debug/.")
|
||||
log(f"[i] Menu nalezeno přes: {desc}")
|
||||
actions.click()
|
||||
|
||||
# Menu se načítá ASYNCHRONNĚ (data-loaded=false -> AJAX),
|
||||
# počkat na položku, nečíst hned po kliknutí.
|
||||
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=15000)
|
||||
except PWTimeout:
|
||||
# fallback podle textu (kdyby se data atribut změnil)
|
||||
item = page.get_by_text("Export to Excel", exact=True)
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=5000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_export_item")
|
||||
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
|
||||
"Excel' jsem nenašel. Diagnostika v debug/.")
|
||||
log("[i] Klikám 'Export to Excel'...")
|
||||
item.first.click()
|
||||
log("[i] Dialog Excel Export Options...")
|
||||
|
||||
# 'Data Only' = radio value=STANDARD, defaultně checked; pojistka.
|
||||
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
|
||||
try:
|
||||
radio.first.wait_for(state="visible", timeout=10000)
|
||||
if not radio.first.is_checked():
|
||||
radio.first.check()
|
||||
log("[i] Přepnuto na 'Data Only'.")
|
||||
except PWTimeout:
|
||||
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
|
||||
|
||||
# Export = <button> s textem Export (React dialog, emotion třídy —
|
||||
# NEselektovat podle class hash, jen role+text).
|
||||
export_btn = page.get_by_role("button", name="Export", exact=True)
|
||||
try:
|
||||
export_btn.first.wait_for(state="visible", timeout=10000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_export_btn")
|
||||
raise RuntimeError("Dialog exportu bez tlačítka Export. "
|
||||
"Diagnostika v debug/.")
|
||||
export_btn = export_btn.first
|
||||
# Export kliknout PRÁVĚ jednou (vícenásobné kliky = duplikáty);
|
||||
# 503/redirecty v network logu neřešit — rozhoduje expect_download
|
||||
with page.expect_download(timeout=120000) as dl_info:
|
||||
export_btn.click()
|
||||
download = dl_info.value
|
||||
|
||||
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
dest = EXCEL_DIR / f"{ts} {download.suggested_filename}"
|
||||
download.save_as(str(dest))
|
||||
log(f"[ok] Report uložen: {dest}")
|
||||
return dest
|
||||
|
||||
|
||||
def archive_report(path):
|
||||
"""Po úspěšném zpracování přesune report do Zpracovano/."""
|
||||
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
target = PROCESSED_DIR / path.name
|
||||
path.rename(target)
|
||||
log(f"[i] Report archivován: {target}")
|
||||
|
||||
|
||||
# --- SeaweedFS ---------------------------------------------------------
|
||||
|
||||
def _sw_path(sha256):
|
||||
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
||||
|
||||
|
||||
def seaweed_store(data, mime="application/octet-stream"):
|
||||
"""Idempotentní upload do SeaweedFS Filer.
|
||||
Vrací (path, url, uploaded): uploaded=False znamená dedup hit."""
|
||||
sha256 = hashlib.sha256(data).hexdigest()
|
||||
path = _sw_path(sha256)
|
||||
url = SEAWEED_FILER + path
|
||||
|
||||
try:
|
||||
urllib.request.urlopen(
|
||||
urllib.request.Request(url, method="HEAD"), timeout=10)
|
||||
return path, url, False # soubor už existuje
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code != 404:
|
||||
raise
|
||||
|
||||
req = urllib.request.Request(
|
||||
url, data=data, method="PUT",
|
||||
headers={"Content-Type": mime})
|
||||
urllib.request.urlopen(req, timeout=120)
|
||||
return path, url, True
|
||||
|
||||
|
||||
# --- Stažení dokumentů -------------------------------------------------
|
||||
|
||||
def find_source_file_button(page):
|
||||
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře).
|
||||
Více fallback selektorů — DOM se může lišit podle typu dokumentu."""
|
||||
candidates = [
|
||||
"[title='Source File']",
|
||||
"[aria-label='Source File']",
|
||||
]
|
||||
for sel in candidates:
|
||||
loc = page.locator(sel)
|
||||
if loc.count():
|
||||
return loc.first
|
||||
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
|
||||
if loc.count():
|
||||
return loc.first
|
||||
return None
|
||||
|
||||
|
||||
def download_source_file(page, doc):
|
||||
vtmf = doc["vtmf"]
|
||||
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
|
||||
page.goto(doc["url"], wait_until="domcontentloaded")
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
except PWTimeout:
|
||||
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
|
||||
dismiss_maintenance_popup(page, timeout=2000)
|
||||
|
||||
ph = page.locator("div.vv_placeholder_text")
|
||||
if ph.count() and ph.first.is_visible():
|
||||
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
|
||||
raise PlaceholderDocument(vtmf)
|
||||
|
||||
target = find_source_file_button(page)
|
||||
if target is None:
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
|
||||
|
||||
log("[i] Klikám na Source File a čekám na download...")
|
||||
with page.expect_download(timeout=60000) as dl_info:
|
||||
target.click()
|
||||
# Varianta s dropdownem (Source File + Viewable Rendition)
|
||||
try:
|
||||
item = page.get_by_role("menuitem",
|
||||
name=re.compile("Source File", re.I))
|
||||
if item.count() and item.first.is_visible():
|
||||
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
|
||||
item.first.click()
|
||||
except Exception:
|
||||
pass
|
||||
download = dl_info.value
|
||||
|
||||
dest = build_target_path(doc, download.suggested_filename)
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
download.save_as(str(dest))
|
||||
return dest
|
||||
|
||||
|
||||
def download_missing(page, coll):
|
||||
"""Stáhne všechny nesmazané dokumenty bez downloaded=True.
|
||||
Výsledek každého se ihned zapíše do Mongo."""
|
||||
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
|
||||
.sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
||||
if LIMIT:
|
||||
todo = todo[:LIMIT]
|
||||
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
|
||||
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
|
||||
|
||||
ok_count, fail_count, placeholder_count = 0, 0, 0
|
||||
sw_uploaded = sw_dedup = sw_failed = 0
|
||||
for n, doc in enumerate(todo, 1):
|
||||
key = doc["_id"]
|
||||
log(f"\n--- [{n}/{len(todo)}] {key} | {doc['desc'][:70]}")
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
dest = download_source_file(page, doc)
|
||||
|
||||
# SeaweedFS upload (neblokuje při chybě)
|
||||
sw_path = sw_url = sw_ts = sha256_hex = None
|
||||
try:
|
||||
data = dest.read_bytes()
|
||||
size_kb = len(data) / 1024
|
||||
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
ext = dest.suffix.lstrip('.').upper()
|
||||
log(f"[ok] Stazeno: {dest.name} ({size_str} {ext})")
|
||||
mime = mimetypes.guess_type(dest.name)[0] or "application/octet-stream"
|
||||
sw_path, sw_url, uploaded = seaweed_store(data, mime)
|
||||
sha256_hex = hashlib.sha256(data).hexdigest()
|
||||
sw_ts = datetime.now()
|
||||
if uploaded:
|
||||
sw_uploaded += 1
|
||||
log(f"[ok] SeaweedFS: nahrano ({size_str}) -> {sw_path}")
|
||||
else:
|
||||
sw_dedup += 1
|
||||
log(f"[i] SeaweedFS: dedup hit ({size_str}) -> {sw_path}")
|
||||
except Exception as sw_err:
|
||||
sw_failed += 1
|
||||
log(f"[!] SeaweedFS upload selhal (soubor je na disku): {sw_err}")
|
||||
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "file": str(dest),
|
||||
"downloaded_at": datetime.now(),
|
||||
"sha256": sha256_hex,
|
||||
"seaweed_path": sw_path,
|
||||
"seaweed_url": sw_url,
|
||||
"seaweed_synced_at": sw_ts,
|
||||
"last_error": None}})
|
||||
ok_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except PlaceholderDocument:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "placeholder": True,
|
||||
"file": None, "downloaded_at": datetime.now(),
|
||||
"last_error": None}})
|
||||
placeholder_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
|
||||
if attempt < MAX_ATTEMPTS:
|
||||
page.wait_for_timeout(RETRY_PAUSE_MS)
|
||||
if last_err is not None:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"last_error": str(last_err),
|
||||
"error_at": datetime.now()}})
|
||||
fail_count += 1
|
||||
page.wait_for_timeout(BETWEEN_DOCS_MS)
|
||||
return ok_count, fail_count, placeholder_count, sw_uploaded, sw_dedup, sw_failed
|
||||
|
||||
|
||||
# --- Main --------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
ensure_credentials()
|
||||
coll = get_collection()
|
||||
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=str(PROFILE_DIR),
|
||||
headless=False,
|
||||
accept_downloads=True,
|
||||
no_viewport=True, # okno se chová nativně
|
||||
args=["--start-maximized"],
|
||||
)
|
||||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||||
ok_count = fail_count = placeholder_count = 0
|
||||
sw_uploaded = sw_dedup = sw_failed = 0
|
||||
pipeline_error = None
|
||||
try:
|
||||
# 1) login
|
||||
login_if_needed(page)
|
||||
verify_inside(page)
|
||||
dismiss_maintenance_popup(page)
|
||||
|
||||
# 2) export reportu
|
||||
report_path = download_report(page)
|
||||
|
||||
# 3) parse + sync do Mongo
|
||||
docs = read_documents_from_excel(report_path)
|
||||
if not docs:
|
||||
raise RuntimeError("Report neobsahuje žádné dokumenty — "
|
||||
"sync přeskočen, nic se nemaže.")
|
||||
sync_report_to_mongo(coll, docs)
|
||||
migrate_old_csv(coll)
|
||||
archive_report(report_path)
|
||||
|
||||
# 4) stažení chybějících
|
||||
DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
(ok_count, fail_count, placeholder_count,
|
||||
sw_uploaded, sw_dedup, sw_failed) = download_missing(page, coll)
|
||||
except KeyboardInterrupt:
|
||||
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
|
||||
except Exception as e:
|
||||
pipeline_error = e
|
||||
print("\n" + "=" * 60)
|
||||
print(" PIPELINE SELHALA!")
|
||||
print(f" {type(e).__name__}: {e}")
|
||||
print("=" * 60)
|
||||
finally:
|
||||
total = coll.count_documents({})
|
||||
have = coll.count_documents({"deleted": False, "downloaded": True})
|
||||
active = coll.count_documents({"deleted": False})
|
||||
sw_info = (f"SeaweedFS: {sw_uploaded} nových, {sw_dedup} dedup"
|
||||
+ (f", {sw_failed} chyb uploadu" if sw_failed else ""))
|
||||
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
|
||||
f"{placeholder_count} placeholderů přeskočeno, {fail_count} chyb"
|
||||
+ (f", PIPELINE SELHALA ({pipeline_error})" if pipeline_error else ".")
|
||||
+ (f"\n[i] {sw_info}" if ok_count else ""))
|
||||
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
|
||||
f"z toho staženo {have} ({active - have} zbývá).")
|
||||
log("[i] Zavírám prohlížeč.")
|
||||
ctx.close()
|
||||
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,215 @@
|
||||
# ============================================================
|
||||
# migrate_to_v16.py
|
||||
# Verze: 1.1
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Jednorázová migrace stávajících STUDY-level dat
|
||||
# (nasbíraných pipeline v1.3–v1.5) na schéma v1.6.
|
||||
#
|
||||
# v1.6 ukládá dokumenty JEN do SeaweedFS (žádný Dropbox),
|
||||
# klíč = číslo dokumentu + verze. Dvě fáze:
|
||||
#
|
||||
# [mongo] Re-parse NEJNOVĚJŠÍHO archivovaného study reportu
|
||||
# (WhatToDownload/Zpracovano/*Study Level*.xlsx)
|
||||
# v1.6 parserem a obohacení existujících dokumentů
|
||||
# o nová pole (level, levels[], scopes[], studies[],
|
||||
# countries=[], sites=[], classification,
|
||||
# process_name, external_system_name, created_by,
|
||||
# last_modified_by, version_created_by).
|
||||
# NESAHÁ na download stav (downloaded, sha256,
|
||||
# seaweed_*, history, first_seen).
|
||||
#
|
||||
# [seaweed] Překlíčování SeaweedFS ze starých SHA cest na nové
|
||||
# /vtmf-documents/<vtmf>/<verze>.<přípona>. Zdroj
|
||||
# bajtů = stávající soubor na disku (pole file), jako
|
||||
# fallback GET ze staré SHA cesty. Po úspěchu: oprava
|
||||
# seaweed_path/url + sha256 v Mongo, smazání staré SHA
|
||||
# cesty a ODEBRÁNÍ pole file z Mongo (Dropbox se už
|
||||
# nepoužívá; fyzické soubory v Dropboxu pak můžeš
|
||||
# smazat ručně).
|
||||
#
|
||||
# DEFAULT je DRY-RUN. Ostře až s --apply. Idempotentní.
|
||||
#
|
||||
# Použití:
|
||||
# python migrate_to_v16.py # dry-run, vše
|
||||
# python migrate_to_v16.py --apply # ostře, vše
|
||||
# python migrate_to_v16.py --phase mongo --apply
|
||||
# python migrate_to_v16.py --phase seaweed --apply
|
||||
# ============================================================
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import importlib.util
|
||||
import mimetypes
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PIPE_FILE = SCRIPT_DIR / "vtmf_pipeline_v1.6.py"
|
||||
|
||||
# starý SHA-256 content-addressed tvar cesty (k odstranění z SeaweedFS)
|
||||
OLD_SHA_PATH_RE = re.compile(r"^/vtmf-documents/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}$")
|
||||
|
||||
|
||||
def load_pipeline():
|
||||
spec = importlib.util.spec_from_file_location("vtmf_pipeline_v16", PIPE_FILE)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
return mod
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def http_get(url):
|
||||
with urllib.request.urlopen(url, timeout=120) as r:
|
||||
return r.read()
|
||||
|
||||
|
||||
def seaweed_delete(url):
|
||||
try:
|
||||
urllib.request.urlopen(urllib.request.Request(url, method="DELETE"), timeout=30)
|
||||
return True
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code in (404, 204, 200)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# --- Fáze MONGO --------------------------------------------------------
|
||||
|
||||
def phase_mongo(mod, coll, apply):
|
||||
zp = SCRIPT_DIR / "WhatToDownload" / "Zpracovano"
|
||||
reports = sorted(zp.glob("*Study Level*.xlsx"))
|
||||
if not reports:
|
||||
log("[!] Nenašel jsem žádný archivovaný study report — fáze mongo přeskočena.")
|
||||
return
|
||||
newest = reports[-1]
|
||||
log(f"[i] [mongo] Re-parse: {newest.name}")
|
||||
docs = mod.read_documents_from_excel(newest, "study")
|
||||
docs = [d for d in docs if mod.TARGET_STUDY in d["studies"]]
|
||||
log(f"[i] [mongo] {len(docs)} dokumentů study-level {mod.TARGET_STUDY}.")
|
||||
|
||||
sk = f"study|{mod.TARGET_STUDY}|"
|
||||
enriched = missing = 0
|
||||
for d in docs:
|
||||
key = mod.doc_key(d["vtmf"], d["version"])
|
||||
if not coll.find_one({"_id": key}, {"_id": 1}):
|
||||
missing += 1
|
||||
if missing <= 10:
|
||||
log(f" [!] V Mongo chybí {key} (přeskočeno).")
|
||||
continue
|
||||
set_fields = {
|
||||
"level": "study", "url": d["url"], "name": d["name"],
|
||||
"status": d["status"], "type": d["type"], "subtype": d["subtype"],
|
||||
"classification": d["classification"], "desc": d["desc"],
|
||||
"process_name": d["process_name"],
|
||||
"external_system_name": d["external_system_name"],
|
||||
"created_by": d["created_by"], "last_modified_by": d["last_modified_by"],
|
||||
"version_created_by": d["version_created_by"], "date": d["date"],
|
||||
"studies": d["studies"], "countries": [], "sites": [],
|
||||
}
|
||||
if apply:
|
||||
coll.update_one({"_id": key}, {
|
||||
"$set": set_fields,
|
||||
"$addToSet": {"scopes": sk, "levels": "study"},
|
||||
})
|
||||
enriched += 1
|
||||
|
||||
log(f"[{'APPLY' if apply else 'DRY'}] [mongo] Obohaceno {enriched} dokumentů"
|
||||
+ (f", {missing} v Mongo chybělo." if missing else "."))
|
||||
|
||||
|
||||
# --- Fáze SEAWEED ------------------------------------------------------
|
||||
|
||||
def phase_seaweed(mod, coll, apply):
|
||||
q = {"downloaded": True, "placeholder": {"$ne": True}, "file": {"$ne": None}}
|
||||
docs = list(coll.find(q))
|
||||
log(f"[i] [seaweed] Kandidátů (s polem file): {len(docs)}")
|
||||
|
||||
uploaded = old_deleted = unset = missing = err = already = 0
|
||||
for doc in docs:
|
||||
key = doc["_id"]
|
||||
src = Path(doc["file"])
|
||||
ext = src.suffix
|
||||
new_path = mod.seaweed_path(doc["vtmf"], doc["version"], ext)
|
||||
old_path = doc.get("seaweed_path")
|
||||
old_is_sha = bool(old_path and OLD_SHA_PATH_RE.match(old_path))
|
||||
|
||||
if old_path == new_path:
|
||||
already += 1
|
||||
if apply: # jen dorovnat: zahodit file
|
||||
coll.update_one({"_id": key}, {"$unset": {"file": ""}})
|
||||
unset += 1
|
||||
continue
|
||||
|
||||
if not apply:
|
||||
note = f" (smazat starou {old_path})" if old_is_sha else ""
|
||||
log(f" PUT {new_path}{note} (+ unset file)")
|
||||
continue
|
||||
|
||||
# zdroj bajtů: disk, fallback GET ze staré SHA cesty
|
||||
try:
|
||||
if src.exists():
|
||||
data = src.read_bytes()
|
||||
elif old_is_sha:
|
||||
data = http_get(mod.SEAWEED_FILER + old_path)
|
||||
else:
|
||||
missing += 1
|
||||
if missing <= 10:
|
||||
log(f" [!] {key}: zdroj nedostupný (soubor i SHA chybí).")
|
||||
continue
|
||||
|
||||
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
|
||||
sw_path, sw_url = mod.seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
|
||||
coll.update_one({"_id": key}, {
|
||||
"$set": {"seaweed_path": sw_path, "seaweed_url": sw_url,
|
||||
"sha256": hashlib.sha256(data).hexdigest(),
|
||||
"seaweed_synced_at": datetime.now()},
|
||||
"$unset": {"file": ""}})
|
||||
uploaded += 1
|
||||
unset += 1
|
||||
if old_is_sha and old_path != sw_path:
|
||||
if seaweed_delete(mod.SEAWEED_FILER + old_path):
|
||||
old_deleted += 1
|
||||
except Exception as e:
|
||||
err += 1
|
||||
log(f" [!] {key}: SeaweedFS selhal: {e}")
|
||||
|
||||
log(f"[{'APPLY' if apply else 'DRY'}] [seaweed] Překlíčováno {uploaded}, "
|
||||
f"už na nové cestě {already}, starých SHA smazáno {old_deleted}, "
|
||||
f"pole file odebráno {unset}, chybí zdroj {missing}, chyb {err}.")
|
||||
|
||||
|
||||
# --- Main --------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Migrace VTMF dat na schéma v1.6")
|
||||
ap.add_argument("--phase", choices=["mongo", "seaweed", "all"], default="all")
|
||||
ap.add_argument("--apply", action="store_true",
|
||||
help="ostrý běh (bez něj jen DRY-RUN)")
|
||||
args = ap.parse_args()
|
||||
|
||||
mode = "APPLY (ostře)" if args.apply else "DRY-RUN (nic se nemění)"
|
||||
log(f"=== Migrace na v1.6 — fáze: {args.phase} — režim: {mode} ===\n")
|
||||
|
||||
mod = load_pipeline()
|
||||
_, coll, _ = mod.get_db()
|
||||
log(f"[ok] Mongo: {mod.MONGO_URI} / {mod.MONGO_DB}.{mod.MONGO_COLL}\n")
|
||||
|
||||
if args.phase in ("mongo", "all"):
|
||||
phase_mongo(mod, coll, args.apply)
|
||||
log("")
|
||||
if args.phase in ("seaweed", "all"):
|
||||
phase_seaweed(mod, coll, args.apply)
|
||||
log("")
|
||||
|
||||
log("=== DRY-RUN hotov. Pro ostrý běh přidej --apply. ==="
|
||||
if not args.apply else "=== Migrace dokončena. ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,140 @@
|
||||
# ============================================================
|
||||
# seaweed_backfill_v1.1.py
|
||||
# Verze: 1.1
|
||||
# Datum: 2026-06-15
|
||||
# v1.1: retry 3x s 5s pauzou při HTTP 5xx (přechodná chyba serveru)
|
||||
# Popis: Jednorázový backfill — nahraje do SeaweedFS Filer
|
||||
# všechny dokumenty z VTMF.documents, které jsou na disku
|
||||
# (downloaded=True, file!=null) ale ještě nemají seaweed_path.
|
||||
# Placeholdery a záznamy bez souboru přeskočí.
|
||||
# Lze spustit opakovaně — HEAD check zajistí dedup,
|
||||
# přerušení kdykoli naváže příště.
|
||||
# ============================================================
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "VTMF"
|
||||
MONGO_COLL = "documents"
|
||||
|
||||
SEAWEED_FILER = "http://192.168.1.50:8888"
|
||||
SEAWEED_PREFIX = "/vtmf-documents"
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def sw_path(sha256):
|
||||
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
||||
|
||||
|
||||
MAX_ATTEMPTS = 3
|
||||
RETRY_PAUSE = 5 # sekund mezi pokusy při 5xx
|
||||
|
||||
|
||||
def seaweed_store(data, mime="application/octet-stream"):
|
||||
"""HEAD check + PUT s retry při 5xx. Vrací (path, url, uploaded)."""
|
||||
sha256 = hashlib.sha256(data).hexdigest()
|
||||
path = sw_path(sha256)
|
||||
url = SEAWEED_FILER + path
|
||||
|
||||
try:
|
||||
urllib.request.urlopen(
|
||||
urllib.request.Request(url, method="HEAD"), timeout=10)
|
||||
return path, url, False # dedup hit
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code != 404:
|
||||
raise
|
||||
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
urllib.request.urlopen(
|
||||
urllib.request.Request(url, data=data, method="PUT",
|
||||
headers={"Content-Type": mime}),
|
||||
timeout=120)
|
||||
return path, url, True
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code < 500:
|
||||
raise # 4xx — nema smysl opakovat
|
||||
last_err = e
|
||||
if attempt < MAX_ATTEMPTS:
|
||||
log(f" [!] HTTP {e.code} (pokus {attempt}/{MAX_ATTEMPTS}), čekám {RETRY_PAUSE}s...")
|
||||
time.sleep(RETRY_PAUSE)
|
||||
raise last_err
|
||||
|
||||
|
||||
def main():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
coll = client[MONGO_DB][MONGO_COLL]
|
||||
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
||||
|
||||
query = {
|
||||
"downloaded": True,
|
||||
"placeholder": {"$ne": True},
|
||||
"seaweed_path": None,
|
||||
"file": {"$ne": None},
|
||||
}
|
||||
todo = list(coll.find(query).sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
||||
log(f"[i] Ke zpracování: {len(todo)} dokumentů\n")
|
||||
|
||||
uploaded = dedup = skipped = failed = 0
|
||||
|
||||
for n, doc in enumerate(todo, 1):
|
||||
key = doc["_id"]
|
||||
path = doc.get("file")
|
||||
|
||||
if not path or not Path(path).exists():
|
||||
log(f"[{n}/{len(todo)}] {key} [!] Soubor nenalezen na disku — přeskočeno.")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
data = Path(path).read_bytes()
|
||||
size_kb = len(data) / 1024
|
||||
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
log(f"[{n}/{len(todo)}] {key} ({size_str} {Path(path).suffix.lstrip('.').upper()}) {doc.get('desc', '')[:60]}")
|
||||
mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
|
||||
sha256_hex = hashlib.sha256(data).hexdigest()
|
||||
|
||||
sw_p, sw_url, was_new = seaweed_store(data, mime)
|
||||
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"sha256": sha256_hex,
|
||||
"seaweed_path": sw_p,
|
||||
"seaweed_url": sw_url,
|
||||
"seaweed_synced_at": datetime.now(),
|
||||
}})
|
||||
|
||||
if was_new:
|
||||
uploaded += 1
|
||||
log(f" [ok] Nahráno ({size_str}) → {sw_p}")
|
||||
else:
|
||||
dedup += 1
|
||||
log(f" [i] Dedup hit ({size_str}) → {sw_p}")
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
log(f" [!] Chyba: {e}")
|
||||
|
||||
log(f"\n{'='*60}")
|
||||
log(f" Hotovo: {uploaded} nahráno, {dedup} dedup, "
|
||||
f"{skipped} bez souboru, {failed} chyb.")
|
||||
log(f"{'='*60}")
|
||||
|
||||
sys.exit(1 if failed else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,46 @@
|
||||
"""Rychlý test SeaweedFS Filer (port 8888) — PUT / HEAD / GET / DELETE."""
|
||||
import hashlib
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
FILER = "http://192.168.1.50:8888"
|
||||
PAYLOAD = b"SeaweedFS VTMF test " + b"x" * 1000
|
||||
SHA256 = hashlib.sha256(PAYLOAD).hexdigest()
|
||||
PATH = f"/vtmf-documents/_test/{SHA256[:8]}"
|
||||
URL = FILER + PATH
|
||||
|
||||
|
||||
def req(method, data=None):
|
||||
r = urllib.request.Request(URL, method=method, data=data,
|
||||
headers={"Content-Type": "text/plain"} if data else {})
|
||||
try:
|
||||
with urllib.request.urlopen(r, timeout=10) as resp:
|
||||
return resp.status, resp.read()
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, b""
|
||||
|
||||
|
||||
print(f"Filer: {FILER}")
|
||||
print(f"Path: {PATH}\n")
|
||||
|
||||
status, _ = req("PUT", PAYLOAD)
|
||||
assert status in (200, 201), f"PUT selhal: {status}"
|
||||
print(f"[ok] PUT → {status}")
|
||||
|
||||
status, _ = req("HEAD")
|
||||
assert status == 200, f"HEAD selhal: {status}"
|
||||
print(f"[ok] HEAD → {status}")
|
||||
|
||||
status, body = req("GET")
|
||||
assert status == 200 and body == PAYLOAD, f"GET selhal: {status}, délka={len(body)}"
|
||||
print(f"[ok] GET → {status}, {len(body)} B")
|
||||
|
||||
status, _ = req("DELETE")
|
||||
assert status in (200, 204), f"DELETE selhal: {status}"
|
||||
print(f"[ok] DELETE → {status}")
|
||||
|
||||
status, _ = req("HEAD")
|
||||
assert status == 404, f"Po DELETE HEAD vrátil {status}, čekal 404"
|
||||
print(f"[ok] HEAD po DELETE → 404 (soubor odstraněn)\n")
|
||||
|
||||
print("SeaweedFS Filer OK.")
|
||||
@@ -0,0 +1,112 @@
|
||||
# vtmf_pipeline_v1.4 — Kompletní V-TMF workflow (report → Mongo → download)
|
||||
|
||||
**Verze:** 1.4 · **Datum:** 2026-06-15
|
||||
|
||||
**Změny v1.1:** oprava tichého selhání — výjimka kteréhokoli kroku se
|
||||
vypíše jako „PIPELINE SELHALA" + exit kód 2 (v1.0 končila zavádějícím
|
||||
souhrnem „0 staženo, 0 chyb"). Export reportu robustnější: menu ⋯,
|
||||
položka Export to Excel i tlačítko Export se hledají přes víc selektorů
|
||||
a ve všech frames; při nenalezení se automaticky uloží diagnostika
|
||||
stránky do debug/<čas>_report_* (screenshot, HTML všech frames, výpis
|
||||
title/aria-label atributů) — z ní se dá určit přesný selektor.
|
||||
|
||||
**Změny v1.2:** selektory exportu ověřené na živém DOM (Claude in
|
||||
Chrome; žádný iframe na celé stránce): menu ⋯ =
|
||||
`.actionMenuContainer .dropDown.vv_dropdown_toggle button.vv-icon-button`
|
||||
(button má prázdný title!); menu se načítá asynchronně (AJAX) →
|
||||
po kliknutí se čeká na položku `a.ReportAction[data-action-name='ExcelExport']`;
|
||||
„Data Only" = radio `name=requiredRadioField value=STANDARD`, defaultně
|
||||
checked (pojistka přes .check()); tlačítko Export = React `<button>`
|
||||
s emotion class hash → selektovat jen přes roli+text.
|
||||
|
||||
**Změny v1.3:** na konci běhu se prohlížeč i konzole zavřou
|
||||
automaticky (žádné čekání na ENTER); interaktivní vstup zůstává jen
|
||||
u 2FA a u ručně nezavřitelného dialogu.
|
||||
|
||||
**Změny v1.4:** detekce placeholder dokumentů — Vault zobrazuje text
|
||||
„This placeholder has no content", dokument nemá žádný Source File ke
|
||||
stažení. Při detekci se zapíše `placeholder=True, downloaded=True` do
|
||||
Mongo a dokument se přeskočí bez chyby. Souhrn na konci běhu uvádí
|
||||
počet placeholderů zvlášť.
|
||||
|
||||
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
|
||||
|
||||
1. **Login** do vtmf.veevavault.com (persistentní profil
|
||||
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
|
||||
+ ENTER; údaje z `.env` v rootu projektu).
|
||||
2. **Export reportu** „Document Inventory Report - Study Level"
|
||||
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
|
||||
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
|
||||
uloží se s timestampem do `WhatToDownload/`, po zpracování se
|
||||
přesune do `WhatToDownload/Zpracovano/`.
|
||||
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
|
||||
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`
|
||||
(VTMF číslo + verze, unikátní index na dvojici):
|
||||
- nový dokument → založí se (first_seen, deleted=False,
|
||||
downloaded=False),
|
||||
- změna sledovaných polí (name, status, type, subtype, desc,
|
||||
date, url, studies) → promítne se + záznam do `history[]`
|
||||
(timestamp + old/new),
|
||||
- dokument chybí v reportu → `deleted=True, deleted_at` a stažený
|
||||
soubor se přejmenuje s ` [D]` před příponou,
|
||||
- dokument se vrátí do reportu → `deleted=False` a ` [D]`
|
||||
se ze souboru zase odebere.
|
||||
Výsledná sada = záznamy s `deleted=False`.
|
||||
4. **Stažení chybějících** — všechny `deleted=False, downloaded≠True`:
|
||||
doc URL → Source File → uložení do
|
||||
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
|
||||
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<skutečná přípona>`.
|
||||
Výsledek (cesta, čas, případně chyba) se ihned zapisuje do Mongo —
|
||||
běh jde kdykoli přerušit a příště naváže.
|
||||
Placeholder dokumenty (stránka s textem „This placeholder has no
|
||||
content") se přeskočí a označí `placeholder=True, downloaded=True`.
|
||||
|
||||
## Mongo schéma (kolekce documents)
|
||||
|
||||
```
|
||||
_id: "VTMF-19077748|v1.0"
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies
|
||||
first_seen, last_seen # kdy poprvé/naposledy v reportu
|
||||
deleted, deleted_at # není ve výsledné sadě reportu
|
||||
downloaded, file, downloaded_at
|
||||
placeholder # True = Vault placeholder bez obsahu
|
||||
last_error, error_at # poslední chyba stahování
|
||||
history: [{ts, changes: {pole: {old, new}}}]
|
||||
```
|
||||
|
||||
## Migrace starého stavu
|
||||
|
||||
Při prvním běhu se `download_state.csv` (z download_vault v2.x)
|
||||
jednorázově namigruje: záznamy `ok` se k odpovídajícímu VTMF zapíší
|
||||
jako `downloaded=True` + cesta. CSV se přejmenuje na
|
||||
`download_state.csv.imported`.
|
||||
|
||||
## Konfigurace (konstanty nahoře)
|
||||
|
||||
- `REPORT_URL` — ID reportu + filtr studie (pro jinou studii se mění
|
||||
jen tato dvě ID)
|
||||
- `LIMIT` — None = stáhnout vše zbývající; číslo = dávka na běh
|
||||
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
|
||||
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
|
||||
|
||||
## Ověřené technické detaily (nesahat bez ověření)
|
||||
|
||||
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
|
||||
(křížek `.ui-dialog-titlebar-close` je display:none); objevuje se
|
||||
se zpožděním → wait_for visible 8 s (home) / 2-4 s (jinde).
|
||||
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
|
||||
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
|
||||
- Export kliknout právě jednou; 503/redirecty v network logu
|
||||
ignorovat, rozhoduje expect_download.
|
||||
- Placeholder detekce: `page.locator("div.vv_placeholder_text")` (uvnitř
|
||||
`div.vv_placeholder_pane > div.vv_placeholder_container > div.vv-placeholder-drag-and-drop-container`)
|
||||
se testuje před hledáním Source File ikony — CSS selektor je spolehlivější
|
||||
než text match.
|
||||
|
||||
## Spuštění
|
||||
|
||||
```powershell
|
||||
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.4.py"
|
||||
```
|
||||
|
||||
Předchůdce: vtmf_pipeline_v1.3 (TRASH/).
|
||||
@@ -0,0 +1,96 @@
|
||||
# vtmf_pipeline_v1.5 — Kompletní V-TMF workflow (report → Mongo → download → SeaweedFS)
|
||||
|
||||
**Verze:** 1.5 · **Datum:** 2026-06-15
|
||||
|
||||
**Změny v1.5:** upload každého staženého dokumentu do SeaweedFS Filer
|
||||
(`192.168.1.50:8888`, cesta `/vtmf-documents/ab/cd/<sha256>`).
|
||||
SHA-256 content-addressed dedup — identický soubor se uloží jen jednou
|
||||
(HEAD check → 404 → PUT; při 200 dedup hit). Chyba uploadu neblokuje
|
||||
download ani zápis do Mongo — soubor zůstane na disku a pole
|
||||
`sha256/seaweed_path/seaweed_url/seaweed_synced_at` zůstanou `null`
|
||||
(lze doplnit backfillem). Souhrn na konci uvádí počet nově nahraných,
|
||||
dedup hitů a případných chyb uploadu zvlášť.
|
||||
|
||||
_(Předchozí změny viz TRASH/vtmf_pipeline_v1.4.md)_
|
||||
|
||||
Jeden běh skriptu udělá celé workflow pro studii 77242113UCO3001:
|
||||
|
||||
1. **Login** do vtmf.veevavault.com (persistentní profil
|
||||
`vault_profile/`, J&J SSO, případné 2FA potvrdíte na telefonu
|
||||
+ ENTER; údaje z `.env` v rootu projektu).
|
||||
2. **Export reportu** „Document Inventory Report - Study Level"
|
||||
(přímá URL s ID reportu `0RP000000000182` a filtrem studie
|
||||
`0ST000000137008`) → menu ⋯ → Export to Excel → Data Only →
|
||||
uloží se s timestampem do `WhatToDownload/`, po zpracování se
|
||||
přesune do `WhatToDownload/Zpracovano/`.
|
||||
3. **Parse + sync do MongoDB** — Tower `mongodb://192.168.1.76:27017`,
|
||||
db **VTMF**, kolekce **documents**, klíč `_id = "VTMF-xxx|vY.Z"`:
|
||||
- nové dokumenty se založí,
|
||||
- změny sledovaných polí se promítnou (+ `history[]`),
|
||||
- dokumenty chybějící v reportu se označí `deleted=True`
|
||||
a stažený soubor dostane ` [D]` před příponou,
|
||||
- znovuobjevené se vzkřísí a ` [D]` se odebere.
|
||||
4. **Stažení + SeaweedFS upload** — všechny `deleted=False, downloaded≠True`:
|
||||
- Source File se uloží do
|
||||
`U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\<Type>\<Subtype>\`
|
||||
jako `YYYY-MM-DD Description [VTMF-xxx] [vY.Z].<přípona>`,
|
||||
- soubor se přečte z disku, vypočítá se SHA-256, obsah se nahraje
|
||||
do SeaweedFS na `/vtmf-documents/{sha256[:2]}/{sha256[2:4]}/{sha256}`,
|
||||
- do Mongo se zapíše `downloaded=True, file, sha256, seaweed_path,
|
||||
seaweed_url, seaweed_synced_at`; chyba SeaweedFS tyto fieldy
|
||||
nechá `null` ale `downloaded=True` se zapíše (soubor je na disku).
|
||||
- Placeholder dokumenty (`div.vv_placeholder_text` viditelný) se
|
||||
přeskočí s `placeholder=True, downloaded=True`.
|
||||
|
||||
## Mongo schéma (kolekce documents)
|
||||
|
||||
```
|
||||
_id: "VTMF-19077748|v1.0"
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies
|
||||
first_seen, last_seen # kdy poprvé/naposledy v reportu
|
||||
deleted, deleted_at # není ve výsledné sadě reportu
|
||||
downloaded, file, downloaded_at
|
||||
placeholder # True = Vault placeholder bez obsahu
|
||||
sha256 # hex SHA-256 staženého souboru
|
||||
seaweed_path # /vtmf-documents/ab/cd/<sha256>
|
||||
seaweed_url # http://192.168.1.50:8888/vtmf-documents/...
|
||||
seaweed_synced_at # kdy nahráno / null při chybě
|
||||
last_error, error_at # poslední chyba stahování
|
||||
history: [{ts, changes: {pole: {old, new}}}]
|
||||
```
|
||||
|
||||
## SeaweedFS detaily
|
||||
|
||||
- **Filer**: `http://192.168.1.50:8888` (přímý PUT, žádný master assign)
|
||||
- **Dedup**: HEAD → 404 → PUT; HEAD → 200 → dedup hit (vrátí `uploaded=False`)
|
||||
- **Timeout**: HEAD 10 s, PUT 120 s (velké soubory)
|
||||
- **MIME**: `mimetypes.guess_type()`, fallback `application/octet-stream`
|
||||
- **Backfill**: dokumenty s `downloaded=True, seaweed_path=null` lze
|
||||
dohnat samostatným skriptem (čte `file` z Mongo, nahraje, zapíše pola)
|
||||
|
||||
## Konfigurace (konstanty nahoře)
|
||||
|
||||
- `SEAWEED_FILER` — URL Filer serveru
|
||||
- `SEAWEED_PREFIX` — prefix cesty (`/vtmf-documents`)
|
||||
- `REPORT_URL` — ID reportu + filtr studie
|
||||
- `LIMIT` — None = vše; číslo = dávka
|
||||
- `MONGO_URI/DB/COLL`, `DOWNLOAD_ROOT`, `EXCEL_DIR`
|
||||
- `TRACKED_FIELDS`, `MAX_ATTEMPTS`, `RETRY_PAUSE_MS`, `BETWEEN_DOCS_MS`
|
||||
|
||||
## Ověřené technické detaily (nesahat bez ověření)
|
||||
|
||||
- Maintenance dialog: zavírat POUZE přes `.ui-dialog a.ok.vv_button`
|
||||
(křížek `.ui-dialog-titlebar-close` je display:none).
|
||||
- Report Excel má rozbité deklarované rozměry → přímá iterace řádků.
|
||||
- Document Name/Number/Status jsou =HYPERLINK vzorce → regex.
|
||||
- Export kliknout právě jednou; rozhoduje `expect_download`.
|
||||
- Placeholder detekce: `div.vv_placeholder_text` (uvnitř
|
||||
`div.vv_placeholder_pane > div.vv_placeholder_container`).
|
||||
|
||||
## Spuštění
|
||||
|
||||
```powershell
|
||||
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.5.py"
|
||||
```
|
||||
|
||||
Předchůdce: vtmf_pipeline_v1.4 (TRASH/).
|
||||
@@ -0,0 +1,937 @@
|
||||
# ============================================================
|
||||
# vtmf_pipeline_v1.5.py
|
||||
# Verze: 1.5
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault), studie
|
||||
# 77242113UCO3001. Jeden běh udělá:
|
||||
# 1) login do Vaultu (persistentní session + ruční 2FA),
|
||||
# 2) export reportu "Document Inventory Report - Study
|
||||
# Level" do Excelu (Data Only) do WhatToDownload/,
|
||||
# 3) parse reportu a synchronizaci do MongoDB
|
||||
# (Tower, db VTMF, kolekce documents,
|
||||
# klíč = VTMF číslo + verze):
|
||||
# - nové dokumenty se založí,
|
||||
# - změny polí se promítnou (+ history[]),
|
||||
# - dokumenty chybějící v reportu se označí
|
||||
# deleted=True a stažený soubor dostane ' [D]',
|
||||
# - znovuobjevené se vzkřísí a ' [D]' se odebere,
|
||||
# 4) stažení všech dosud nestažených dokumentů do
|
||||
# U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001\
|
||||
# <Type>\<Subtype>\"YYYY-MM-DD Description
|
||||
# [VTMF-x] [v1.0].<přípona>" + zápis stavu do Mongo.
|
||||
#
|
||||
# Tracking stahování je KOMPLETNĚ v Mongo; starý
|
||||
# download_state.csv se při prvním běhu jednorázově
|
||||
# namigruje a přejmenuje na .imported.
|
||||
#
|
||||
# Vychází z download_vault_v2.1 (v TRASH/) — login, dialogy
|
||||
# a stahování beze změny; nové jsou kroky 2 a 3.
|
||||
#
|
||||
# v1.1: oprava tichého selhání — chyba kteréhokoli kroku se teď
|
||||
# hlasitě vypíše (a exit kód 2), místo aby běh skončil
|
||||
# souhrnem "0 staženo, 0 chyb". Export reportu: více
|
||||
# selektorů pro menu ⋯ i položku Export to Excel (včetně
|
||||
# hledání ve všech frames) a při selhání automatický záchyt
|
||||
# diagnostiky stránky do debug/ (screenshot + HTML frames).
|
||||
# v1.2: selektory exportu OVĚŘENÉ na živém DOM (žádný iframe):
|
||||
# menu ⋯ = .actionMenuContainer .dropDown.vv_dropdown_toggle
|
||||
# button.vv-icon-button (title prázdný!); menu se načítá
|
||||
# asynchronně -> čekat na položku; položka =
|
||||
# a.ReportAction[data-action-name='ExcelExport']; Data Only =
|
||||
# radio name=requiredRadioField value=STANDARD (default
|
||||
# checked); Export = <button> role+text (emotion class hash,
|
||||
# neselektovat podle tříd).
|
||||
# v1.3: na konci běhu se prohlížeč i okno zavře automaticky
|
||||
# (žádné čekání na ENTER) — vhodné pro bezobslužné běhy.
|
||||
# Interaktivní vstupy zůstávají jen tam, kde jsou nutné
|
||||
# (2FA, ručně nezavřitelný dialog).
|
||||
# v1.4: detekce placeholder dokumentů — stránka s textem
|
||||
# "This placeholder has no content" se přeskočí
|
||||
# (placeholder=True, downloaded=True v Mongo), žádná chyba.
|
||||
# v1.5: upload stažených dokumentů do SeaweedFS Filer
|
||||
# (192.168.1.50:8888, cesta /vtmf-documents/ab/cd/<sha256>).
|
||||
# SHA-256 content-addressed dedup — identický soubor se uloží
|
||||
# jen jednou. Chyba uploadu neblokuje download; chybějící
|
||||
# sha256/seaweed_path lze doplnit backfillem. Mongo nově ukládá:
|
||||
# sha256, seaweed_path, seaweed_url, seaweed_synced_at.
|
||||
# Souhrn běhu uvádí počet nově nahraných vs. dedup hitů.
|
||||
#
|
||||
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
|
||||
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
|
||||
# ============================================================
|
||||
|
||||
import csv
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
# --- Konfigurace -------------------------------------------------------
|
||||
|
||||
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
|
||||
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
|
||||
"&IdpAdapterId=CompIWALDAPEXTFORM"
|
||||
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
|
||||
|
||||
# Report Document Inventory Report - Study Level, filtr na studii
|
||||
REPORT_URL = ("https://vtmf.veevavault.com/ui/#reporting/viewer/"
|
||||
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008")
|
||||
|
||||
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
|
||||
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
|
||||
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
|
||||
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty
|
||||
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
|
||||
OLD_STATE_FILE = SCRIPT_DIR / "download_state.csv" # legacy CSV (migrace)
|
||||
DOWNLOAD_ROOT = Path(r"U:\Dropbox\!!!Days\Downloads Z230\VTMF-77242113UCO3001")
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "VTMF"
|
||||
MONGO_COLL = "documents"
|
||||
|
||||
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
|
||||
LIMIT = 0
|
||||
# Pole reportu, jejichž změny se promítají a verzují do history[]
|
||||
TRACKED_FIELDS = ("name", "status", "type", "subtype", "desc",
|
||||
"date", "url", "studies")
|
||||
|
||||
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
|
||||
RETRY_PAUSE_MS = 5000 # pauza před opakováním
|
||||
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
|
||||
|
||||
SEAWEED_FILER = "http://192.168.1.50:8888"
|
||||
SEAWEED_PREFIX = "/vtmf-documents"
|
||||
|
||||
|
||||
class PlaceholderDocument(Exception):
|
||||
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def load_env_file(path):
|
||||
"""Načte KEY=VALUE řádky z .env do os.environ.
|
||||
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
|
||||
if not path.exists():
|
||||
log(f"[!] .env nenalezen: {path}")
|
||||
return
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, value = line.partition("=")
|
||||
key, value = key.strip(), value.strip().strip('"').strip("'")
|
||||
if value and key not in os.environ:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
|
||||
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
|
||||
|
||||
|
||||
def ensure_credentials():
|
||||
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
|
||||
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
|
||||
load_env_file(ENV_FILE)
|
||||
if all(os.environ.get(k) for k in ENV_KEYS):
|
||||
return
|
||||
|
||||
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
|
||||
missing_lines = [f"{k}=" for k in ENV_KEYS
|
||||
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
|
||||
|
||||
if not ENV_FILE.exists():
|
||||
ENV_FILE.write_text(
|
||||
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
|
||||
+ ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n",
|
||||
encoding="utf-8")
|
||||
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
|
||||
elif missing_lines:
|
||||
with open(ENV_FILE, "a", encoding="utf-8") as f:
|
||||
f.write("\n" + ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n")
|
||||
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
|
||||
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
|
||||
print(f" {ENV_FILE}")
|
||||
print(" a spusť skript znovu.")
|
||||
print("=" * 60)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# --- Parsování Excelu --------------------------------------------------
|
||||
|
||||
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
|
||||
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
|
||||
# nepovolené znaky Windows názvů + řídicí znaky + unicode artefakt �
|
||||
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f�]")
|
||||
|
||||
|
||||
def clean_filename(s):
|
||||
"""Očistí string na platné jméno souboru/složky ve Windows."""
|
||||
s = BAD_CHARS_RE.sub("_", str(s))
|
||||
s = re.sub(r"\s+", " ", s) # vícenásobné mezery -> jedna
|
||||
s = re.sub(r"_{2,}", "_", s) # vícenásobná podtržítka -> jedno
|
||||
return s.strip(" ._") # okraje: mezery, tečky, podtržítka
|
||||
|
||||
|
||||
def display_text(cell):
|
||||
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
|
||||
raw = str(cell.value or "").strip()
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
return m.group(2).strip() if m else raw
|
||||
|
||||
|
||||
def extract_doc_url(raw):
|
||||
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
|
||||
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
|
||||
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
|
||||
if not m:
|
||||
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
|
||||
return m.group(1)
|
||||
|
||||
|
||||
def read_documents_from_excel(path):
|
||||
"""Načte dokumenty z daného .xlsx reportu. Vrací list dictů:
|
||||
vtmf, version, url, name, status, type, subtype, desc, date, studies.
|
||||
Document Name/Number/Status jsou =HYPERLINK vzorce — URL i text se
|
||||
berou regexem. Report má rozbité deklarované rozměry, čte se
|
||||
přímou iterací řádků."""
|
||||
from openpyxl import load_workbook
|
||||
|
||||
log(f"[i] Parsování reportu: {path.name}")
|
||||
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
|
||||
ws = wb[wb.sheetnames[0]]
|
||||
|
||||
rows = ws.iter_rows()
|
||||
header = [c.value for c in next(rows)]
|
||||
try:
|
||||
i_num = header.index("Document Number")
|
||||
i_name = header.index("Document Name")
|
||||
i_status = header.index("Document Status")
|
||||
i_type = header.index("Type")
|
||||
i_sub = header.index("Subtype")
|
||||
i_desc = header.index("Description")
|
||||
i_date = header.index("Document Date")
|
||||
i_study = header.index("Study")
|
||||
except ValueError as e:
|
||||
raise RuntimeError(f"V reportu chybí očekávaný sloupec: {e}")
|
||||
|
||||
docs, bad = [], []
|
||||
for row in rows:
|
||||
cell = row[i_num]
|
||||
if cell.value is None:
|
||||
continue
|
||||
raw = str(cell.value)
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
if m:
|
||||
url_raw, vtmf = m.group(1), m.group(2)
|
||||
elif cell.hyperlink: # pravý hyperlink místo vzorce
|
||||
url_raw, vtmf = cell.hyperlink.target, raw
|
||||
else:
|
||||
bad.append(raw)
|
||||
continue
|
||||
try:
|
||||
url = extract_doc_url(url_raw)
|
||||
except ValueError:
|
||||
bad.append(raw)
|
||||
continue
|
||||
|
||||
name = display_text(row[i_name])
|
||||
vm = VERSION_RE.search(name)
|
||||
version = vm.group(1) if vm else "v?"
|
||||
|
||||
desc = clean_filename(display_text(row[i_desc]))
|
||||
if not desc:
|
||||
# fallback: Document Name bez koncové verze (jde zvlášť na konec)
|
||||
desc = clean_filename(VERSION_RE.sub("", name))
|
||||
|
||||
date = row[i_date].value # datetime nebo None
|
||||
docs.append({
|
||||
"vtmf": vtmf.strip(),
|
||||
"version": version,
|
||||
"url": url,
|
||||
"name": name,
|
||||
"status": display_text(row[i_status]),
|
||||
"type": clean_filename(display_text(row[i_type])),
|
||||
"subtype": clean_filename(display_text(row[i_sub])),
|
||||
"desc": desc,
|
||||
"date": date if hasattr(date, "strftime") else None,
|
||||
"studies": display_text(row[i_study]),
|
||||
})
|
||||
|
||||
log(f"[i] Načteno {len(docs)} dokumentů"
|
||||
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
|
||||
return docs
|
||||
|
||||
|
||||
def build_target_path(doc, suggested_filename):
|
||||
"""Cílová cesta: DOWNLOAD_ROOT\\Type\\Subtype\\
|
||||
'YYYY-MM-DD Description [VTMF-xxx] [v1.0].<skutečná přípona>'.
|
||||
Datum/verze se vynechají, když nejsou k dispozici."""
|
||||
ext = Path(suggested_filename).suffix # skutečná přípona vč. tečky
|
||||
date_prefix = doc["date"].strftime("%Y-%m-%d") + " " if doc["date"] else ""
|
||||
version = f" [{doc['version']}]" if doc.get("version") else ""
|
||||
filename = f"{date_prefix}{doc['desc']} [{doc['vtmf']}]{version}{ext}"
|
||||
return DOWNLOAD_ROOT / doc["type"] / doc["subtype"] / filename
|
||||
|
||||
|
||||
def deleted_marker_path(path):
|
||||
"""Jméno souboru s příznakem smazání: 'x.pdf' -> 'x [D].pdf'."""
|
||||
p = Path(path)
|
||||
return p.with_name(f"{p.stem} [D]{p.suffix}")
|
||||
|
||||
|
||||
# --- MongoDB synchronizace ---------------------------------------------
|
||||
|
||||
def doc_key(vtmf, version):
|
||||
return f"{vtmf}|{version}"
|
||||
|
||||
|
||||
def get_collection():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
coll = client[MONGO_DB][MONGO_COLL]
|
||||
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)],
|
||||
unique=True)
|
||||
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
|
||||
return coll
|
||||
|
||||
|
||||
def migrate_old_csv(coll):
|
||||
"""Jednorázová migrace download_state.csv do Mongo: záznamy 'ok'
|
||||
se zapíší jako downloaded=True k odpovídajícímu VTMF (aktuální,
|
||||
nesmazané verzi). CSV se pak přejmenuje na .imported."""
|
||||
if not OLD_STATE_FILE.exists():
|
||||
return
|
||||
migrated = 0
|
||||
with open(OLD_STATE_FILE, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
if row["result"] != "ok":
|
||||
continue
|
||||
r = coll.update_one(
|
||||
{"vtmf": row["vtmf"], "deleted": False,
|
||||
"downloaded": {"$ne": True}},
|
||||
{"$set": {"downloaded": True, "file": row["file"],
|
||||
"downloaded_at": row["timestamp"]}})
|
||||
migrated += r.modified_count
|
||||
OLD_STATE_FILE.rename(OLD_STATE_FILE.with_suffix(".csv.imported"))
|
||||
log(f"[i] Migrace download_state.csv -> Mongo: {migrated} záznamů; "
|
||||
f"CSV přejmenováno na .imported")
|
||||
|
||||
|
||||
def sync_report_to_mongo(coll, docs):
|
||||
"""Promítne aktuální report do kolekce documents.
|
||||
Klíč = (vtmf, version). Nové založí, změny polí promítne
|
||||
(s history[]), chybějící označí deleted + soubor přejmenuje
|
||||
s ' [D]', znovuobjevené vzkřísí a ' [D]' odebere."""
|
||||
now = datetime.now()
|
||||
stats = {"new": 0, "updated": 0, "unchanged": 0,
|
||||
"resurrected": 0, "marked_deleted": 0}
|
||||
current_keys = set()
|
||||
|
||||
for d in docs:
|
||||
key = doc_key(d["vtmf"], d["version"])
|
||||
current_keys.add(key)
|
||||
existing = coll.find_one({"_id": key})
|
||||
if existing is None:
|
||||
coll.insert_one({
|
||||
"_id": key, **d,
|
||||
"first_seen": now, "last_seen": now,
|
||||
"deleted": False, "downloaded": False,
|
||||
"file": None, "history": [],
|
||||
})
|
||||
stats["new"] += 1
|
||||
continue
|
||||
|
||||
changes = {}
|
||||
for fld in TRACKED_FIELDS:
|
||||
if existing.get(fld) != d.get(fld):
|
||||
changes[fld] = {"old": existing.get(fld),
|
||||
"new": d.get(fld)}
|
||||
update = {"$set": {**d, "last_seen": now, "deleted": False}}
|
||||
if changes:
|
||||
update["$push"] = {"history": {"ts": now, "changes": changes}}
|
||||
stats["updated"] += 1
|
||||
else:
|
||||
stats["unchanged"] += 1
|
||||
|
||||
if existing.get("deleted"):
|
||||
# dokument se do reportu vrátil -> odebrat [D] ze souboru
|
||||
stats["resurrected"] += 1
|
||||
stats["unchanged"] -= 0 # (počítá se výše jako updated/unchanged)
|
||||
old_file = existing.get("file")
|
||||
if old_file:
|
||||
marked = deleted_marker_path(old_file)
|
||||
if marked.exists() and not Path(old_file).exists():
|
||||
marked.rename(old_file)
|
||||
log(f"[i] {key}: soubor vrácen z ' [D]' zpět.")
|
||||
update["$set"]["file"] = str(old_file)
|
||||
coll.update_one({"_id": key}, update)
|
||||
|
||||
# dokumenty, které v aktuálním reportu nejsou -> deleted + ' [D]'
|
||||
for rec in coll.find({"deleted": False}):
|
||||
if rec["_id"] in current_keys:
|
||||
continue
|
||||
upd = {"deleted": True, "deleted_at": now}
|
||||
f = rec.get("file")
|
||||
if f and Path(f).exists():
|
||||
marked = deleted_marker_path(f)
|
||||
try:
|
||||
Path(f).rename(marked)
|
||||
upd["file"] = str(marked)
|
||||
log(f"[i] {rec['_id']}: soubor označen ' [D]'.")
|
||||
except OSError as e:
|
||||
log(f"[!] {rec['_id']}: přejmenování na [D] selhalo: {e}")
|
||||
coll.update_one({"_id": rec["_id"]},
|
||||
{"$set": upd,
|
||||
"$push": {"history": {"ts": now,
|
||||
"changes": {"deleted": {
|
||||
"old": False,
|
||||
"new": True}}}}})
|
||||
stats["marked_deleted"] += 1
|
||||
|
||||
log(f"[ok] Mongo sync: {stats['new']} nových, {stats['updated']} změněných, "
|
||||
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
|
||||
f"{stats['marked_deleted']} označených deleted.")
|
||||
return stats
|
||||
|
||||
|
||||
# --- Přihlášení --------------------------------------------------------
|
||||
|
||||
def submit_login_form(page, password_box):
|
||||
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
|
||||
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
|
||||
candidates = [
|
||||
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
|
||||
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
|
||||
page.locator("input[type='submit']"),
|
||||
page.locator("button[type='submit']"),
|
||||
page.get_by_role("button", name=re.compile("^ok$", re.I)),
|
||||
]
|
||||
for loc in candidates:
|
||||
try:
|
||||
if loc.count() and loc.first.is_visible():
|
||||
label = (loc.first.inner_text() or
|
||||
loc.first.get_attribute("value") or "submit").strip()
|
||||
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
|
||||
loc.first.click()
|
||||
return
|
||||
except Exception:
|
||||
continue
|
||||
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
|
||||
password_box.press("Enter")
|
||||
|
||||
|
||||
def login_if_needed(page):
|
||||
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
|
||||
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
|
||||
log(f"[i] Otevírám přihlašovací URL...")
|
||||
page.goto(LOGIN_URL, wait_until="domcontentloaded")
|
||||
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Už přihlášen (perzistentní session).")
|
||||
return
|
||||
|
||||
user_box = page.locator("input[type='text']").first
|
||||
try:
|
||||
user_box.wait_for(timeout=8000)
|
||||
except PWTimeout:
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Přihlášen bez formuláře (session redirect).")
|
||||
return
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
|
||||
|
||||
username = os.environ["VAULT_USER"]
|
||||
password = os.environ["VAULT_PASS"]
|
||||
|
||||
log("[i] Vyplňuji přihlašovací údaje...")
|
||||
user_box.fill(username)
|
||||
password_box = page.locator("input[type='password']").first
|
||||
password_box.fill(password)
|
||||
submit_login_form(page, password_box)
|
||||
|
||||
log("[i] Odeslán login, čekám na výsledek...")
|
||||
try:
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
|
||||
log("[ok] Přihlášen rovnou (bez 2FA).")
|
||||
return
|
||||
except PWTimeout:
|
||||
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
|
||||
|
||||
err = page.locator("text=/invalid|incorrect|failed/i")
|
||||
try:
|
||||
if err.count() and err.first.is_visible():
|
||||
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
|
||||
except PWTimeout:
|
||||
pass
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
|
||||
print(" Potvrď přihlášení v mobilní aplikaci.")
|
||||
print("=" * 60)
|
||||
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
|
||||
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
|
||||
log("[ok] Přihlášení dokončeno.")
|
||||
|
||||
|
||||
def verify_inside(page):
|
||||
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
|
||||
log(f"[ok] Uvnitř Vaultu: {page.url}")
|
||||
|
||||
|
||||
def dialog_visible(page):
|
||||
"""True, pokud je na stránce viditelný jQuery UI dialog."""
|
||||
try:
|
||||
dlg = page.locator(".ui-dialog")
|
||||
return bool(dlg.count() and dlg.first.is_visible())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def save_page_debug(page, tag):
|
||||
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
|
||||
kandidátů na tlačítka. Vrátí cestu složky."""
|
||||
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
|
||||
except Exception as e:
|
||||
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
|
||||
report = []
|
||||
for i, frame in enumerate(page.frames):
|
||||
report.append(f"=== frame[{i}] url={frame.url}")
|
||||
try:
|
||||
(out / f"frame_{i}.html").write_text(frame.content(),
|
||||
encoding="utf-8")
|
||||
for sel in (".ui-dialog", "a.ok.vv_button",
|
||||
".ui-dialog-titlebar-close",
|
||||
"button", "input[type='button']",
|
||||
"[title]", "[aria-label]"):
|
||||
n = frame.locator(sel).count()
|
||||
if n:
|
||||
report.append(f" {sel}: {n}x")
|
||||
# výpis title/aria-label atributů — pomáhá najít menu ⋯
|
||||
for attr in ("title", "aria-label"):
|
||||
vals = frame.locator(f"[{attr}]").evaluate_all(
|
||||
f"els => els.map(e => e.getAttribute('{attr}'))")
|
||||
uniq = sorted({v for v in vals if v})[:80]
|
||||
report.append(f" {attr}: {uniq}")
|
||||
except Exception as e:
|
||||
report.append(f" [chyba čtení framu: {e}]")
|
||||
(out / "frames_report.txt").write_text("\n".join(report),
|
||||
encoding="utf-8")
|
||||
log(f"[!] Diagnostika stránky uložena do: {out}")
|
||||
return out
|
||||
|
||||
|
||||
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
|
||||
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
|
||||
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
|
||||
".vv_login_msg_dialog .vv_button.ok")
|
||||
|
||||
|
||||
def dismiss_maintenance_popup(page, timeout=8000):
|
||||
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
|
||||
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
|
||||
proto se na něj krátce čeká. Bezpečné volat vždy."""
|
||||
ok = page.locator(DIALOG_OK_SELECTOR)
|
||||
try:
|
||||
ok.first.wait_for(state="visible", timeout=timeout)
|
||||
except PWTimeout:
|
||||
return False # okno se neobjevilo — pokračujeme
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
closed = 0
|
||||
for _ in range(5): # dialogy umí být ve frontě
|
||||
try:
|
||||
if ok.count() and ok.first.is_visible():
|
||||
ok.first.click()
|
||||
page.wait_for_timeout(300)
|
||||
closed += 1
|
||||
log("[i] Maintenance/login dialog zavřen (OK).")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
if not dialog_visible(page):
|
||||
return bool(closed)
|
||||
|
||||
page.keyboard.press("Escape")
|
||||
page.wait_for_timeout(500)
|
||||
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
|
||||
|
||||
if dialog_visible(page):
|
||||
save_page_debug(page, "dialog")
|
||||
print("\n" + "=" * 60)
|
||||
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
|
||||
print(" Zavři ho prosím ručně v prohlížeči.")
|
||||
print("=" * 60)
|
||||
input(" Po ručním zavření stiskni ENTER... ")
|
||||
return bool(closed)
|
||||
|
||||
|
||||
# --- Export reportu ----------------------------------------------------
|
||||
|
||||
def _first_visible(page, builders):
|
||||
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
|
||||
hlavní stránce i ve všech frames."""
|
||||
for frame in page.frames:
|
||||
for build, desc in builders:
|
||||
try:
|
||||
loc = build(frame)
|
||||
if loc.count() and loc.first.is_visible():
|
||||
return loc.first, desc
|
||||
except Exception:
|
||||
continue
|
||||
return None, None
|
||||
|
||||
|
||||
def download_report(page):
|
||||
"""Stáhne report (Export to Excel, Data Only) do WhatToDownload/
|
||||
pod timestampovaným názvem. Vrátí cestu k souboru.
|
||||
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
|
||||
log("[i] Otevírám report Document Inventory Report - Study Level...")
|
||||
page.goto(REPORT_URL, wait_until="domcontentloaded")
|
||||
dismiss_maintenance_popup(page, timeout=4000)
|
||||
|
||||
# report je hotový, když se objeví počet záznamů / statusy
|
||||
try:
|
||||
page.wait_for_selector("text=Returned", timeout=30000)
|
||||
except PWTimeout:
|
||||
try:
|
||||
page.wait_for_selector("text=Document Status:", timeout=30000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_load")
|
||||
raise RuntimeError(
|
||||
"Report se nenačetl (nenašel jsem 'Returned' ani "
|
||||
"'Document Status:'). Diagnostika v debug/.")
|
||||
log("[i] Report načten, otevírám menu akcí (⋯)...")
|
||||
|
||||
# Menu ⋯ (Actions): button bez title/aria-label uvnitř
|
||||
# .actionMenuContainer (ověřeno na živém DOM, žádný iframe).
|
||||
actions, desc = _first_visible(page, [
|
||||
(lambda f: f.locator(
|
||||
".actionMenuContainer .dropDown.vv_dropdown_toggle "
|
||||
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
|
||||
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
|
||||
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
|
||||
])
|
||||
if actions is None:
|
||||
save_page_debug(page, "report_menu")
|
||||
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. "
|
||||
"Diagnostika v debug/.")
|
||||
log(f"[i] Menu nalezeno přes: {desc}")
|
||||
actions.click()
|
||||
|
||||
# Menu se načítá ASYNCHRONNĚ (data-loaded=false -> AJAX),
|
||||
# počkat na položku, nečíst hned po kliknutí.
|
||||
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=15000)
|
||||
except PWTimeout:
|
||||
# fallback podle textu (kdyby se data atribut změnil)
|
||||
item = page.get_by_text("Export to Excel", exact=True)
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=5000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_export_item")
|
||||
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
|
||||
"Excel' jsem nenašel. Diagnostika v debug/.")
|
||||
log("[i] Klikám 'Export to Excel'...")
|
||||
item.first.click()
|
||||
log("[i] Dialog Excel Export Options...")
|
||||
|
||||
# 'Data Only' = radio value=STANDARD, defaultně checked; pojistka.
|
||||
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
|
||||
try:
|
||||
radio.first.wait_for(state="visible", timeout=10000)
|
||||
if not radio.first.is_checked():
|
||||
radio.first.check()
|
||||
log("[i] Přepnuto na 'Data Only'.")
|
||||
except PWTimeout:
|
||||
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
|
||||
|
||||
# Export = <button> s textem Export (React dialog, emotion třídy —
|
||||
# NEselektovat podle class hash, jen role+text).
|
||||
export_btn = page.get_by_role("button", name="Export", exact=True)
|
||||
try:
|
||||
export_btn.first.wait_for(state="visible", timeout=10000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, "report_export_btn")
|
||||
raise RuntimeError("Dialog exportu bez tlačítka Export. "
|
||||
"Diagnostika v debug/.")
|
||||
export_btn = export_btn.first
|
||||
# Export kliknout PRÁVĚ jednou (vícenásobné kliky = duplikáty);
|
||||
# 503/redirecty v network logu neřešit — rozhoduje expect_download
|
||||
with page.expect_download(timeout=120000) as dl_info:
|
||||
export_btn.click()
|
||||
download = dl_info.value
|
||||
|
||||
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
dest = EXCEL_DIR / f"{ts} {download.suggested_filename}"
|
||||
download.save_as(str(dest))
|
||||
log(f"[ok] Report uložen: {dest}")
|
||||
return dest
|
||||
|
||||
|
||||
def archive_report(path):
|
||||
"""Po úspěšném zpracování přesune report do Zpracovano/."""
|
||||
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
target = PROCESSED_DIR / path.name
|
||||
path.rename(target)
|
||||
log(f"[i] Report archivován: {target}")
|
||||
|
||||
|
||||
# --- SeaweedFS ---------------------------------------------------------
|
||||
|
||||
def _sw_path(sha256):
|
||||
return f"{SEAWEED_PREFIX}/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
||||
|
||||
|
||||
def seaweed_store(data, mime="application/octet-stream"):
|
||||
"""Idempotentní upload do SeaweedFS Filer.
|
||||
Vrací (path, url, uploaded): uploaded=False znamená dedup hit."""
|
||||
sha256 = hashlib.sha256(data).hexdigest()
|
||||
path = _sw_path(sha256)
|
||||
url = SEAWEED_FILER + path
|
||||
|
||||
try:
|
||||
urllib.request.urlopen(
|
||||
urllib.request.Request(url, method="HEAD"), timeout=10)
|
||||
return path, url, False # soubor už existuje
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code != 404:
|
||||
raise
|
||||
|
||||
req = urllib.request.Request(
|
||||
url, data=data, method="PUT",
|
||||
headers={"Content-Type": mime})
|
||||
urllib.request.urlopen(req, timeout=120)
|
||||
return path, url, True
|
||||
|
||||
|
||||
# --- Stažení dokumentů -------------------------------------------------
|
||||
|
||||
def find_source_file_button(page):
|
||||
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře).
|
||||
Více fallback selektorů — DOM se může lišit podle typu dokumentu."""
|
||||
candidates = [
|
||||
"[title='Source File']",
|
||||
"[aria-label='Source File']",
|
||||
]
|
||||
for sel in candidates:
|
||||
loc = page.locator(sel)
|
||||
if loc.count():
|
||||
return loc.first
|
||||
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
|
||||
if loc.count():
|
||||
return loc.first
|
||||
return None
|
||||
|
||||
|
||||
def download_source_file(page, doc):
|
||||
vtmf = doc["vtmf"]
|
||||
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
|
||||
page.goto(doc["url"], wait_until="domcontentloaded")
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
except PWTimeout:
|
||||
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
|
||||
dismiss_maintenance_popup(page, timeout=2000)
|
||||
|
||||
ph = page.locator("div.vv_placeholder_text")
|
||||
if ph.count() and ph.first.is_visible():
|
||||
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
|
||||
raise PlaceholderDocument(vtmf)
|
||||
|
||||
target = find_source_file_button(page)
|
||||
if target is None:
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
|
||||
|
||||
log("[i] Klikám na Source File a čekám na download...")
|
||||
with page.expect_download(timeout=60000) as dl_info:
|
||||
target.click()
|
||||
# Varianta s dropdownem (Source File + Viewable Rendition)
|
||||
try:
|
||||
item = page.get_by_role("menuitem",
|
||||
name=re.compile("Source File", re.I))
|
||||
if item.count() and item.first.is_visible():
|
||||
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
|
||||
item.first.click()
|
||||
except Exception:
|
||||
pass
|
||||
download = dl_info.value
|
||||
|
||||
dest = build_target_path(doc, download.suggested_filename)
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
download.save_as(str(dest))
|
||||
return dest
|
||||
|
||||
|
||||
def download_missing(page, coll):
|
||||
"""Stáhne všechny nesmazané dokumenty bez downloaded=True.
|
||||
Výsledek každého se ihned zapíše do Mongo."""
|
||||
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
|
||||
.sort([("vtmf", ASCENDING), ("version", ASCENDING)]))
|
||||
if LIMIT:
|
||||
todo = todo[:LIMIT]
|
||||
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
|
||||
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
|
||||
|
||||
ok_count, fail_count, placeholder_count = 0, 0, 0
|
||||
sw_uploaded = sw_dedup = sw_failed = 0
|
||||
for n, doc in enumerate(todo, 1):
|
||||
key = doc["_id"]
|
||||
log(f"\n--- [{n}/{len(todo)}] {key} | {doc['desc'][:70]}")
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
dest = download_source_file(page, doc)
|
||||
|
||||
# SeaweedFS upload (neblokuje při chybě)
|
||||
sw_path = sw_url = sw_ts = sha256_hex = None
|
||||
try:
|
||||
data = dest.read_bytes()
|
||||
size_kb = len(data) / 1024
|
||||
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
ext = dest.suffix.lstrip('.').upper()
|
||||
log(f"[ok] Stazeno: {dest.name} ({size_str} {ext})")
|
||||
mime = mimetypes.guess_type(dest.name)[0] or "application/octet-stream"
|
||||
sw_path, sw_url, uploaded = seaweed_store(data, mime)
|
||||
sha256_hex = hashlib.sha256(data).hexdigest()
|
||||
sw_ts = datetime.now()
|
||||
if uploaded:
|
||||
sw_uploaded += 1
|
||||
log(f"[ok] SeaweedFS: nahrano ({size_str}) -> {sw_path}")
|
||||
else:
|
||||
sw_dedup += 1
|
||||
log(f"[i] SeaweedFS: dedup hit ({size_str}) -> {sw_path}")
|
||||
except Exception as sw_err:
|
||||
sw_failed += 1
|
||||
log(f"[!] SeaweedFS upload selhal (soubor je na disku): {sw_err}")
|
||||
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "file": str(dest),
|
||||
"downloaded_at": datetime.now(),
|
||||
"sha256": sha256_hex,
|
||||
"seaweed_path": sw_path,
|
||||
"seaweed_url": sw_url,
|
||||
"seaweed_synced_at": sw_ts,
|
||||
"last_error": None}})
|
||||
ok_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except PlaceholderDocument:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "placeholder": True,
|
||||
"file": None, "downloaded_at": datetime.now(),
|
||||
"last_error": None}})
|
||||
placeholder_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
|
||||
if attempt < MAX_ATTEMPTS:
|
||||
page.wait_for_timeout(RETRY_PAUSE_MS)
|
||||
if last_err is not None:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"last_error": str(last_err),
|
||||
"error_at": datetime.now()}})
|
||||
fail_count += 1
|
||||
page.wait_for_timeout(BETWEEN_DOCS_MS)
|
||||
return ok_count, fail_count, placeholder_count, sw_uploaded, sw_dedup, sw_failed
|
||||
|
||||
|
||||
# --- Main --------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
ensure_credentials()
|
||||
coll = get_collection()
|
||||
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=str(PROFILE_DIR),
|
||||
headless=False,
|
||||
accept_downloads=True,
|
||||
no_viewport=True, # okno se chová nativně
|
||||
args=["--start-maximized"],
|
||||
)
|
||||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||||
ok_count = fail_count = placeholder_count = 0
|
||||
sw_uploaded = sw_dedup = sw_failed = 0
|
||||
pipeline_error = None
|
||||
try:
|
||||
# 1) login
|
||||
login_if_needed(page)
|
||||
verify_inside(page)
|
||||
dismiss_maintenance_popup(page)
|
||||
|
||||
# 2) export reportu
|
||||
report_path = download_report(page)
|
||||
|
||||
# 3) parse + sync do Mongo
|
||||
docs = read_documents_from_excel(report_path)
|
||||
if not docs:
|
||||
raise RuntimeError("Report neobsahuje žádné dokumenty — "
|
||||
"sync přeskočen, nic se nemaže.")
|
||||
sync_report_to_mongo(coll, docs)
|
||||
migrate_old_csv(coll)
|
||||
archive_report(report_path)
|
||||
|
||||
# 4) stažení chybějících
|
||||
DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
(ok_count, fail_count, placeholder_count,
|
||||
sw_uploaded, sw_dedup, sw_failed) = download_missing(page, coll)
|
||||
except KeyboardInterrupt:
|
||||
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
|
||||
except Exception as e:
|
||||
pipeline_error = e
|
||||
print("\n" + "=" * 60)
|
||||
print(" PIPELINE SELHALA!")
|
||||
print(f" {type(e).__name__}: {e}")
|
||||
print("=" * 60)
|
||||
finally:
|
||||
total = coll.count_documents({})
|
||||
have = coll.count_documents({"deleted": False, "downloaded": True})
|
||||
active = coll.count_documents({"deleted": False})
|
||||
sw_info = (f"SeaweedFS: {sw_uploaded} nových, {sw_dedup} dedup"
|
||||
+ (f", {sw_failed} chyb uploadu" if sw_failed else ""))
|
||||
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
|
||||
f"{placeholder_count} placeholderů přeskočeno, {fail_count} chyb"
|
||||
+ (f", PIPELINE SELHALA ({pipeline_error})" if pipeline_error else ".")
|
||||
+ (f"\n[i] {sw_info}" if ok_count else ""))
|
||||
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
|
||||
f"z toho staženo {have} ({active - have} zbývá).")
|
||||
log("[i] Zavírám prohlížeč.")
|
||||
ctx.close()
|
||||
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,134 @@
|
||||
# vtmf_pipeline_v1.6 — V-TMF workflow přes 3 úrovně (STUDY / COUNTRY / SITE)
|
||||
|
||||
**Verze:** 1.6 · **Datum:** 2026-06-15
|
||||
|
||||
## Co je nové proti v1.5
|
||||
|
||||
v1.5 stahovala jen **study-level** dokumenty jedné studie do ploché
|
||||
`<Type>\<Subtype>` struktury. v1.6 řeší celou hierarchii VTMF
|
||||
**STUDY → COUNTRY → SITE** a sdílený (M:N) charakter dokumentů.
|
||||
|
||||
**Klíčové poznatky z reportů:**
|
||||
- Dokument je do studií/zemí/center jen **referencovaný** (M:N) — např.
|
||||
Master Confidentiality Agreement v nemocnici je jeden dokument
|
||||
referencovaný do všech studií i center té nemocnice. Reference ≠ kopie.
|
||||
- Sloupce `Study`, `Study Country`, `Site` jsou **comma-separated seznamy**.
|
||||
- Tři reporty = tři **úrovně** dokumentu. Aby byl TMF kompletní, musí se
|
||||
stáhnout všechny tři.
|
||||
- Country i site report filtrují **jen na zemi** (CZ), ne na studii →
|
||||
empiricky vrací 100 % dokumentů navázaných na UCO3001, ořez na studii je
|
||||
pojistka (no-op).
|
||||
- Study report má 15 sloupců (+ `Document Date`), country/site 17
|
||||
(+ `Created By`, `Study Country`, `Site`; bez `Document Date`).
|
||||
|
||||
## Konfigurace REPORTS
|
||||
|
||||
```python
|
||||
TARGET_STUDY = "77242113UCO3001"
|
||||
REPORTS = [
|
||||
{"level":"study", "study":TARGET_STUDY, "country":None,
|
||||
"url":".../0RP000000000182?study__v...IN=0ST000000137008"},
|
||||
{"level":"country", "study":TARGET_STUDY, "country":"Czech Republic",
|
||||
"url":".../0RP000000000319?study_country__v...IN=0SC00000017T056"},
|
||||
{"level":"site", "study":TARGET_STUDY, "country":"Czech Republic",
|
||||
"url":".../0RP000000000762?study_country__v...EQ=0SC00000017T056"},
|
||||
]
|
||||
```
|
||||
Jiná studie / země = jen úprava ID v URL + TARGET_STUDY.
|
||||
|
||||
## Tok jednoho běhu
|
||||
|
||||
1. **Login** (persistentní profil, J&J SSO, 2FA na telefonu).
|
||||
2. Pro **každý report** v `REPORTS`:
|
||||
- export do Excelu (Data Only) → `WhatToDownload/<ts> <level> ...xlsx`,
|
||||
- parse (zobecněný parser, sloupce podle názvu),
|
||||
- ořez na `TARGET_STUDY` (řádek se bere jen pokud má studii v `studies`),
|
||||
- **scoped sync** do Mongo,
|
||||
- archiv reportu do `Zpracovano/`.
|
||||
3. **Jeden průchod stažení** všech `deleted=False, downloaded≠True`
|
||||
na disk i do SeaweedFS.
|
||||
|
||||
## Mongo schéma (kolekce documents)
|
||||
|
||||
```
|
||||
_id: "VTMF-9108777|v2.0" # číslo dokumentu | verze
|
||||
vtmf, version, url, level # level = study|country|site (pro cestu)
|
||||
levels: ["site"] # všechny úrovně, kde se objevil
|
||||
scopes: ["site|77242113UCO3001|Czech Republic", ...] # pro scoped mazání
|
||||
name, status, type, subtype, classification, desc
|
||||
process_name, external_system_name
|
||||
created_by, last_modified_by, version_created_by
|
||||
date # YYYY-MM-DD (Document/Approval/Version date)
|
||||
studies: ["77242113UCO3001", ...] # comma-split sloupce reportu
|
||||
countries: ["Czech Republic", ...]
|
||||
sites: ["BH5-CZ10001", ...]
|
||||
first_seen, last_seen, deleted, deleted_at
|
||||
downloaded, downloaded_at, placeholder # žádné pole file (Dropbox zrušen)
|
||||
sha256 # kontrolní součet (NE cesta)
|
||||
seaweed_path, seaweed_url, seaweed_synced_at # jediné umístění souboru
|
||||
history: [{ts, changes:{pole:{old,new}}}]
|
||||
```
|
||||
|
||||
## Scoped sync (řeší mazací háček)
|
||||
|
||||
Mazání už **nekouká na celou kolekci** (to by sync country reportu označil
|
||||
study/site dokumenty jako smazané). Každý report má
|
||||
`scope = "<level>|<study>|<country>"`; dokument nese pole `scopes[]`.
|
||||
- dokument v reportu → `$addToSet` scope,
|
||||
- dokument, který z **tohoto** scope zmizel → scope se odebere; teprve když
|
||||
nemá **žádný** scope → `deleted=True` + soubor ` [D]`.
|
||||
|
||||
## Evidence reportů — kolekce report_runs
|
||||
|
||||
```
|
||||
level, study, country, url, scope, exported_at, file, row_count, doc_keys[]
|
||||
```
|
||||
Umožní ukázat „co přesně bylo v reportu" a slouží jako audit.
|
||||
|
||||
## Úložiště = JEN SeaweedFS (žádný Dropbox/disk)
|
||||
|
||||
Dokumenty se stahují z Vaultu přes **dočasný soubor Playwrightu** rovnou do
|
||||
SeaweedFS Fileru — na disk/Dropbox se nic neukládá. Klíč = číslo dokumentu
|
||||
+ verze:
|
||||
|
||||
```
|
||||
/vtmf-documents/<vtmf>/<verze>.<přípona>
|
||||
např. /vtmf-documents/VTMF-9108777/v2.0.pdf
|
||||
```
|
||||
Žádné SHA cesty, žádný content dedup, žádné hardlinky. SHA-256 se počítá a
|
||||
ukládá do Mongo jen jako kontrolní součet. Která úroveň / země / centra =
|
||||
pole `level` / `countries[]` / `sites[]` v Mongo.
|
||||
|
||||
Aktuální verzi čehokoli do Dropboxu (nebo kamkoli jinam) zařídí samostatný
|
||||
export skript ze SeaweedFS — pipeline se tím nezdržuje.
|
||||
|
||||
## Migrace stávajících dat → migrate_to_v16.py
|
||||
|
||||
Stávající study-level data (v1.3–v1.5) převede na schéma v1.6. Dvě fáze,
|
||||
**default DRY-RUN**, ostře s `--apply`:
|
||||
|
||||
- `--phase mongo` — re-parse nejnovějšího archivu study reportu v1.6
|
||||
parserem → obohatí ~1692 dokumentů o nová pole (level, scopes[],
|
||||
studies[], countries=[], sites=[], classification, …). Nesahá na
|
||||
download stav.
|
||||
- `--phase seaweed` — překlíčuje SeaweedFS ze starých SHA cest na nové
|
||||
`<vtmf>/<verze>` (~1637 souborů; zdroj bajtů = stávající soubor na disku,
|
||||
fallback GET ze SHA cesty), opraví `seaweed_path/url` + `sha256`, smaže
|
||||
staré SHA objekty a odebere pole `file` z Mongo. Fyzické soubory
|
||||
v Dropboxu pak můžeš smazat ručně.
|
||||
|
||||
```powershell
|
||||
# náhled
|
||||
& "...\.venv\Scripts\python.exe" "...\migrate_to_v16.py"
|
||||
# ostře
|
||||
& "...\.venv\Scripts\python.exe" "...\migrate_to_v16.py" --apply
|
||||
```
|
||||
|
||||
## Spuštění pipeline
|
||||
|
||||
```powershell
|
||||
& "U:\PythonProject\Janssen\.venv\Scripts\python.exe" "U:\PythonProject\Janssen\VTMFDownloadFiles\vtmf_pipeline_v1.6.py"
|
||||
```
|
||||
|
||||
Předchůdce: vtmf_pipeline_v1.5 (TRASH/).
|
||||
```
|
||||
@@ -0,0 +1,937 @@
|
||||
# ============================================================
|
||||
# vtmf_pipeline_v1.6.py
|
||||
# Verze: 1.6
|
||||
# Datum: 2026-06-15
|
||||
# Popis: Kompletní workflow V-TMF (J&J Veeva Vault) pro studii
|
||||
# 77242113UCO3001 přes VŠECHNY TŘI ÚROVNĚ dokumentů
|
||||
# (STUDY / COUNTRY / SITE). Jeden běh udělá pro každý
|
||||
# report ze seznamu REPORTS:
|
||||
# 1) login do Vaultu (persistentní session + ruční 2FA),
|
||||
# 2) export reportu do Excelu (Data Only) do WhatToDownload/,
|
||||
# 3) parse + scoped sync do MongoDB (db VTMF, kolekce
|
||||
# documents; klíč _id = "číslo|verze"),
|
||||
# a nakonec jeden průchod stažení všech dosud nestažených
|
||||
# dokumentů PŘÍMO do SeaweedFS (žádný Dropbox/disk).
|
||||
#
|
||||
# ZÁSADNÍ ZMĚNY proti v1.5:
|
||||
#
|
||||
# • Hierarchie dokumentů ve VTMF je STUDY -> COUNTRY -> SITE.
|
||||
# Dokument je do studií/zemí/center jen REFERENCOVANÝ (M:N) —
|
||||
# např. Master Confidentiality Agreement v nemocnici je jeden
|
||||
# dokument referencovaný do všech studií i center té nemocnice.
|
||||
# Proto: jeden dokument = jeden záznam = jeden SeaweedFS objekt;
|
||||
# příslušnost je jen metadatová pole studies[]/countries[]/sites[].
|
||||
#
|
||||
# • REPORTS = seznam (level, study, country, url). Country i site
|
||||
# report filtrují jen na zemi (CZ), ne na studii -> při ukládání
|
||||
# se row bere jen pokud cílová studie je v jeho Study sloupci
|
||||
# (prakticky no-op, vše vrácené UCO3001 obsahuje).
|
||||
#
|
||||
# • Zobecněný parser: study report má 15 sloupců (+ Document Date),
|
||||
# country/site mají 17 (+ Created By, Study Country, Site; bez
|
||||
# Document Date). Sloupce se hledají podle NÁZVU, datum má
|
||||
# fallback Document Date -> Approval Complete Date -> Version
|
||||
# Creation Date. Study/Study Country/Site se parsují na pole.
|
||||
#
|
||||
# • Scoped sync: mazání už NEkouká na celou kolekci. Každý report
|
||||
# má scope = (level|study|country); dokument nese pole scopes[].
|
||||
# Když z reportu daného scope zmizí, scope se odebere; teprve
|
||||
# když nemá žádný scope -> deleted=True.
|
||||
#
|
||||
# • Evidence reportů: kolekce report_runs (level, study, country,
|
||||
# url, exported_at, file, row_count, doc_keys).
|
||||
#
|
||||
# • ÚLOŽIŠTĚ = JEN SeaweedFS, klíč číslo dokumentu + verze:
|
||||
# /vtmf-documents/<vtmf>/<verze>.<přípona>
|
||||
# Žádné ukládání dokumentů na disk/Dropbox — stahují se přes
|
||||
# dočasný soubor Playwrightu rovnou do Fileru. SHA-256 se počítá
|
||||
# a ukládá do Mongo jen jako kontrolní součet. (Aktuální verzi
|
||||
# čehokoli do Dropboxu zařídí samostatný export skript ze SeaweedFS.)
|
||||
#
|
||||
# Heslo se NIKDY nedává natvrdo do skriptu — čte se z .env
|
||||
# v rootu projektu Janssen (VAULT_USER / VAULT_PASS).
|
||||
#
|
||||
# Migrace stávajících study-level dat na toto schéma: migrate_to_v16.py
|
||||
# Předchůdce: vtmf_pipeline_v1.5 (v TRASH/).
|
||||
# ============================================================
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
# --- Konfigurace -------------------------------------------------------
|
||||
|
||||
LOGIN_URL = ("https://fedlogin.jnj.com/idp/eyJ2c2lkIjoiam5qX3ZlZXZhIn0/"
|
||||
"startSSO.ping?PartnerSpId=janssenetmf.veevavault.com"
|
||||
"&IdpAdapterId=CompIWALDAPEXTFORM"
|
||||
"&TargetResource=https%3A%2F%2Fvtmf.veevavault.com%2F")
|
||||
|
||||
# Studie, jejíž TMF stavíme (cíl ořezu country/site reportů).
|
||||
TARGET_STUDY = "77242113UCO3001"
|
||||
|
||||
# ====================================================================
|
||||
# SEZNAM REPORTŮ KE ZPRACOVÁNÍ
|
||||
# --------------------------------------------------------------------
|
||||
# Každý řádek = jeden report. Pole:
|
||||
# enabled = True/False -> přepni na False a report se v dalším běhu
|
||||
# NEnačte (zůstane v seznamu jako dokumentace)
|
||||
# name = popisek do logu (co to je za report)
|
||||
# level = "study" | "country" | "site" (úroveň + scope)
|
||||
# study = kód cílové studie (scope + ořez na tuto studii)
|
||||
# country = země scope (None u study-level)
|
||||
# url = přímý odkaz na report viewer ve Vaultu
|
||||
#
|
||||
# Přidání jiné studie = prostě dopiš další 3 řádky s jejím kódem
|
||||
# a URL; běh je zpracuje vedle stávajících.
|
||||
# ====================================================================
|
||||
REPORTS = [
|
||||
{"enabled": True, "name": "UCO3001 — STUDY level",
|
||||
"level": "study", "study": TARGET_STUDY, "country": None,
|
||||
"url": "https://vtmf.veevavault.com/ui/#reporting/viewer/"
|
||||
"0RP000000000182?study__v%2C%2C%2CIN=0ST000000137008"},
|
||||
|
||||
{"enabled": True, "name": "UCO3001 — COUNTRY level (Czech Republic)",
|
||||
"level": "country", "study": TARGET_STUDY, "country": "Czech Republic",
|
||||
"url": "https://vtmf.veevavault.com/ui/#reporting/viewer/"
|
||||
"0RP000000000319?study_country__v%2C%2C%2CIN=0SC00000017T056"},
|
||||
|
||||
{"enabled": False, "name": "UCO3001 — SITE level (all sites in Czech Republic)",
|
||||
"level": "site", "study": TARGET_STUDY, "country": "Czech Republic",
|
||||
"url": "https://vtmf.veevavault.com/ui/#reporting/viewer/"
|
||||
"0RP000000000762?study_country__v%2C%2C%2CEQ=0SC00000017T056"},
|
||||
]
|
||||
|
||||
VAULT_UI_PATTERN = "**vtmf.veevavault.com/ui**" # úspěšný vstup do Vaultu
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROFILE_DIR = SCRIPT_DIR / "vault_profile" # perzistentní session
|
||||
ENV_FILE = SCRIPT_DIR.parent / ".env" # root projektu Janssen
|
||||
DEBUG_DIR = SCRIPT_DIR / "debug" # diagnostické výstupy
|
||||
EXCEL_DIR = SCRIPT_DIR / "WhatToDownload" # stažené reporty (jen Excel)
|
||||
PROCESSED_DIR = EXCEL_DIR / "Zpracovano" # archiv zpracovaných
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "VTMF"
|
||||
MONGO_COLL = "documents"
|
||||
RUNS_COLL = "report_runs"
|
||||
|
||||
# Kolik dokumentů stáhnout v tomto běhu (None = všechny zbývající)
|
||||
LIMIT = None
|
||||
# Pole, jejichž změny se verzují do history[]
|
||||
TRACKED_FIELDS = ("name", "status", "type", "subtype", "classification",
|
||||
"desc", "date", "url", "studies", "countries", "sites",
|
||||
"level")
|
||||
|
||||
MAX_ATTEMPTS = 2 # pokusy na jeden dokument
|
||||
RETRY_PAUSE_MS = 5000 # pauza před opakováním
|
||||
BETWEEN_DOCS_MS = 500 # pauza mezi dokumenty
|
||||
|
||||
SEAWEED_FILER = "http://192.168.1.50:8888"
|
||||
SEAWEED_PREFIX = "/vtmf-documents"
|
||||
|
||||
|
||||
class PlaceholderDocument(Exception):
|
||||
"""Dokument existuje jen jako placeholder — "This placeholder has no content"."""
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def load_env_file(path):
|
||||
"""Načte KEY=VALUE řádky z .env do os.environ.
|
||||
Už nastavené env proměnné mají přednost, .env je nepřepisuje."""
|
||||
if not path.exists():
|
||||
log(f"[!] .env nenalezen: {path}")
|
||||
return
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, value = line.partition("=")
|
||||
key, value = key.strip(), value.strip().strip('"').strip("'")
|
||||
if value and key not in os.environ:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
ENV_SECTION_HEADER = "# --- Veeva Vault (J&J V-TMF) — VTMFDownloadFiles/download_vault ---"
|
||||
ENV_KEYS = ("VAULT_USER", "VAULT_PASS")
|
||||
|
||||
|
||||
def ensure_credentials():
|
||||
"""Načte .env; pokud VAULT_USER/VAULT_PASS chybí, založí/doplní
|
||||
v .env šablonu, vyzve uživatele k doplnění a ukončí skript."""
|
||||
load_env_file(ENV_FILE)
|
||||
if all(os.environ.get(k) for k in ENV_KEYS):
|
||||
return
|
||||
|
||||
existing = ENV_FILE.read_text(encoding="utf-8") if ENV_FILE.exists() else ""
|
||||
missing_lines = [f"{k}=" for k in ENV_KEYS
|
||||
if not re.search(rf"^\s*{k}\s*=", existing, re.M)]
|
||||
|
||||
if not ENV_FILE.exists():
|
||||
ENV_FILE.write_text(
|
||||
"# .env — lokální přihlašovací údaje (NEVERZOVAT, je v .gitignore)\n\n"
|
||||
+ ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n",
|
||||
encoding="utf-8")
|
||||
log(f"[i] Založil jsem nový .env: {ENV_FILE}")
|
||||
elif missing_lines:
|
||||
with open(ENV_FILE, "a", encoding="utf-8") as f:
|
||||
f.write("\n" + ENV_SECTION_HEADER + "\n"
|
||||
+ "\n".join(missing_lines) + "\n")
|
||||
log(f"[i] Doplnil jsem chybějící řádky do .env: {ENV_FILE}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" CHYBÍ PŘIHLAŠOVACÍ ÚDAJE.")
|
||||
print(f" Doplň VAULT_USER a VAULT_PASS do souboru:")
|
||||
print(f" {ENV_FILE}")
|
||||
print(" a spusť skript znovu.")
|
||||
print("=" * 60)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# --- Parsování Excelu --------------------------------------------------
|
||||
|
||||
HYPERLINK_RE = re.compile(r'HYPERLINK\("([^"]+)"\s*,\s*"([^"]+)"\)')
|
||||
VERSION_RE = re.compile(r"\((v[^)]+)\)\s*$")
|
||||
DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")
|
||||
# nepovolené znaky názvů + řídicí znaky + unicode artefakt �
|
||||
BAD_CHARS_RE = re.compile(r"[<>:\"/\\|?*\x00-\x1f�]")
|
||||
|
||||
|
||||
def clean_text(s):
|
||||
"""Očistí string na rozumný název (bez nepovolených znaků)."""
|
||||
s = BAD_CHARS_RE.sub("_", str(s))
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
s = re.sub(r"_{2,}", "_", s)
|
||||
return s.strip(" ._")
|
||||
|
||||
|
||||
def display_text(cell):
|
||||
"""Zobrazený text buňky — u =HYPERLINK vzorce druhý argument."""
|
||||
raw = str(cell.value or "").strip()
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
return m.group(2).strip() if m else raw
|
||||
|
||||
|
||||
def split_multi(text):
|
||||
"""Comma-separated seznam -> list (strip, bez prázdných, dedup pořadí)."""
|
||||
out, seen = [], set()
|
||||
for part in str(text or "").split(","):
|
||||
p = part.strip()
|
||||
if p and p not in seen:
|
||||
seen.add(p)
|
||||
out.append(p)
|
||||
return out
|
||||
|
||||
|
||||
def cell_date(cell):
|
||||
"""Z buňky vytáhne datum jako 'YYYY-MM-DD' (datetime i string), nebo ''."""
|
||||
v = cell.value if cell is not None else None
|
||||
if hasattr(v, "strftime"):
|
||||
return v.strftime("%Y-%m-%d")
|
||||
m = DATE_RE.search(str(v or ""))
|
||||
return m.group(1) if m else ""
|
||||
|
||||
|
||||
def extract_doc_url(raw):
|
||||
"""Z HYPERLINK hodnoty (nebo i rozbité URL) vytáhne čistou doc URL
|
||||
ve tvaru https://<host>/ui/#doc_info/<id>/<major>/<minor>."""
|
||||
m = re.search(r"(https://[^/\"]+/ui/#doc_info/\d+/\d+/\d+)", str(raw))
|
||||
if not m:
|
||||
raise ValueError(f"Nenašel jsem doc URL v: {raw!r}")
|
||||
return m.group(1)
|
||||
|
||||
|
||||
def read_documents_from_excel(path, level):
|
||||
"""Načte dokumenty z .xlsx reportu dané úrovně (study/country/site).
|
||||
Sloupce se hledají podle NÁZVU (study má 15, country/site 17).
|
||||
Document Name/Number jsou =HYPERLINK vzorce -> URL i text regexem.
|
||||
Report má rozbité deklarované rozměry -> přímá iterace řádků."""
|
||||
from openpyxl import load_workbook
|
||||
|
||||
log(f"[i] Parsování reportu ({level}): {path.name}")
|
||||
wb = load_workbook(path, data_only=False) # potřebujeme vzorce
|
||||
ws = wb[wb.sheetnames[0]]
|
||||
|
||||
rows = ws.iter_rows()
|
||||
header = [c.value for c in next(rows)]
|
||||
idx = {h: i for i, h in enumerate(header) if h is not None}
|
||||
|
||||
required = ("Document Number", "Document Name", "Document Status",
|
||||
"Type", "Subtype", "Description", "Study")
|
||||
missing = [c for c in required if c not in idx]
|
||||
if missing:
|
||||
raise RuntimeError(f"V reportu chybí očekávané sloupce: {missing}")
|
||||
|
||||
i_num, i_name = idx["Document Number"], idx["Document Name"]
|
||||
i_status, i_type, i_sub = idx["Document Status"], idx["Type"], idx["Subtype"]
|
||||
i_desc, i_study = idx["Description"], idx["Study"]
|
||||
i_class = idx.get("Classification")
|
||||
i_proc = idx.get("Process Name")
|
||||
i_extsys = idx.get("External System Name")
|
||||
i_created = idx.get("Created By")
|
||||
i_modby = idx.get("Last Modified By")
|
||||
i_verby = idx.get("Version Created By")
|
||||
i_country = idx.get("Study Country")
|
||||
i_site = idx.get("Site")
|
||||
i_date_cols = [idx.get(c) for c in
|
||||
("Document Date", "Approval Complete Date", "Version Creation Date")
|
||||
if idx.get(c) is not None]
|
||||
|
||||
def g(row, i):
|
||||
return display_text(row[i]) if i is not None else ""
|
||||
|
||||
docs, bad = [], []
|
||||
for row in rows:
|
||||
cell = row[i_num]
|
||||
if cell.value is None:
|
||||
continue
|
||||
raw = str(cell.value)
|
||||
m = HYPERLINK_RE.search(raw)
|
||||
if m:
|
||||
url_raw, vtmf = m.group(1), m.group(2)
|
||||
elif cell.hyperlink:
|
||||
url_raw, vtmf = cell.hyperlink.target, raw
|
||||
else:
|
||||
bad.append(raw)
|
||||
continue
|
||||
try:
|
||||
url = extract_doc_url(url_raw)
|
||||
except ValueError:
|
||||
bad.append(raw)
|
||||
continue
|
||||
|
||||
name = display_text(row[i_name])
|
||||
vm = VERSION_RE.search(name)
|
||||
version = vm.group(1) if vm else "v?"
|
||||
|
||||
desc = clean_text(g(row, i_desc))
|
||||
if not desc:
|
||||
desc = clean_text(VERSION_RE.sub("", name))
|
||||
|
||||
date = ""
|
||||
for i_d in i_date_cols:
|
||||
date = cell_date(row[i_d])
|
||||
if date:
|
||||
break
|
||||
|
||||
docs.append({
|
||||
"vtmf": vtmf.strip(),
|
||||
"version": version,
|
||||
"url": url,
|
||||
"level": level,
|
||||
"name": name,
|
||||
"status": g(row, i_status),
|
||||
"type": clean_text(g(row, i_type)),
|
||||
"subtype": clean_text(g(row, i_sub)),
|
||||
"classification": g(row, i_class),
|
||||
"desc": desc,
|
||||
"process_name": g(row, i_proc),
|
||||
"external_system_name": g(row, i_extsys),
|
||||
"created_by": g(row, i_created),
|
||||
"last_modified_by": g(row, i_modby),
|
||||
"version_created_by": g(row, i_verby),
|
||||
"date": date,
|
||||
"studies": split_multi(g(row, i_study)),
|
||||
"countries": split_multi(g(row, i_country)) if i_country is not None else [],
|
||||
"sites": split_multi(g(row, i_site)) if i_site is not None else [],
|
||||
})
|
||||
|
||||
log(f"[i] Načteno {len(docs)} dokumentů"
|
||||
+ (f", {len(bad)} řádků bez použitelné URL (přeskočeno)" if bad else ""))
|
||||
return docs
|
||||
|
||||
|
||||
# --- MongoDB synchronizace ---------------------------------------------
|
||||
|
||||
def doc_key(vtmf, version):
|
||||
return f"{vtmf}|{version}"
|
||||
|
||||
|
||||
def scope_key(report):
|
||||
return f"{report['level']}|{report['study']}|{report.get('country') or ''}"
|
||||
|
||||
|
||||
def get_db():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
db = client[MONGO_DB]
|
||||
coll = db[MONGO_COLL]
|
||||
coll.create_index([("vtmf", ASCENDING), ("version", ASCENDING)], unique=True)
|
||||
coll.create_index([("deleted", ASCENDING), ("downloaded", ASCENDING)])
|
||||
coll.create_index([("scopes", ASCENDING)])
|
||||
coll.create_index([("studies", ASCENDING)])
|
||||
coll.create_index([("sites", ASCENDING)])
|
||||
coll.create_index([("level", ASCENDING)])
|
||||
runs = db[RUNS_COLL]
|
||||
runs.create_index([("level", ASCENDING), ("study", ASCENDING),
|
||||
("country", ASCENDING), ("exported_at", ASCENDING)])
|
||||
return db, coll, runs
|
||||
|
||||
|
||||
def sync_report_to_mongo(coll, runs, docs, report, report_file):
|
||||
"""Promítne report daného scope do kolekce documents.
|
||||
- nové založí, změny polí promítne (+ history[]),
|
||||
- každému dokumentu přidá scope do scopes[] (a level do levels[]),
|
||||
- dokument, který z TOHOTO scope zmizel, ztratí tento scope;
|
||||
bez jakéhokoli scope -> deleted=True.
|
||||
Scoped mazání = sync jednoho reportu NIKDY neoznačí dokumenty
|
||||
jiného scope (study/country/site) jako smazané. Žádné souborové
|
||||
operace (úložiště je SeaweedFS)."""
|
||||
now = datetime.now()
|
||||
sk = scope_key(report)
|
||||
stats = {"new": 0, "updated": 0, "unchanged": 0,
|
||||
"resurrected": 0, "scope_removed": 0, "marked_deleted": 0}
|
||||
current_keys = set()
|
||||
|
||||
for d in docs:
|
||||
key = doc_key(d["vtmf"], d["version"])
|
||||
current_keys.add(key)
|
||||
existing = coll.find_one({"_id": key})
|
||||
|
||||
if existing is None:
|
||||
coll.insert_one({
|
||||
"_id": key, **d,
|
||||
"levels": [d["level"]], "scopes": [sk],
|
||||
"first_seen": now, "last_seen": now,
|
||||
"deleted": False, "downloaded": False,
|
||||
"seaweed_path": None, "history": [],
|
||||
})
|
||||
stats["new"] += 1
|
||||
continue
|
||||
|
||||
changes = {}
|
||||
for fld in TRACKED_FIELDS:
|
||||
if existing.get(fld) != d.get(fld):
|
||||
changes[fld] = {"old": existing.get(fld), "new": d.get(fld)}
|
||||
|
||||
update = {"$set": {**d, "last_seen": now, "deleted": False},
|
||||
"$addToSet": {"scopes": sk, "levels": d["level"]}}
|
||||
if changes:
|
||||
update["$push"] = {"history": {"ts": now, "changes": changes}}
|
||||
stats["updated"] += 1
|
||||
else:
|
||||
stats["unchanged"] += 1
|
||||
if existing.get("deleted"):
|
||||
stats["resurrected"] += 1
|
||||
coll.update_one({"_id": key}, update)
|
||||
|
||||
# dokumenty dříve v TOMTO scope, které v reportu chybí -> odebrat scope
|
||||
for rec in coll.find({"scopes": sk, "_id": {"$nin": list(current_keys)}}):
|
||||
remaining = [s for s in rec.get("scopes", []) if s != sk]
|
||||
upd = {"scopes": remaining}
|
||||
op = {"$set": upd}
|
||||
stats["scope_removed"] += 1
|
||||
if not remaining: # už nikde -> smazáno
|
||||
upd["deleted"] = True
|
||||
upd["deleted_at"] = now
|
||||
op["$push"] = {"history": {"ts": now,
|
||||
"changes": {"deleted": {"old": False, "new": True}}}}
|
||||
stats["marked_deleted"] += 1
|
||||
coll.update_one({"_id": rec["_id"]}, op)
|
||||
|
||||
runs.insert_one({
|
||||
"level": report["level"], "study": report["study"],
|
||||
"country": report.get("country"), "url": report["url"],
|
||||
"scope": sk, "exported_at": now,
|
||||
"file": str(report_file), "row_count": len(docs),
|
||||
"doc_keys": sorted(current_keys),
|
||||
})
|
||||
|
||||
log(f"[ok] Mongo sync [{sk}]: {stats['new']} nových, {stats['updated']} změněných, "
|
||||
f"{stats['unchanged']} beze změny, {stats['resurrected']} obnovených, "
|
||||
f"{stats['scope_removed']} odebrán scope ({stats['marked_deleted']} úplně smazáno).")
|
||||
return stats
|
||||
|
||||
|
||||
# --- Přihlášení --------------------------------------------------------
|
||||
|
||||
def submit_login_form(page, password_box):
|
||||
"""Odešle login formulář. Zkouší postupně tlačítka Sign On / Login /
|
||||
OK / submit input; když žádné nenajde, stiskne Enter v poli hesla."""
|
||||
candidates = [
|
||||
page.get_by_role("button", name=re.compile("sign\\s*on", re.I)),
|
||||
page.get_by_role("button", name=re.compile("log\\s*in|sign\\s*in", re.I)),
|
||||
page.locator("input[type='submit']"),
|
||||
page.locator("button[type='submit']"),
|
||||
page.get_by_role("button", name=re.compile("^ok$", re.I)),
|
||||
]
|
||||
for loc in candidates:
|
||||
try:
|
||||
if loc.count() and loc.first.is_visible():
|
||||
label = (loc.first.inner_text() or
|
||||
loc.first.get_attribute("value") or "submit").strip()
|
||||
log(f"[i] Odesílám formulář tlačítkem '{label}'...")
|
||||
loc.first.click()
|
||||
return
|
||||
except Exception:
|
||||
continue
|
||||
log("[i] Tlačítko nenalezeno, odesílám Enterem v poli hesla...")
|
||||
password_box.press("Enter")
|
||||
|
||||
|
||||
def login_if_needed(page):
|
||||
"""Otevře login URL, vyplní jméno+heslo, detekuje 2FA a počká na
|
||||
ruční potvrzení. Pokud perzistentní session žije, login přeskočí."""
|
||||
log(f"[i] Otevírám přihlašovací URL...")
|
||||
page.goto(LOGIN_URL, wait_until="domcontentloaded")
|
||||
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Už přihlášen (perzistentní session).")
|
||||
return
|
||||
|
||||
user_box = page.locator("input[type='text']").first
|
||||
try:
|
||||
user_box.wait_for(timeout=8000)
|
||||
except PWTimeout:
|
||||
if "vtmf.veevavault.com/ui" in page.url:
|
||||
log("[i] Přihlášen bez formuláře (session redirect).")
|
||||
return
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem login formulář ani Vault. Aktuální URL: {page.url}")
|
||||
|
||||
username = os.environ["VAULT_USER"]
|
||||
password = os.environ["VAULT_PASS"]
|
||||
|
||||
log("[i] Vyplňuji přihlašovací údaje...")
|
||||
user_box.fill(username)
|
||||
password_box = page.locator("input[type='password']").first
|
||||
password_box.fill(password)
|
||||
submit_login_form(page, password_box)
|
||||
|
||||
log("[i] Odeslán login, čekám na výsledek...")
|
||||
try:
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=15000)
|
||||
log("[ok] Přihlášen rovnou (bez 2FA).")
|
||||
return
|
||||
except PWTimeout:
|
||||
pass # nejsme ve Vaultu -> pravděpodobně 2FA výzva
|
||||
|
||||
err = page.locator("text=/invalid|incorrect|failed/i")
|
||||
try:
|
||||
if err.count() and err.first.is_visible():
|
||||
raise RuntimeError(f"Login selhal: {err.first.inner_text().strip()}")
|
||||
except PWTimeout:
|
||||
pass
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" VYŽADOVÁNO OVĚŘENÍ NA TELEFONU (2FA).")
|
||||
print(" Potvrď přihlášení v mobilní aplikaci.")
|
||||
print("=" * 60)
|
||||
input(" Až to potvrdíš, stiskni ENTER pro pokračování... ")
|
||||
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=120000)
|
||||
log("[ok] Přihlášení dokončeno.")
|
||||
|
||||
|
||||
def verify_inside(page):
|
||||
"""Ověří, že jsme uvnitř Vaultu (URL na /ui)."""
|
||||
page.wait_for_url(VAULT_UI_PATTERN, timeout=30000)
|
||||
log(f"[ok] Uvnitř Vaultu: {page.url}")
|
||||
|
||||
|
||||
def dialog_visible(page):
|
||||
"""True, pokud je na stránce viditelný jQuery UI dialog."""
|
||||
try:
|
||||
dlg = page.locator(".ui-dialog")
|
||||
return bool(dlg.count() and dlg.first.is_visible())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def save_page_debug(page, tag):
|
||||
"""Uloží diagnostiku stránky: screenshot, HTML všech frames a výpis
|
||||
kandidátů na tlačítka. Vrátí cestu složky."""
|
||||
out = DEBUG_DIR / datetime.now().strftime(f"%Y-%m-%d_%H-%M-%S_{tag}")
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
page.screenshot(path=str(out / "screenshot.png"), full_page=False)
|
||||
except Exception as e:
|
||||
(out / "screenshot_error.txt").write_text(str(e), encoding="utf-8")
|
||||
report = []
|
||||
for i, frame in enumerate(page.frames):
|
||||
report.append(f"=== frame[{i}] url={frame.url}")
|
||||
try:
|
||||
(out / f"frame_{i}.html").write_text(frame.content(), encoding="utf-8")
|
||||
for sel in (".ui-dialog", "a.ok.vv_button",
|
||||
".ui-dialog-titlebar-close",
|
||||
"button", "input[type='button']",
|
||||
"[title]", "[aria-label]"):
|
||||
n = frame.locator(sel).count()
|
||||
if n:
|
||||
report.append(f" {sel}: {n}x")
|
||||
for attr in ("title", "aria-label"):
|
||||
vals = frame.locator(f"[{attr}]").evaluate_all(
|
||||
f"els => els.map(e => e.getAttribute('{attr}'))")
|
||||
uniq = sorted({v for v in vals if v})[:80]
|
||||
report.append(f" {attr}: {uniq}")
|
||||
except Exception as e:
|
||||
report.append(f" [chyba čtení framu: {e}]")
|
||||
(out / "frames_report.txt").write_text("\n".join(report), encoding="utf-8")
|
||||
log(f"[!] Diagnostika stránky uložena do: {out}")
|
||||
return out
|
||||
|
||||
|
||||
# Viditelné OK tlačítko dialogu — je to <a>, ne <button>!
|
||||
# Křížek .ui-dialog-titlebar-close je display:none → NEPOUŽÍVAT.
|
||||
DIALOG_OK_SELECTOR = (".ui-dialog a.ok.vv_button, "
|
||||
".vv_login_msg_dialog .vv_button.ok")
|
||||
|
||||
|
||||
def dismiss_maintenance_popup(page, timeout=8000):
|
||||
"""Zavře Veeva login/maintenance dialog kliknutím na viditelné OK
|
||||
(<a class='ok vv_button'>). Dialog se objevuje SE ZPOŽDĚNÍM,
|
||||
proto se na něj krátce čeká. Bezpečné volat vždy."""
|
||||
ok = page.locator(DIALOG_OK_SELECTOR)
|
||||
try:
|
||||
ok.first.wait_for(state="visible", timeout=timeout)
|
||||
except PWTimeout:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
closed = 0
|
||||
for _ in range(5): # dialogy umí být ve frontě
|
||||
try:
|
||||
if ok.count() and ok.first.is_visible():
|
||||
ok.first.click()
|
||||
page.wait_for_timeout(300)
|
||||
closed += 1
|
||||
log("[i] Maintenance/login dialog zavřen (OK).")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
if not dialog_visible(page):
|
||||
return bool(closed)
|
||||
|
||||
page.keyboard.press("Escape")
|
||||
page.wait_for_timeout(500)
|
||||
log("[i] Zkusil jsem dialog zavřít klávesou Escape.")
|
||||
|
||||
if dialog_visible(page):
|
||||
save_page_debug(page, "dialog")
|
||||
print("\n" + "=" * 60)
|
||||
print(" DIALOG SE NEPODAŘILO ZAVŘÍT AUTOMATICKY.")
|
||||
print(" Zavři ho prosím ručně v prohlížeči.")
|
||||
print("=" * 60)
|
||||
input(" Po ručním zavření stiskni ENTER... ")
|
||||
return bool(closed)
|
||||
|
||||
|
||||
# --- Export reportu ----------------------------------------------------
|
||||
|
||||
def _first_visible(page, builders):
|
||||
"""Vrátí (locator, popis) prvního viditelného kandidáta. Hledá na
|
||||
hlavní stránce i ve všech frames."""
|
||||
for frame in page.frames:
|
||||
for build, desc in builders:
|
||||
try:
|
||||
loc = build(frame)
|
||||
if loc.count() and loc.first.is_visible():
|
||||
return loc.first, desc
|
||||
except Exception:
|
||||
continue
|
||||
return None, None
|
||||
|
||||
|
||||
def download_report(page, report):
|
||||
"""Stáhne daný report (Export to Excel, Data Only) do WhatToDownload/
|
||||
pod timestampovaným názvem. Vrátí cestu k souboru.
|
||||
Při selhání uloží diagnostiku stránky do debug/ a vyhodí výjimku."""
|
||||
log(f"\n[i] === Report {report['level'].upper()} "
|
||||
f"({report.get('country') or report['study']}) ===")
|
||||
log("[i] Otevírám report...")
|
||||
page.goto(report["url"], wait_until="domcontentloaded")
|
||||
dismiss_maintenance_popup(page, timeout=4000)
|
||||
|
||||
try:
|
||||
page.wait_for_selector("text=Returned", timeout=30000)
|
||||
except PWTimeout:
|
||||
try:
|
||||
page.wait_for_selector("text=Document Status:", timeout=30000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, f"report_load_{report['level']}")
|
||||
raise RuntimeError(
|
||||
"Report se nenačetl (nenašel jsem 'Returned' ani "
|
||||
"'Document Status:'). Diagnostika v debug/.")
|
||||
log("[i] Report načten, otevírám menu akcí (⋯)...")
|
||||
|
||||
actions, desc = _first_visible(page, [
|
||||
(lambda f: f.locator(
|
||||
".actionMenuContainer .dropDown.vv_dropdown_toggle "
|
||||
"button.vv-icon-button"), ".actionMenuContainer button (ověřený)"),
|
||||
(lambda f: f.locator(".actionMenuContainer button"), ".actionMenuContainer button (volnější)"),
|
||||
(lambda f: f.locator("button[title='Actions'], [aria-label='Actions']"), "title/aria-label Actions"),
|
||||
])
|
||||
if actions is None:
|
||||
save_page_debug(page, f"report_menu_{report['level']}")
|
||||
raise RuntimeError("Nenašel jsem menu akcí (⋯) na reportu. Diagnostika v debug/.")
|
||||
log(f"[i] Menu nalezeno přes: {desc}")
|
||||
actions.click()
|
||||
|
||||
item = page.locator("a.ReportAction[data-action-name='ExcelExport']")
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=15000)
|
||||
except PWTimeout:
|
||||
item = page.get_by_text("Export to Excel", exact=True)
|
||||
try:
|
||||
item.first.wait_for(state="visible", timeout=5000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, f"report_export_item_{report['level']}")
|
||||
raise RuntimeError("Menu se otevřelo, ale položku 'Export to "
|
||||
"Excel' jsem nenašel. Diagnostika v debug/.")
|
||||
log("[i] Klikám 'Export to Excel'...")
|
||||
item.first.click()
|
||||
log("[i] Dialog Excel Export Options...")
|
||||
|
||||
radio = page.locator("input[name='requiredRadioField'][value='STANDARD']")
|
||||
try:
|
||||
radio.first.wait_for(state="visible", timeout=10000)
|
||||
if not radio.first.is_checked():
|
||||
radio.first.check()
|
||||
log("[i] Přepnuto na 'Data Only'.")
|
||||
except PWTimeout:
|
||||
log("[!] Radio 'Data Only' nenalezeno — spoléhám na default dialogu.")
|
||||
|
||||
export_btn = page.get_by_role("button", name="Export", exact=True)
|
||||
try:
|
||||
export_btn.first.wait_for(state="visible", timeout=10000)
|
||||
except PWTimeout:
|
||||
save_page_debug(page, f"report_export_btn_{report['level']}")
|
||||
raise RuntimeError("Dialog exportu bez tlačítka Export. Diagnostika v debug/.")
|
||||
export_btn = export_btn.first
|
||||
with page.expect_download(timeout=120000) as dl_info:
|
||||
export_btn.click()
|
||||
download = dl_info.value
|
||||
|
||||
EXCEL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
dest = EXCEL_DIR / f"{ts} {report['level']} {download.suggested_filename}"
|
||||
download.save_as(str(dest))
|
||||
log(f"[ok] Report uložen: {dest}")
|
||||
return dest
|
||||
|
||||
|
||||
def archive_report(path):
|
||||
"""Po úspěšném zpracování přesune report do Zpracovano/."""
|
||||
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
target = PROCESSED_DIR / path.name
|
||||
path.rename(target)
|
||||
log(f"[i] Report archivován: {target}")
|
||||
|
||||
|
||||
# --- SeaweedFS ---------------------------------------------------------
|
||||
|
||||
def seaweed_path(vtmf, version, ext):
|
||||
"""Cesta podle identity dokumentu: /vtmf-documents/<vtmf>/<verze><ext>."""
|
||||
ver = version or "vunknown"
|
||||
return f"{SEAWEED_PREFIX}/{vtmf}/{ver}{ext}"
|
||||
|
||||
|
||||
def seaweed_store(vtmf, version, ext, data, mime="application/octet-stream"):
|
||||
"""Upload do SeaweedFS Filer pod cestou <vtmf>/<verze><ext>.
|
||||
Vrací (path, url)."""
|
||||
path = seaweed_path(vtmf, version, ext)
|
||||
url = SEAWEED_FILER + path
|
||||
req = urllib.request.Request(url, data=data, method="PUT",
|
||||
headers={"Content-Type": mime})
|
||||
urllib.request.urlopen(req, timeout=120)
|
||||
return path, url
|
||||
|
||||
|
||||
# --- Stažení dokumentů -------------------------------------------------
|
||||
|
||||
def find_source_file_button(page):
|
||||
"""Najde ikonu Source File (list papíru se šipkou dolů, vpravo nahoře)."""
|
||||
for sel in ("[title='Source File']", "[aria-label='Source File']"):
|
||||
loc = page.locator(sel)
|
||||
if loc.count():
|
||||
return loc.first
|
||||
loc = page.get_by_role("button", name=re.compile("Source File", re.I))
|
||||
if loc.count():
|
||||
return loc.first
|
||||
return None
|
||||
|
||||
|
||||
def download_source_bytes(page, doc):
|
||||
"""Otevře dokument, stáhne Source File do dočasného souboru Playwrightu
|
||||
a vrátí (data: bytes, ext: str). Žádné trvalé uložení na disk.
|
||||
PlaceholderDocument když dokument nemá obsah."""
|
||||
vtmf = doc["vtmf"]
|
||||
log(f"[i] Otevírám dokument {vtmf} ({doc.get('version', '')}) ...")
|
||||
page.goto(doc["url"], wait_until="domcontentloaded")
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
except PWTimeout:
|
||||
log("[!] networkidle nenastal do 30 s, zkouším pokračovat...")
|
||||
dismiss_maintenance_popup(page, timeout=2000)
|
||||
|
||||
ph = page.locator("div.vv_placeholder_text")
|
||||
if ph.count() and ph.first.is_visible():
|
||||
log(f"[i] {vtmf}: placeholder bez obsahu — přeskakuji.")
|
||||
raise PlaceholderDocument(vtmf)
|
||||
|
||||
target = find_source_file_button(page)
|
||||
if target is None:
|
||||
raise RuntimeError(
|
||||
f"Nenašel jsem ikonu 'Source File' na stránce dokumentu {vtmf}.")
|
||||
|
||||
log("[i] Klikám na Source File a čekám na download...")
|
||||
with page.expect_download(timeout=60000) as dl_info:
|
||||
target.click()
|
||||
try:
|
||||
item = page.get_by_role("menuitem", name=re.compile("Source File", re.I))
|
||||
if item.count() and item.first.is_visible():
|
||||
log("[i] Otevřel se dropdown, vybírám 'Source File'...")
|
||||
item.first.click()
|
||||
except Exception:
|
||||
pass
|
||||
download = dl_info.value
|
||||
|
||||
ext = Path(download.suggested_filename).suffix
|
||||
tmp = download.path() # dočasný soubor Playwrightu
|
||||
data = Path(tmp).read_bytes()
|
||||
return data, ext
|
||||
|
||||
|
||||
def download_missing(page, coll):
|
||||
"""Stáhne všechny nesmazané dokumenty bez downloaded=True PŘÍMO do
|
||||
SeaweedFS (žádný disk). Výsledek každého se ihned zapíše do Mongo."""
|
||||
todo = list(coll.find({"deleted": False, "downloaded": {"$ne": True}})
|
||||
.sort([("level", ASCENDING), ("vtmf", ASCENDING),
|
||||
("version", ASCENDING)]))
|
||||
if LIMIT:
|
||||
todo = todo[:LIMIT]
|
||||
log(f"\n[i] Ke stažení: {len(todo)} dokumentů"
|
||||
+ (f" (LIMIT={LIMIT})" if LIMIT else ""))
|
||||
|
||||
ok_count = fail_count = placeholder_count = 0
|
||||
for n, doc in enumerate(todo, 1):
|
||||
key = doc["_id"]
|
||||
log(f"\n--- [{n}/{len(todo)}] {key} | {doc.get('level', '?')} | {doc['desc'][:60]}")
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
data, ext = download_source_bytes(page, doc)
|
||||
size_kb = len(data) / 1024
|
||||
size_str = f"{size_kb:.0f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||
sha256_hex = hashlib.sha256(data).hexdigest()
|
||||
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
|
||||
sw_path, sw_url = seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
|
||||
log(f"[ok] {size_str} -> SeaweedFS {sw_path}")
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True,
|
||||
"downloaded_at": datetime.now(),
|
||||
"sha256": sha256_hex,
|
||||
"seaweed_path": sw_path, "seaweed_url": sw_url,
|
||||
"seaweed_synced_at": datetime.now(),
|
||||
"last_error": None}})
|
||||
ok_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except PlaceholderDocument:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"downloaded": True, "placeholder": True,
|
||||
"downloaded_at": datetime.now(), "last_error": None}})
|
||||
placeholder_count += 1
|
||||
last_err = None
|
||||
break
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
log(f"[!] Pokus {attempt}/{MAX_ATTEMPTS} selhal: {e}")
|
||||
if attempt < MAX_ATTEMPTS:
|
||||
page.wait_for_timeout(RETRY_PAUSE_MS)
|
||||
if last_err is not None:
|
||||
coll.update_one({"_id": key}, {"$set": {
|
||||
"last_error": str(last_err), "error_at": datetime.now()}})
|
||||
fail_count += 1
|
||||
page.wait_for_timeout(BETWEEN_DOCS_MS)
|
||||
return ok_count, fail_count, placeholder_count
|
||||
|
||||
|
||||
# --- Main --------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
ensure_credentials()
|
||||
db, coll, runs = get_db()
|
||||
log(f"[ok] Mongo připojeno: {MONGO_URI} / {MONGO_DB}.{MONGO_COLL}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=str(PROFILE_DIR),
|
||||
headless=False,
|
||||
accept_downloads=True,
|
||||
no_viewport=True,
|
||||
args=["--start-maximized"],
|
||||
)
|
||||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||||
ok_count = fail_count = placeholder_count = 0
|
||||
pipeline_error = None
|
||||
try:
|
||||
# 1) login
|
||||
login_if_needed(page)
|
||||
verify_inside(page)
|
||||
dismiss_maintenance_popup(page)
|
||||
|
||||
# 2+3) pro každý ZAPNUTÝ report: export -> parse -> scoped sync
|
||||
log("\n[i] Plán reportů:")
|
||||
for r in REPORTS:
|
||||
flag = "ZAP" if r.get("enabled", True) else "VYP"
|
||||
log(f" [{flag}] {r.get('name', r['level'])}")
|
||||
for report in REPORTS:
|
||||
if not report.get("enabled", True):
|
||||
log(f"\n[i] Přeskakuji (enabled=False): {report.get('name', report['level'])}")
|
||||
continue
|
||||
report_path = download_report(page, report)
|
||||
docs = read_documents_from_excel(report_path, report["level"])
|
||||
before = len(docs)
|
||||
docs = [d for d in docs if report["study"] in d["studies"]]
|
||||
if before != len(docs):
|
||||
log(f"[i] Ořez na {report['study']}: {len(docs)}/{before} řádků.")
|
||||
if not docs:
|
||||
log(f"[!] Report {report['level']} prázdný (po ořezu) — "
|
||||
f"sync přeskočen, nic se nemaže.")
|
||||
archive_report(report_path)
|
||||
continue
|
||||
sync_report_to_mongo(coll, runs, docs, report, report_path)
|
||||
archive_report(report_path)
|
||||
|
||||
# 4) jeden průchod stažení všeho nestaženého do SeaweedFS
|
||||
ok_count, fail_count, placeholder_count = download_missing(page, coll)
|
||||
except KeyboardInterrupt:
|
||||
log("\n[!] Přerušeno uživatelem — stav je v Mongo, příští běh naváže.")
|
||||
except Exception as e:
|
||||
pipeline_error = e
|
||||
print("\n" + "=" * 60)
|
||||
print(" PIPELINE SELHALA!")
|
||||
print(f" {type(e).__name__}: {e}")
|
||||
print("=" * 60)
|
||||
finally:
|
||||
total = coll.count_documents({})
|
||||
active = coll.count_documents({"deleted": False})
|
||||
have = coll.count_documents({"deleted": False, "downloaded": True})
|
||||
log(f"\n[i] Výsledek běhu: {ok_count} staženo, "
|
||||
f"{placeholder_count} placeholderů, {fail_count} chyb"
|
||||
+ (f", PIPELINE SELHALA ({pipeline_error})" if pipeline_error else "."))
|
||||
log(f"[i] Mongo: {total} záznamů celkem, {active} aktivních, "
|
||||
f"z toho v SeaweedFS {have} ({active - have} zbývá).")
|
||||
log("[i] Zavírám prohlížeč.")
|
||||
ctx.close()
|
||||
sys.exit(2 if pipeline_error else (1 if fail_count else 0))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user