diff --git a/VTMFprocessing/ToProcess/Document Inventory Report - Site Level All Sites by Country.xlsx b/VTMFprocessing/ToProcess/Document Inventory Report - Site Level All Sites by Country.xlsx new file mode 100644 index 0000000..5605aa7 Binary files /dev/null and b/VTMFprocessing/ToProcess/Document Inventory Report - Site Level All Sites by Country.xlsx differ diff --git a/VTMFprocessing/process_vtmf.py b/VTMFprocessing/process_vtmf.py new file mode 100644 index 0000000..f128326 --- /dev/null +++ b/VTMFprocessing/process_vtmf.py @@ -0,0 +1,111 @@ +import re +import copy +import datetime +from collections import Counter +from pathlib import Path +from openpyxl import load_workbook, Workbook +from openpyxl.utils import get_column_letter + +TO_PROCESS = Path(__file__).parent / "ToProcess" +PROCESSED = Path(__file__).parent / "Processed" +SITE_PATTERN = re.compile(r'DD5-CZ\d+') + +ACTIVE_SITES = { + "DD5-CZ10001", "DD5-CZ10003", "DD5-CZ10006", "DD5-CZ10009", + "DD5-CZ10010", "DD5-CZ10012", "DD5-CZ10013", "DD5-CZ10015", + "DD5-CZ10016", "DD5-CZ10020", "DD5-CZ10021", "DD5-CZ10022", +} +TAB_GREEN = "00B050" +TAB_RED = "FF0000" + + +def copy_row(ws_dst, dst_row_num, src_row): + for src_cell in src_row: + dst_cell = ws_dst.cell(row=dst_row_num, column=src_cell.column) + dst_cell.value = src_cell.value + if src_cell.has_style: + dst_cell.font = copy.copy(src_cell.font) + dst_cell.fill = copy.copy(src_cell.fill) + dst_cell.border = copy.copy(src_cell.border) + dst_cell.alignment = copy.copy(src_cell.alignment) + dst_cell.number_format = src_cell.number_format + + +def process_file(src_path: Path): + print(f"Processing: {src_path.name}") + wb_src = load_workbook(src_path) + ws_src = wb_src["Sheet0"] + + headers = [ws_src.cell(1, c).value for c in range(1, ws_src.max_column + 1)] + site_col_idx = headers.index("Site") + 1 + date_col_idx = headers.index("Approval Complete Date") + 1 + + all_rows = list(ws_src.iter_rows(min_row=2)) + + all_sites = set( + m for row in all_rows + for m in SITE_PATTERN.findall(str(row[site_col_idx - 1].value or "")) + ) + active = sorted(s for s in all_sites if s in ACTIVE_SITES) + inactive = sorted(s for s in all_sites if s not in ACTIVE_SITES) + ordered_sites = active + inactive + print(f" Found {len(active)} active, {len(inactive)} inactive DD5-CZ sites") + + wb_out = Workbook() + wb_out.remove(wb_out.active) + + header_row = list(ws_src.iter_rows(min_row=1, max_row=1))[0] + + for site in ordered_sites: + site_rows = [ + row for row in all_rows + if site in (row[site_col_idx - 1].value or "") + ] + site_rows.sort( + key=lambda r: r[date_col_idx - 1].value or datetime.datetime.min, + reverse=True, + ) + + ws = wb_out.create_sheet(title=site) + ws.sheet_properties.tabColor = TAB_GREEN if site in ACTIVE_SITES else TAB_RED + copy_row(ws, 1, header_row) + + for i, row in enumerate(site_rows, start=2): + copy_row(ws, i, row) + + for col_idx in range(1, ws_src.max_column + 1): + col_letter = get_column_letter(col_idx) + if col_letter in ws_src.column_dimensions: + ws.column_dimensions[col_letter].width = ws_src.column_dimensions[col_letter].width + + ws.auto_filter.ref = ws.dimensions + print(f" {site}: {len(site_rows)} rows") + + study_col_idx = headers.index("Study") + 1 + study_values = [ + s.strip() + for row in all_rows + for s in str(row[study_col_idx - 1].value or "").split(",") + if s.strip() + ] + study_number = Counter(study_values).most_common(1)[0][0] + + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S") + filename = f"{timestamp} {study_number} {src_path.name}" + out_path = PROCESSED / filename + wb_out.save(out_path) + print(f" Saved: {out_path}") + + +def main(): + xlsx_files = list(TO_PROCESS.glob("*.xlsx")) + if not xlsx_files: + print("No .xlsx files found in ToProcess/") + return + + for f in xlsx_files: + process_file(f) + + +if __name__ == "__main__": + main()