From fc13fe9d340a1f9201fd869b910c704a79bef050 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Wed, 20 May 2026 06:09:44 +0200 Subject: [PATCH] notebookvb --- Webpagescraping/splcr.cz/convert_to_md.py | 67 +++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 Webpagescraping/splcr.cz/convert_to_md.py diff --git a/Webpagescraping/splcr.cz/convert_to_md.py b/Webpagescraping/splcr.cz/convert_to_md.py new file mode 100644 index 0000000..d190e65 --- /dev/null +++ b/Webpagescraping/splcr.cz/convert_to_md.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +import csv +from collections import defaultdict +from pathlib import Path + +# Load CSV +docs = [] +with open('output/documents.csv', 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + docs = list(reader) + +# Organize by type and year +by_type = defaultdict(list) +appel_by_year = defaultdict(list) + +for doc in docs: + doc_type = doc['type'] + url = doc['found_on'] + + # Check if appel + if 'appel' in url: + year = url.split('appel-rocnik-')[-1].rstrip('/') + appel_by_year[year].append(doc) + else: + by_type[doc_type].append(doc) + +# Generate markdown +appel_count = sum(len(docs) for docs in appel_by_year.values()) +other_count = len(docs) - appel_count + +md = f"""# splcr.cz — Stažené dokumenty + +**Datum:** 2026-05-20 +**Celkem:** {len(docs)} dokumentů +**Appel:** {appel_count} dokumentů +**Ostatní:** {other_count} dokumentů + +## Přehled po typu + +| Typ | Počet | +|-----|-------| +""" + +for doc_type in sorted(by_type.keys()): + md += f"| {doc_type} | {len(by_type[doc_type])} |\n" + +# Appel by year +md += "\n## Apel — Všechny ročníky\n\n" +for year in sorted(appel_by_year.keys(), reverse=True): + year_docs = appel_by_year[year] + md += f"### Ročník {year} ({len(year_docs)} dokumentů)\n\n" + for doc in sorted(year_docs, key=lambda x: x['title']): + md += f"- [{doc['title']}]({doc['url']})\n" + md += "\n" + +# Other documents +md += "## Ostatní dokumenty\n\n" +for doc_type in sorted(by_type.keys()): + md += f"### {doc_type} ({len(by_type[doc_type])} dokumentů)\n\n" + for doc in sorted(by_type[doc_type], key=lambda x: x['title']): + md += f"- [{doc['title']}]({doc['url']})\n" + md += "\n" + +# Save +output_path = Path('output/DOKUMENTY.md') +output_path.write_text(md, encoding='utf-8') +print(f"OK - Ulozeno: {output_path}")