ordinaceprojekt/Webpagescraping/splcr.cz/convert_to_md.py

#!/usr/bin/env python3
import csv
from collections import defaultdict
from pathlib import Path

# Load CSV
docs = []
with open('output/documents.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    docs = list(reader)

# Organize by type and year
by_type = defaultdict(list)
appel_by_year = defaultdict(list)

for doc in docs:
    doc_type = doc['type']
    url = doc['found_on']

    # Check if appel
    if 'appel' in url:
        year = url.split('appel-rocnik-')[-1].rstrip('/')
        appel_by_year[year].append(doc)
    else:
        by_type[doc_type].append(doc)

# Generate markdown
appel_count = sum(len(docs) for docs in appel_by_year.values())
other_count = len(docs) - appel_count

md = f"""# splcr.cz — Stažené dokumenty

**Datum:** 2026-05-20
**Celkem:** {len(docs)} dokumentů
**Appel:** {appel_count} dokumentů
**Ostatní:** {other_count} dokumentů

## Přehled po typu

| Typ | Počet |
|-----|-------|
"""

for doc_type in sorted(by_type.keys()):
    md += f"| {doc_type} | {len(by_type[doc_type])} |\n"

# Appel by year
md += "\n## Apel — Všechny ročníky\n\n"
for year in sorted(appel_by_year.keys(), reverse=True):
    year_docs = appel_by_year[year]
    md += f"### Ročník {year} ({len(year_docs)} dokumentů)\n\n"
    for doc in sorted(year_docs, key=lambda x: x['title']):
        md += f"- [{doc['title']}]({doc['url']})\n"
    md += "\n"

# Other documents
md += "## Ostatní dokumenty\n\n"
for doc_type in sorted(by_type.keys()):
    md += f"### {doc_type} ({len(by_type[doc_type])} dokumentů)\n\n"
    for doc in sorted(by_type[doc_type], key=lambda x: x['title']):
        md += f"- [{doc['title']}]({doc['url']})\n"
    md += "\n"

# Save
output_path = Path('output/DOKUMENTY.md')
output_path.write_text(md, encoding='utf-8')
print(f"OK - Ulozeno: {output_path}")