diff --git a/20 PrůzkumFotek/analyze_all.py b/20 PrůzkumFotek/analyze_all.py new file mode 100644 index 0000000..358f2b2 --- /dev/null +++ b/20 PrůzkumFotek/analyze_all.py @@ -0,0 +1,164 @@ +import sys, io +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +import psycopg2 + +conn = psycopg2.connect(host='192.168.1.76', port=5432, dbname='fotky_buzalkovi', + user='vladimir.buzalka', password='Vlado7309208104++') +cur = conn.cursor() + +# 1. Duplikáty sha256_pixels +print('=== DUPLIKÁTY sha256_pixels ===') +cur.execute('''SELECT sha256_pixels, COUNT(*) as cnt FROM photos + WHERE sha256_pixels IS NOT NULL GROUP BY sha256_pixels HAVING COUNT(*) > 1 + ORDER BY cnt DESC LIMIT 10''') +for r in cur.fetchall(): + print(f' {r[0][:16]}... : {r[1]}x') +cur.execute('''SELECT COUNT(*), SUM(cnt) FROM ( + SELECT COUNT(*) as cnt FROM photos WHERE sha256_pixels IS NOT NULL + GROUP BY sha256_pixels HAVING COUNT(*) > 1) x''') +r = cur.fetchone() +print(f'Celkem skupin duplikátů: {r[0]}, fotek v duplikátech: {r[1]}') + +# 2. Identické phash +print('\n=== IDENTICKÉ phash ===') +cur.execute('''SELECT phash, COUNT(*) as cnt FROM photos + WHERE phash IS NOT NULL GROUP BY phash HAVING COUNT(*) > 1 + ORDER BY cnt DESC LIMIT 10''') +for r in cur.fetchall(): + print(f' phash={r[0]}: {r[1]}x') +cur.execute('''SELECT COUNT(*), SUM(cnt) FROM ( + SELECT COUNT(*) as cnt FROM photos WHERE phash IS NOT NULL + GROUP BY phash HAVING COUNT(*) > 1) x''') +r = cur.fetchone() +print(f'Celkem skupin: {r[0]}, fotek: {r[1]}') + +# 3. Screenshoty +print('\n=== SCREENSHOTY ===') +cur.execute('SELECT COUNT(*) FROM photos WHERE is_screenshot = true') +print(f'is_screenshot=true: {cur.fetchone()[0]}') + +# 4. Objektivy +print('\n=== TOP OBJEKTIVY ===') +cur.execute('''SELECT COALESCE(lens_model, '(neuvedeno)') as lens, COUNT(*) as cnt + FROM photos GROUP BY lens ORDER BY cnt DESC LIMIT 15''') +for r in cur.fetchall(): + print(f' {r[1]:>6} {r[0]}') + +# 5. ISO distribuce +print('\n=== ISO DISTRIBUCE ===') +cur.execute('''SELECT iso, COUNT(*) as cnt FROM photos WHERE iso IS NOT NULL + GROUP BY iso ORDER BY cnt DESC LIMIT 15''') +for r in cur.fetchall(): + print(f' ISO {r[0]:>6}: {r[1]}') + +# 6. Clona +print('\n=== CLONA (aperture) TOP ===') +cur.execute('''SELECT aperture, COUNT(*) as cnt FROM photos WHERE aperture IS NOT NULL + GROUP BY aperture ORDER BY cnt DESC LIMIT 15''') +for r in cur.fetchall(): + print(f' f/{r[0]}: {r[1]}') + +# 7. Expoziční čas +print('\n=== EXPOZIČNÍ ČAS TOP ===') +cur.execute('''SELECT exposure_time, COUNT(*) as cnt FROM photos WHERE exposure_time IS NOT NULL + GROUP BY exposure_time ORDER BY cnt DESC LIMIT 15''') +for r in cur.fetchall(): + print(f' {r[0]}: {r[1]}') + +# 8. GPS top lokace +print('\n=== GPS TOP LOKACE (zaokrouhleno na 0.1 stupne) ===') +cur.execute('''SELECT ROUND(gps_lat::numeric, 1) as lat, ROUND(gps_lon::numeric, 1) as lon, COUNT(*) as cnt + FROM photos WHERE gps_lat IS NOT NULL AND gps_lon IS NOT NULL + GROUP BY lat, lon ORDER BY cnt DESC LIMIT 20''') +for r in cur.fetchall(): + print(f' [{r[0]}, {r[1]}]: {r[2]} fotek') + +cur.execute('''SELECT MIN(gps_lat), MAX(gps_lat), MIN(gps_lon), MAX(gps_lon) + FROM photos WHERE gps_lat IS NOT NULL''') +r = cur.fetchone() +print(f' Rozsah Lat: {r[0]} .. {r[1]}') +print(f' Rozsah Lon: {r[2]} .. {r[3]}') + +# 9. Megapixely po letech +print('\n=== PRUMERNE MEGAPIXELY PO LETECH ===') +cur.execute('''SELECT EXTRACT(YEAR FROM taken_at)::INT as rok, + ROUND(AVG(megapixels)::numeric, 1) as avg_mp, + ROUND(MAX(megapixels)::numeric, 1) as max_mp, COUNT(*) as cnt + FROM photos WHERE taken_at IS NOT NULL AND megapixels IS NOT NULL + GROUP BY rok ORDER BY rok''') +for r in cur.fetchall(): + print(f' {r[0]}: avg={r[1]} MP, max={r[2]} MP ({r[3]} fotek)') + +# 10. Formáty +print('\n=== FORMATY ===') +cur.execute('''SELECT COALESCE(file_ext, '(none)') as ext, COUNT(*) as cnt + FROM photos GROUP BY ext ORDER BY cnt DESC''') +for r in cur.fetchall(): + print(f' {r[0]}: {r[1]}') + +# 11. Průhlednost +print('\n=== PRUHLEDNOST ===') +cur.execute('SELECT COUNT(*) FROM photos WHERE has_transparency = true') +print(f' S pruhlednosti: {cur.fetchone()[0]}') + +# 12. Barevné módy +print('\n=== BAREVNE MODY ===') +cur.execute('''SELECT COALESCE(mode, '(none)') as m, COUNT(*) as cnt + FROM photos GROUP BY m ORDER BY cnt DESC''') +for r in cur.fetchall(): + print(f' {r[0]}: {r[1]}') + +# 13. Neznámé fotky - vzory názvů +print('\n=== NEZNAME FOTKY 2015-2016 (bez kamery) - vzory nazvu ===') +cur.execute('''SELECT file_name FROM photos + WHERE camera_model IS NULL AND EXTRACT(YEAR FROM taken_at) BETWEEN 2015 AND 2016 + LIMIT 30''') +for r in cur.fetchall(): + print(f' {r[0]}') + +print('\n=== NEZNAME FOTKY 2022 - vzory nazvu ===') +cur.execute('''SELECT file_name FROM photos + WHERE camera_model IS NULL AND EXTRACT(YEAR FROM taken_at) = 2022 + LIMIT 30''') +for r in cur.fetchall(): + print(f' {r[0]}') + +print('\n=== 2022 PREFIXES ===') +cur.execute('''SELECT LEFT(file_name, 10) as prefix, COUNT(*) as cnt FROM photos + WHERE camera_model IS NULL AND EXTRACT(YEAR FROM taken_at) = 2022 + GROUP BY prefix ORDER BY cnt DESC LIMIT 15''') +for r in cur.fetchall(): + print(f' {r[0]}: {r[1]}') + +# 14. Měsíce +print('\n=== FOTKY PO MESICICH ===') +nazvy = ['','Leden','Unor','Brezen','Duben','Kveten','Cerven','Cervenec','Srpen','Zari','Rijen','Listopad','Prosinec'] +cur.execute('''SELECT EXTRACT(MONTH FROM taken_at)::INT as mesic, COUNT(*) as cnt + FROM photos WHERE taken_at IS NOT NULL GROUP BY mesic ORDER BY mesic''') +for r in cur.fetchall(): + print(f' {nazvy[r[0]]}: {r[1]}') + +# 15. Dny v týdnu +print('\n=== FOTKY PO DNECH V TYDNU ===') +dny = ['Nedele','Pondeli','Utery','Streda','Ctvrtek','Patek','Sobota'] +cur.execute('''SELECT EXTRACT(DOW FROM taken_at)::INT as den, COUNT(*) as cnt + FROM photos WHERE taken_at IS NOT NULL GROUP BY den ORDER BY den''') +for r in cur.fetchall(): + print(f' {dny[r[0]]}: {r[1]}') + +# 16. Hodiny +print('\n=== FOTKY PO HODINACH ===') +cur.execute('''SELECT EXTRACT(HOUR FROM taken_at)::INT as hod, COUNT(*) as cnt + FROM photos WHERE taken_at IS NOT NULL GROUP BY hod ORDER BY hod''') +for r in cur.fetchall(): + print(f' {r[0]:02d}:00 - {r[1]}') + +# 17. Top dny (události) +print('\n=== TOP 20 DNU (nejvic fotek = udalosti) ===') +cur.execute('''SELECT taken_at::date as den, COUNT(*) as cnt + FROM photos WHERE taken_at IS NOT NULL GROUP BY den ORDER BY cnt DESC LIMIT 20''') +for r in cur.fetchall(): + print(f' {r[0]}: {r[1]} fotek') + +conn.close() diff --git a/20 PrůzkumFotek/report.py b/20 PrůzkumFotek/report.py new file mode 100644 index 0000000..9cf27b6 --- /dev/null +++ b/20 PrůzkumFotek/report.py @@ -0,0 +1,361 @@ +import streamlit as st +import psycopg2 +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + +st.set_page_config(page_title="FotkyBuzalkovi - Report", layout="wide", page_icon="📷") + +@st.cache_resource +def get_conn(): + return psycopg2.connect( + host="192.168.1.76", port=5432, dbname="fotky_buzalkovi", + user="vladimir.buzalka", password="Vlado7309208104++" + ) + +def q(sql, params=None): + conn = get_conn() + return pd.read_sql(sql, conn, params=params) + + +st.title("📷 FotkyBuzalkovi — Průzkum dat") + +# --- Celkové statistiky --- +st.header("Celkové statistiky") +c1, c2, c3, c4 = st.columns(4) +counts = q(""" + SELECT + (SELECT COUNT(*) FROM photos) as photos, + (SELECT COUNT(*) FROM photos WHERE exif_raw IS NOT NULL AND exif_raw != '{}') as s_exif, + (SELECT COUNT(*) FROM photos WHERE gps_lat IS NOT NULL) as s_gps, + (SELECT COUNT(*) FROM photos WHERE camera_model IS NOT NULL) as s_camera +""").iloc[0] +c1.metric("Celkem fotek", f"{counts['photos']:,}") +c2.metric("S EXIF daty", f"{counts['s_exif']:,}") +c3.metric("S GPS", f"{counts['s_gps']:,}") +c4.metric("S kamerou", f"{counts['s_camera']:,}") + +# --- Zálohovací pipeline --- +st.subheader("Zálohovací pipeline (sběr fotek)") +z1, z2, z3 = st.columns(3) +zcounts = q(""" + SELECT + (SELECT COUNT(*) FROM zaloha_obrazku) as zalohy, + (SELECT COUNT(*) FROM zdrojove_soubory) as zdroje, + (SELECT COUNT(*) FROM zdrojove_soubory) - (SELECT COUNT(*) FROM zaloha_obrazku) as duplikaty +""").iloc[0] +z1.metric("Unikátních záloh", f"{zcounts['zalohy']:,}") +z2.metric("Zdrojových souborů", f"{zcounts['zdroje']:,}") +z3.metric("Duplikátních výskytů", f"{zcounts['duplikaty']:,}") + +st.divider() + +# --- Fotky po letech --- +st.header("📅 Fotky po letech") +df_years = q(""" + SELECT EXTRACT(YEAR FROM taken_at)::INT as rok, COUNT(*) as pocet + FROM photos WHERE taken_at IS NOT NULL + GROUP BY rok ORDER BY rok +""") +fig = px.bar(df_years, x="rok", y="pocet", text="pocet", + labels={"rok": "Rok", "pocet": "Počet fotek"}) +fig.update_traces(textposition="outside", texttemplate="%{text:,}") +fig.update_layout(height=450) +st.plotly_chart(fig, use_container_width=True) + +# --- Fotoaparáty po letech --- +st.header("📸 Fotoaparáty po letech") +df_cam = q(""" + SELECT EXTRACT(YEAR FROM taken_at)::INT as rok, + COALESCE(camera_model, '(neznámý)') as model, + COUNT(*) as pocet + FROM photos WHERE taken_at IS NOT NULL + GROUP BY rok, model ORDER BY rok, pocet DESC +""") + +selected_year = st.selectbox("Vyber rok:", sorted(df_cam["rok"].unique()), index=len(df_cam["rok"].unique())-5) +df_year_cam = df_cam[df_cam["rok"] == selected_year].head(15) +fig_cam = px.bar(df_year_cam, x="model", y="pocet", text="pocet", + labels={"model": "Fotoaparát", "pocet": "Počet fotek"}, + title=f"Fotoaparáty v roce {selected_year}") +fig_cam.update_traces(textposition="outside") +fig_cam.update_layout(xaxis_tickangle=-45, height=500) +st.plotly_chart(fig_cam, use_container_width=True) + +# Heatmapa kamery × rok (top 15 kamer celkově) +st.subheader("Heatmapa: top kamery × roky") +top_cameras = q(""" + SELECT camera_model, COUNT(*) as cnt FROM photos + WHERE camera_model IS NOT NULL + GROUP BY camera_model ORDER BY cnt DESC LIMIT 15 +""")["camera_model"].tolist() + +df_heat = df_cam[df_cam["model"].isin(top_cameras)].pivot_table( + index="model", columns="rok", values="pocet", fill_value=0 +) +fig_heat = px.imshow(df_heat, labels=dict(x="Rok", y="Fotoaparát", color="Fotek"), + aspect="auto", color_continuous_scale="YlOrRd") +fig_heat.update_layout(height=500) +st.plotly_chart(fig_heat, use_container_width=True) + +st.divider() + +# --- Duplikáty --- +st.header("🔄 Duplikáty") +d1, d2 = st.columns(2) + +with d1: + st.subheader("Identické pixely (sha256_pixels)") + df_dup_px = q(""" + SELECT COUNT(*) as skupin, SUM(cnt) as fotek FROM ( + SELECT COUNT(*) as cnt FROM photos WHERE sha256_pixels IS NOT NULL + GROUP BY sha256_pixels HAVING COUNT(*) > 1 + ) x + """).iloc[0] + st.metric("Skupin duplikátů", f"{df_dup_px['skupin']:,}") + st.metric("Fotek v duplikátech", f"{df_dup_px['fotek']:,}") + +with d2: + st.subheader("Vizuálně podobné (phash)") + df_dup_ph = q(""" + SELECT COUNT(*) as skupin, SUM(cnt) as fotek FROM ( + SELECT COUNT(*) as cnt FROM photos WHERE phash IS NOT NULL + GROUP BY phash HAVING COUNT(*) > 1 + ) x + """).iloc[0] + st.metric("Skupin podobných", f"{df_dup_ph['skupin']:,}") + st.metric("Fotek v podobných skupinách", f"{df_dup_ph['fotek']:,}") + +st.divider() + +# --- GPS mapa --- +st.header("🗺️ GPS lokace") +df_gps = q(""" + SELECT gps_lat as lat, gps_lon as lon + FROM photos WHERE gps_lat IS NOT NULL AND gps_lon IS NOT NULL +""") +if not df_gps.empty: + df_gps["lat"] = df_gps["lat"].astype(float) + df_gps["lon"] = df_gps["lon"].astype(float) + st.map(df_gps, size=2) + + st.subheader("Top lokace (zaokrouhleno na 0.1°)") + df_gps_top = q(""" + SELECT ROUND(gps_lat::numeric, 1) as lat, ROUND(gps_lon::numeric, 1) as lon, + COUNT(*) as pocet + FROM photos WHERE gps_lat IS NOT NULL AND gps_lon IS NOT NULL + GROUP BY lat, lon ORDER BY pocet DESC LIMIT 20 + """) + st.dataframe(df_gps_top, use_container_width=True) + +st.divider() + +# --- Technické parametry --- +st.header("⚙️ Technické parametry") +tab_iso, tab_clona, tab_exp, tab_lens = st.tabs(["ISO", "Clona", "Expoziční čas", "Objektivy"]) + +with tab_iso: + df_iso = q(""" + SELECT iso, COUNT(*) as pocet FROM photos WHERE iso IS NOT NULL + GROUP BY iso ORDER BY pocet DESC LIMIT 20 + """) + fig_iso = px.bar(df_iso, x="iso", y="pocet", text="pocet", + labels={"iso": "ISO", "pocet": "Počet fotek"}) + fig_iso.update_traces(textposition="outside") + st.plotly_chart(fig_iso, use_container_width=True) + +with tab_clona: + df_ap = q(""" + SELECT aperture, COUNT(*) as pocet FROM photos WHERE aperture IS NOT NULL + GROUP BY aperture ORDER BY pocet DESC LIMIT 20 + """) + df_ap["label"] = "f/" + df_ap["aperture"].astype(str) + fig_ap = px.bar(df_ap, x="label", y="pocet", text="pocet", + labels={"label": "Clona", "pocet": "Počet fotek"}) + fig_ap.update_traces(textposition="outside") + st.plotly_chart(fig_ap, use_container_width=True) + +with tab_exp: + df_exp = q(""" + SELECT exposure_time, COUNT(*) as pocet FROM photos WHERE exposure_time IS NOT NULL + GROUP BY exposure_time ORDER BY pocet DESC LIMIT 20 + """) + fig_exp = px.bar(df_exp, x="exposure_time", y="pocet", text="pocet", + labels={"exposure_time": "Expoziční čas", "pocet": "Počet fotek"}) + fig_exp.update_traces(textposition="outside") + fig_exp.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig_exp, use_container_width=True) + +with tab_lens: + df_lens = q(""" + SELECT COALESCE(lens_model, '(neuvedeno)') as objektiv, COUNT(*) as pocet + FROM photos GROUP BY objektiv ORDER BY pocet DESC LIMIT 15 + """) + st.dataframe(df_lens, use_container_width=True) + +st.divider() + +# --- Rozlišení po letech --- +st.header("📐 Megapixely po letech") +df_mp = q(""" + SELECT EXTRACT(YEAR FROM taken_at)::INT as rok, + ROUND(AVG(megapixels)::numeric, 1) as prumer, + ROUND(MAX(megapixels)::numeric, 1) as maximum + FROM photos WHERE taken_at IS NOT NULL AND megapixels IS NOT NULL + GROUP BY rok ORDER BY rok +""") +fig_mp = go.Figure() +fig_mp.add_trace(go.Scatter(x=df_mp["rok"], y=df_mp["prumer"], mode="lines+markers", name="Průměr MP")) +fig_mp.add_trace(go.Scatter(x=df_mp["rok"], y=df_mp["maximum"], mode="lines+markers", name="Maximum MP")) +fig_mp.update_layout(yaxis_title="Megapixely", xaxis_title="Rok", height=400) +st.plotly_chart(fig_mp, use_container_width=True) + +st.divider() + +# --- Formáty a barevné módy --- +st.header("🎨 Formáty a barvy") +f1, f2 = st.columns(2) +with f1: + st.subheader("Přípony") + df_ext = q(""" + SELECT COALESCE(file_ext, '(none)') as pripona, COUNT(*) as pocet + FROM photos GROUP BY pripona ORDER BY pocet DESC + """) + st.dataframe(df_ext, use_container_width=True) + +with f2: + st.subheader("Barevné módy") + df_mode = q(""" + SELECT COALESCE(mode, '(none)') as mod, COUNT(*) as pocet + FROM photos GROUP BY mod ORDER BY pocet DESC + """) + fig_mode = px.pie(df_mode, values="pocet", names="mod") + st.plotly_chart(fig_mode, use_container_width=True) + +st.divider() + +# --- Neznámé fotky --- +st.header("❓ Fotky bez kamery — analýza názvů") + +tab_2015, tab_2022 = st.tabs(["2015–2016", "2022"]) + +with tab_2015: + df_unk15 = q(""" + SELECT file_name, file_size, taken_at, taken_at_source + FROM photos + WHERE camera_model IS NULL AND EXTRACT(YEAR FROM taken_at) BETWEEN 2015 AND 2016 + ORDER BY taken_at LIMIT 50 + """) + st.dataframe(df_unk15, use_container_width=True) + st.info("Přejmenované importním skriptem — vzor: `[NO MODEL] [MD5...]`") + +with tab_2022: + df_unk22 = q(""" + SELECT file_name, file_size, taken_at, taken_at_source + FROM photos + WHERE camera_model IS NULL AND EXTRACT(YEAR FROM taken_at) = 2022 + ORDER BY taken_at LIMIT 50 + """) + st.dataframe(df_unk22, use_container_width=True) + + df_prefix = q(""" + SELECT LEFT(file_name, 10) as prefix, COUNT(*) as pocet FROM photos + WHERE camera_model IS NULL AND EXTRACT(YEAR FROM taken_at) = 2022 + GROUP BY prefix ORDER BY pocet DESC LIMIT 10 + """) + st.subheader("Prefixes") + st.dataframe(df_prefix, use_container_width=True) + st.info("4 194 z 4 210 importováno najednou 25.9.2023 — pravděpodobně hromadný export z iCloudu/Google Photos") + +st.divider() + +# --- Časové vzory --- +st.header("⏰ Časové vzory") + +tab_month, tab_dow, tab_hour, tab_topdays = st.tabs(["Měsíce", "Dny v týdnu", "Hodiny", "Top dny (události)"]) + +with tab_month: + df_month = q(""" + SELECT EXTRACT(MONTH FROM taken_at)::INT as mesic, COUNT(*) as pocet + FROM photos WHERE taken_at IS NOT NULL GROUP BY mesic ORDER BY mesic + """) + nazvy = {1:'Leden',2:'Únor',3:'Březen',4:'Duben',5:'Květen',6:'Červen', + 7:'Červenec',8:'Srpen',9:'Září',10:'Říjen',11:'Listopad',12:'Prosinec'} + df_month["nazev"] = df_month["mesic"].map(nazvy) + fig_m = px.bar(df_month, x="nazev", y="pocet", text="pocet", + labels={"nazev": "Měsíc", "pocet": "Počet fotek"}) + fig_m.update_traces(textposition="outside") + st.plotly_chart(fig_m, use_container_width=True) + +with tab_dow: + df_dow = q(""" + SELECT EXTRACT(DOW FROM taken_at)::INT as den, COUNT(*) as pocet + FROM photos WHERE taken_at IS NOT NULL GROUP BY den ORDER BY den + """) + dny = {0:'Neděle',1:'Pondělí',2:'Úterý',3:'Středa',4:'Čtvrtek',5:'Pátek',6:'Sobota'} + df_dow["nazev"] = df_dow["den"].map(dny) + fig_d = px.bar(df_dow, x="nazev", y="pocet", text="pocet", + labels={"nazev": "Den", "pocet": "Počet fotek"}) + fig_d.update_traces(textposition="outside") + st.plotly_chart(fig_d, use_container_width=True) + +with tab_hour: + df_hour = q(""" + SELECT EXTRACT(HOUR FROM taken_at)::INT as hodina, COUNT(*) as pocet + FROM photos WHERE taken_at IS NOT NULL GROUP BY hodina ORDER BY hodina + """) + fig_h = px.bar(df_hour, x="hodina", y="pocet", text="pocet", + labels={"hodina": "Hodina", "pocet": "Počet fotek"}) + fig_h.update_traces(textposition="outside", texttemplate="%{text:,}") + fig_h.update_layout(xaxis=dict(dtick=1)) + st.plotly_chart(fig_h, use_container_width=True) + +with tab_topdays: + df_topdays = q(""" + SELECT taken_at::date as den, COUNT(*) as pocet + FROM photos WHERE taken_at IS NOT NULL + GROUP BY den ORDER BY pocet DESC LIMIT 30 + """) + fig_td = px.bar(df_topdays, x="den", y="pocet", text="pocet", + labels={"den": "Datum", "pocet": "Počet fotek"}) + fig_td.update_traces(textposition="outside") + fig_td.update_layout(xaxis_tickangle=-45, height=500) + st.plotly_chart(fig_td, use_container_width=True) + +st.divider() + +# --- EXIF pokrytí --- +st.header("📊 EXIF pokrytí") +df_coverage = q(""" + SELECT + COUNT(*) FILTER (WHERE exif_raw IS NOT NULL AND exif_raw != '{}') as s_exif, + COUNT(*) FILTER (WHERE taken_at IS NOT NULL) as s_taken_at, + COUNT(*) FILTER (WHERE camera_model IS NOT NULL) as s_camera, + COUNT(*) FILTER (WHERE iso IS NOT NULL) as s_iso, + COUNT(*) FILTER (WHERE gps_lat IS NOT NULL) as s_gps, + COUNT(*) FILTER (WHERE aperture IS NOT NULL) as s_aperture, + COUNT(*) FILTER (WHERE lens_model IS NOT NULL) as s_lens, + COUNT(*) as celkem + FROM photos +""").iloc[0] + +categories = ["EXIF data", "Datum pořízení", "Model kamery", "ISO", "Clona", "GPS", "Objektiv"] +values = [ + int(df_coverage["s_exif"]), int(df_coverage["s_taken_at"]), + int(df_coverage["s_camera"]), int(df_coverage["s_iso"]), + int(df_coverage["s_aperture"]), int(df_coverage["s_gps"]), + int(df_coverage["s_lens"]) +] +total = int(df_coverage["celkem"]) +pct = [round(v / total * 100, 1) for v in values] + +fig_cov = go.Figure(go.Bar( + x=pct, y=categories, orientation='h', + text=[f"{v:,} ({p}%)" for v, p in zip(values, pct)], + textposition="auto" +)) +fig_cov.update_layout(xaxis_title="% fotek", height=350) +st.plotly_chart(fig_cov, use_container_width=True) + +st.divider() +st.caption("FotkyBuzalkovi — data z PostgreSQL 192.168.1.76 / fotky_buzalkovi") diff --git a/SCHEMA.md b/SCHEMA.md new file mode 100644 index 0000000..557d1ae --- /dev/null +++ b/SCHEMA.md @@ -0,0 +1,171 @@ +# Databázové schéma — fotky_buzalkovi + +PostgreSQL 192.168.1.76:5432, databáze `fotky_buzalkovi`. + +--- + +## Skupina 1: Zpracované fotky + +Tyto tabulky obsahují naparsované informace o fotkách — EXIF, hashe, metadata, tagy. +Jsou základem pro veškerou další práci (vyhledávání, deduplikace, organizace). + +### photos (85 833 záznamů) + +Hlavní tabulka. Každý řádek = jedna unikátní fotka identifikovaná hashem `sha256_file`. + +| Sloupec | Typ | Nullable | Default | Popis | +|---------|-----|----------|---------|-------| +| **id** | BIGSERIAL | NO | autoincrement | PK | +| **sha256_file** | CHAR(64) | NO | — | SHA-256 celého souboru (UNIQUE) | +| sha256_pixels | CHAR(64) | YES | — | SHA-256 pixelových dat (odhalí změnu jen v metadatech) | +| phash | BIGINT | YES | — | Perceptuální hash (vizuální podobnost) | +| dhash | BIGINT | YES | — | Difference hash (vizuální podobnost) | +| **file_path** | VARCHAR(2000) | NO | — | Absolutní cesta k souboru | +| file_path_relative | VARCHAR(2000) | YES | — | Relativní cesta | +| **file_name** | VARCHAR(500) | NO | — | Název souboru | +| file_stem | VARCHAR(500) | YES | — | Název bez přípony | +| file_ext | VARCHAR(20) | YES | — | Přípona (.jpg, .png, …) | +| file_size | BIGINT | YES | — | Velikost v bajtech | +| mime_type | VARCHAR(50) | YES | — | MIME typ (image/jpeg, …) | +| format | VARCHAR(20) | YES | — | Formát obrázku (JPEG, PNG, …) | +| mode | VARCHAR(20) | YES | — | Barevný mód (RGB, L, RGBA, …) | +| width | INT | YES | — | Šířka v pixelech | +| height | INT | YES | — | Výška v pixelech | +| megapixels | NUMERIC | YES | — | Rozlišení v megapixelech | +| has_transparency | BOOLEAN | YES | false | Má alfa kanál | +| icc_profile | BOOLEAN | YES | false | Obsahuje ICC profil | +| embedded_thumbnail | BOOLEAN | YES | false | Obsahuje vložený náhled | +| taken_at | TIMESTAMPTZ | YES | — | Datum pořízení fotky | +| taken_at_source | VARCHAR(20) | YES | — | Zdroj datumu (exif / mtime / …) | +| mtime | TIMESTAMPTZ | YES | — | Datum poslední modifikace souboru | +| collected_at | TIMESTAMPTZ | YES | — | Datum sběru/importu do pipeline | +| camera_make | VARCHAR(100) | YES | — | Výrobce fotoaparátu | +| camera_model | VARCHAR(255) | YES | — | Model fotoaparátu | +| lens_model | VARCHAR(255) | YES | — | Model objektivu | +| iso | INT | YES | — | ISO citlivost | +| aperture | NUMERIC | YES | — | Clona (f/2.8, …) | +| exposure_time | VARCHAR(30) | YES | — | Expoziční čas (1/250, …) | +| focal_length_mm | NUMERIC | YES | — | Ohnisková vzdálenost v mm | +| gps_lat | NUMERIC | YES | — | GPS šířka | +| gps_lon | NUMERIC | YES | — | GPS délka | +| gps_altitude | NUMERIC | YES | — | GPS nadmořská výška | +| is_screenshot | BOOLEAN | YES | false | Detekováno jako screenshot | +| face_count | INT | YES | — | Počet detekovaných obličejů | +| exif_raw | JSONB | YES | — | Kompletní surová EXIF data | +| iptc_raw | JSONB | YES | — | Kompletní surová IPTC data | +| xmp_raw | JSONB | YES | — | Kompletní surová XMP data | +| imported_at | TIMESTAMPTZ | YES | now() | Kdy byl záznam vložen do DB | +| processing_status | VARCHAR(50) | YES | 'pending' | Stav zpracování | + +**Indexy:** +- `photos_pkey` — PK (id) +- `photos_sha256_file_key` — UNIQUE (sha256_file) +- `idx_photos_sha256_pixels` — (sha256_pixels) +- `idx_photos_phash` — (phash) +- `idx_photos_taken_at` — (taken_at) +- `idx_photos_camera_model` — (camera_model) +- `idx_photos_file_name` — (file_name) +- `idx_photos_file_ext` — (file_ext) +- `idx_photos_exif_gin` — GIN (exif_raw) + +--- + +### tags + +Hierarchická tabulka tagů. Podporuje stromovou strukturu přes `parent_tag_id`. + +| Sloupec | Typ | Nullable | Default | Popis | +|---------|-----|----------|---------|-------| +| **id** | SERIAL | NO | autoincrement | PK | +| **name** | VARCHAR(100) | NO | — | Název tagu | +| parent_tag_id | INT | YES | — | Rodičovský tag (FK → tags.id) | + +**Constrainty:** +- PK (id) +- UNIQUE (name, parent_tag_id) — stejný název může existovat pod různými rodiči +- FK parent_tag_id → tags(id) + +--- + +### photo_tags + +Vazební tabulka M:N mezi `photos` a `tags`. + +| Sloupec | Typ | Nullable | Default | Popis | +|---------|-----|----------|---------|-------| +| **photo_id** | BIGINT | NO | — | FK → photos(id) ON DELETE CASCADE | +| **tag_id** | INT | NO | — | FK → tags(id) ON DELETE CASCADE | +| source | VARCHAR(20) | YES | — | Zdroj tagu (manual / auto / …) | +| created_at | TIMESTAMPTZ | YES | now() | Kdy byl tag přiřazen | + +**Constrainty:** +- PK (photo_id, tag_id) +- FK photo_id → photos(id) ON DELETE CASCADE +- FK tag_id → tags(id) ON DELETE CASCADE + +--- + +## Skupina 2: Sběr a záloha fotek + +Tyto tabulky slouží **výhradně** pro proces sběru fotek ze všech počítačů na jedno centrální +úložiště (Tower1). Neobsahují žádné informace o obsahu fotek — jen evidenci, odkud byly +soubory sebrány a kam byly zálohovány. S dalším zpracováním (EXIF, tagy, organizace) nemají +nic společného. + +Skript: `00 PictureCollector/collect_pictures.py` (Linux) / `collect_pictures_windows.py` (Windows) + +### zaloha_obrazku (39 961 záznamů) + +Každý řádek = jeden unikátní soubor fyzicky uložený v záloze (identifikovaný BLAKE3 hashem). + +| Sloupec | Typ | Nullable | Default | Popis | +|---------|-----|----------|---------|-------| +| **id** | SERIAL | NO | autoincrement | PK | +| **blake3_hash** | VARCHAR(64) | NO | — | BLAKE3 hash souboru (UNIQUE) | +| **cesta_zalohy** | TEXT | NO | — | Cesta k záloze na Tower1 | +| **nazev_souboru** | VARCHAR(512) | NO | — | Název souboru | +| velikost | BIGINT | YES | — | Velikost v bajtech | +| datum_kopirovani | TIMESTAMP | YES | now() | Kdy byl soubor zkopírován | + +**Indexy:** +- `zaloha_obrazku_pkey` — PK (id) +- `zaloha_obrazku_blake3_hash_key` — UNIQUE (blake3_hash) +- `idx_zaloha_hash` — (blake3_hash) + +--- + +### zdrojove_soubory (45 090 záznamů) + +Každý řádek = jeden nalezený zdrojový soubor na nějakém počítači. Stejný soubor (stejný +BLAKE3 hash) může mít více záznamů, pokud existuje na různých místech/počítačích. + +| Sloupec | Typ | Nullable | Default | Popis | +|---------|-----|----------|---------|-------| +| **id** | SERIAL | NO | autoincrement | PK | +| **hostname** | VARCHAR(255) | NO | — | Název počítače, kde byl soubor nalezen | +| **cesta_zdroje** | TEXT | NO | — | Původní cesta k souboru | +| **nazev_souboru** | VARCHAR(512) | NO | — | Název souboru | +| velikost | BIGINT | YES | — | Velikost v bajtech | +| datum_nalezeni | TIMESTAMP | YES | now() | Kdy byl soubor nalezen | +| **blake3_hash** | VARCHAR(64) | NO | — | BLAKE3 hash souboru | +| zaloha_id | INT | YES | — | FK → zaloha_obrazku(id) | + +**Constrainty:** +- PK (id) +- UNIQUE (hostname, cesta_zdroje) — každý soubor z každého PC jen jednou +- FK zaloha_id → zaloha_obrazku(id) + +**Indexy:** +- `idx_zdroj_hash` — (blake3_hash) +- `idx_zdroj_zaloha` — (zaloha_id) +- `idx_zdroj_host` — (hostname) + +--- + +## Poznámky + +- Počty záznamů jsou k datu 2026-05-24. +- Tabulka `cameras` z původního `create_schema.py` v DB neexistuje — informace o kameře + jsou přímo ve sloupcích `camera_make` / `camera_model` v tabulce `photos`. +- EXIF parser: ExifRead (Pillow má bug v GPS). +- Tabulky skupiny 1 a skupiny 2 zatím nejsou propojené (žádný FK mezi `photos` a `zaloha_obrazku`).