Add Outlook/Soubory/Clario/Feasibility scripts and reports; ignore Incoming, Outlook downloads & profile

This commit is contained in:
2026-06-03 16:15:19 +02:00
parent 61c6aeea23
commit 6c57ab3ae6
36 changed files with 4949 additions and 0 deletions
+80
View File
@@ -0,0 +1,80 @@
# enrich_fulltext_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
**Skript:** `enrich_fulltext_v1.0.py`
## Účel
Pro každý dokument odkazovaný v MongoDB (`soubory.*`) vytáhne **plný text** a uloží do PostgreSQL s GIN `tsvector` indexem pro fulltext vyhledávání.
## Cíl: PostgreSQL `MongoSoubory`
- **host:** 192.168.1.76:5432
- **db:** `MongoSoubory`
- **user:** vladimir.buzalka
- **extension:** `unaccent`, `pg_trgm`
- **text search config:** `soubory` (= simple + unaccent → case- a diakritika-insensitivní)
## Tabulka `documents`
| sloupec | typ | popis |
|---|---|---|
| id | BIGSERIAL | PK |
| mongo_id | TEXT | ObjectId z Mongo |
| study | TEXT | kolekce v Mongo (`42847922MDD3003` / `77242113UCO3001`) |
| path | TEXT | absolutní cesta (UNIQUE s study) |
| rel_path, name, ext | TEXT | doplňková metadata |
| sha256 | TEXT | pro inkrementální kontrolu |
| size_bytes, mtime | | |
| **body** | TEXT | plný extrahovaný text (max 5 MB) |
| body_length | INT | délka v znacích |
| **tsv** | tsvector GENERATED STORED | `to_tsvector('soubory', body)` |
| extracted_at | TIMESTAMPTZ | čas extrakce |
| extractor_version | TEXT | verze tohoto skriptu |
| ok | BOOLEAN | true pokud extrakce proběhla |
| error | TEXT | chybové hlášení |
**Indexy:** GIN nad `tsv`, GIN trigram nad `name`, btree `sha256`, btree `(study, ext)`.
## Podporované přípony
`pdf`, `docx`, `xlsx`, `xlsm`, `pptx`, `eml`, `msg`, `txt`, `csv`
## Inkrementální chování
Soubor se přeskočí pokud v PG už existuje záznam s:
- shodným `sha256`
- shodnou `extractor_version`
- `ok = true`
Jinak se přeparsuje a UPSERT.
## Limity (skip s `error=too_big_...`)
- PDF nad 500 MB
- XLSX nad 200 MB
- ostatní nad 300 MB
- `body` se vždy ořízne na 5 MB UTF-8
## Příklady dotazů (psql)
```sql
-- fulltext (case+diakritika insensitivní)
SELECT study, name, ts_rank_cd(tsv, q) AS rank,
ts_headline('soubory', body, q, 'MaxFragments=2,MinWords=5,MaxWords=15') AS snippet
FROM documents, plainto_tsquery('soubory', 'amendment 3') q
WHERE tsv @@ q
ORDER BY rank DESC
LIMIT 20;
-- jméno obsahuje (trigram, fuzzy)
SELECT study, name FROM documents
WHERE name ILIKE '%protokol%';
-- nejdelsi dokumenty per studie
SELECT study, name, body_length
FROM documents
WHERE ok = true
ORDER BY body_length DESC LIMIT 10;
```
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.0.py
```
Průběh tiskne řádek na soubor: `[n/total] OK pdf 2.3MB protokol.pdf | 12340 znaku 'Protocol amendment ...'`
+416
View File
@@ -0,0 +1,416 @@
"""
==============================================================================
Skript: enrich_fulltext_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector
fulltext indexem.
Zdroje:
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
Inkrementalne: preskoci soubor, kde v PG existuje radek se shodnym
sha256 a extractor_version a ok=true.
Pri prvnim behu sam vytvori tabulku, indexy a textovou konfiguraci
'soubory' (unaccent + simple) - vyhleda case- a diakritika-insensitivni.
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import sys
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
import psycopg
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.0"
MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB textu na dokument max
MAX_PDF_BYTES = 500 * 1024 * 1024
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
mongo_id TEXT NOT NULL,
study TEXT NOT NULL,
path TEXT NOT NULL,
rel_path TEXT,
name TEXT,
ext TEXT,
sha256 TEXT NOT NULL,
size_bytes BIGINT,
mtime TIMESTAMPTZ,
body TEXT,
body_length INT,
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, coalesce(body, ''))
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (study, path)
);
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
"""
# --- EXTRAKTORY (vraci string, max MAX_TEXT_BYTES) --------------------------
def _truncate(s: str) -> str:
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
def extract_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
if reader.is_encrypted:
try:
reader.decrypt("")
except Exception:
return ""
parts = []
total = 0
for page in reader.pages:
try:
t = page.extract_text() or ""
except Exception:
continue
parts.append(t)
total += len(t)
if total > MAX_TEXT_BYTES:
break
return _truncate("\n".join(parts))
def extract_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
parts = [p.text for p in doc.paragraphs if p.text]
for tbl in doc.tables:
for row in tbl.rows:
parts.append(" | ".join(c.text for c in row.cells))
return _truncate("\n".join(parts))
def extract_xlsx(path: Path) -> str:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=True)
parts = []
total = 0
for ws in wb.worksheets:
parts.append(f"# {ws.title}")
for row in ws.iter_rows(values_only=True):
line = "\t".join("" if v is None else str(v) for v in row)
if line.strip():
parts.append(line)
total += len(line)
if total > MAX_TEXT_BYTES:
break
if total > MAX_TEXT_BYTES:
break
wb.close()
return _truncate("\n".join(parts))
def extract_pptx(path: Path) -> str:
from pptx import Presentation
prs = Presentation(str(path))
parts = []
for i, slide in enumerate(prs.slides, 1):
parts.append(f"# slide {i}")
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs)
if line.strip():
parts.append(line)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes:
parts.append(f"[notes] {notes}")
return _truncate("\n".join(parts))
def extract_eml(path: Path) -> str:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
head = []
for k in ("From", "To", "Cc", "Subject", "Date"):
v = msg.get(k)
if v:
head.append(f"{k}: {v}")
parts = ["\n".join(head)]
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain" and not part.get_filename():
try:
parts.append(part.get_content())
except Exception:
pass
else:
try:
parts.append(msg.get_content())
except Exception:
pass
return _truncate("\n\n".join(parts))
def extract_msg(path: Path) -> str:
import extract_msg
with extract_msg.openMsg(str(path)) as m:
head = []
if m.subject: head.append(f"Subject: {m.subject}")
if m.sender: head.append(f"From: {m.sender}")
if m.to: head.append(f"To: {m.to}")
if m.cc: head.append(f"Cc: {m.cc}")
if m.date: head.append(f"Date: {m.date}")
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
def extract_text(path: Path) -> str:
data = path.read_bytes()[:MAX_TEXT_BYTES]
for enc in ("utf-8-sig", "cp1250", "latin-1"):
try:
return data.decode(enc)
except UnicodeDecodeError:
continue
return data.decode("utf-8", errors="replace")
EXTRACTORS = {
"pdf": (extract_pdf, MAX_PDF_BYTES),
"docx": (extract_docx, MAX_GENERIC_BYTES),
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
"eml": (extract_eml, MAX_GENERIC_BYTES),
"msg": (extract_msg, MAX_GENERIC_BYTES),
"txt": (extract_text, MAX_GENERIC_BYTES),
"csv": (extract_text, MAX_GENERIC_BYTES),
}
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
# nactu z PG existujici sha256 + verzi
with pg.cursor() as cur:
cur.execute(
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
(study,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
cursor = mongo_coll.find(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
"sha256": 1, "size_bytes": 1, "mtime": 1},
no_cursor_timeout=True,
)
processed = ok = errors = skipped = too_big = 0
queue = []
total_pending = mongo_coll.count_documents(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
)
print(f"[{study}] kandidatu v Mongo: {total_pending}")
n = 0
try:
for doc in cursor:
n += 1
prev = existing.get(doc["path"])
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
skipped += 1
continue
ext = doc["ext"]
extractor, max_bytes = EXTRACTORS[ext]
path = Path(doc["path"])
row = {
"mongo_id": str(doc["_id"]),
"study": study,
"path": doc["path"],
"rel_path": doc.get("rel_path"),
"name": doc.get("name"),
"ext": ext,
"sha256": doc.get("sha256"),
"size_bytes": doc.get("size_bytes"),
"mtime": doc.get("mtime"),
"body": None,
"body_length": 0,
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "
detail = ""
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
if not path.exists():
row["error"] = "file_missing"
status = "ERR"; detail = "file_missing"; errors += 1
elif (doc.get("size_bytes") or 0) > max_bytes:
row["error"] = f"too_big_>{max_bytes}"
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
else:
try:
body = extractor(path) or ""
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
f"{path.name} | {detail}", flush=True)
if len(queue) >= 50:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"study": study, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "too_big": too_big}
UPSERT_SQL = """
INSERT INTO documents
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
body, body_length, extracted_at, extractor_version, ok, error)
VALUES
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
%(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (study, path) DO UPDATE SET
mongo_id = EXCLUDED.mongo_id,
rel_path = EXCLUDED.rel_path,
name = EXCLUDED.name,
ext = EXCLUDED.ext,
sha256 = EXCLUDED.sha256,
size_bytes = EXCLUDED.size_bytes,
mtime = EXCLUDED.mtime,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Mongo OK.")
results = []
for name in MONGO_COLLECTIONS:
results.append(process_collection(pg, db[name], name))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+22
View File
@@ -0,0 +1,22 @@
# enrich_fulltext_v1.1
**Verze:** 1.1
**Datum:** 2026-06-03
**Skript:** `enrich_fulltext_v1.1.py`
## Změny proti v1.0
- **NUL bajty (0x00) v textu** — PG TEXT je odmítá. v1.1 odstraní všechny `\x00` a ostatní controly (kromě `\n \r \t`) ve společné funkci `_clean_for_pg`, navíc bezpečnostní strip i v `_flush` před UPSERT.
- **DOCX fallback** — pokud python-docx hodí výjimku (typicky `"no tr above topmost tr in w:tbl"` u VTMF formulářů s rozbitými tabulkami), v1.1 sáhne přímo do `word/document.xml` v ZIPu a regexem vytáhne text z `<w:t>` elementů. Přijde o strukturu tabulek, ale text zachrání.
- `extractor_version` zvýšena na `1.1` → všechny řádky z v1.0 se přeparsují (původní jsou pravděpodobně stejně chyběly kvůli pádu).
## Vše ostatní
Beze změny proti [v1.0](Trash/enrich_fulltext_v1.0.md):
- Tabulka `documents` v PG `MongoSoubory` (192.168.1.76:5432)
- Text search config `soubory` (simple + unaccent)
- Limity: PDF 500 MB, XLSX 200 MB, ostatní 300 MB; text max 5 MB
- Inkrementálně podle `sha256` + `extractor_version`
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.1.py
```
+457
View File
@@ -0,0 +1,457 @@
"""
==============================================================================
Skript: enrich_fulltext_v1.1.py
Verze: 1.1
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector indexem.
Zmeny proti v1.0:
- PG odmita NUL (0x00) bajty v TEXT -> v _truncate se vsechny NULy odstrani
(i jine controly krome \\n \\r \\t)
- DOCX fallback: pokud python-docx selze (typicky "no tr above topmost tr
in w:tbl" u rozbitych tabulek), pokusi se primy raw extract z word/document.xml
pres regex - prijde o strukturu tabulek, ale zachrani text
- drobnost: posunul jsem extractor_version na "1.1" -> stare radky se preparsuji
Cilove ulozeni:
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import re
import sys
import time
import traceback
import zipfile
from datetime import datetime, timezone
from pathlib import Path
import psycopg
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.1"
MAX_TEXT_BYTES = 5 * 1024 * 1024
MAX_PDF_BYTES = 500 * 1024 * 1024
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
mongo_id TEXT NOT NULL,
study TEXT NOT NULL,
path TEXT NOT NULL,
rel_path TEXT,
name TEXT,
ext TEXT,
sha256 TEXT NOT NULL,
size_bytes BIGINT,
mtime TIMESTAMPTZ,
body TEXT,
body_length INT,
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, coalesce(body, ''))
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (study, path)
);
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
"""
# --- HELPERY ----------------------------------------------------------------
# odstrani 0x00 a ostatni controly krome whitespace
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
# --- EXTRAKTORY -------------------------------------------------------------
def extract_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
if reader.is_encrypted:
try:
reader.decrypt("")
except Exception:
return ""
parts = []
total = 0
for page in reader.pages:
try:
t = page.extract_text() or ""
except Exception:
continue
parts.append(t)
total += len(t)
if total > MAX_TEXT_BYTES:
break
return _truncate("\n".join(parts))
# regex pro DOCX fallback - vytahne <w:t>...</w:t>
_DOCX_WT_RX = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
_DOCX_WP_END_RX = re.compile(r"</w:p>")
def _docx_raw_text(path: Path) -> str:
"""Fallback - cte primo word/document.xml ze ZIPu."""
with zipfile.ZipFile(str(path)) as z:
try:
xml = z.read("word/document.xml").decode("utf-8", errors="replace")
except KeyError:
return ""
xml = _DOCX_WP_END_RX.sub("\n", xml)
return "\n".join(m.group(1) for m in _DOCX_WT_RX.finditer(xml))
def extract_docx(path: Path) -> str:
from docx import Document
try:
doc = Document(str(path))
parts = [p.text for p in doc.paragraphs if p.text]
for tbl in doc.tables:
for row in tbl.rows:
parts.append(" | ".join(c.text for c in row.cells))
return _truncate("\n".join(parts))
except Exception:
# fallback - raw XML extract
return _truncate(_docx_raw_text(path))
def extract_xlsx(path: Path) -> str:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=True)
parts = []
total = 0
for ws in wb.worksheets:
parts.append(f"# {ws.title}")
for row in ws.iter_rows(values_only=True):
line = "\t".join("" if v is None else str(v) for v in row)
if line.strip():
parts.append(line)
total += len(line)
if total > MAX_TEXT_BYTES:
break
if total > MAX_TEXT_BYTES:
break
wb.close()
return _truncate("\n".join(parts))
def extract_pptx(path: Path) -> str:
from pptx import Presentation
prs = Presentation(str(path))
parts = []
for i, slide in enumerate(prs.slides, 1):
parts.append(f"# slide {i}")
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs)
if line.strip():
parts.append(line)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes:
parts.append(f"[notes] {notes}")
return _truncate("\n".join(parts))
def extract_eml(path: Path) -> str:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
head = []
for k in ("From", "To", "Cc", "Subject", "Date"):
v = msg.get(k)
if v:
head.append(f"{k}: {v}")
parts = ["\n".join(head)]
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain" and not part.get_filename():
try:
parts.append(part.get_content())
except Exception:
pass
else:
try:
parts.append(msg.get_content())
except Exception:
pass
return _truncate("\n\n".join(parts))
def extract_msg(path: Path) -> str:
import extract_msg
with extract_msg.openMsg(str(path)) as m:
head = []
if m.subject: head.append(f"Subject: {m.subject}")
if m.sender: head.append(f"From: {m.sender}")
if m.to: head.append(f"To: {m.to}")
if m.cc: head.append(f"Cc: {m.cc}")
if m.date: head.append(f"Date: {m.date}")
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
def extract_text(path: Path) -> str:
data = path.read_bytes()[:MAX_TEXT_BYTES]
for enc in ("utf-8-sig", "cp1250", "latin-1"):
try:
return _truncate(data.decode(enc))
except UnicodeDecodeError:
continue
return _truncate(data.decode("utf-8", errors="replace"))
EXTRACTORS = {
"pdf": (extract_pdf, MAX_PDF_BYTES),
"docx": (extract_docx, MAX_GENERIC_BYTES),
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
"eml": (extract_eml, MAX_GENERIC_BYTES),
"msg": (extract_msg, MAX_GENERIC_BYTES),
"txt": (extract_text, MAX_GENERIC_BYTES),
"csv": (extract_text, MAX_GENERIC_BYTES),
}
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
with pg.cursor() as cur:
cur.execute(
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
(study,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
cursor = mongo_coll.find(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
"sha256": 1, "size_bytes": 1, "mtime": 1},
no_cursor_timeout=True,
)
processed = ok = errors = skipped = too_big = 0
queue: list[dict] = []
total_pending = mongo_coll.count_documents(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
)
print(f"[{study}] kandidatu v Mongo: {total_pending}")
n = 0
try:
for doc in cursor:
n += 1
prev = existing.get(doc["path"])
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
skipped += 1
continue
ext = doc["ext"]
extractor, max_bytes = EXTRACTORS[ext]
path = Path(doc["path"])
row = {
"mongo_id": str(doc["_id"]),
"study": study,
"path": doc["path"],
"rel_path": doc.get("rel_path"),
"name": doc.get("name"),
"ext": ext,
"sha256": doc.get("sha256"),
"size_bytes": doc.get("size_bytes"),
"mtime": doc.get("mtime"),
"body": None,
"body_length": 0,
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "
detail = ""
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
if not path.exists():
row["error"] = "file_missing"
status = "ERR"; detail = "file_missing"; errors += 1
elif (doc.get("size_bytes") or 0) > max_bytes:
row["error"] = f"too_big_>{max_bytes}"
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
else:
try:
body = extractor(path) or ""
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
f"{path.name} | {detail}", flush=True)
if len(queue) >= 50:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"study": study, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "too_big": too_big}
UPSERT_SQL = """
INSERT INTO documents
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
body, body_length, extracted_at, extractor_version, ok, error)
VALUES
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
%(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (study, path) DO UPDATE SET
mongo_id = EXCLUDED.mongo_id,
rel_path = EXCLUDED.rel_path,
name = EXCLUDED.name,
ext = EXCLUDED.ext,
sha256 = EXCLUDED.sha256,
size_bytes = EXCLUDED.size_bytes,
mtime = EXCLUDED.mtime,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
# posledni pojistka - jeste jednou strip NUL (kdyby se necim prokrouzil)
for r in rows:
if r.get("body"):
r["body"] = _clean_for_pg(r["body"])
if r.get("error"):
r["error"] = _clean_for_pg(r["error"])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Mongo OK.")
results = []
for name in MONGO_COLLECTIONS:
results.append(process_collection(pg, db[name], name))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+46
View File
@@ -0,0 +1,46 @@
# enrich_files_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
**Skript:** `enrich_files_v1.0.py`
## Účel
Doplnit do existujících záznamů v MongoDB `soubory.*` pole `content.*` parsovaná z obsahu souborů.
Spouští se **až po** [scan_files_v1.0.py](scan_files_v1.0.md).
## Podporované přípony a pole
| ext | knihovna | pole v `content` |
|---|---|---|
| pdf | pypdf | pages, encrypted, author, title, subject, creator, producer, created, modified, text_head |
| docx | python-docx | author, title, subject, last_modified_by, paragraphs, words, created, modified, text_head |
| xlsx, xlsm | openpyxl | total_sheets, sheets[{name,rows,cols}], author, title, subject, last_modified_by, created, modified |
| pptx | python-pptx | slides, author, title, subject, last_modified_by, created, modified, text_head (z prvních 3 snímků) |
| eml | stdlib email | subject, from, to, cc, date, has_attachments, attachments[], body_head |
| msg | extract_msg | totéž co eml |
Společná pole vždy: `ok` (bool), `parsed_at`, `parser_version`, `sha256_at_parse`. Při chybě `error` (název výjimky + zpráva).
## Inkrementální chování
Zpracují se jen dokumenty kde:
- `content` chybí, NEBO
- `content.parser_version` != aktuální verze (1.0), NEBO
- `content.sha256_at_parse` != aktuální `sha256` (soubor se změnil)
Při dalším spuštění **přidá** jen nové/změněné. Při zvýšení verze parseru přeparsuje vše.
## Limity (skip)
- PDF nad 500 MB → ok=False, error="too_big_..."
- XLSX nad 200 MB → ok=False
- ostatní nad 300 MB → ok=False
`text_head` max 2000 znaků.
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_files_v1.0.py
```
## Plán
Po doběhnutí ověřit `content.ok` rate, případně doladit (chybové vzory) a teprve pak stavět `MCP_SOUBORY` server.
+388
View File
@@ -0,0 +1,388 @@
"""
==============================================================================
Skript: enrich_files_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Doplni metadata z obsahu souboru (PDF/DOCX/XLSX/PPTX/EML/MSG)
do existujicich zaznamu v MongoDB (db: soubory).
Pole se uklada do podobjektu `content`:
- common: ok (bool), error (str|None), parsed_at, parser_version
- pdf: pages, author, title, subject, creator, producer,
created, modified, encrypted, text_head (prvni stranka, max 2000 znaku)
- docx: author, title, subject, last_modified_by, paragraphs,
words, created, modified, text_head
- xlsx: sheets [{name, rows, cols}], total_sheets,
author, title, last_modified_by, created, modified
- pptx: slides, author, title, subject, last_modified_by,
created, modified, text_head (text z prvnich 3 snimku)
- eml: subject, from, to, cc, date, has_attachments,
attachments [filenames], body_head
- msg: same as eml
Inkrementalni:
- preskaci soubor, kde content.sha256_at_parse == aktualni sha256
a content.parser_version == aktualni verze
- pri zmene obsahu (jiny sha256) prepocita
- pri chybe ulozi content.error a content.ok=False
MongoDB: 192.168.1.76:27017
DB: soubory
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import sys
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
from pymongo import MongoClient, UpdateOne
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "soubory"
COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PARSER_VERSION = "1.0"
TEXT_HEAD_LIMIT = 2000
# limity pro velke soubory - aby skript neuvazil na 1GB PDF
MAX_PDF_BYTES = 500 * 1024 * 1024 # 500 MB
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
def _truncate(s: str | None, n: int = TEXT_HEAD_LIMIT) -> str | None:
if s is None:
return None
s = s.strip()
return s if len(s) <= n else s[:n]
def _to_dt(value):
if isinstance(value, datetime):
return value if value.tzinfo else value.replace(tzinfo=timezone.utc)
if isinstance(value, str) and value:
try:
return datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return None
return None
# --- PARSERY ----------------------------------------------------------------
def parse_pdf(path: Path) -> dict:
from pypdf import PdfReader
reader = PdfReader(str(path))
info = reader.metadata or {}
out = {
"pages": len(reader.pages),
"encrypted": reader.is_encrypted,
"author": getattr(info, "author", None),
"title": getattr(info, "title", None),
"subject": getattr(info, "subject", None),
"creator": getattr(info, "creator", None),
"producer": getattr(info, "producer", None),
"created": _to_dt(getattr(info, "creation_date", None)),
"modified": _to_dt(getattr(info, "modification_date", None)),
}
text_head = None
try:
if not reader.is_encrypted and reader.pages:
text_head = reader.pages[0].extract_text()
except Exception:
text_head = None
out["text_head"] = _truncate(text_head)
return out
def parse_docx(path: Path) -> dict:
from docx import Document
doc = Document(str(path))
core = doc.core_properties
paragraphs = doc.paragraphs
text = "\n".join(p.text for p in paragraphs if p.text)
words = len(text.split())
return {
"author": core.author,
"title": core.title,
"subject": core.subject,
"last_modified_by": core.last_modified_by,
"paragraphs": len(paragraphs),
"words": words,
"created": _to_dt(core.created),
"modified": _to_dt(core.modified),
"text_head": _truncate(text),
}
def parse_xlsx(path: Path) -> dict:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=False)
sheets = []
for ws in wb.worksheets:
sheets.append({
"name": ws.title,
"rows": ws.max_row,
"cols": ws.max_column,
})
props = wb.properties
out = {
"total_sheets": len(sheets),
"sheets": sheets,
"author": props.creator,
"title": props.title,
"subject": props.subject,
"last_modified_by": props.lastModifiedBy,
"created": _to_dt(props.created),
"modified": _to_dt(props.modified),
}
wb.close()
return out
def parse_pptx(path: Path) -> dict:
from pptx import Presentation
prs = Presentation(str(path))
core = prs.core_properties
head_parts = []
for slide in list(prs.slides)[:3]:
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
for run in para.runs:
if run.text:
head_parts.append(run.text)
return {
"slides": len(prs.slides),
"author": core.author,
"title": core.title,
"subject": core.subject,
"last_modified_by": core.last_modified_by,
"created": _to_dt(core.created),
"modified": _to_dt(core.modified),
"text_head": _truncate(" ".join(head_parts)),
}
def parse_eml(path: Path) -> dict:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
attachments = []
body_parts = []
if msg.is_multipart():
for part in msg.walk():
disp = (part.get("Content-Disposition") or "").lower()
ctype = part.get_content_type()
if "attachment" in disp or part.get_filename():
fname = part.get_filename()
if fname:
attachments.append(fname)
elif ctype == "text/plain":
try:
body_parts.append(part.get_content())
except Exception:
pass
else:
try:
body_parts.append(msg.get_content())
except Exception:
pass
def _addrs(field):
v = msg.get(field)
return v if v else None
return {
"subject": msg.get("Subject"),
"from": _addrs("From"),
"to": _addrs("To"),
"cc": _addrs("Cc"),
"date": msg.get("Date"),
"has_attachments": bool(attachments),
"attachments": attachments,
"body_head": _truncate("\n".join(body_parts)),
}
def parse_msg(path: Path) -> dict:
import extract_msg
with extract_msg.openMsg(str(path)) as msg:
attachments = []
for att in msg.attachments or []:
try:
fname = att.longFilename or att.shortFilename
if fname:
attachments.append(fname)
except Exception:
continue
return {
"subject": msg.subject,
"from": msg.sender,
"to": msg.to,
"cc": msg.cc,
"date": str(msg.date) if msg.date else None,
"has_attachments": bool(attachments),
"attachments": attachments,
"body_head": _truncate(msg.body or ""),
}
PARSERS = {
"pdf": (parse_pdf, MAX_PDF_BYTES),
"docx": (parse_docx, MAX_GENERIC_BYTES),
"xlsx": (parse_xlsx, MAX_XLSX_BYTES),
"xlsm": (parse_xlsx, MAX_XLSX_BYTES),
"pptx": (parse_pptx, MAX_GENERIC_BYTES),
"eml": (parse_eml, MAX_GENERIC_BYTES),
"msg": (parse_msg, MAX_GENERIC_BYTES),
}
# --- SUMMARY PRO KONZOLI ----------------------------------------------------
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _summary(content: dict, ext: str) -> str:
if not content.get("ok"):
return f"chyba: {_short(content.get('error'), 80)}"
parts = []
if ext == "pdf":
parts.append(f"{content.get('pages')}p")
if content.get("encrypted"): parts.append("enc")
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
if content.get("title"): parts.append(f"t={_short(content['title'], 30)}")
elif ext == "docx":
parts.append(f"{content.get('paragraphs')}para")
parts.append(f"{content.get('words')}w")
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
elif ext in ("xlsx", "xlsm"):
n = content.get("total_sheets", 0)
sheets = content.get("sheets") or []
names = ",".join(_short(s["name"], 12) for s in sheets[:3])
if n > 3:
names += f",+{n-3}"
parts.append(f"{n}sh[{names}]")
if content.get("author"): parts.append(f"by={_short(content['author'], 20)}")
elif ext == "pptx":
parts.append(f"{content.get('slides')}slides")
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
if content.get("title"): parts.append(f"t={_short(content['title'], 25)}")
elif ext in ("eml", "msg"):
if content.get("from"): parts.append(f"from={_short(content['from'], 25)}")
if content.get("subject"): parts.append(f"subj={_short(content['subject'], 40)}")
if content.get("has_attachments"):
parts.append(f"att={len(content.get('attachments') or [])}")
return " ".join(parts) if parts else "ok"
# --- HLAVNI SMYCKA ----------------------------------------------------------
def enrich_collection(coll, study: str) -> dict:
supported = list(PARSERS.keys())
query = {
"ext": {"$in": supported},
"deleted_at": {"$exists": False},
"$or": [
{"content": {"$exists": False}},
{"content.parser_version": {"$ne": PARSER_VERSION}},
{"$expr": {"$ne": ["$content.sha256_at_parse", "$sha256"]}},
],
}
total_pending = coll.count_documents(query)
print(f"[{study}] k zpracovani: {total_pending} souboru")
ops: list[UpdateOne] = []
processed = 0
ok = 0
errors = 0
too_big = 0
cursor = coll.find(query, {"path": 1, "ext": 1, "size_bytes": 1, "sha256": 1}, no_cursor_timeout=True)
try:
for doc in cursor:
ext = doc["ext"]
parser, max_bytes = PARSERS[ext]
path = Path(doc["path"])
content: dict = {
"parser_version": PARSER_VERSION,
"parsed_at": _now(),
"sha256_at_parse": doc.get("sha256"),
}
if not path.exists():
content.update(ok=False, error="file_missing")
errors += 1
elif doc.get("size_bytes", 0) > max_bytes:
content.update(ok=False, error=f"too_big_>{max_bytes}")
too_big += 1
else:
try:
payload = parser(path)
content["ok"] = True
content.update(payload)
ok += 1
except Exception as e:
content["ok"] = False
content["error"] = f"{type(e).__name__}: {e}"[:500]
errors += 1
ops.append(UpdateOne({"_id": doc["_id"]}, {"$set": {"content": content}}))
processed += 1
status = "OK " if content.get("ok") else ("BIG" if "too_big" in (content.get("error") or "") else "ERR")
size_mb = (doc.get("size_bytes", 0) or 0) / 1024 / 1024
detail = _summary(content, ext)
print(f" [{processed:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB {path.name} | {detail}", flush=True)
if len(ops) >= 50:
coll.bulk_write(ops, ordered=False)
ops.clear()
finally:
cursor.close()
if ops:
coll.bulk_write(ops, ordered=False)
return {"study": study, "processed": processed, "ok": ok, "errors": errors, "too_big": too_big}
def main() -> int:
t0 = time.time()
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
results = []
for name in COLLECTIONS:
results.append(enrich_collection(db[name], name))
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+51
View File
@@ -0,0 +1,51 @@
# enrich_fulltext_v1.2
**Verze:** 1.2
**Datum:** 2026-06-03
**Skript:** `enrich_fulltext_v1.2.py`
## Změna proti v1.1
Velký XLSX (`#400 MDD3003_EAT detail report_30jun25.xlsx`, 5 242 128 znaků textu) způsobil pád:
```
psycopg.errors.ProgramLimitExceeded:
string is too long for tsvector (1114090 bytes, max 1048575 bytes)
```
PostgreSQL `tsvector`**tvrdý limit ~1 MB** binární velikosti — nelze obejít.
**Řešení:** `tsv` se generuje z prvních **800 000 znaků** sloupce `body`:
```sql
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, left(coalesce(body, ''), 800000))
) STORED
```
- sloupec `body` zůstává **plný** (až 5 MB) — pro náhledy, snippet, `ts_headline`
- vyhledávání (`tsv @@ q`) ignoruje obsah za 800 000. znakem
- u rozsáhlých XLSX/PDF (např. data exporty) je 800 KB stále víc než 100 000 slov — pro fulltext bohatě stačí
## Migrace
`SCHEMA_SQL` při startu zkontroluje, zda současný výraz `tsv` obsahuje `left(`. Pokud ne (starý sloupec z v1.0/v1.1):
1. dropne `documents_tsv_gin` index
2. dropne sloupec `tsv`
3. přidá nový s `left(body, 800000)`
4. index se vytvoří znovu na konci `SCHEMA_SQL`
Bezpečné spustit opakovaně.
## extractor_version
Posunuto na `1.2` → všechny řádky z v1.0/v1.1 se přeparsují (potřebné už proto, že migrace tsv změnila co je v indexu).
## Vše ostatní
Beze změny proti [v1.1](Trash/enrich_fulltext_v1.1.md):
- DOCX fallback přes raw `word/document.xml`
- NUL byte strip
- Limity souborů (PDF 500 MB, XLSX 200 MB, ostatní 300 MB), text max 5 MB
- Inkrementálně podle `sha256` + `extractor_version`
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.2.py
```
+481
View File
@@ -0,0 +1,481 @@
"""
==============================================================================
Skript: enrich_fulltext_v1.2.py
Verze: 1.2
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector indexem.
Zmeny proti v1.1:
- PG tsvector ma tvrdy limit ~1 MB binarne -> velky XLSX (5 MB textu) ho prekrocil.
v1.2 generuje tsv z prvnich 800 000 znaku body: left(body, 800000).
Sloupec body zustava plny (max 5 MB pro nahled / snippet).
- SCHEMA_SQL provadi migraci sloupce tsv: pokud uz existuje stara verze
(bez `left`), dropne index+sloupec a vytvori znovu s truncated vyrazem.
- extractor_version = "1.2" -> preparsuji se vsechny radky z v1.0/v1.1.
Zachovano z v1.1:
- NUL bajty (0x00) se strippuji z body i error
- DOCX fallback na raw XML pres regex pri padu python-docx
Cilove ulozeni:
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
==============================================================================
"""
from __future__ import annotations
import email
import email.policy
import re
import sys
import time
import traceback
import zipfile
from datetime import datetime, timezone
from pathlib import Path
import psycopg
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.2"
MAX_TEXT_BYTES = 5 * 1024 * 1024
MAX_PDF_BYTES = 500 * 1024 * 1024
MAX_XLSX_BYTES = 200 * 1024 * 1024
MAX_GENERIC_BYTES = 300 * 1024 * 1024
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
mongo_id TEXT NOT NULL,
study TEXT NOT NULL,
path TEXT NOT NULL,
rel_path TEXT,
name TEXT,
ext TEXT,
sha256 TEXT NOT NULL,
size_bytes BIGINT,
mtime TIMESTAMPTZ,
body TEXT,
body_length INT,
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig, left(coalesce(body, ''), 800000))
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (study, path)
);
-- migrace tsv sloupce ze stareho vyrazu (bez `left`) na novy (s `left(..,800000)`)
DO $$
DECLARE
cur_expr TEXT;
BEGIN
SELECT pg_get_expr(d.adbin, d.adrelid)
INTO cur_expr
FROM pg_attribute a
JOIN pg_class c ON c.oid = a.attrelid
JOIN pg_attrdef d ON d.adrelid = a.attrelid AND d.adnum = a.attnum
WHERE c.relname = 'documents' AND a.attname = 'tsv';
IF cur_expr IS NOT NULL AND position('left' in cur_expr) = 0 THEN
EXECUTE 'DROP INDEX IF EXISTS documents_tsv_gin';
EXECUTE 'ALTER TABLE documents DROP COLUMN tsv';
EXECUTE 'ALTER TABLE documents ADD COLUMN tsv tsvector GENERATED ALWAYS AS '
|| '(to_tsvector(''soubory''::regconfig, left(coalesce(body, ''''), 800000))) STORED';
END IF;
END$$;
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
"""
# --- HELPERY ----------------------------------------------------------------
# odstrani 0x00 a ostatni controly krome whitespace
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
# --- EXTRAKTORY -------------------------------------------------------------
def extract_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
if reader.is_encrypted:
try:
reader.decrypt("")
except Exception:
return ""
parts = []
total = 0
for page in reader.pages:
try:
t = page.extract_text() or ""
except Exception:
continue
parts.append(t)
total += len(t)
if total > MAX_TEXT_BYTES:
break
return _truncate("\n".join(parts))
# regex pro DOCX fallback - vytahne <w:t>...</w:t>
_DOCX_WT_RX = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
_DOCX_WP_END_RX = re.compile(r"</w:p>")
def _docx_raw_text(path: Path) -> str:
"""Fallback - cte primo word/document.xml ze ZIPu."""
with zipfile.ZipFile(str(path)) as z:
try:
xml = z.read("word/document.xml").decode("utf-8", errors="replace")
except KeyError:
return ""
xml = _DOCX_WP_END_RX.sub("\n", xml)
return "\n".join(m.group(1) for m in _DOCX_WT_RX.finditer(xml))
def extract_docx(path: Path) -> str:
from docx import Document
try:
doc = Document(str(path))
parts = [p.text for p in doc.paragraphs if p.text]
for tbl in doc.tables:
for row in tbl.rows:
parts.append(" | ".join(c.text for c in row.cells))
return _truncate("\n".join(parts))
except Exception:
# fallback - raw XML extract
return _truncate(_docx_raw_text(path))
def extract_xlsx(path: Path) -> str:
from openpyxl import load_workbook
wb = load_workbook(str(path), read_only=True, data_only=True)
parts = []
total = 0
for ws in wb.worksheets:
parts.append(f"# {ws.title}")
for row in ws.iter_rows(values_only=True):
line = "\t".join("" if v is None else str(v) for v in row)
if line.strip():
parts.append(line)
total += len(line)
if total > MAX_TEXT_BYTES:
break
if total > MAX_TEXT_BYTES:
break
wb.close()
return _truncate("\n".join(parts))
def extract_pptx(path: Path) -> str:
from pptx import Presentation
prs = Presentation(str(path))
parts = []
for i, slide in enumerate(prs.slides, 1):
parts.append(f"# slide {i}")
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs)
if line.strip():
parts.append(line)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes:
parts.append(f"[notes] {notes}")
return _truncate("\n".join(parts))
def extract_eml(path: Path) -> str:
with path.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.default)
head = []
for k in ("From", "To", "Cc", "Subject", "Date"):
v = msg.get(k)
if v:
head.append(f"{k}: {v}")
parts = ["\n".join(head)]
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain" and not part.get_filename():
try:
parts.append(part.get_content())
except Exception:
pass
else:
try:
parts.append(msg.get_content())
except Exception:
pass
return _truncate("\n\n".join(parts))
def extract_msg(path: Path) -> str:
import extract_msg
with extract_msg.openMsg(str(path)) as m:
head = []
if m.subject: head.append(f"Subject: {m.subject}")
if m.sender: head.append(f"From: {m.sender}")
if m.to: head.append(f"To: {m.to}")
if m.cc: head.append(f"Cc: {m.cc}")
if m.date: head.append(f"Date: {m.date}")
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
def extract_text(path: Path) -> str:
data = path.read_bytes()[:MAX_TEXT_BYTES]
for enc in ("utf-8-sig", "cp1250", "latin-1"):
try:
return _truncate(data.decode(enc))
except UnicodeDecodeError:
continue
return _truncate(data.decode("utf-8", errors="replace"))
EXTRACTORS = {
"pdf": (extract_pdf, MAX_PDF_BYTES),
"docx": (extract_docx, MAX_GENERIC_BYTES),
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
"eml": (extract_eml, MAX_GENERIC_BYTES),
"msg": (extract_msg, MAX_GENERIC_BYTES),
"txt": (extract_text, MAX_GENERIC_BYTES),
"csv": (extract_text, MAX_GENERIC_BYTES),
}
def _short(s, n=40):
if not s:
return ""
s = str(s).replace("\n", " ").replace("\r", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
with pg.cursor() as cur:
cur.execute(
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
(study,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
cursor = mongo_coll.find(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
"sha256": 1, "size_bytes": 1, "mtime": 1},
no_cursor_timeout=True,
)
processed = ok = errors = skipped = too_big = 0
queue: list[dict] = []
total_pending = mongo_coll.count_documents(
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
)
print(f"[{study}] kandidatu v Mongo: {total_pending}")
n = 0
try:
for doc in cursor:
n += 1
prev = existing.get(doc["path"])
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
skipped += 1
continue
ext = doc["ext"]
extractor, max_bytes = EXTRACTORS[ext]
path = Path(doc["path"])
row = {
"mongo_id": str(doc["_id"]),
"study": study,
"path": doc["path"],
"rel_path": doc.get("rel_path"),
"name": doc.get("name"),
"ext": ext,
"sha256": doc.get("sha256"),
"size_bytes": doc.get("size_bytes"),
"mtime": doc.get("mtime"),
"body": None,
"body_length": 0,
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "
detail = ""
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
if not path.exists():
row["error"] = "file_missing"
status = "ERR"; detail = "file_missing"; errors += 1
elif (doc.get("size_bytes") or 0) > max_bytes:
row["error"] = f"too_big_>{max_bytes}"
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
else:
try:
body = extractor(path) or ""
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
f"{path.name} | {detail}", flush=True)
if len(queue) >= 50:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"study": study, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "too_big": too_big}
UPSERT_SQL = """
INSERT INTO documents
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
body, body_length, extracted_at, extractor_version, ok, error)
VALUES
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
%(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (study, path) DO UPDATE SET
mongo_id = EXCLUDED.mongo_id,
rel_path = EXCLUDED.rel_path,
name = EXCLUDED.name,
ext = EXCLUDED.ext,
sha256 = EXCLUDED.sha256,
size_bytes = EXCLUDED.size_bytes,
mtime = EXCLUDED.mtime,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
# posledni pojistka - jeste jednou strip NUL (kdyby se necim prokrouzil)
for r in rows:
if r.get("body"):
r["body"] = _clean_for_pg(r["body"])
if r.get("error"):
r["error"] = _clean_for_pg(r["error"])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Mongo OK.")
results = []
for name in MONGO_COLLECTIONS:
results.append(process_collection(pg, db[name], name))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+63
View File
@@ -0,0 +1,63 @@
# scan_files_v1.0
**Verze:** 1.0
**Datum:** 2026-06-03
**Skript:** `scan_files_v1.0.py`
## Účel
Rekurzivní sken dvou Dropbox složek studií (`!!42847922MDD3003`, `!77242113UCO3001`) a zápis metadat všech souborů do MongoDB.
## Konfigurace
- **MongoDB:** `mongodb://192.168.1.76:27017` (bez autentizace)
- **DB:** `soubory`
- **Kolekce:** `42847922MDD3003`, `77242113UCO3001` (jedna kolekce na studii)
- **Cesta k Dropboxu:** zjištěna pomocí `Knihovny/najdi_dropbox.py` (přenositelné mezi PC)
## Struktura dokumentu v MongoDB
| pole | popis |
|---|---|
| `path` | absolutní cesta (unikátní klíč) |
| `study` | kód studie (= název kolekce) |
| `rel_path` | relativní cesta od kořene studie |
| `dir`, `rel_dir` | nadřazený adresář (absolutní/relativní) |
| `parent_folders` | pole názvů složek (pro filtrování) |
| `name`, `stem`, `ext` | jméno, jméno bez přípony, přípona (lower-case) |
| `size_bytes` | velikost |
| `mtime`, `ctime`, `atime` | časové údaje (UTC) |
| `sha256` | hash obsahu |
| `mime` | mimetype dle přípony |
| `tokens` | jméno rozparsované na slova/čísla (lower-case) |
| `dates_in_name` | datumy nalezené v názvu, formát `YYYY-MM-DD` |
| `first_seen_at` | první sken, kdy byl soubor viděn |
| `last_seen_at` | poslední sken, kdy byl viděn |
| `deleted_at` | nastaveno, pokud soubor v posledním skenu už nebyl nalezen |
## Datumy v názvu
Skript hledá tři varianty:
- `12JAN2026`, `12Jan2026` (den + 3-písm. zkratka měsíce + rok)
- `2026-01-12`, `2026_01_12`, `2026.01.12`
- `12-01-2026`, `12_01_2026`, `12.01.2026`
Všechny se normalizují do ISO `YYYY-MM-DD` v poli `dates_in_name`.
## Inkrementální chování
- `size_bytes` + `mtime` souhlasí se záznamem v DB → SHA256 se nepřepočítává, jen se aktualizuje `last_seen_at`
- nový soubor → vloží se s `first_seen_at`
- chybějící v aktuálním běhu → `deleted_at` se nastaví na čas běhu
## Co se ignoruje
- `.dropbox*`, `Thumbs.db`, `desktop.ini`, `~$*.*` (Office locky), `.DS_Store`
- adresář `.dropbox.cache`
## Spuštění
```
python U:\PythonProject\Janssen\Soubory\scan_files_v1.0.py
```
## Index pole pro rychlé dotazy
`path` (unique), `ext`, `dates_in_name`, `tokens`, `sha256`
## Plán pokračování
1. Spustit první sken → zjistit profil dat (přípony, hloubku stromů)
2. Doplnit dle potřeby (např. počet stran PDF, autor DOCX, listy XLSX)
3. Postavit `MCP_SOUBORY` server nad touto kolekcí
+272
View File
@@ -0,0 +1,272 @@
"""
==============================================================================
Skript: scan_files_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Rekurzivni sken Dropbox slozek dvou studii a zapis metadat
vsech souboru do MongoDB (db: soubory, kolekce = nazev studie).
- cesty k Dropboxu se zjisti pres Knihovny.najdi_dropbox
- pro kazdy soubor: stat, sha256, mime (podle pripony),
parsing data v nazvu (12JAN2026, 2026-01-12, 12-01-2026 ...)
- inkrementalni: pokud size+mtime souhlasi se zaznamem v DB,
sha256 se nepocita znovu (jen se aktualizuje last_seen_at)
- smazane soubory dostanou deleted_at pri behu, ve kterem
uz nebyly videny
- vynechavaji se: .dropbox*, Thumbs.db, desktop.ini,
~$*.* (Office lock), .DS_Store, *.tmp
MongoDB: 192.168.1.76:27017, bez autentizace
DB: soubory
Kolekce: 42847922MDD3003, 77242113UCO3001 (extrahovano z rootu cesty)
==============================================================================
"""
from __future__ import annotations
import hashlib
import mimetypes
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from pymongo import MongoClient, UpdateOne, ASCENDING
# --- prida Knihovny do path -------------------------------------------------
HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(HERE.parent))
from Knihovny.najdi_dropbox import get_dropbox_root # noqa: E402
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "soubory"
STUDIES = {
"42847922MDD3003": "!!42847922MDD3003",
"77242113UCO3001": "!77242113UCO3001",
}
SKIP_NAME_PATTERNS = [
re.compile(r"^\.dropbox.*", re.IGNORECASE),
re.compile(r"^Thumbs\.db$", re.IGNORECASE),
re.compile(r"^desktop\.ini$", re.IGNORECASE),
re.compile(r"^~\$.*", re.IGNORECASE),
re.compile(r"^\.DS_Store$", re.IGNORECASE),
]
SKIP_DIR_NAMES = {".dropbox.cache"}
HASH_CHUNK = 1024 * 1024 # 1 MiB
# --- parsovani datumu v nazvu ----------------------------------------------
MONTHS = {
"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12,
}
DATE_PATTERNS = [
# 12JAN2026 / 12Jan2026
(re.compile(r"(\d{1,2})([A-Za-z]{3})(\d{4})"), "dmonth"),
# 2026-01-12 / 2026_01_12 / 2026.01.12
(re.compile(r"(20\d{2})[-_.](\d{1,2})[-_.](\d{1,2})"), "ymd"),
# 12-01-2026 / 12_01_2026 / 12.01.2026
(re.compile(r"(\d{1,2})[-_.](\d{1,2})[-_.](20\d{2})"), "dmy"),
]
def extract_dates(name: str) -> list[str]:
"""Vraci unikatni ISO datumy (YYYY-MM-DD) nalezene v nazvu."""
found: set[str] = set()
for rx, kind in DATE_PATTERNS:
for m in rx.finditer(name):
try:
if kind == "dmonth":
d = int(m.group(1))
mo = MONTHS.get(m.group(2).upper())
y = int(m.group(3))
if not mo:
continue
elif kind == "ymd":
y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
else: # dmy
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
datetime(y, mo, d)
found.add(f"{y:04d}-{mo:02d}-{d:02d}")
except ValueError:
continue
return sorted(found)
TOKEN_RX = re.compile(r"[A-Za-z0-9]+")
def tokenize(name: str) -> list[str]:
return [t.lower() for t in TOKEN_RX.findall(name)]
def should_skip(name: str) -> bool:
return any(p.match(name) for p in SKIP_NAME_PATTERNS)
def sha256_of(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
while True:
chunk = f.read(HASH_CHUNK)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def to_dt(ts: float) -> datetime:
return datetime.fromtimestamp(ts, tz=timezone.utc)
def scan_study(study_code: str, study_root: Path, db, scan_started_at: datetime) -> dict:
coll = db[study_code]
coll.create_index([("path", ASCENDING)], unique=True)
coll.create_index([("ext", ASCENDING)])
coll.create_index([("dates_in_name", ASCENDING)])
coll.create_index([("tokens", ASCENDING)])
coll.create_index([("sha256", ASCENDING)])
# existujici zaznamy -> mapa path -> (size, mtime_iso, sha256)
existing = {
d["path"]: (d.get("size_bytes"), d.get("mtime"), d.get("sha256"))
for d in coll.find({}, {"path": 1, "size_bytes": 1, "mtime": 1, "sha256": 1})
}
ops: list[UpdateOne] = []
seen = 0
rehashed = 0
skipped = 0
errors: list[tuple[str, str]] = []
print(f"[{study_code}] sken: {study_root}")
for root, dirs, files in os.walk(study_root):
# vyrad skip-dirs in-place
dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES]
for fname in files:
if should_skip(fname):
skipped += 1
continue
fpath = Path(root) / fname
try:
st = fpath.stat()
except OSError as e:
errors.append((str(fpath), f"stat: {e}"))
continue
path_str = str(fpath)
size = st.st_size
mtime = to_dt(st.st_mtime)
prev = existing.get(path_str)
if prev and prev[0] == size and prev[1] == mtime and prev[2]:
# bez zmeny - jen last_seen_at + clear deleted_at
ops.append(UpdateOne(
{"path": path_str},
{"$set": {"last_seen_at": scan_started_at},
"$unset": {"deleted_at": ""}},
))
else:
try:
digest = sha256_of(fpath)
except OSError as e:
errors.append((path_str, f"hash: {e}"))
continue
rehashed += 1
rel = fpath.relative_to(study_root)
doc = {
"path": path_str,
"study": study_code,
"rel_path": str(rel),
"dir": str(fpath.parent),
"rel_dir": str(rel.parent) if str(rel.parent) != "." else "",
"parent_folders": list(rel.parts[:-1]),
"name": fname,
"stem": fpath.stem,
"ext": fpath.suffix.lower().lstrip("."),
"size_bytes": size,
"mtime": mtime,
"ctime": to_dt(st.st_ctime),
"atime": to_dt(st.st_atime),
"sha256": digest,
"mime": mimetypes.guess_type(fname)[0],
"tokens": tokenize(fpath.stem),
"dates_in_name": extract_dates(fname),
"last_seen_at": scan_started_at,
}
ops.append(UpdateOne(
{"path": path_str},
{"$set": doc, "$unset": {"deleted_at": ""},
"$setOnInsert": {"first_seen_at": scan_started_at}},
upsert=True,
))
seen += 1
if len(ops) >= 500:
coll.bulk_write(ops, ordered=False)
ops.clear()
print(f" ... {seen} souboru zpracovano")
if ops:
coll.bulk_write(ops, ordered=False)
# oznac smazane
res = coll.update_many(
{"last_seen_at": {"$lt": scan_started_at}, "deleted_at": {"$exists": False}},
{"$set": {"deleted_at": scan_started_at}},
)
return {
"study": study_code,
"seen": seen,
"rehashed": rehashed,
"unchanged": seen - rehashed,
"skipped": skipped,
"marked_deleted": res.modified_count,
"errors": errors,
}
def main() -> int:
t0 = time.time()
dropbox_root = Path(get_dropbox_root())
print(f"Dropbox root: {dropbox_root}")
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
scan_started_at = datetime.now(tz=timezone.utc)
results = []
for study_code, folder in STUDIES.items():
study_root = dropbox_root / folder
if not study_root.is_dir():
print(f"[!] {study_root} neexistuje, preskakuji")
continue
results.append(scan_study(study_code, study_root, db, scan_started_at))
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['study']}: seen={r['seen']} rehashed={r['rehashed']} "
f"unchanged={r['unchanged']} skipped={r['skipped']} "
f"deleted={r['marked_deleted']} errors={len(r['errors'])}")
for path, err in r["errors"][:5]:
print(f" ! {err} ({path})")
if len(r["errors"]) > 5:
print(f" ... +{len(r['errors']) - 5} dalsich chyb")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
raise SystemExit(main())