Add Outlook/Soubory/Clario/Feasibility scripts and reports; ignore Incoming, Outlook downloads & profile
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
# enrich_fulltext_v1.0
|
||||
|
||||
**Verze:** 1.0
|
||||
**Datum:** 2026-06-03
|
||||
**Skript:** `enrich_fulltext_v1.0.py`
|
||||
|
||||
## Účel
|
||||
Pro každý dokument odkazovaný v MongoDB (`soubory.*`) vytáhne **plný text** a uloží do PostgreSQL s GIN `tsvector` indexem pro fulltext vyhledávání.
|
||||
|
||||
## Cíl: PostgreSQL `MongoSoubory`
|
||||
- **host:** 192.168.1.76:5432
|
||||
- **db:** `MongoSoubory`
|
||||
- **user:** vladimir.buzalka
|
||||
- **extension:** `unaccent`, `pg_trgm`
|
||||
- **text search config:** `soubory` (= simple + unaccent → case- a diakritika-insensitivní)
|
||||
|
||||
## Tabulka `documents`
|
||||
| sloupec | typ | popis |
|
||||
|---|---|---|
|
||||
| id | BIGSERIAL | PK |
|
||||
| mongo_id | TEXT | ObjectId z Mongo |
|
||||
| study | TEXT | kolekce v Mongo (`42847922MDD3003` / `77242113UCO3001`) |
|
||||
| path | TEXT | absolutní cesta (UNIQUE s study) |
|
||||
| rel_path, name, ext | TEXT | doplňková metadata |
|
||||
| sha256 | TEXT | pro inkrementální kontrolu |
|
||||
| size_bytes, mtime | | |
|
||||
| **body** | TEXT | plný extrahovaný text (max 5 MB) |
|
||||
| body_length | INT | délka v znacích |
|
||||
| **tsv** | tsvector GENERATED STORED | `to_tsvector('soubory', body)` |
|
||||
| extracted_at | TIMESTAMPTZ | čas extrakce |
|
||||
| extractor_version | TEXT | verze tohoto skriptu |
|
||||
| ok | BOOLEAN | true pokud extrakce proběhla |
|
||||
| error | TEXT | chybové hlášení |
|
||||
|
||||
**Indexy:** GIN nad `tsv`, GIN trigram nad `name`, btree `sha256`, btree `(study, ext)`.
|
||||
|
||||
## Podporované přípony
|
||||
`pdf`, `docx`, `xlsx`, `xlsm`, `pptx`, `eml`, `msg`, `txt`, `csv`
|
||||
|
||||
## Inkrementální chování
|
||||
Soubor se přeskočí pokud v PG už existuje záznam s:
|
||||
- shodným `sha256`
|
||||
- shodnou `extractor_version`
|
||||
- `ok = true`
|
||||
|
||||
Jinak se přeparsuje a UPSERT.
|
||||
|
||||
## Limity (skip s `error=too_big_...`)
|
||||
- PDF nad 500 MB
|
||||
- XLSX nad 200 MB
|
||||
- ostatní nad 300 MB
|
||||
- `body` se vždy ořízne na 5 MB UTF-8
|
||||
|
||||
## Příklady dotazů (psql)
|
||||
```sql
|
||||
-- fulltext (case+diakritika insensitivní)
|
||||
SELECT study, name, ts_rank_cd(tsv, q) AS rank,
|
||||
ts_headline('soubory', body, q, 'MaxFragments=2,MinWords=5,MaxWords=15') AS snippet
|
||||
FROM documents, plainto_tsquery('soubory', 'amendment 3') q
|
||||
WHERE tsv @@ q
|
||||
ORDER BY rank DESC
|
||||
LIMIT 20;
|
||||
|
||||
-- jméno obsahuje (trigram, fuzzy)
|
||||
SELECT study, name FROM documents
|
||||
WHERE name ILIKE '%protokol%';
|
||||
|
||||
-- nejdelsi dokumenty per studie
|
||||
SELECT study, name, body_length
|
||||
FROM documents
|
||||
WHERE ok = true
|
||||
ORDER BY body_length DESC LIMIT 10;
|
||||
```
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.0.py
|
||||
```
|
||||
|
||||
Průběh tiskne řádek na soubor: `[n/total] OK pdf 2.3MB protokol.pdf | 12340 znaku 'Protocol amendment ...'`
|
||||
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: enrich_fulltext_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
|
||||
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector
|
||||
fulltext indexem.
|
||||
|
||||
Zdroje:
|
||||
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
|
||||
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
|
||||
|
||||
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
|
||||
|
||||
Inkrementalne: preskoci soubor, kde v PG existuje radek se shodnym
|
||||
sha256 a extractor_version a ok=true.
|
||||
|
||||
Pri prvnim behu sam vytvori tabulku, indexy a textovou konfiguraci
|
||||
'soubory' (unaccent + simple) - vyhleda case- a diakritika-insensitivni.
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import email
|
||||
import email.policy
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg
|
||||
from pymongo import MongoClient
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "soubory"
|
||||
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
EXTRACTOR_VERSION = "1.0"
|
||||
|
||||
MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB textu na dokument max
|
||||
MAX_PDF_BYTES = 500 * 1024 * 1024
|
||||
MAX_XLSX_BYTES = 200 * 1024 * 1024
|
||||
MAX_GENERIC_BYTES = 300 * 1024 * 1024
|
||||
|
||||
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
|
||||
|
||||
|
||||
# --- SCHEMA -----------------------------------------------------------------
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE EXTENSION IF NOT EXISTS unaccent;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
|
||||
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
|
||||
ALTER TEXT SEARCH CONFIGURATION soubory
|
||||
ALTER MAPPING FOR hword, hword_part, word
|
||||
WITH unaccent, simple;
|
||||
END IF;
|
||||
END$$;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
mongo_id TEXT NOT NULL,
|
||||
study TEXT NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
rel_path TEXT,
|
||||
name TEXT,
|
||||
ext TEXT,
|
||||
sha256 TEXT NOT NULL,
|
||||
size_bytes BIGINT,
|
||||
mtime TIMESTAMPTZ,
|
||||
body TEXT,
|
||||
body_length INT,
|
||||
tsv tsvector GENERATED ALWAYS AS (
|
||||
to_tsvector('soubory'::regconfig, coalesce(body, ''))
|
||||
) STORED,
|
||||
extracted_at TIMESTAMPTZ DEFAULT now(),
|
||||
extractor_version TEXT,
|
||||
ok BOOLEAN,
|
||||
error TEXT,
|
||||
UNIQUE (study, path)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
|
||||
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
|
||||
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
|
||||
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
|
||||
"""
|
||||
|
||||
|
||||
# --- EXTRAKTORY (vraci string, max MAX_TEXT_BYTES) --------------------------
|
||||
|
||||
def _truncate(s: str) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
b = s.encode("utf-8", errors="replace")
|
||||
if len(b) <= MAX_TEXT_BYTES:
|
||||
return s
|
||||
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
|
||||
|
||||
|
||||
def extract_pdf(path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
reader = PdfReader(str(path))
|
||||
if reader.is_encrypted:
|
||||
try:
|
||||
reader.decrypt("")
|
||||
except Exception:
|
||||
return ""
|
||||
parts = []
|
||||
total = 0
|
||||
for page in reader.pages:
|
||||
try:
|
||||
t = page.extract_text() or ""
|
||||
except Exception:
|
||||
continue
|
||||
parts.append(t)
|
||||
total += len(t)
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_docx(path: Path) -> str:
|
||||
from docx import Document
|
||||
doc = Document(str(path))
|
||||
parts = [p.text for p in doc.paragraphs if p.text]
|
||||
for tbl in doc.tables:
|
||||
for row in tbl.rows:
|
||||
parts.append(" | ".join(c.text for c in row.cells))
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_xlsx(path: Path) -> str:
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(str(path), read_only=True, data_only=True)
|
||||
parts = []
|
||||
total = 0
|
||||
for ws in wb.worksheets:
|
||||
parts.append(f"# {ws.title}")
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
line = "\t".join("" if v is None else str(v) for v in row)
|
||||
if line.strip():
|
||||
parts.append(line)
|
||||
total += len(line)
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
wb.close()
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_pptx(path: Path) -> str:
|
||||
from pptx import Presentation
|
||||
prs = Presentation(str(path))
|
||||
parts = []
|
||||
for i, slide in enumerate(prs.slides, 1):
|
||||
parts.append(f"# slide {i}")
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
line = "".join(run.text for run in para.runs)
|
||||
if line.strip():
|
||||
parts.append(line)
|
||||
if slide.has_notes_slide:
|
||||
notes = slide.notes_slide.notes_text_frame.text
|
||||
if notes:
|
||||
parts.append(f"[notes] {notes}")
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_eml(path: Path) -> str:
|
||||
with path.open("rb") as f:
|
||||
msg = email.message_from_binary_file(f, policy=email.policy.default)
|
||||
head = []
|
||||
for k in ("From", "To", "Cc", "Subject", "Date"):
|
||||
v = msg.get(k)
|
||||
if v:
|
||||
head.append(f"{k}: {v}")
|
||||
parts = ["\n".join(head)]
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain" and not part.get_filename():
|
||||
try:
|
||||
parts.append(part.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
parts.append(msg.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
return _truncate("\n\n".join(parts))
|
||||
|
||||
|
||||
def extract_msg(path: Path) -> str:
|
||||
import extract_msg
|
||||
with extract_msg.openMsg(str(path)) as m:
|
||||
head = []
|
||||
if m.subject: head.append(f"Subject: {m.subject}")
|
||||
if m.sender: head.append(f"From: {m.sender}")
|
||||
if m.to: head.append(f"To: {m.to}")
|
||||
if m.cc: head.append(f"Cc: {m.cc}")
|
||||
if m.date: head.append(f"Date: {m.date}")
|
||||
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
|
||||
|
||||
|
||||
def extract_text(path: Path) -> str:
|
||||
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
||||
for enc in ("utf-8-sig", "cp1250", "latin-1"):
|
||||
try:
|
||||
return data.decode(enc)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return data.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"pdf": (extract_pdf, MAX_PDF_BYTES),
|
||||
"docx": (extract_docx, MAX_GENERIC_BYTES),
|
||||
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
|
||||
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
|
||||
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
|
||||
"eml": (extract_eml, MAX_GENERIC_BYTES),
|
||||
"msg": (extract_msg, MAX_GENERIC_BYTES),
|
||||
"txt": (extract_text, MAX_GENERIC_BYTES),
|
||||
"csv": (extract_text, MAX_GENERIC_BYTES),
|
||||
}
|
||||
|
||||
|
||||
def _short(s, n=40):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").replace("\r", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return datetime.now(tz=timezone.utc)
|
||||
|
||||
|
||||
# --- HLAVNI SMYCKA ----------------------------------------------------------
|
||||
|
||||
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
|
||||
# nactu z PG existujici sha256 + verzi
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
|
||||
(study,),
|
||||
)
|
||||
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
|
||||
|
||||
cursor = mongo_coll.find(
|
||||
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
|
||||
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
|
||||
"sha256": 1, "size_bytes": 1, "mtime": 1},
|
||||
no_cursor_timeout=True,
|
||||
)
|
||||
|
||||
processed = ok = errors = skipped = too_big = 0
|
||||
queue = []
|
||||
total_pending = mongo_coll.count_documents(
|
||||
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
|
||||
)
|
||||
print(f"[{study}] kandidatu v Mongo: {total_pending}")
|
||||
|
||||
n = 0
|
||||
try:
|
||||
for doc in cursor:
|
||||
n += 1
|
||||
prev = existing.get(doc["path"])
|
||||
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ext = doc["ext"]
|
||||
extractor, max_bytes = EXTRACTORS[ext]
|
||||
path = Path(doc["path"])
|
||||
|
||||
row = {
|
||||
"mongo_id": str(doc["_id"]),
|
||||
"study": study,
|
||||
"path": doc["path"],
|
||||
"rel_path": doc.get("rel_path"),
|
||||
"name": doc.get("name"),
|
||||
"ext": ext,
|
||||
"sha256": doc.get("sha256"),
|
||||
"size_bytes": doc.get("size_bytes"),
|
||||
"mtime": doc.get("mtime"),
|
||||
"body": None,
|
||||
"body_length": 0,
|
||||
"extracted_at": _now(),
|
||||
"extractor_version": EXTRACTOR_VERSION,
|
||||
"ok": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
status = "OK "
|
||||
detail = ""
|
||||
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
|
||||
|
||||
if not path.exists():
|
||||
row["error"] = "file_missing"
|
||||
status = "ERR"; detail = "file_missing"; errors += 1
|
||||
elif (doc.get("size_bytes") or 0) > max_bytes:
|
||||
row["error"] = f"too_big_>{max_bytes}"
|
||||
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
|
||||
else:
|
||||
try:
|
||||
body = extractor(path) or ""
|
||||
row["body"] = body if body else None
|
||||
row["body_length"] = len(body)
|
||||
row["ok"] = True
|
||||
ok += 1
|
||||
detail = f"{len(body)} znaku {_short(body, 60)!r}"
|
||||
except Exception as e:
|
||||
row["error"] = f"{type(e).__name__}: {e}"[:500]
|
||||
status = "ERR"; detail = row["error"][:80]; errors += 1
|
||||
|
||||
queue.append(row)
|
||||
processed += 1
|
||||
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
|
||||
f"{path.name} | {detail}", flush=True)
|
||||
|
||||
if len(queue) >= 50:
|
||||
_flush(pg, queue); queue.clear()
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
if queue:
|
||||
_flush(pg, queue)
|
||||
|
||||
return {"study": study, "processed": processed, "ok": ok,
|
||||
"errors": errors, "skipped": skipped, "too_big": too_big}
|
||||
|
||||
|
||||
UPSERT_SQL = """
|
||||
INSERT INTO documents
|
||||
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
|
||||
body, body_length, extracted_at, extractor_version, ok, error)
|
||||
VALUES
|
||||
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
|
||||
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
|
||||
%(extractor_version)s, %(ok)s, %(error)s)
|
||||
ON CONFLICT (study, path) DO UPDATE SET
|
||||
mongo_id = EXCLUDED.mongo_id,
|
||||
rel_path = EXCLUDED.rel_path,
|
||||
name = EXCLUDED.name,
|
||||
ext = EXCLUDED.ext,
|
||||
sha256 = EXCLUDED.sha256,
|
||||
size_bytes = EXCLUDED.size_bytes,
|
||||
mtime = EXCLUDED.mtime,
|
||||
body = EXCLUDED.body,
|
||||
body_length = EXCLUDED.body_length,
|
||||
extracted_at = EXCLUDED.extracted_at,
|
||||
extractor_version = EXCLUDED.extractor_version,
|
||||
ok = EXCLUDED.ok,
|
||||
error = EXCLUDED.error
|
||||
"""
|
||||
|
||||
|
||||
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
|
||||
with pg.cursor() as cur:
|
||||
cur.executemany(UPSERT_SQL, rows)
|
||||
pg.commit()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
t0 = time.time()
|
||||
print("Pripojuji se k PostgreSQL...")
|
||||
pg = psycopg.connect(PG_DSN, connect_timeout=10)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(SCHEMA_SQL)
|
||||
pg.commit()
|
||||
print("Schema OK.")
|
||||
|
||||
print("Pripojuji se k MongoDB...")
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
mongo.admin.command("ping")
|
||||
db = mongo[MONGO_DB]
|
||||
print("Mongo OK.")
|
||||
|
||||
results = []
|
||||
for name in MONGO_COLLECTIONS:
|
||||
results.append(process_collection(pg, db[name], name))
|
||||
|
||||
pg.close()
|
||||
|
||||
print("\n=== SHRNUTI ===")
|
||||
for r in results:
|
||||
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
|
||||
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
|
||||
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,22 @@
|
||||
# enrich_fulltext_v1.1
|
||||
|
||||
**Verze:** 1.1
|
||||
**Datum:** 2026-06-03
|
||||
**Skript:** `enrich_fulltext_v1.1.py`
|
||||
|
||||
## Změny proti v1.0
|
||||
- **NUL bajty (0x00) v textu** — PG TEXT je odmítá. v1.1 odstraní všechny `\x00` a ostatní controly (kromě `\n \r \t`) ve společné funkci `_clean_for_pg`, navíc bezpečnostní strip i v `_flush` před UPSERT.
|
||||
- **DOCX fallback** — pokud python-docx hodí výjimku (typicky `"no tr above topmost tr in w:tbl"` u VTMF formulářů s rozbitými tabulkami), v1.1 sáhne přímo do `word/document.xml` v ZIPu a regexem vytáhne text z `<w:t>` elementů. Přijde o strukturu tabulek, ale text zachrání.
|
||||
- `extractor_version` zvýšena na `1.1` → všechny řádky z v1.0 se přeparsují (původní jsou pravděpodobně stejně chyběly kvůli pádu).
|
||||
|
||||
## Vše ostatní
|
||||
Beze změny proti [v1.0](Trash/enrich_fulltext_v1.0.md):
|
||||
- Tabulka `documents` v PG `MongoSoubory` (192.168.1.76:5432)
|
||||
- Text search config `soubory` (simple + unaccent)
|
||||
- Limity: PDF 500 MB, XLSX 200 MB, ostatní 300 MB; text max 5 MB
|
||||
- Inkrementálně podle `sha256` + `extractor_version`
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.1.py
|
||||
```
|
||||
@@ -0,0 +1,457 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: enrich_fulltext_v1.1.py
|
||||
Verze: 1.1
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
|
||||
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector indexem.
|
||||
|
||||
Zmeny proti v1.0:
|
||||
- PG odmita NUL (0x00) bajty v TEXT -> v _truncate se vsechny NULy odstrani
|
||||
(i jine controly krome \\n \\r \\t)
|
||||
- DOCX fallback: pokud python-docx selze (typicky "no tr above topmost tr
|
||||
in w:tbl" u rozbitych tabulek), pokusi se primy raw extract z word/document.xml
|
||||
pres regex - prijde o strukturu tabulek, ale zachrani text
|
||||
- drobnost: posunul jsem extractor_version na "1.1" -> stare radky se preparsuji
|
||||
|
||||
Cilove ulozeni:
|
||||
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
|
||||
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
|
||||
|
||||
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import email
|
||||
import email.policy
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import zipfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg
|
||||
from pymongo import MongoClient
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "soubory"
|
||||
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
EXTRACTOR_VERSION = "1.1"
|
||||
|
||||
MAX_TEXT_BYTES = 5 * 1024 * 1024
|
||||
MAX_PDF_BYTES = 500 * 1024 * 1024
|
||||
MAX_XLSX_BYTES = 200 * 1024 * 1024
|
||||
MAX_GENERIC_BYTES = 300 * 1024 * 1024
|
||||
|
||||
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
|
||||
|
||||
|
||||
# --- SCHEMA -----------------------------------------------------------------
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE EXTENSION IF NOT EXISTS unaccent;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
|
||||
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
|
||||
ALTER TEXT SEARCH CONFIGURATION soubory
|
||||
ALTER MAPPING FOR hword, hword_part, word
|
||||
WITH unaccent, simple;
|
||||
END IF;
|
||||
END$$;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
mongo_id TEXT NOT NULL,
|
||||
study TEXT NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
rel_path TEXT,
|
||||
name TEXT,
|
||||
ext TEXT,
|
||||
sha256 TEXT NOT NULL,
|
||||
size_bytes BIGINT,
|
||||
mtime TIMESTAMPTZ,
|
||||
body TEXT,
|
||||
body_length INT,
|
||||
tsv tsvector GENERATED ALWAYS AS (
|
||||
to_tsvector('soubory'::regconfig, coalesce(body, ''))
|
||||
) STORED,
|
||||
extracted_at TIMESTAMPTZ DEFAULT now(),
|
||||
extractor_version TEXT,
|
||||
ok BOOLEAN,
|
||||
error TEXT,
|
||||
UNIQUE (study, path)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
|
||||
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
|
||||
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
|
||||
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
|
||||
"""
|
||||
|
||||
|
||||
# --- HELPERY ----------------------------------------------------------------
|
||||
|
||||
# odstrani 0x00 a ostatni controly krome whitespace
|
||||
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
|
||||
|
||||
|
||||
def _clean_for_pg(s: str) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
return _CTRL_RX.sub("", s)
|
||||
|
||||
|
||||
def _truncate(s: str) -> str:
|
||||
s = _clean_for_pg(s or "")
|
||||
if not s:
|
||||
return ""
|
||||
b = s.encode("utf-8", errors="replace")
|
||||
if len(b) <= MAX_TEXT_BYTES:
|
||||
return s
|
||||
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
|
||||
|
||||
|
||||
# --- EXTRAKTORY -------------------------------------------------------------
|
||||
|
||||
def extract_pdf(path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
reader = PdfReader(str(path))
|
||||
if reader.is_encrypted:
|
||||
try:
|
||||
reader.decrypt("")
|
||||
except Exception:
|
||||
return ""
|
||||
parts = []
|
||||
total = 0
|
||||
for page in reader.pages:
|
||||
try:
|
||||
t = page.extract_text() or ""
|
||||
except Exception:
|
||||
continue
|
||||
parts.append(t)
|
||||
total += len(t)
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
# regex pro DOCX fallback - vytahne <w:t>...</w:t>
|
||||
_DOCX_WT_RX = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
|
||||
_DOCX_WP_END_RX = re.compile(r"</w:p>")
|
||||
|
||||
|
||||
def _docx_raw_text(path: Path) -> str:
|
||||
"""Fallback - cte primo word/document.xml ze ZIPu."""
|
||||
with zipfile.ZipFile(str(path)) as z:
|
||||
try:
|
||||
xml = z.read("word/document.xml").decode("utf-8", errors="replace")
|
||||
except KeyError:
|
||||
return ""
|
||||
xml = _DOCX_WP_END_RX.sub("\n", xml)
|
||||
return "\n".join(m.group(1) for m in _DOCX_WT_RX.finditer(xml))
|
||||
|
||||
|
||||
def extract_docx(path: Path) -> str:
|
||||
from docx import Document
|
||||
try:
|
||||
doc = Document(str(path))
|
||||
parts = [p.text for p in doc.paragraphs if p.text]
|
||||
for tbl in doc.tables:
|
||||
for row in tbl.rows:
|
||||
parts.append(" | ".join(c.text for c in row.cells))
|
||||
return _truncate("\n".join(parts))
|
||||
except Exception:
|
||||
# fallback - raw XML extract
|
||||
return _truncate(_docx_raw_text(path))
|
||||
|
||||
|
||||
def extract_xlsx(path: Path) -> str:
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(str(path), read_only=True, data_only=True)
|
||||
parts = []
|
||||
total = 0
|
||||
for ws in wb.worksheets:
|
||||
parts.append(f"# {ws.title}")
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
line = "\t".join("" if v is None else str(v) for v in row)
|
||||
if line.strip():
|
||||
parts.append(line)
|
||||
total += len(line)
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
wb.close()
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_pptx(path: Path) -> str:
|
||||
from pptx import Presentation
|
||||
prs = Presentation(str(path))
|
||||
parts = []
|
||||
for i, slide in enumerate(prs.slides, 1):
|
||||
parts.append(f"# slide {i}")
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
line = "".join(run.text for run in para.runs)
|
||||
if line.strip():
|
||||
parts.append(line)
|
||||
if slide.has_notes_slide:
|
||||
notes = slide.notes_slide.notes_text_frame.text
|
||||
if notes:
|
||||
parts.append(f"[notes] {notes}")
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_eml(path: Path) -> str:
|
||||
with path.open("rb") as f:
|
||||
msg = email.message_from_binary_file(f, policy=email.policy.default)
|
||||
head = []
|
||||
for k in ("From", "To", "Cc", "Subject", "Date"):
|
||||
v = msg.get(k)
|
||||
if v:
|
||||
head.append(f"{k}: {v}")
|
||||
parts = ["\n".join(head)]
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain" and not part.get_filename():
|
||||
try:
|
||||
parts.append(part.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
parts.append(msg.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
return _truncate("\n\n".join(parts))
|
||||
|
||||
|
||||
def extract_msg(path: Path) -> str:
|
||||
import extract_msg
|
||||
with extract_msg.openMsg(str(path)) as m:
|
||||
head = []
|
||||
if m.subject: head.append(f"Subject: {m.subject}")
|
||||
if m.sender: head.append(f"From: {m.sender}")
|
||||
if m.to: head.append(f"To: {m.to}")
|
||||
if m.cc: head.append(f"Cc: {m.cc}")
|
||||
if m.date: head.append(f"Date: {m.date}")
|
||||
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
|
||||
|
||||
|
||||
def extract_text(path: Path) -> str:
|
||||
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
||||
for enc in ("utf-8-sig", "cp1250", "latin-1"):
|
||||
try:
|
||||
return _truncate(data.decode(enc))
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return _truncate(data.decode("utf-8", errors="replace"))
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"pdf": (extract_pdf, MAX_PDF_BYTES),
|
||||
"docx": (extract_docx, MAX_GENERIC_BYTES),
|
||||
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
|
||||
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
|
||||
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
|
||||
"eml": (extract_eml, MAX_GENERIC_BYTES),
|
||||
"msg": (extract_msg, MAX_GENERIC_BYTES),
|
||||
"txt": (extract_text, MAX_GENERIC_BYTES),
|
||||
"csv": (extract_text, MAX_GENERIC_BYTES),
|
||||
}
|
||||
|
||||
|
||||
def _short(s, n=40):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").replace("\r", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return datetime.now(tz=timezone.utc)
|
||||
|
||||
|
||||
# --- HLAVNI SMYCKA ----------------------------------------------------------
|
||||
|
||||
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
|
||||
(study,),
|
||||
)
|
||||
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
|
||||
|
||||
cursor = mongo_coll.find(
|
||||
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
|
||||
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
|
||||
"sha256": 1, "size_bytes": 1, "mtime": 1},
|
||||
no_cursor_timeout=True,
|
||||
)
|
||||
|
||||
processed = ok = errors = skipped = too_big = 0
|
||||
queue: list[dict] = []
|
||||
total_pending = mongo_coll.count_documents(
|
||||
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
|
||||
)
|
||||
print(f"[{study}] kandidatu v Mongo: {total_pending}")
|
||||
|
||||
n = 0
|
||||
try:
|
||||
for doc in cursor:
|
||||
n += 1
|
||||
prev = existing.get(doc["path"])
|
||||
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ext = doc["ext"]
|
||||
extractor, max_bytes = EXTRACTORS[ext]
|
||||
path = Path(doc["path"])
|
||||
|
||||
row = {
|
||||
"mongo_id": str(doc["_id"]),
|
||||
"study": study,
|
||||
"path": doc["path"],
|
||||
"rel_path": doc.get("rel_path"),
|
||||
"name": doc.get("name"),
|
||||
"ext": ext,
|
||||
"sha256": doc.get("sha256"),
|
||||
"size_bytes": doc.get("size_bytes"),
|
||||
"mtime": doc.get("mtime"),
|
||||
"body": None,
|
||||
"body_length": 0,
|
||||
"extracted_at": _now(),
|
||||
"extractor_version": EXTRACTOR_VERSION,
|
||||
"ok": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
status = "OK "
|
||||
detail = ""
|
||||
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
|
||||
|
||||
if not path.exists():
|
||||
row["error"] = "file_missing"
|
||||
status = "ERR"; detail = "file_missing"; errors += 1
|
||||
elif (doc.get("size_bytes") or 0) > max_bytes:
|
||||
row["error"] = f"too_big_>{max_bytes}"
|
||||
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
|
||||
else:
|
||||
try:
|
||||
body = extractor(path) or ""
|
||||
row["body"] = body if body else None
|
||||
row["body_length"] = len(body)
|
||||
row["ok"] = True
|
||||
ok += 1
|
||||
detail = f"{len(body)} znaku {_short(body, 60)!r}"
|
||||
except Exception as e:
|
||||
row["error"] = f"{type(e).__name__}: {e}"[:500]
|
||||
status = "ERR"; detail = row["error"][:80]; errors += 1
|
||||
|
||||
queue.append(row)
|
||||
processed += 1
|
||||
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
|
||||
f"{path.name} | {detail}", flush=True)
|
||||
|
||||
if len(queue) >= 50:
|
||||
_flush(pg, queue); queue.clear()
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
if queue:
|
||||
_flush(pg, queue)
|
||||
|
||||
return {"study": study, "processed": processed, "ok": ok,
|
||||
"errors": errors, "skipped": skipped, "too_big": too_big}
|
||||
|
||||
|
||||
UPSERT_SQL = """
|
||||
INSERT INTO documents
|
||||
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
|
||||
body, body_length, extracted_at, extractor_version, ok, error)
|
||||
VALUES
|
||||
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
|
||||
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
|
||||
%(extractor_version)s, %(ok)s, %(error)s)
|
||||
ON CONFLICT (study, path) DO UPDATE SET
|
||||
mongo_id = EXCLUDED.mongo_id,
|
||||
rel_path = EXCLUDED.rel_path,
|
||||
name = EXCLUDED.name,
|
||||
ext = EXCLUDED.ext,
|
||||
sha256 = EXCLUDED.sha256,
|
||||
size_bytes = EXCLUDED.size_bytes,
|
||||
mtime = EXCLUDED.mtime,
|
||||
body = EXCLUDED.body,
|
||||
body_length = EXCLUDED.body_length,
|
||||
extracted_at = EXCLUDED.extracted_at,
|
||||
extractor_version = EXCLUDED.extractor_version,
|
||||
ok = EXCLUDED.ok,
|
||||
error = EXCLUDED.error
|
||||
"""
|
||||
|
||||
|
||||
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
|
||||
# posledni pojistka - jeste jednou strip NUL (kdyby se necim prokrouzil)
|
||||
for r in rows:
|
||||
if r.get("body"):
|
||||
r["body"] = _clean_for_pg(r["body"])
|
||||
if r.get("error"):
|
||||
r["error"] = _clean_for_pg(r["error"])
|
||||
with pg.cursor() as cur:
|
||||
cur.executemany(UPSERT_SQL, rows)
|
||||
pg.commit()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
t0 = time.time()
|
||||
print("Pripojuji se k PostgreSQL...")
|
||||
pg = psycopg.connect(PG_DSN, connect_timeout=10)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(SCHEMA_SQL)
|
||||
pg.commit()
|
||||
print("Schema OK.")
|
||||
|
||||
print("Pripojuji se k MongoDB...")
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
mongo.admin.command("ping")
|
||||
db = mongo[MONGO_DB]
|
||||
print("Mongo OK.")
|
||||
|
||||
results = []
|
||||
for name in MONGO_COLLECTIONS:
|
||||
results.append(process_collection(pg, db[name], name))
|
||||
|
||||
pg.close()
|
||||
|
||||
print("\n=== SHRNUTI ===")
|
||||
for r in results:
|
||||
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
|
||||
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
|
||||
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,46 @@
|
||||
# enrich_files_v1.0
|
||||
|
||||
**Verze:** 1.0
|
||||
**Datum:** 2026-06-03
|
||||
**Skript:** `enrich_files_v1.0.py`
|
||||
|
||||
## Účel
|
||||
Doplnit do existujících záznamů v MongoDB `soubory.*` pole `content.*` parsovaná z obsahu souborů.
|
||||
|
||||
Spouští se **až po** [scan_files_v1.0.py](scan_files_v1.0.md).
|
||||
|
||||
## Podporované přípony a pole
|
||||
|
||||
| ext | knihovna | pole v `content` |
|
||||
|---|---|---|
|
||||
| pdf | pypdf | pages, encrypted, author, title, subject, creator, producer, created, modified, text_head |
|
||||
| docx | python-docx | author, title, subject, last_modified_by, paragraphs, words, created, modified, text_head |
|
||||
| xlsx, xlsm | openpyxl | total_sheets, sheets[{name,rows,cols}], author, title, subject, last_modified_by, created, modified |
|
||||
| pptx | python-pptx | slides, author, title, subject, last_modified_by, created, modified, text_head (z prvních 3 snímků) |
|
||||
| eml | stdlib email | subject, from, to, cc, date, has_attachments, attachments[], body_head |
|
||||
| msg | extract_msg | totéž co eml |
|
||||
|
||||
Společná pole vždy: `ok` (bool), `parsed_at`, `parser_version`, `sha256_at_parse`. Při chybě `error` (název výjimky + zpráva).
|
||||
|
||||
## Inkrementální chování
|
||||
Zpracují se jen dokumenty kde:
|
||||
- `content` chybí, NEBO
|
||||
- `content.parser_version` != aktuální verze (1.0), NEBO
|
||||
- `content.sha256_at_parse` != aktuální `sha256` (soubor se změnil)
|
||||
|
||||
Při dalším spuštění **přidá** jen nové/změněné. Při zvýšení verze parseru přeparsuje vše.
|
||||
|
||||
## Limity (skip)
|
||||
- PDF nad 500 MB → ok=False, error="too_big_..."
|
||||
- XLSX nad 200 MB → ok=False
|
||||
- ostatní nad 300 MB → ok=False
|
||||
|
||||
`text_head` max 2000 znaků.
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python U:\PythonProject\Janssen\Soubory\enrich_files_v1.0.py
|
||||
```
|
||||
|
||||
## Plán
|
||||
Po doběhnutí ověřit `content.ok` rate, případně doladit (chybové vzory) a teprve pak stavět `MCP_SOUBORY` server.
|
||||
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: enrich_files_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Doplni metadata z obsahu souboru (PDF/DOCX/XLSX/PPTX/EML/MSG)
|
||||
do existujicich zaznamu v MongoDB (db: soubory).
|
||||
|
||||
Pole se uklada do podobjektu `content`:
|
||||
- common: ok (bool), error (str|None), parsed_at, parser_version
|
||||
- pdf: pages, author, title, subject, creator, producer,
|
||||
created, modified, encrypted, text_head (prvni stranka, max 2000 znaku)
|
||||
- docx: author, title, subject, last_modified_by, paragraphs,
|
||||
words, created, modified, text_head
|
||||
- xlsx: sheets [{name, rows, cols}], total_sheets,
|
||||
author, title, last_modified_by, created, modified
|
||||
- pptx: slides, author, title, subject, last_modified_by,
|
||||
created, modified, text_head (text z prvnich 3 snimku)
|
||||
- eml: subject, from, to, cc, date, has_attachments,
|
||||
attachments [filenames], body_head
|
||||
- msg: same as eml
|
||||
|
||||
Inkrementalni:
|
||||
- preskaci soubor, kde content.sha256_at_parse == aktualni sha256
|
||||
a content.parser_version == aktualni verze
|
||||
- pri zmene obsahu (jiny sha256) prepocita
|
||||
- pri chybe ulozi content.error a content.ok=False
|
||||
|
||||
MongoDB: 192.168.1.76:27017
|
||||
DB: soubory
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import email
|
||||
import email.policy
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "soubory"
|
||||
COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
|
||||
PARSER_VERSION = "1.0"
|
||||
TEXT_HEAD_LIMIT = 2000
|
||||
|
||||
# limity pro velke soubory - aby skript neuvazil na 1GB PDF
|
||||
MAX_PDF_BYTES = 500 * 1024 * 1024 # 500 MB
|
||||
MAX_XLSX_BYTES = 200 * 1024 * 1024
|
||||
MAX_GENERIC_BYTES = 300 * 1024 * 1024
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return datetime.now(tz=timezone.utc)
|
||||
|
||||
|
||||
def _truncate(s: str | None, n: int = TEXT_HEAD_LIMIT) -> str | None:
|
||||
if s is None:
|
||||
return None
|
||||
s = s.strip()
|
||||
return s if len(s) <= n else s[:n]
|
||||
|
||||
|
||||
def _to_dt(value):
|
||||
if isinstance(value, datetime):
|
||||
return value if value.tzinfo else value.replace(tzinfo=timezone.utc)
|
||||
if isinstance(value, str) and value:
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# --- PARSERY ----------------------------------------------------------------
|
||||
|
||||
def parse_pdf(path: Path) -> dict:
|
||||
from pypdf import PdfReader
|
||||
reader = PdfReader(str(path))
|
||||
info = reader.metadata or {}
|
||||
out = {
|
||||
"pages": len(reader.pages),
|
||||
"encrypted": reader.is_encrypted,
|
||||
"author": getattr(info, "author", None),
|
||||
"title": getattr(info, "title", None),
|
||||
"subject": getattr(info, "subject", None),
|
||||
"creator": getattr(info, "creator", None),
|
||||
"producer": getattr(info, "producer", None),
|
||||
"created": _to_dt(getattr(info, "creation_date", None)),
|
||||
"modified": _to_dt(getattr(info, "modification_date", None)),
|
||||
}
|
||||
text_head = None
|
||||
try:
|
||||
if not reader.is_encrypted and reader.pages:
|
||||
text_head = reader.pages[0].extract_text()
|
||||
except Exception:
|
||||
text_head = None
|
||||
out["text_head"] = _truncate(text_head)
|
||||
return out
|
||||
|
||||
|
||||
def parse_docx(path: Path) -> dict:
|
||||
from docx import Document
|
||||
doc = Document(str(path))
|
||||
core = doc.core_properties
|
||||
paragraphs = doc.paragraphs
|
||||
text = "\n".join(p.text for p in paragraphs if p.text)
|
||||
words = len(text.split())
|
||||
return {
|
||||
"author": core.author,
|
||||
"title": core.title,
|
||||
"subject": core.subject,
|
||||
"last_modified_by": core.last_modified_by,
|
||||
"paragraphs": len(paragraphs),
|
||||
"words": words,
|
||||
"created": _to_dt(core.created),
|
||||
"modified": _to_dt(core.modified),
|
||||
"text_head": _truncate(text),
|
||||
}
|
||||
|
||||
|
||||
def parse_xlsx(path: Path) -> dict:
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(str(path), read_only=True, data_only=False)
|
||||
sheets = []
|
||||
for ws in wb.worksheets:
|
||||
sheets.append({
|
||||
"name": ws.title,
|
||||
"rows": ws.max_row,
|
||||
"cols": ws.max_column,
|
||||
})
|
||||
props = wb.properties
|
||||
out = {
|
||||
"total_sheets": len(sheets),
|
||||
"sheets": sheets,
|
||||
"author": props.creator,
|
||||
"title": props.title,
|
||||
"subject": props.subject,
|
||||
"last_modified_by": props.lastModifiedBy,
|
||||
"created": _to_dt(props.created),
|
||||
"modified": _to_dt(props.modified),
|
||||
}
|
||||
wb.close()
|
||||
return out
|
||||
|
||||
|
||||
def parse_pptx(path: Path) -> dict:
|
||||
from pptx import Presentation
|
||||
prs = Presentation(str(path))
|
||||
core = prs.core_properties
|
||||
head_parts = []
|
||||
for slide in list(prs.slides)[:3]:
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
for run in para.runs:
|
||||
if run.text:
|
||||
head_parts.append(run.text)
|
||||
return {
|
||||
"slides": len(prs.slides),
|
||||
"author": core.author,
|
||||
"title": core.title,
|
||||
"subject": core.subject,
|
||||
"last_modified_by": core.last_modified_by,
|
||||
"created": _to_dt(core.created),
|
||||
"modified": _to_dt(core.modified),
|
||||
"text_head": _truncate(" ".join(head_parts)),
|
||||
}
|
||||
|
||||
|
||||
def parse_eml(path: Path) -> dict:
|
||||
with path.open("rb") as f:
|
||||
msg = email.message_from_binary_file(f, policy=email.policy.default)
|
||||
attachments = []
|
||||
body_parts = []
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
disp = (part.get("Content-Disposition") or "").lower()
|
||||
ctype = part.get_content_type()
|
||||
if "attachment" in disp or part.get_filename():
|
||||
fname = part.get_filename()
|
||||
if fname:
|
||||
attachments.append(fname)
|
||||
elif ctype == "text/plain":
|
||||
try:
|
||||
body_parts.append(part.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
body_parts.append(msg.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _addrs(field):
|
||||
v = msg.get(field)
|
||||
return v if v else None
|
||||
|
||||
return {
|
||||
"subject": msg.get("Subject"),
|
||||
"from": _addrs("From"),
|
||||
"to": _addrs("To"),
|
||||
"cc": _addrs("Cc"),
|
||||
"date": msg.get("Date"),
|
||||
"has_attachments": bool(attachments),
|
||||
"attachments": attachments,
|
||||
"body_head": _truncate("\n".join(body_parts)),
|
||||
}
|
||||
|
||||
|
||||
def parse_msg(path: Path) -> dict:
|
||||
import extract_msg
|
||||
with extract_msg.openMsg(str(path)) as msg:
|
||||
attachments = []
|
||||
for att in msg.attachments or []:
|
||||
try:
|
||||
fname = att.longFilename or att.shortFilename
|
||||
if fname:
|
||||
attachments.append(fname)
|
||||
except Exception:
|
||||
continue
|
||||
return {
|
||||
"subject": msg.subject,
|
||||
"from": msg.sender,
|
||||
"to": msg.to,
|
||||
"cc": msg.cc,
|
||||
"date": str(msg.date) if msg.date else None,
|
||||
"has_attachments": bool(attachments),
|
||||
"attachments": attachments,
|
||||
"body_head": _truncate(msg.body or ""),
|
||||
}
|
||||
|
||||
|
||||
PARSERS = {
|
||||
"pdf": (parse_pdf, MAX_PDF_BYTES),
|
||||
"docx": (parse_docx, MAX_GENERIC_BYTES),
|
||||
"xlsx": (parse_xlsx, MAX_XLSX_BYTES),
|
||||
"xlsm": (parse_xlsx, MAX_XLSX_BYTES),
|
||||
"pptx": (parse_pptx, MAX_GENERIC_BYTES),
|
||||
"eml": (parse_eml, MAX_GENERIC_BYTES),
|
||||
"msg": (parse_msg, MAX_GENERIC_BYTES),
|
||||
}
|
||||
|
||||
|
||||
# --- SUMMARY PRO KONZOLI ----------------------------------------------------
|
||||
|
||||
def _short(s, n=40):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").replace("\r", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _summary(content: dict, ext: str) -> str:
|
||||
if not content.get("ok"):
|
||||
return f"chyba: {_short(content.get('error'), 80)}"
|
||||
parts = []
|
||||
if ext == "pdf":
|
||||
parts.append(f"{content.get('pages')}p")
|
||||
if content.get("encrypted"): parts.append("enc")
|
||||
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
|
||||
if content.get("title"): parts.append(f"t={_short(content['title'], 30)}")
|
||||
elif ext == "docx":
|
||||
parts.append(f"{content.get('paragraphs')}para")
|
||||
parts.append(f"{content.get('words')}w")
|
||||
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
|
||||
elif ext in ("xlsx", "xlsm"):
|
||||
n = content.get("total_sheets", 0)
|
||||
sheets = content.get("sheets") or []
|
||||
names = ",".join(_short(s["name"], 12) for s in sheets[:3])
|
||||
if n > 3:
|
||||
names += f",+{n-3}"
|
||||
parts.append(f"{n}sh[{names}]")
|
||||
if content.get("author"): parts.append(f"by={_short(content['author'], 20)}")
|
||||
elif ext == "pptx":
|
||||
parts.append(f"{content.get('slides')}slides")
|
||||
if content.get("author"): parts.append(f"by={_short(content['author'], 25)}")
|
||||
if content.get("title"): parts.append(f"t={_short(content['title'], 25)}")
|
||||
elif ext in ("eml", "msg"):
|
||||
if content.get("from"): parts.append(f"from={_short(content['from'], 25)}")
|
||||
if content.get("subject"): parts.append(f"subj={_short(content['subject'], 40)}")
|
||||
if content.get("has_attachments"):
|
||||
parts.append(f"att={len(content.get('attachments') or [])}")
|
||||
return " ".join(parts) if parts else "ok"
|
||||
|
||||
|
||||
# --- HLAVNI SMYCKA ----------------------------------------------------------
|
||||
|
||||
def enrich_collection(coll, study: str) -> dict:
|
||||
supported = list(PARSERS.keys())
|
||||
query = {
|
||||
"ext": {"$in": supported},
|
||||
"deleted_at": {"$exists": False},
|
||||
"$or": [
|
||||
{"content": {"$exists": False}},
|
||||
{"content.parser_version": {"$ne": PARSER_VERSION}},
|
||||
{"$expr": {"$ne": ["$content.sha256_at_parse", "$sha256"]}},
|
||||
],
|
||||
}
|
||||
total_pending = coll.count_documents(query)
|
||||
print(f"[{study}] k zpracovani: {total_pending} souboru")
|
||||
|
||||
ops: list[UpdateOne] = []
|
||||
processed = 0
|
||||
ok = 0
|
||||
errors = 0
|
||||
too_big = 0
|
||||
|
||||
cursor = coll.find(query, {"path": 1, "ext": 1, "size_bytes": 1, "sha256": 1}, no_cursor_timeout=True)
|
||||
try:
|
||||
for doc in cursor:
|
||||
ext = doc["ext"]
|
||||
parser, max_bytes = PARSERS[ext]
|
||||
path = Path(doc["path"])
|
||||
content: dict = {
|
||||
"parser_version": PARSER_VERSION,
|
||||
"parsed_at": _now(),
|
||||
"sha256_at_parse": doc.get("sha256"),
|
||||
}
|
||||
if not path.exists():
|
||||
content.update(ok=False, error="file_missing")
|
||||
errors += 1
|
||||
elif doc.get("size_bytes", 0) > max_bytes:
|
||||
content.update(ok=False, error=f"too_big_>{max_bytes}")
|
||||
too_big += 1
|
||||
else:
|
||||
try:
|
||||
payload = parser(path)
|
||||
content["ok"] = True
|
||||
content.update(payload)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
content["ok"] = False
|
||||
content["error"] = f"{type(e).__name__}: {e}"[:500]
|
||||
errors += 1
|
||||
|
||||
ops.append(UpdateOne({"_id": doc["_id"]}, {"$set": {"content": content}}))
|
||||
processed += 1
|
||||
|
||||
status = "OK " if content.get("ok") else ("BIG" if "too_big" in (content.get("error") or "") else "ERR")
|
||||
size_mb = (doc.get("size_bytes", 0) or 0) / 1024 / 1024
|
||||
detail = _summary(content, ext)
|
||||
print(f" [{processed:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB {path.name} | {detail}", flush=True)
|
||||
|
||||
if len(ops) >= 50:
|
||||
coll.bulk_write(ops, ordered=False)
|
||||
ops.clear()
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
if ops:
|
||||
coll.bulk_write(ops, ordered=False)
|
||||
|
||||
return {"study": study, "processed": processed, "ok": ok, "errors": errors, "too_big": too_big}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
t0 = time.time()
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
db = client[DB_NAME]
|
||||
|
||||
results = []
|
||||
for name in COLLECTIONS:
|
||||
results.append(enrich_collection(db[name], name))
|
||||
|
||||
print("\n=== SHRNUTI ===")
|
||||
for r in results:
|
||||
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
|
||||
f"errors={r['errors']} too_big={r['too_big']}")
|
||||
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,51 @@
|
||||
# enrich_fulltext_v1.2
|
||||
|
||||
**Verze:** 1.2
|
||||
**Datum:** 2026-06-03
|
||||
**Skript:** `enrich_fulltext_v1.2.py`
|
||||
|
||||
## Změna proti v1.1
|
||||
Velký XLSX (`#400 MDD3003_EAT detail report_30jun25.xlsx`, 5 242 128 znaků textu) způsobil pád:
|
||||
|
||||
```
|
||||
psycopg.errors.ProgramLimitExceeded:
|
||||
string is too long for tsvector (1114090 bytes, max 1048575 bytes)
|
||||
```
|
||||
|
||||
PostgreSQL `tsvector` má **tvrdý limit ~1 MB** binární velikosti — nelze obejít.
|
||||
|
||||
**Řešení:** `tsv` se generuje z prvních **800 000 znaků** sloupce `body`:
|
||||
|
||||
```sql
|
||||
tsv tsvector GENERATED ALWAYS AS (
|
||||
to_tsvector('soubory'::regconfig, left(coalesce(body, ''), 800000))
|
||||
) STORED
|
||||
```
|
||||
|
||||
- sloupec `body` zůstává **plný** (až 5 MB) — pro náhledy, snippet, `ts_headline`
|
||||
- vyhledávání (`tsv @@ q`) ignoruje obsah za 800 000. znakem
|
||||
- u rozsáhlých XLSX/PDF (např. data exporty) je 800 KB stále víc než 100 000 slov — pro fulltext bohatě stačí
|
||||
|
||||
## Migrace
|
||||
`SCHEMA_SQL` při startu zkontroluje, zda současný výraz `tsv` obsahuje `left(`. Pokud ne (starý sloupec z v1.0/v1.1):
|
||||
1. dropne `documents_tsv_gin` index
|
||||
2. dropne sloupec `tsv`
|
||||
3. přidá nový s `left(body, 800000)`
|
||||
4. index se vytvoří znovu na konci `SCHEMA_SQL`
|
||||
|
||||
Bezpečné spustit opakovaně.
|
||||
|
||||
## extractor_version
|
||||
Posunuto na `1.2` → všechny řádky z v1.0/v1.1 se přeparsují (potřebné už proto, že migrace tsv změnila co je v indexu).
|
||||
|
||||
## Vše ostatní
|
||||
Beze změny proti [v1.1](Trash/enrich_fulltext_v1.1.md):
|
||||
- DOCX fallback přes raw `word/document.xml`
|
||||
- NUL byte strip
|
||||
- Limity souborů (PDF 500 MB, XLSX 200 MB, ostatní 300 MB), text max 5 MB
|
||||
- Inkrementálně podle `sha256` + `extractor_version`
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python U:\PythonProject\Janssen\Soubory\enrich_fulltext_v1.2.py
|
||||
```
|
||||
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: enrich_fulltext_v1.2.py
|
||||
Verze: 1.2
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Vytahne PLNY TEXT z dokumentu odkazovanych v MongoDB (db: soubory)
|
||||
a ulozi ho do PostgreSQL (db: MongoSoubory) s GIN tsvector indexem.
|
||||
|
||||
Zmeny proti v1.1:
|
||||
- PG tsvector ma tvrdy limit ~1 MB binarne -> velky XLSX (5 MB textu) ho prekrocil.
|
||||
v1.2 generuje tsv z prvnich 800 000 znaku body: left(body, 800000).
|
||||
Sloupec body zustava plny (max 5 MB pro nahled / snippet).
|
||||
- SCHEMA_SQL provadi migraci sloupce tsv: pokud uz existuje stara verze
|
||||
(bez `left`), dropne index+sloupec a vytvori znovu s truncated vyrazem.
|
||||
- extractor_version = "1.2" -> preparsuji se vsechny radky z v1.0/v1.1.
|
||||
|
||||
Zachovano z v1.1:
|
||||
- NUL bajty (0x00) se strippuji z body i error
|
||||
- DOCX fallback na raw XML pres regex pri padu python-docx
|
||||
|
||||
Cilove ulozeni:
|
||||
- MongoDB 192.168.1.76 db=soubory kolekce=42847922MDD3003, 77242113UCO3001
|
||||
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
|
||||
|
||||
Podporovane pripony: pdf, docx, xlsx, xlsm, pptx, eml, msg, txt, csv
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import email
|
||||
import email.policy
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import zipfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg
|
||||
from pymongo import MongoClient
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "soubory"
|
||||
MONGO_COLLECTIONS = ["42847922MDD3003", "77242113UCO3001"]
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
EXTRACTOR_VERSION = "1.2"
|
||||
|
||||
MAX_TEXT_BYTES = 5 * 1024 * 1024
|
||||
MAX_PDF_BYTES = 500 * 1024 * 1024
|
||||
MAX_XLSX_BYTES = 200 * 1024 * 1024
|
||||
MAX_GENERIC_BYTES = 300 * 1024 * 1024
|
||||
|
||||
SUPPORTED = ("pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv")
|
||||
|
||||
|
||||
# --- SCHEMA -----------------------------------------------------------------
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE EXTENSION IF NOT EXISTS unaccent;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
|
||||
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
|
||||
ALTER TEXT SEARCH CONFIGURATION soubory
|
||||
ALTER MAPPING FOR hword, hword_part, word
|
||||
WITH unaccent, simple;
|
||||
END IF;
|
||||
END$$;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
mongo_id TEXT NOT NULL,
|
||||
study TEXT NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
rel_path TEXT,
|
||||
name TEXT,
|
||||
ext TEXT,
|
||||
sha256 TEXT NOT NULL,
|
||||
size_bytes BIGINT,
|
||||
mtime TIMESTAMPTZ,
|
||||
body TEXT,
|
||||
body_length INT,
|
||||
tsv tsvector GENERATED ALWAYS AS (
|
||||
to_tsvector('soubory'::regconfig, left(coalesce(body, ''), 800000))
|
||||
) STORED,
|
||||
extracted_at TIMESTAMPTZ DEFAULT now(),
|
||||
extractor_version TEXT,
|
||||
ok BOOLEAN,
|
||||
error TEXT,
|
||||
UNIQUE (study, path)
|
||||
);
|
||||
|
||||
-- migrace tsv sloupce ze stareho vyrazu (bez `left`) na novy (s `left(..,800000)`)
|
||||
DO $$
|
||||
DECLARE
|
||||
cur_expr TEXT;
|
||||
BEGIN
|
||||
SELECT pg_get_expr(d.adbin, d.adrelid)
|
||||
INTO cur_expr
|
||||
FROM pg_attribute a
|
||||
JOIN pg_class c ON c.oid = a.attrelid
|
||||
JOIN pg_attrdef d ON d.adrelid = a.attrelid AND d.adnum = a.attnum
|
||||
WHERE c.relname = 'documents' AND a.attname = 'tsv';
|
||||
|
||||
IF cur_expr IS NOT NULL AND position('left' in cur_expr) = 0 THEN
|
||||
EXECUTE 'DROP INDEX IF EXISTS documents_tsv_gin';
|
||||
EXECUTE 'ALTER TABLE documents DROP COLUMN tsv';
|
||||
EXECUTE 'ALTER TABLE documents ADD COLUMN tsv tsvector GENERATED ALWAYS AS '
|
||||
|| '(to_tsvector(''soubory''::regconfig, left(coalesce(body, ''''), 800000))) STORED';
|
||||
END IF;
|
||||
END$$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS documents_tsv_gin ON documents USING gin(tsv);
|
||||
CREATE INDEX IF NOT EXISTS documents_name_trgm ON documents USING gin(name gin_trgm_ops);
|
||||
CREATE INDEX IF NOT EXISTS documents_sha256_idx ON documents(sha256);
|
||||
CREATE INDEX IF NOT EXISTS documents_study_ext_idx ON documents(study, ext);
|
||||
"""
|
||||
|
||||
|
||||
# --- HELPERY ----------------------------------------------------------------
|
||||
|
||||
# odstrani 0x00 a ostatni controly krome whitespace
|
||||
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
|
||||
|
||||
|
||||
def _clean_for_pg(s: str) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
return _CTRL_RX.sub("", s)
|
||||
|
||||
|
||||
def _truncate(s: str) -> str:
|
||||
s = _clean_for_pg(s or "")
|
||||
if not s:
|
||||
return ""
|
||||
b = s.encode("utf-8", errors="replace")
|
||||
if len(b) <= MAX_TEXT_BYTES:
|
||||
return s
|
||||
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
|
||||
|
||||
|
||||
# --- EXTRAKTORY -------------------------------------------------------------
|
||||
|
||||
def extract_pdf(path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
reader = PdfReader(str(path))
|
||||
if reader.is_encrypted:
|
||||
try:
|
||||
reader.decrypt("")
|
||||
except Exception:
|
||||
return ""
|
||||
parts = []
|
||||
total = 0
|
||||
for page in reader.pages:
|
||||
try:
|
||||
t = page.extract_text() or ""
|
||||
except Exception:
|
||||
continue
|
||||
parts.append(t)
|
||||
total += len(t)
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
# regex pro DOCX fallback - vytahne <w:t>...</w:t>
|
||||
_DOCX_WT_RX = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
|
||||
_DOCX_WP_END_RX = re.compile(r"</w:p>")
|
||||
|
||||
|
||||
def _docx_raw_text(path: Path) -> str:
|
||||
"""Fallback - cte primo word/document.xml ze ZIPu."""
|
||||
with zipfile.ZipFile(str(path)) as z:
|
||||
try:
|
||||
xml = z.read("word/document.xml").decode("utf-8", errors="replace")
|
||||
except KeyError:
|
||||
return ""
|
||||
xml = _DOCX_WP_END_RX.sub("\n", xml)
|
||||
return "\n".join(m.group(1) for m in _DOCX_WT_RX.finditer(xml))
|
||||
|
||||
|
||||
def extract_docx(path: Path) -> str:
|
||||
from docx import Document
|
||||
try:
|
||||
doc = Document(str(path))
|
||||
parts = [p.text for p in doc.paragraphs if p.text]
|
||||
for tbl in doc.tables:
|
||||
for row in tbl.rows:
|
||||
parts.append(" | ".join(c.text for c in row.cells))
|
||||
return _truncate("\n".join(parts))
|
||||
except Exception:
|
||||
# fallback - raw XML extract
|
||||
return _truncate(_docx_raw_text(path))
|
||||
|
||||
|
||||
def extract_xlsx(path: Path) -> str:
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(str(path), read_only=True, data_only=True)
|
||||
parts = []
|
||||
total = 0
|
||||
for ws in wb.worksheets:
|
||||
parts.append(f"# {ws.title}")
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
line = "\t".join("" if v is None else str(v) for v in row)
|
||||
if line.strip():
|
||||
parts.append(line)
|
||||
total += len(line)
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
if total > MAX_TEXT_BYTES:
|
||||
break
|
||||
wb.close()
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_pptx(path: Path) -> str:
|
||||
from pptx import Presentation
|
||||
prs = Presentation(str(path))
|
||||
parts = []
|
||||
for i, slide in enumerate(prs.slides, 1):
|
||||
parts.append(f"# slide {i}")
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
line = "".join(run.text for run in para.runs)
|
||||
if line.strip():
|
||||
parts.append(line)
|
||||
if slide.has_notes_slide:
|
||||
notes = slide.notes_slide.notes_text_frame.text
|
||||
if notes:
|
||||
parts.append(f"[notes] {notes}")
|
||||
return _truncate("\n".join(parts))
|
||||
|
||||
|
||||
def extract_eml(path: Path) -> str:
|
||||
with path.open("rb") as f:
|
||||
msg = email.message_from_binary_file(f, policy=email.policy.default)
|
||||
head = []
|
||||
for k in ("From", "To", "Cc", "Subject", "Date"):
|
||||
v = msg.get(k)
|
||||
if v:
|
||||
head.append(f"{k}: {v}")
|
||||
parts = ["\n".join(head)]
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain" and not part.get_filename():
|
||||
try:
|
||||
parts.append(part.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
parts.append(msg.get_content())
|
||||
except Exception:
|
||||
pass
|
||||
return _truncate("\n\n".join(parts))
|
||||
|
||||
|
||||
def extract_msg(path: Path) -> str:
|
||||
import extract_msg
|
||||
with extract_msg.openMsg(str(path)) as m:
|
||||
head = []
|
||||
if m.subject: head.append(f"Subject: {m.subject}")
|
||||
if m.sender: head.append(f"From: {m.sender}")
|
||||
if m.to: head.append(f"To: {m.to}")
|
||||
if m.cc: head.append(f"Cc: {m.cc}")
|
||||
if m.date: head.append(f"Date: {m.date}")
|
||||
return _truncate("\n".join(head) + "\n\n" + (m.body or ""))
|
||||
|
||||
|
||||
def extract_text(path: Path) -> str:
|
||||
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
||||
for enc in ("utf-8-sig", "cp1250", "latin-1"):
|
||||
try:
|
||||
return _truncate(data.decode(enc))
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return _truncate(data.decode("utf-8", errors="replace"))
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"pdf": (extract_pdf, MAX_PDF_BYTES),
|
||||
"docx": (extract_docx, MAX_GENERIC_BYTES),
|
||||
"xlsx": (extract_xlsx, MAX_XLSX_BYTES),
|
||||
"xlsm": (extract_xlsx, MAX_XLSX_BYTES),
|
||||
"pptx": (extract_pptx, MAX_GENERIC_BYTES),
|
||||
"eml": (extract_eml, MAX_GENERIC_BYTES),
|
||||
"msg": (extract_msg, MAX_GENERIC_BYTES),
|
||||
"txt": (extract_text, MAX_GENERIC_BYTES),
|
||||
"csv": (extract_text, MAX_GENERIC_BYTES),
|
||||
}
|
||||
|
||||
|
||||
def _short(s, n=40):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").replace("\r", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return datetime.now(tz=timezone.utc)
|
||||
|
||||
|
||||
# --- HLAVNI SMYCKA ----------------------------------------------------------
|
||||
|
||||
def process_collection(pg: psycopg.Connection, mongo_coll, study: str) -> dict:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT path, sha256, extractor_version, ok FROM documents WHERE study = %s",
|
||||
(study,),
|
||||
)
|
||||
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
|
||||
|
||||
cursor = mongo_coll.find(
|
||||
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}},
|
||||
{"_id": 1, "path": 1, "rel_path": 1, "name": 1, "ext": 1,
|
||||
"sha256": 1, "size_bytes": 1, "mtime": 1},
|
||||
no_cursor_timeout=True,
|
||||
)
|
||||
|
||||
processed = ok = errors = skipped = too_big = 0
|
||||
queue: list[dict] = []
|
||||
total_pending = mongo_coll.count_documents(
|
||||
{"ext": {"$in": list(EXTRACTORS.keys())}, "deleted_at": {"$exists": False}}
|
||||
)
|
||||
print(f"[{study}] kandidatu v Mongo: {total_pending}")
|
||||
|
||||
n = 0
|
||||
try:
|
||||
for doc in cursor:
|
||||
n += 1
|
||||
prev = existing.get(doc["path"])
|
||||
if prev and prev[0] == doc.get("sha256") and prev[1] == EXTRACTOR_VERSION and prev[2]:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ext = doc["ext"]
|
||||
extractor, max_bytes = EXTRACTORS[ext]
|
||||
path = Path(doc["path"])
|
||||
|
||||
row = {
|
||||
"mongo_id": str(doc["_id"]),
|
||||
"study": study,
|
||||
"path": doc["path"],
|
||||
"rel_path": doc.get("rel_path"),
|
||||
"name": doc.get("name"),
|
||||
"ext": ext,
|
||||
"sha256": doc.get("sha256"),
|
||||
"size_bytes": doc.get("size_bytes"),
|
||||
"mtime": doc.get("mtime"),
|
||||
"body": None,
|
||||
"body_length": 0,
|
||||
"extracted_at": _now(),
|
||||
"extractor_version": EXTRACTOR_VERSION,
|
||||
"ok": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
status = "OK "
|
||||
detail = ""
|
||||
size_mb = (doc.get("size_bytes") or 0) / 1024 / 1024
|
||||
|
||||
if not path.exists():
|
||||
row["error"] = "file_missing"
|
||||
status = "ERR"; detail = "file_missing"; errors += 1
|
||||
elif (doc.get("size_bytes") or 0) > max_bytes:
|
||||
row["error"] = f"too_big_>{max_bytes}"
|
||||
status = "BIG"; detail = f"too_big_>{max_bytes//1024//1024}MB"; too_big += 1
|
||||
else:
|
||||
try:
|
||||
body = extractor(path) or ""
|
||||
row["body"] = body if body else None
|
||||
row["body_length"] = len(body)
|
||||
row["ok"] = True
|
||||
ok += 1
|
||||
detail = f"{len(body)} znaku {_short(body, 60)!r}"
|
||||
except Exception as e:
|
||||
row["error"] = f"{type(e).__name__}: {e}"[:500]
|
||||
status = "ERR"; detail = row["error"][:80]; errors += 1
|
||||
|
||||
queue.append(row)
|
||||
processed += 1
|
||||
print(f" [{n:>4}/{total_pending}] {status} {ext:<4} {size_mb:6.1f}MB "
|
||||
f"{path.name} | {detail}", flush=True)
|
||||
|
||||
if len(queue) >= 50:
|
||||
_flush(pg, queue); queue.clear()
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
if queue:
|
||||
_flush(pg, queue)
|
||||
|
||||
return {"study": study, "processed": processed, "ok": ok,
|
||||
"errors": errors, "skipped": skipped, "too_big": too_big}
|
||||
|
||||
|
||||
UPSERT_SQL = """
|
||||
INSERT INTO documents
|
||||
(mongo_id, study, path, rel_path, name, ext, sha256, size_bytes, mtime,
|
||||
body, body_length, extracted_at, extractor_version, ok, error)
|
||||
VALUES
|
||||
(%(mongo_id)s, %(study)s, %(path)s, %(rel_path)s, %(name)s, %(ext)s, %(sha256)s,
|
||||
%(size_bytes)s, %(mtime)s, %(body)s, %(body_length)s, %(extracted_at)s,
|
||||
%(extractor_version)s, %(ok)s, %(error)s)
|
||||
ON CONFLICT (study, path) DO UPDATE SET
|
||||
mongo_id = EXCLUDED.mongo_id,
|
||||
rel_path = EXCLUDED.rel_path,
|
||||
name = EXCLUDED.name,
|
||||
ext = EXCLUDED.ext,
|
||||
sha256 = EXCLUDED.sha256,
|
||||
size_bytes = EXCLUDED.size_bytes,
|
||||
mtime = EXCLUDED.mtime,
|
||||
body = EXCLUDED.body,
|
||||
body_length = EXCLUDED.body_length,
|
||||
extracted_at = EXCLUDED.extracted_at,
|
||||
extractor_version = EXCLUDED.extractor_version,
|
||||
ok = EXCLUDED.ok,
|
||||
error = EXCLUDED.error
|
||||
"""
|
||||
|
||||
|
||||
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
|
||||
# posledni pojistka - jeste jednou strip NUL (kdyby se necim prokrouzil)
|
||||
for r in rows:
|
||||
if r.get("body"):
|
||||
r["body"] = _clean_for_pg(r["body"])
|
||||
if r.get("error"):
|
||||
r["error"] = _clean_for_pg(r["error"])
|
||||
with pg.cursor() as cur:
|
||||
cur.executemany(UPSERT_SQL, rows)
|
||||
pg.commit()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
t0 = time.time()
|
||||
print("Pripojuji se k PostgreSQL...")
|
||||
pg = psycopg.connect(PG_DSN, connect_timeout=10)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(SCHEMA_SQL)
|
||||
pg.commit()
|
||||
print("Schema OK.")
|
||||
|
||||
print("Pripojuji se k MongoDB...")
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
mongo.admin.command("ping")
|
||||
db = mongo[MONGO_DB]
|
||||
print("Mongo OK.")
|
||||
|
||||
results = []
|
||||
for name in MONGO_COLLECTIONS:
|
||||
results.append(process_collection(pg, db[name], name))
|
||||
|
||||
pg.close()
|
||||
|
||||
print("\n=== SHRNUTI ===")
|
||||
for r in results:
|
||||
print(f" {r['study']}: processed={r['processed']} ok={r['ok']} "
|
||||
f"errors={r['errors']} skipped={r['skipped']} too_big={r['too_big']}")
|
||||
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,63 @@
|
||||
# scan_files_v1.0
|
||||
|
||||
**Verze:** 1.0
|
||||
**Datum:** 2026-06-03
|
||||
**Skript:** `scan_files_v1.0.py`
|
||||
|
||||
## Účel
|
||||
Rekurzivní sken dvou Dropbox složek studií (`!!42847922MDD3003`, `!77242113UCO3001`) a zápis metadat všech souborů do MongoDB.
|
||||
|
||||
## Konfigurace
|
||||
- **MongoDB:** `mongodb://192.168.1.76:27017` (bez autentizace)
|
||||
- **DB:** `soubory`
|
||||
- **Kolekce:** `42847922MDD3003`, `77242113UCO3001` (jedna kolekce na studii)
|
||||
- **Cesta k Dropboxu:** zjištěna pomocí `Knihovny/najdi_dropbox.py` (přenositelné mezi PC)
|
||||
|
||||
## Struktura dokumentu v MongoDB
|
||||
| pole | popis |
|
||||
|---|---|
|
||||
| `path` | absolutní cesta (unikátní klíč) |
|
||||
| `study` | kód studie (= název kolekce) |
|
||||
| `rel_path` | relativní cesta od kořene studie |
|
||||
| `dir`, `rel_dir` | nadřazený adresář (absolutní/relativní) |
|
||||
| `parent_folders` | pole názvů složek (pro filtrování) |
|
||||
| `name`, `stem`, `ext` | jméno, jméno bez přípony, přípona (lower-case) |
|
||||
| `size_bytes` | velikost |
|
||||
| `mtime`, `ctime`, `atime` | časové údaje (UTC) |
|
||||
| `sha256` | hash obsahu |
|
||||
| `mime` | mimetype dle přípony |
|
||||
| `tokens` | jméno rozparsované na slova/čísla (lower-case) |
|
||||
| `dates_in_name` | datumy nalezené v názvu, formát `YYYY-MM-DD` |
|
||||
| `first_seen_at` | první sken, kdy byl soubor viděn |
|
||||
| `last_seen_at` | poslední sken, kdy byl viděn |
|
||||
| `deleted_at` | nastaveno, pokud soubor v posledním skenu už nebyl nalezen |
|
||||
|
||||
## Datumy v názvu
|
||||
Skript hledá tři varianty:
|
||||
- `12JAN2026`, `12Jan2026` (den + 3-písm. zkratka měsíce + rok)
|
||||
- `2026-01-12`, `2026_01_12`, `2026.01.12`
|
||||
- `12-01-2026`, `12_01_2026`, `12.01.2026`
|
||||
|
||||
Všechny se normalizují do ISO `YYYY-MM-DD` v poli `dates_in_name`.
|
||||
|
||||
## Inkrementální chování
|
||||
- `size_bytes` + `mtime` souhlasí se záznamem v DB → SHA256 se nepřepočítává, jen se aktualizuje `last_seen_at`
|
||||
- nový soubor → vloží se s `first_seen_at`
|
||||
- chybějící v aktuálním běhu → `deleted_at` se nastaví na čas běhu
|
||||
|
||||
## Co se ignoruje
|
||||
- `.dropbox*`, `Thumbs.db`, `desktop.ini`, `~$*.*` (Office locky), `.DS_Store`
|
||||
- adresář `.dropbox.cache`
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python U:\PythonProject\Janssen\Soubory\scan_files_v1.0.py
|
||||
```
|
||||
|
||||
## Index pole pro rychlé dotazy
|
||||
`path` (unique), `ext`, `dates_in_name`, `tokens`, `sha256`
|
||||
|
||||
## Plán pokračování
|
||||
1. Spustit první sken → zjistit profil dat (přípony, hloubku stromů)
|
||||
2. Doplnit dle potřeby (např. počet stran PDF, autor DOCX, listy XLSX)
|
||||
3. Postavit `MCP_SOUBORY` server nad touto kolekcí
|
||||
@@ -0,0 +1,272 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: scan_files_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Rekurzivni sken Dropbox slozek dvou studii a zapis metadat
|
||||
vsech souboru do MongoDB (db: soubory, kolekce = nazev studie).
|
||||
|
||||
- cesty k Dropboxu se zjisti pres Knihovny.najdi_dropbox
|
||||
- pro kazdy soubor: stat, sha256, mime (podle pripony),
|
||||
parsing data v nazvu (12JAN2026, 2026-01-12, 12-01-2026 ...)
|
||||
- inkrementalni: pokud size+mtime souhlasi se zaznamem v DB,
|
||||
sha256 se nepocita znovu (jen se aktualizuje last_seen_at)
|
||||
- smazane soubory dostanou deleted_at pri behu, ve kterem
|
||||
uz nebyly videny
|
||||
- vynechavaji se: .dropbox*, Thumbs.db, desktop.ini,
|
||||
~$*.* (Office lock), .DS_Store, *.tmp
|
||||
|
||||
MongoDB: 192.168.1.76:27017, bez autentizace
|
||||
DB: soubory
|
||||
Kolekce: 42847922MDD3003, 77242113UCO3001 (extrahovano z rootu cesty)
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING
|
||||
|
||||
# --- prida Knihovny do path -------------------------------------------------
|
||||
HERE = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(HERE.parent))
|
||||
from Knihovny.najdi_dropbox import get_dropbox_root # noqa: E402
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "soubory"
|
||||
|
||||
STUDIES = {
|
||||
"42847922MDD3003": "!!42847922MDD3003",
|
||||
"77242113UCO3001": "!77242113UCO3001",
|
||||
}
|
||||
|
||||
SKIP_NAME_PATTERNS = [
|
||||
re.compile(r"^\.dropbox.*", re.IGNORECASE),
|
||||
re.compile(r"^Thumbs\.db$", re.IGNORECASE),
|
||||
re.compile(r"^desktop\.ini$", re.IGNORECASE),
|
||||
re.compile(r"^~\$.*", re.IGNORECASE),
|
||||
re.compile(r"^\.DS_Store$", re.IGNORECASE),
|
||||
]
|
||||
|
||||
SKIP_DIR_NAMES = {".dropbox.cache"}
|
||||
|
||||
HASH_CHUNK = 1024 * 1024 # 1 MiB
|
||||
|
||||
# --- parsovani datumu v nazvu ----------------------------------------------
|
||||
MONTHS = {
|
||||
"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
|
||||
"JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12,
|
||||
}
|
||||
|
||||
DATE_PATTERNS = [
|
||||
# 12JAN2026 / 12Jan2026
|
||||
(re.compile(r"(\d{1,2})([A-Za-z]{3})(\d{4})"), "dmonth"),
|
||||
# 2026-01-12 / 2026_01_12 / 2026.01.12
|
||||
(re.compile(r"(20\d{2})[-_.](\d{1,2})[-_.](\d{1,2})"), "ymd"),
|
||||
# 12-01-2026 / 12_01_2026 / 12.01.2026
|
||||
(re.compile(r"(\d{1,2})[-_.](\d{1,2})[-_.](20\d{2})"), "dmy"),
|
||||
]
|
||||
|
||||
|
||||
def extract_dates(name: str) -> list[str]:
|
||||
"""Vraci unikatni ISO datumy (YYYY-MM-DD) nalezene v nazvu."""
|
||||
found: set[str] = set()
|
||||
for rx, kind in DATE_PATTERNS:
|
||||
for m in rx.finditer(name):
|
||||
try:
|
||||
if kind == "dmonth":
|
||||
d = int(m.group(1))
|
||||
mo = MONTHS.get(m.group(2).upper())
|
||||
y = int(m.group(3))
|
||||
if not mo:
|
||||
continue
|
||||
elif kind == "ymd":
|
||||
y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||
else: # dmy
|
||||
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||
datetime(y, mo, d)
|
||||
found.add(f"{y:04d}-{mo:02d}-{d:02d}")
|
||||
except ValueError:
|
||||
continue
|
||||
return sorted(found)
|
||||
|
||||
|
||||
TOKEN_RX = re.compile(r"[A-Za-z0-9]+")
|
||||
|
||||
|
||||
def tokenize(name: str) -> list[str]:
|
||||
return [t.lower() for t in TOKEN_RX.findall(name)]
|
||||
|
||||
|
||||
def should_skip(name: str) -> bool:
|
||||
return any(p.match(name) for p in SKIP_NAME_PATTERNS)
|
||||
|
||||
|
||||
def sha256_of(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while True:
|
||||
chunk = f.read(HASH_CHUNK)
|
||||
if not chunk:
|
||||
break
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def to_dt(ts: float) -> datetime:
|
||||
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
||||
|
||||
|
||||
def scan_study(study_code: str, study_root: Path, db, scan_started_at: datetime) -> dict:
|
||||
coll = db[study_code]
|
||||
coll.create_index([("path", ASCENDING)], unique=True)
|
||||
coll.create_index([("ext", ASCENDING)])
|
||||
coll.create_index([("dates_in_name", ASCENDING)])
|
||||
coll.create_index([("tokens", ASCENDING)])
|
||||
coll.create_index([("sha256", ASCENDING)])
|
||||
|
||||
# existujici zaznamy -> mapa path -> (size, mtime_iso, sha256)
|
||||
existing = {
|
||||
d["path"]: (d.get("size_bytes"), d.get("mtime"), d.get("sha256"))
|
||||
for d in coll.find({}, {"path": 1, "size_bytes": 1, "mtime": 1, "sha256": 1})
|
||||
}
|
||||
|
||||
ops: list[UpdateOne] = []
|
||||
seen = 0
|
||||
rehashed = 0
|
||||
skipped = 0
|
||||
errors: list[tuple[str, str]] = []
|
||||
|
||||
print(f"[{study_code}] sken: {study_root}")
|
||||
for root, dirs, files in os.walk(study_root):
|
||||
# vyrad skip-dirs in-place
|
||||
dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES]
|
||||
for fname in files:
|
||||
if should_skip(fname):
|
||||
skipped += 1
|
||||
continue
|
||||
fpath = Path(root) / fname
|
||||
try:
|
||||
st = fpath.stat()
|
||||
except OSError as e:
|
||||
errors.append((str(fpath), f"stat: {e}"))
|
||||
continue
|
||||
|
||||
path_str = str(fpath)
|
||||
size = st.st_size
|
||||
mtime = to_dt(st.st_mtime)
|
||||
|
||||
prev = existing.get(path_str)
|
||||
if prev and prev[0] == size and prev[1] == mtime and prev[2]:
|
||||
# bez zmeny - jen last_seen_at + clear deleted_at
|
||||
ops.append(UpdateOne(
|
||||
{"path": path_str},
|
||||
{"$set": {"last_seen_at": scan_started_at},
|
||||
"$unset": {"deleted_at": ""}},
|
||||
))
|
||||
else:
|
||||
try:
|
||||
digest = sha256_of(fpath)
|
||||
except OSError as e:
|
||||
errors.append((path_str, f"hash: {e}"))
|
||||
continue
|
||||
rehashed += 1
|
||||
|
||||
rel = fpath.relative_to(study_root)
|
||||
doc = {
|
||||
"path": path_str,
|
||||
"study": study_code,
|
||||
"rel_path": str(rel),
|
||||
"dir": str(fpath.parent),
|
||||
"rel_dir": str(rel.parent) if str(rel.parent) != "." else "",
|
||||
"parent_folders": list(rel.parts[:-1]),
|
||||
"name": fname,
|
||||
"stem": fpath.stem,
|
||||
"ext": fpath.suffix.lower().lstrip("."),
|
||||
"size_bytes": size,
|
||||
"mtime": mtime,
|
||||
"ctime": to_dt(st.st_ctime),
|
||||
"atime": to_dt(st.st_atime),
|
||||
"sha256": digest,
|
||||
"mime": mimetypes.guess_type(fname)[0],
|
||||
"tokens": tokenize(fpath.stem),
|
||||
"dates_in_name": extract_dates(fname),
|
||||
"last_seen_at": scan_started_at,
|
||||
}
|
||||
ops.append(UpdateOne(
|
||||
{"path": path_str},
|
||||
{"$set": doc, "$unset": {"deleted_at": ""},
|
||||
"$setOnInsert": {"first_seen_at": scan_started_at}},
|
||||
upsert=True,
|
||||
))
|
||||
|
||||
seen += 1
|
||||
if len(ops) >= 500:
|
||||
coll.bulk_write(ops, ordered=False)
|
||||
ops.clear()
|
||||
print(f" ... {seen} souboru zpracovano")
|
||||
|
||||
if ops:
|
||||
coll.bulk_write(ops, ordered=False)
|
||||
|
||||
# oznac smazane
|
||||
res = coll.update_many(
|
||||
{"last_seen_at": {"$lt": scan_started_at}, "deleted_at": {"$exists": False}},
|
||||
{"$set": {"deleted_at": scan_started_at}},
|
||||
)
|
||||
|
||||
return {
|
||||
"study": study_code,
|
||||
"seen": seen,
|
||||
"rehashed": rehashed,
|
||||
"unchanged": seen - rehashed,
|
||||
"skipped": skipped,
|
||||
"marked_deleted": res.modified_count,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
t0 = time.time()
|
||||
dropbox_root = Path(get_dropbox_root())
|
||||
print(f"Dropbox root: {dropbox_root}")
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
db = client[DB_NAME]
|
||||
|
||||
scan_started_at = datetime.now(tz=timezone.utc)
|
||||
|
||||
results = []
|
||||
for study_code, folder in STUDIES.items():
|
||||
study_root = dropbox_root / folder
|
||||
if not study_root.is_dir():
|
||||
print(f"[!] {study_root} neexistuje, preskakuji")
|
||||
continue
|
||||
results.append(scan_study(study_code, study_root, db, scan_started_at))
|
||||
|
||||
print("\n=== SHRNUTI ===")
|
||||
for r in results:
|
||||
print(f" {r['study']}: seen={r['seen']} rehashed={r['rehashed']} "
|
||||
f"unchanged={r['unchanged']} skipped={r['skipped']} "
|
||||
f"deleted={r['marked_deleted']} errors={len(r['errors'])}")
|
||||
for path, err in r["errors"][:5]:
|
||||
print(f" ! {err} ({path})")
|
||||
if len(r["errors"]) > 5:
|
||||
print(f" ... +{len(r['errors']) - 5} dalsich chyb")
|
||||
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user