nis2-agile/scripts/ingest-nis2-sources.php
DevEnv nis2-agile 5c545ea3d0 [FEAT] Integrazione analisi docs/nis2 v1.7.0 — scoring asset, tassonomia incidenti, PIR, NIST CSF, fonti certe
Fase 1 - Asset Relevance Scoring NIS2 (GV.OC-04): metodologia 0-100 a 6 criteri,
  AssetScoringService + endpoint scoringGrid/score/relevantSystems + UI assets.html + registro stampabile.
Fase 2 - Tassonomia incidenti Determina ACN 164179/2025: IS-1..4 + regime essenziale/importante (Allegati 3/4).
Fase 3 - Post-Incident Review (5-Whys) + metriche TTD/TTC/TTR + timestamp di fase.
Fase 4 - Mapping NIST CSF 2.0 (43 controlli) reference-only.
Fonti certe: registry config/nis2_sources.php + grounding AI (vieta riferimenti inventati) +
  citazioni help.js + ingest PDF normativi nella KB RAG (scripts/ingest-nis2-sources.php).
Migrazioni 020/021/022 (additive idempotenti). Fix VectorService IP Qdrant (drift .5->.3).
Analisi concorrenza Evix (docs/EVIX_ANALISI_CONCORRENZA.html, gap-driven).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 17:15:13 +02:00

214 lines
9.0 KiB
PHP

<?php
/**
* NIS2 Agile - Ingest Fonti Normative Certe nella Knowledge Base (RAG)
* ----------------------------------------------------------------------------
* Indicizza i PDF normativi ufficiali (docs/nis2/*.pdf, registrati in
* application/config/nis2_sources.php) nella collection Qdrant `nis2_kb` con
* scope SYSTEM, cosi' che AIService::askWithRag() possa citare le fonti certe.
*
* ESEGUIRE SU HETZNER (richiede accesso a Qdrant + Voyage), es:
* docker exec -i nis2-app php /var/www/nis2-agile/scripts/ingest-nis2-sources.php
* # oppure dalla root del progetto:
* php scripts/ingest-nis2-sources.php
*
* Estrazione testo: usa `pdftotext` (poppler-utils) se disponibile, altrimenti
* ricade sull'API document di Claude. Idempotente: cancella i chunk SYSTEM del
* documento (per `source` stabile) prima del re-upsert.
*
* Opzioni:
* --only=determina_164179_2025 ingerisce una sola fonte (key del registry)
* --dry-run estrae e mostra le statistiche senza upsert
* ============================================================================
*/
if (PHP_SAPI !== 'cli') { fwrite(STDERR, "Solo CLI\n"); exit(1); }
if (!defined('BASE_PATH')) define('BASE_PATH', dirname(__DIR__));
if (!defined('APP_PATH')) define('APP_PATH', BASE_PATH . '/application');
require_once APP_PATH . '/config/env.php';
require_once APP_PATH . '/config/config.php';
require_once APP_PATH . '/config/database.php';
require_once APP_PATH . '/services/EmbedService.php';
require_once APP_PATH . '/services/VectorService.php';
$opts = getopt('', ['only::', 'dry-run']);
$only = $opts['only'] ?? null;
$dryRun = isset($opts['dry-run']);
$sources = require APP_PATH . '/config/nis2_sources.php';
function logln(string $m): void { echo '[' . date('Y-m-d H:i:s') . "] $m\n"; }
/** Estrae testo da un PDF: pdftotext -> fallback Claude document API. */
function extractPdfText(string $absPath): string
{
// 0) Cache di testo pre-estratto accanto al PDF (<file>.pdf.txt).
// Utile quando l'ingest gira in un container privo di pdftotext:
// si estrae prima sull'host e si rilegge il .txt qui.
$cache = $absPath . '.txt';
if (is_file($cache)) {
$t = (string) file_get_contents($cache);
if (strlen(trim($t)) > 200) { logln(' uso cache testo: ' . basename($cache)); return $t; }
}
// 1) pdftotext (veloce, gratuito)
$bin = trim((string)@shell_exec('command -v pdftotext 2>/dev/null'));
if ($bin !== '') {
$tmp = tempnam(sys_get_temp_dir(), 'nis2pdf') . '.txt';
@shell_exec(escapeshellcmd($bin) . ' -enc UTF-8 -nopgbrk ' . escapeshellarg($absPath) . ' ' . escapeshellarg($tmp) . ' 2>/dev/null');
$txt = is_file($tmp) ? (string)file_get_contents($tmp) : '';
@unlink($tmp);
if (strlen(trim($txt)) > 200) return $txt;
}
// 2) Fallback: Claude document API
logln(' pdftotext non disponibile/insufficiente -> uso Claude document API');
$data = base64_encode((string)file_get_contents($absPath));
$body = [
'model' => defined('ANTHROPIC_MODEL') ? ANTHROPIC_MODEL : 'claude-sonnet-4-5-20250929',
'max_tokens' => 8000,
'messages' => [[
'role' => 'user',
'content' => [
['type' => 'document', 'source' => ['type' => 'base64', 'media_type' => 'application/pdf', 'data' => $data]],
['type' => 'text', 'text' => 'Estrai integralmente il testo di questo documento normativo in testo semplice, mantenendo numeri di articolo, commi, allegati e tabelle. Non riassumere, non commentare.'],
],
]],
];
$ch = curl_init('https://api.anthropic.com/v1/messages');
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_HTTPHEADER => [
'content-type: application/json',
'x-api-key: ' . ANTHROPIC_API_KEY,
'anthropic-version: 2023-06-01',
],
CURLOPT_POSTFIELDS => json_encode($body),
CURLOPT_TIMEOUT => 180,
]);
$res = curl_exec($ch);
if ($res === false) { logln(' ERRORE curl: ' . curl_error($ch)); curl_close($ch); return ''; }
curl_close($ch);
$j = json_decode($res, true);
return $j['content'][0]['text'] ?? '';
}
function chunkText(string $text, int $size = 2000, int $overlap = 200): array
{
// Multibyte-safe: usa mb_* per non spezzare caratteri UTF-8 a meta'
// (altrimenti json_encode produce body non valido -> Voyage HTTP 400).
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); // bonifica sequenze invalide
$chunks = []; $len = mb_strlen($text, 'UTF-8'); $start = 0;
while ($start < $len) {
$take = min($size, $len - $start);
$piece = mb_substr($text, $start, $take, 'UTF-8');
if (trim($piece) !== '') $chunks[] = $piece;
if ($start + $take >= $len) break;
$start += ($size - $overlap);
}
return $chunks;
}
function uuid(): string
{
$b = random_bytes(16);
$b[6] = chr((ord($b[6]) & 0x0f) | 0x40);
$b[8] = chr((ord($b[8]) & 0x3f) | 0x80);
return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($b), 4));
}
logln('=== Ingest fonti normative NIS2 nella KB (scope SYSTEM) ===');
if ($dryRun) logln('MODALITA DRY-RUN: nessun upsert.');
$embed = null; $vector = null;
if (!$dryRun) {
$embed = new EmbedService();
$vector = new VectorService();
$vector->ensureCollection($embed->dims);
}
$totalChunks = 0; $done = 0;
foreach ($sources as $key => $src) {
if ($only && $key !== $only) continue;
if (empty($src['file'])) { logln("SKIP {$key}: nessun file PDF associato"); continue; }
$abs = BASE_PATH . '/' . $src['file'];
if (!is_file($abs)) { logln("SKIP {$key}: file non trovato {$abs}"); continue; }
logln("Fonte: {$src['short']} ({$src['file']})");
$text = extractPdfText($abs);
$text = preg_replace('/[ \t]+/', ' ', $text);
$text = preg_replace('/\n{3,}/', "\n\n", trim($text));
if (strlen($text) < 200) { logln(" ERRORE: testo estratto troppo breve, salto."); continue; }
// Prefisso citazione su ogni documento: aiuta il modello a citare correttamente
$header = "FONTE NORMATIVA: {$src['citation']}\nAUTORITA: {$src['authority']}\n\n";
$chunks = chunkText($header . $text, 2000, 200);
logln(' testo: ' . strlen($text) . ' char -> ' . count($chunks) . ' chunk');
$totalChunks += count($chunks);
if ($dryRun) { $done++; continue; }
// Idempotenza: rimuovi i chunk SYSTEM esistenti per questa fonte
try {
$vector->deleteByFilter(['must' => [
['key' => 'scope', 'match' => ['value' => 'SYSTEM']],
['key' => 'source', 'match' => ['value' => $src['citation']]],
]]);
} catch (Exception $e) { logln(' (warning) delete precedente: ' . $e->getMessage()); }
$docUuid = uuid();
$points = [];
foreach ($chunks as $i => $chunk) {
// Embedding con retry/backoff: Voyage puo' restituire errori transitori
// (HTTP 0 timeout / 429 rate limit) su grandi volumi di chunk.
$vec = null;
for ($try = 1; $try <= 5; $try++) {
try { $vec = $embed->embed($chunk); break; }
catch (Throwable $e) {
if ($try === 5) { logln(" ERRORE embed chunk {$i} dopo 5 tentativi: " . $e->getMessage()); throw $e; }
logln(" retry embed chunk {$i} (tentativo {$try}): " . substr($e->getMessage(), 0, 60));
sleep($try); // backoff lineare 1s,2s,3s,4s
}
}
$points[] = [
'id' => uuid(),
'vector' => $vec,
'payload' => [
'doc_uuid' => $docUuid,
'title' => $src['short'] . ($i > 0 ? ' (parte ' . ($i + 1) . ')' : ''),
'chunk' => $chunk,
'entity_type' => 'normativa',
'source' => $src['citation'],
'lang' => 'it',
'scope' => 'SYSTEM',
'consulting_firm_id' => null,
'organization_id' => null,
'shared_with_orgs' => [],
'uploaded_by' => 0,
],
];
}
// Upsert a batch (per non superare i limiti di payload)
foreach (array_chunk($points, 64) as $batch) {
$vector->upsertBatch($batch);
}
// Tracking MySQL (best-effort)
try {
$stmt = Database::getInstance()->prepare(
"INSERT INTO kb_uploaded_documents
(qdrant_doc_uuid, scope, consulting_firm_id, organization_id, uploaded_by, title, entity_type, source, lang, chunk_count, shared_with_orgs, status)
VALUES (?, 'SYSTEM', NULL, NULL, 0, ?, 'normativa', ?, 'it', ?, '[]', 'ready')"
);
$stmt->execute([$docUuid, $src['short'], $src['citation'], count($chunks)]);
} catch (Exception $e) { logln(' (warning) tracking insert: ' . $e->getMessage()); }
logln(" OK indicizzato (doc_uuid={$docUuid})");
$done++;
}
logln("=== Completato: {$done} fonti, {$totalChunks} chunk totali ===");