Fase 1 - Asset Relevance Scoring NIS2 (GV.OC-04): metodologia 0-100 a 6 criteri, AssetScoringService + endpoint scoringGrid/score/relevantSystems + UI assets.html + registro stampabile. Fase 2 - Tassonomia incidenti Determina ACN 164179/2025: IS-1..4 + regime essenziale/importante (Allegati 3/4). Fase 3 - Post-Incident Review (5-Whys) + metriche TTD/TTC/TTR + timestamp di fase. Fase 4 - Mapping NIST CSF 2.0 (43 controlli) reference-only. Fonti certe: registry config/nis2_sources.php + grounding AI (vieta riferimenti inventati) + citazioni help.js + ingest PDF normativi nella KB RAG (scripts/ingest-nis2-sources.php). Migrazioni 020/021/022 (additive idempotenti). Fix VectorService IP Qdrant (drift .5->.3). Analisi concorrenza Evix (docs/EVIX_ANALISI_CONCORRENZA.html, gap-driven). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
214 lines
9.0 KiB
PHP
214 lines
9.0 KiB
PHP
<?php
|
|
/**
|
|
* NIS2 Agile - Ingest Fonti Normative Certe nella Knowledge Base (RAG)
|
|
* ----------------------------------------------------------------------------
|
|
* Indicizza i PDF normativi ufficiali (docs/nis2/*.pdf, registrati in
|
|
* application/config/nis2_sources.php) nella collection Qdrant `nis2_kb` con
|
|
* scope SYSTEM, cosi' che AIService::askWithRag() possa citare le fonti certe.
|
|
*
|
|
* ESEGUIRE SU HETZNER (richiede accesso a Qdrant + Voyage), es:
|
|
* docker exec -i nis2-app php /var/www/nis2-agile/scripts/ingest-nis2-sources.php
|
|
* # oppure dalla root del progetto:
|
|
* php scripts/ingest-nis2-sources.php
|
|
*
|
|
* Estrazione testo: usa `pdftotext` (poppler-utils) se disponibile, altrimenti
|
|
* ricade sull'API document di Claude. Idempotente: cancella i chunk SYSTEM del
|
|
* documento (per `source` stabile) prima del re-upsert.
|
|
*
|
|
* Opzioni:
|
|
* --only=determina_164179_2025 ingerisce una sola fonte (key del registry)
|
|
* --dry-run estrae e mostra le statistiche senza upsert
|
|
* ============================================================================
|
|
*/
|
|
|
|
if (PHP_SAPI !== 'cli') { fwrite(STDERR, "Solo CLI\n"); exit(1); }
|
|
|
|
if (!defined('BASE_PATH')) define('BASE_PATH', dirname(__DIR__));
|
|
if (!defined('APP_PATH')) define('APP_PATH', BASE_PATH . '/application');
|
|
|
|
require_once APP_PATH . '/config/env.php';
|
|
require_once APP_PATH . '/config/config.php';
|
|
require_once APP_PATH . '/config/database.php';
|
|
require_once APP_PATH . '/services/EmbedService.php';
|
|
require_once APP_PATH . '/services/VectorService.php';
|
|
|
|
$opts = getopt('', ['only::', 'dry-run']);
|
|
$only = $opts['only'] ?? null;
|
|
$dryRun = isset($opts['dry-run']);
|
|
|
|
$sources = require APP_PATH . '/config/nis2_sources.php';
|
|
|
|
function logln(string $m): void { echo '[' . date('Y-m-d H:i:s') . "] $m\n"; }
|
|
|
|
/** Estrae testo da un PDF: pdftotext -> fallback Claude document API. */
|
|
function extractPdfText(string $absPath): string
|
|
{
|
|
// 0) Cache di testo pre-estratto accanto al PDF (<file>.pdf.txt).
|
|
// Utile quando l'ingest gira in un container privo di pdftotext:
|
|
// si estrae prima sull'host e si rilegge il .txt qui.
|
|
$cache = $absPath . '.txt';
|
|
if (is_file($cache)) {
|
|
$t = (string) file_get_contents($cache);
|
|
if (strlen(trim($t)) > 200) { logln(' uso cache testo: ' . basename($cache)); return $t; }
|
|
}
|
|
|
|
// 1) pdftotext (veloce, gratuito)
|
|
$bin = trim((string)@shell_exec('command -v pdftotext 2>/dev/null'));
|
|
if ($bin !== '') {
|
|
$tmp = tempnam(sys_get_temp_dir(), 'nis2pdf') . '.txt';
|
|
@shell_exec(escapeshellcmd($bin) . ' -enc UTF-8 -nopgbrk ' . escapeshellarg($absPath) . ' ' . escapeshellarg($tmp) . ' 2>/dev/null');
|
|
$txt = is_file($tmp) ? (string)file_get_contents($tmp) : '';
|
|
@unlink($tmp);
|
|
if (strlen(trim($txt)) > 200) return $txt;
|
|
}
|
|
|
|
// 2) Fallback: Claude document API
|
|
logln(' pdftotext non disponibile/insufficiente -> uso Claude document API');
|
|
$data = base64_encode((string)file_get_contents($absPath));
|
|
$body = [
|
|
'model' => defined('ANTHROPIC_MODEL') ? ANTHROPIC_MODEL : 'claude-sonnet-4-5-20250929',
|
|
'max_tokens' => 8000,
|
|
'messages' => [[
|
|
'role' => 'user',
|
|
'content' => [
|
|
['type' => 'document', 'source' => ['type' => 'base64', 'media_type' => 'application/pdf', 'data' => $data]],
|
|
['type' => 'text', 'text' => 'Estrai integralmente il testo di questo documento normativo in testo semplice, mantenendo numeri di articolo, commi, allegati e tabelle. Non riassumere, non commentare.'],
|
|
],
|
|
]],
|
|
];
|
|
$ch = curl_init('https://api.anthropic.com/v1/messages');
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_POST => true,
|
|
CURLOPT_HTTPHEADER => [
|
|
'content-type: application/json',
|
|
'x-api-key: ' . ANTHROPIC_API_KEY,
|
|
'anthropic-version: 2023-06-01',
|
|
],
|
|
CURLOPT_POSTFIELDS => json_encode($body),
|
|
CURLOPT_TIMEOUT => 180,
|
|
]);
|
|
$res = curl_exec($ch);
|
|
if ($res === false) { logln(' ERRORE curl: ' . curl_error($ch)); curl_close($ch); return ''; }
|
|
curl_close($ch);
|
|
$j = json_decode($res, true);
|
|
return $j['content'][0]['text'] ?? '';
|
|
}
|
|
|
|
function chunkText(string $text, int $size = 2000, int $overlap = 200): array
|
|
{
|
|
// Multibyte-safe: usa mb_* per non spezzare caratteri UTF-8 a meta'
|
|
// (altrimenti json_encode produce body non valido -> Voyage HTTP 400).
|
|
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); // bonifica sequenze invalide
|
|
$chunks = []; $len = mb_strlen($text, 'UTF-8'); $start = 0;
|
|
while ($start < $len) {
|
|
$take = min($size, $len - $start);
|
|
$piece = mb_substr($text, $start, $take, 'UTF-8');
|
|
if (trim($piece) !== '') $chunks[] = $piece;
|
|
if ($start + $take >= $len) break;
|
|
$start += ($size - $overlap);
|
|
}
|
|
return $chunks;
|
|
}
|
|
|
|
function uuid(): string
|
|
{
|
|
$b = random_bytes(16);
|
|
$b[6] = chr((ord($b[6]) & 0x0f) | 0x40);
|
|
$b[8] = chr((ord($b[8]) & 0x3f) | 0x80);
|
|
return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($b), 4));
|
|
}
|
|
|
|
logln('=== Ingest fonti normative NIS2 nella KB (scope SYSTEM) ===');
|
|
if ($dryRun) logln('MODALITA DRY-RUN: nessun upsert.');
|
|
|
|
$embed = null; $vector = null;
|
|
if (!$dryRun) {
|
|
$embed = new EmbedService();
|
|
$vector = new VectorService();
|
|
$vector->ensureCollection($embed->dims);
|
|
}
|
|
|
|
$totalChunks = 0; $done = 0;
|
|
foreach ($sources as $key => $src) {
|
|
if ($only && $key !== $only) continue;
|
|
if (empty($src['file'])) { logln("SKIP {$key}: nessun file PDF associato"); continue; }
|
|
|
|
$abs = BASE_PATH . '/' . $src['file'];
|
|
if (!is_file($abs)) { logln("SKIP {$key}: file non trovato {$abs}"); continue; }
|
|
|
|
logln("Fonte: {$src['short']} ({$src['file']})");
|
|
$text = extractPdfText($abs);
|
|
$text = preg_replace('/[ \t]+/', ' ', $text);
|
|
$text = preg_replace('/\n{3,}/', "\n\n", trim($text));
|
|
if (strlen($text) < 200) { logln(" ERRORE: testo estratto troppo breve, salto."); continue; }
|
|
|
|
// Prefisso citazione su ogni documento: aiuta il modello a citare correttamente
|
|
$header = "FONTE NORMATIVA: {$src['citation']}\nAUTORITA: {$src['authority']}\n\n";
|
|
$chunks = chunkText($header . $text, 2000, 200);
|
|
logln(' testo: ' . strlen($text) . ' char -> ' . count($chunks) . ' chunk');
|
|
$totalChunks += count($chunks);
|
|
|
|
if ($dryRun) { $done++; continue; }
|
|
|
|
// Idempotenza: rimuovi i chunk SYSTEM esistenti per questa fonte
|
|
try {
|
|
$vector->deleteByFilter(['must' => [
|
|
['key' => 'scope', 'match' => ['value' => 'SYSTEM']],
|
|
['key' => 'source', 'match' => ['value' => $src['citation']]],
|
|
]]);
|
|
} catch (Exception $e) { logln(' (warning) delete precedente: ' . $e->getMessage()); }
|
|
|
|
$docUuid = uuid();
|
|
$points = [];
|
|
foreach ($chunks as $i => $chunk) {
|
|
// Embedding con retry/backoff: Voyage puo' restituire errori transitori
|
|
// (HTTP 0 timeout / 429 rate limit) su grandi volumi di chunk.
|
|
$vec = null;
|
|
for ($try = 1; $try <= 5; $try++) {
|
|
try { $vec = $embed->embed($chunk); break; }
|
|
catch (Throwable $e) {
|
|
if ($try === 5) { logln(" ERRORE embed chunk {$i} dopo 5 tentativi: " . $e->getMessage()); throw $e; }
|
|
logln(" retry embed chunk {$i} (tentativo {$try}): " . substr($e->getMessage(), 0, 60));
|
|
sleep($try); // backoff lineare 1s,2s,3s,4s
|
|
}
|
|
}
|
|
$points[] = [
|
|
'id' => uuid(),
|
|
'vector' => $vec,
|
|
'payload' => [
|
|
'doc_uuid' => $docUuid,
|
|
'title' => $src['short'] . ($i > 0 ? ' (parte ' . ($i + 1) . ')' : ''),
|
|
'chunk' => $chunk,
|
|
'entity_type' => 'normativa',
|
|
'source' => $src['citation'],
|
|
'lang' => 'it',
|
|
'scope' => 'SYSTEM',
|
|
'consulting_firm_id' => null,
|
|
'organization_id' => null,
|
|
'shared_with_orgs' => [],
|
|
'uploaded_by' => 0,
|
|
],
|
|
];
|
|
}
|
|
// Upsert a batch (per non superare i limiti di payload)
|
|
foreach (array_chunk($points, 64) as $batch) {
|
|
$vector->upsertBatch($batch);
|
|
}
|
|
|
|
// Tracking MySQL (best-effort)
|
|
try {
|
|
$stmt = Database::getInstance()->prepare(
|
|
"INSERT INTO kb_uploaded_documents
|
|
(qdrant_doc_uuid, scope, consulting_firm_id, organization_id, uploaded_by, title, entity_type, source, lang, chunk_count, shared_with_orgs, status)
|
|
VALUES (?, 'SYSTEM', NULL, NULL, 0, ?, 'normativa', ?, 'it', ?, '[]', 'ready')"
|
|
);
|
|
$stmt->execute([$docUuid, $src['short'], $src['citation'], count($chunks)]);
|
|
} catch (Exception $e) { logln(' (warning) tracking insert: ' . $e->getMessage()); }
|
|
|
|
logln(" OK indicizzato (doc_uuid={$docUuid})");
|
|
$done++;
|
|
}
|
|
|
|
logln("=== Completato: {$done} fonti, {$totalChunks} chunk totali ===");
|