nis2-agile/scripts/ingest-acn-requirements.php
DevEnv nis2-agile 6365d5dfda [FIX] ingest-acn: usa upsertBatch/deleteByFilter (metodi reali VectorService)
I metodi corretti sono upsertBatch() e deleteByFilter(), non upsertPoints/deletePoints.
Ingest eseguito su host: 203 requisiti ACN confermati in Qdrant nis2_kb.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 17:24:38 +02:00

216 lines
9.0 KiB
PHP

<?php
/**
* NIS2 Agile - Ingest 203 requisiti ACN granulari nella Knowledge Base (RAG)
* ----------------------------------------------------------------------------
* Indicizza UNO chunk Qdrant PER OGNI requisito ACN granulare (87 importanti
* dell'Allegato 1 + 116 essenziali dell'Allegato 2 = 203) nella collection
* `nis2_kb` con scope SYSTEM, entity_type='requisito_acn'. Cosi'
* AIService::askWithRag() puo' fare grounding sul singolo requisito e citarlo
* con precisione (subcategory + req_index + funzione + allegato).
*
* Sorgente: docs/nis2/allegati_acn/acn_requirements.json
* struttura reale (verificata): { "importante":[87 record], "essenziale":[116] }
* campi record: entity, function, subcategory, subcategory_text, req_index, requirement
*
* ESEGUIRE SU HETZNER (Qdrant + Voyage raggiungibili):
* docker exec nis2-app php /var/www/nis2-agile/scripts/ingest-acn-requirements.php
*
* IDEMPOTENTE: point id deterministico (UUIDv5 da entity|subcategory|req_index)
* + purge iniziale per filtro (scope SYSTEM, entity_type requisito_acn, source ACN).
*
* Opzioni: --only=importante|essenziale | --dry-run | --no-purge
*
* NB metodi VectorService REALI: upsertBatch(array), deleteByFilter(array filtro),
* ensureCollection(int dims). EmbedService: embed(text), ->dims (512).
* ============================================================================
*/
if (PHP_SAPI !== 'cli') { fwrite(STDERR, "Solo CLI\n"); exit(1); }
if (!defined('BASE_PATH')) define('BASE_PATH', dirname(__DIR__));
if (!defined('APP_PATH')) define('APP_PATH', BASE_PATH . '/application');
require_once APP_PATH . '/config/env.php';
require_once APP_PATH . '/config/config.php';
require_once APP_PATH . '/config/database.php';
require_once APP_PATH . '/services/EmbedService.php';
require_once APP_PATH . '/services/VectorService.php';
$opts = getopt('', ['only::', 'dry-run', 'no-purge']);
$only = $opts['only'] ?? null;
$dryRun = isset($opts['dry-run']);
$noPurge = isset($opts['no-purge']);
const ACN_JSON = 'docs/nis2/allegati_acn/acn_requirements.json';
const ACN_SOURCE = 'ACN, Misure di sicurezza di base NIS - Allegati 1 e 2 alla Determinazione n. 164179/2025';
const ACN_ENTITY_TYPE = 'requisito_acn';
const ACN_NAMESPACE_UUID = '7f3b6d2a-9c41-5e88-b0a4-2acn164179bas';
function logln(string $m): void { echo '[' . date('Y-m-d H:i:s') . "] $m\n"; }
/** UUIDv5 deterministico (RFC 4122 §4.3) per point id stabile => idempotenza. */
function uuidv5(string $namespaceUuid, string $name): string
{
$nhex = str_replace(['-', '{', '}'], '', $namespaceUuid);
$nbin = '';
for ($i = 0; $i + 1 < strlen($nhex); $i += 2) {
$nbin .= chr(hexdec(substr($nhex, $i, 2)));
}
$hash = sha1($nbin . $name);
return sprintf('%08s-%04s-%04x-%04x-%12s',
substr($hash, 0, 8), substr($hash, 8, 4),
(hexdec(substr($hash, 12, 4)) & 0x0fff) | 0x5000,
(hexdec(substr($hash, 16, 4)) & 0x3fff) | 0x8000,
substr($hash, 20, 12));
}
function allegatoLabel(string $entity): string
{
$e = strtolower(trim($entity));
if (strpos($e, 'import') === 0) return 'Allegato 1 (soggetti importanti)';
if (strpos($e, 'essenz') === 0) return 'Allegato 2 (soggetti essenziali)';
return 'Allegato ACN';
}
function canonEntity(string $entity): string
{
$e = strtolower(trim($entity));
if (strpos($e, 'import') === 0) return 'importante';
if (strpos($e, 'essenz') === 0) return 'essenziale';
return $e !== '' ? $e : 'sconosciuta';
}
$abs = BASE_PATH . '/' . ACN_JSON;
if (!is_file($abs)) { fwrite(STDERR, "ERRORE: file non trovato: $abs\n"); exit(1); }
$json = json_decode((string)file_get_contents($abs), true);
if (!is_array($json)) { fwrite(STDERR, "ERRORE: JSON non valido\n"); exit(1); }
$records = [];
foreach ($json as $groupKey => $items) {
if (!is_array($items)) continue;
if ($only && $groupKey !== $only) continue;
foreach ($items as $r) {
if (!is_array($r)) continue;
$requirement = trim((string)($r['requirement'] ?? ''));
if ($requirement === '') continue;
$entity = canonEntity((string)($r['entity'] ?? $groupKey));
$subcategory = trim((string)($r['subcategory'] ?? '')) ?: 'N/D';
$function = trim((string)($r['function'] ?? '')) ?: 'N/D';
$subcatText = trim((string)($r['subcategory_text'] ?? ''));
$reqIndex = isset($r['req_index']) ? (int)$r['req_index'] : 1;
$code = sprintf('%s/%s/%d', $entity, $subcategory, $reqIndex);
$records[] = compact('entity', 'function', 'subcategory', 'subcatText', 'reqIndex', 'requirement', 'code');
}
}
logln('=== Ingest requisiti ACN granulari (scope SYSTEM, entity_type=' . ACN_ENTITY_TYPE . ') ===');
logln('Requisiti da indicizzare: ' . count($records) . ($only ? " (solo: {$only})" : ''));
if ($dryRun) logln('MODALITA DRY-RUN: nessun embed/upsert.');
if (empty($records)) { logln('Nessun requisito. Esco.'); exit(0); }
$embed = null; $vector = null;
if (!$dryRun) {
try {
$embed = new EmbedService();
$vector = new VectorService();
$vector->ensureCollection($embed->dims);
} catch (Throwable $e) {
fwrite(STDERR, 'ERRORE init Voyage/Qdrant: ' . $e->getMessage() . "\n"); exit(1);
}
if (!$noPurge) {
try {
$vector->deleteByFilter(['must' => [
['key' => 'scope', 'match' => ['value' => 'SYSTEM']],
['key' => 'entity_type', 'match' => ['value' => ACN_ENTITY_TYPE]],
['key' => 'source', 'match' => ['value' => ACN_SOURCE]],
]]);
logln('Purge chunk pre-esistenti: OK');
} catch (Throwable $e) {
logln(' (warning) purge fallita: ' . $e->getMessage());
}
}
}
$ok = 0; $fail = 0; $failed = []; $batch = []; $BATCH_SIZE = 32;
$flush = function () use (&$batch, $vector) {
if (empty($batch)) return;
$vector->upsertBatch($batch);
$batch = [];
};
foreach ($records as $n => $rec) {
$human = ($n + 1) . '/' . count($records);
$allegato = allegatoLabel($rec['entity']);
$chunkText = implode("\n", array_filter([
'FONTE NORMATIVA: ' . ACN_SOURCE,
'RIFERIMENTO: ' . $allegato . ' - Soggetti ' . $rec['entity'],
'FUNZIONE (Framework Nazionale): ' . $rec['function'],
'SOTTOCATEGORIA: ' . $rec['subcategory'],
$rec['subcatText'] !== '' ? 'DESCRIZIONE SOTTOCATEGORIA: ' . $rec['subcatText'] : null,
'REQUISITO #' . $rec['reqIndex'] . ': ' . $rec['requirement'],
]));
if ($dryRun) { if ($n < 3) logln(" [{$human}] {$rec['code']}"); $ok++; continue; }
$vec = null; $lastErr = '';
for ($try = 1; $try <= 5; $try++) {
try { $vec = $embed->embed($chunkText); break; }
catch (Throwable $e) { $lastErr = $e->getMessage(); if ($try < 5) sleep($try); }
}
if (!is_array($vec)) {
$fail++; $failed[] = $rec['code'];
logln(" ERRORE [{$human}] embed {$rec['code']}: " . substr($lastErr, 0, 100) . ' -> SKIP');
continue;
}
$pointId = uuidv5(ACN_NAMESPACE_UUID, ACN_SOURCE . '|' . $rec['code']);
$batch[] = [
'id' => $pointId,
'vector' => $vec,
'payload' => [
'doc_uuid' => $pointId,
'title' => $rec['subcategory'] . ' #' . $rec['reqIndex'] . ' (' . $rec['entity'] . ')',
'chunk' => $chunkText,
'entity_type' => ACN_ENTITY_TYPE,
'source' => ACN_SOURCE,
'lang' => 'it',
'scope' => 'SYSTEM',
'consulting_firm_id' => null,
'organization_id' => null,
'shared_with_orgs' => [],
'code' => $rec['code'],
'subcategory' => $rec['subcategory'],
'subcategory_text' => $rec['subcatText'],
'function' => $rec['function'],
'req_index' => $rec['reqIndex'],
'entity' => $rec['entity'],
'allegato' => $allegato,
'requirement_text' => $rec['requirement'],
],
];
$ok++;
if (count($batch) >= $BATCH_SIZE) {
try { $flush(); logln(" upsert batch (ok cumulato={$ok})"); }
catch (Throwable $e) {
$n0 = count($batch); $fail += $n0; $ok -= $n0;
foreach ($batch as $b) $failed[] = $b['payload']['code'] ?? '(?)';
$batch = [];
logln(' ERRORE upsert batch (' . $n0 . ' persi): ' . substr($e->getMessage(), 0, 120));
}
}
}
if (!$dryRun) {
try { $flush(); }
catch (Throwable $e) {
$n0 = count($batch); $fail += $n0; $ok -= $n0;
foreach ($batch as $b) $failed[] = $b['payload']['code'] ?? '(?)';
logln(' ERRORE upsert finale (' . $n0 . ' persi): ' . substr($e->getMessage(), 0, 120));
}
}
logln('=== Completato ===');
logln("Indicizzati OK : {$ok}");
logln("Falliti : {$fail}");
if (!empty($failed)) logln('Codici falliti : ' . implode(', ', array_slice($failed, 0, 50)) . (count($failed) > 50 ? ' ...' : ''));
exit($fail > 0 ? 2 : 0);