diff --git a/scripts/ingest-acn-requirements.php b/scripts/ingest-acn-requirements.php new file mode 100644 index 0000000..5436e6a --- /dev/null +++ b/scripts/ingest-acn-requirements.php @@ -0,0 +1,215 @@ +dims (512). + * ============================================================================ + */ + +if (PHP_SAPI !== 'cli') { fwrite(STDERR, "Solo CLI\n"); exit(1); } + +if (!defined('BASE_PATH')) define('BASE_PATH', dirname(__DIR__)); +if (!defined('APP_PATH')) define('APP_PATH', BASE_PATH . '/application'); + +require_once APP_PATH . '/config/env.php'; +require_once APP_PATH . '/config/config.php'; +require_once APP_PATH . '/config/database.php'; +require_once APP_PATH . '/services/EmbedService.php'; +require_once APP_PATH . '/services/VectorService.php'; + +$opts = getopt('', ['only::', 'dry-run', 'no-purge']); +$only = $opts['only'] ?? null; +$dryRun = isset($opts['dry-run']); +$noPurge = isset($opts['no-purge']); + +const ACN_JSON = 'docs/nis2/allegati_acn/acn_requirements.json'; +const ACN_SOURCE = 'ACN, Misure di sicurezza di base NIS - Allegati 1 e 2 alla Determinazione n. 164179/2025'; +const ACN_ENTITY_TYPE = 'requisito_acn'; +const ACN_NAMESPACE_UUID = '7f3b6d2a-9c41-5e88-b0a4-2acn164179bas'; + +function logln(string $m): void { echo '[' . date('Y-m-d H:i:s') . "] $m\n"; } + +/** UUIDv5 deterministico (RFC 4122 ยง4.3) per point id stabile => idempotenza. */ +function uuidv5(string $namespaceUuid, string $name): string +{ + $nhex = str_replace(['-', '{', '}'], '', $namespaceUuid); + $nbin = ''; + for ($i = 0; $i + 1 < strlen($nhex); $i += 2) { + $nbin .= chr(hexdec(substr($nhex, $i, 2))); + } + $hash = sha1($nbin . $name); + return sprintf('%08s-%04s-%04x-%04x-%12s', + substr($hash, 0, 8), substr($hash, 8, 4), + (hexdec(substr($hash, 12, 4)) & 0x0fff) | 0x5000, + (hexdec(substr($hash, 16, 4)) & 0x3fff) | 0x8000, + substr($hash, 20, 12)); +} + +function allegatoLabel(string $entity): string +{ + $e = strtolower(trim($entity)); + if (strpos($e, 'import') === 0) return 'Allegato 1 (soggetti importanti)'; + if (strpos($e, 'essenz') === 0) return 'Allegato 2 (soggetti essenziali)'; + return 'Allegato ACN'; +} +function canonEntity(string $entity): string +{ + $e = strtolower(trim($entity)); + if (strpos($e, 'import') === 0) return 'importante'; + if (strpos($e, 'essenz') === 0) return 'essenziale'; + return $e !== '' ? $e : 'sconosciuta'; +} + +$abs = BASE_PATH . '/' . ACN_JSON; +if (!is_file($abs)) { fwrite(STDERR, "ERRORE: file non trovato: $abs\n"); exit(1); } +$json = json_decode((string)file_get_contents($abs), true); +if (!is_array($json)) { fwrite(STDERR, "ERRORE: JSON non valido\n"); exit(1); } + +$records = []; +foreach ($json as $groupKey => $items) { + if (!is_array($items)) continue; + if ($only && $groupKey !== $only) continue; + foreach ($items as $r) { + if (!is_array($r)) continue; + $requirement = trim((string)($r['requirement'] ?? '')); + if ($requirement === '') continue; + $entity = canonEntity((string)($r['entity'] ?? $groupKey)); + $subcategory = trim((string)($r['subcategory'] ?? '')) ?: 'N/D'; + $function = trim((string)($r['function'] ?? '')) ?: 'N/D'; + $subcatText = trim((string)($r['subcategory_text'] ?? '')); + $reqIndex = isset($r['req_index']) ? (int)$r['req_index'] : 1; + $code = sprintf('%s/%s/%d', $entity, $subcategory, $reqIndex); + $records[] = compact('entity', 'function', 'subcategory', 'subcatText', 'reqIndex', 'requirement', 'code'); + } +} + +logln('=== Ingest requisiti ACN granulari (scope SYSTEM, entity_type=' . ACN_ENTITY_TYPE . ') ==='); +logln('Requisiti da indicizzare: ' . count($records) . ($only ? " (solo: {$only})" : '')); +if ($dryRun) logln('MODALITA DRY-RUN: nessun embed/upsert.'); +if (empty($records)) { logln('Nessun requisito. Esco.'); exit(0); } + +$embed = null; $vector = null; +if (!$dryRun) { + try { + $embed = new EmbedService(); + $vector = new VectorService(); + $vector->ensureCollection($embed->dims); + } catch (Throwable $e) { + fwrite(STDERR, 'ERRORE init Voyage/Qdrant: ' . $e->getMessage() . "\n"); exit(1); + } + if (!$noPurge) { + try { + $vector->deletePoints(['must' => [ + ['key' => 'scope', 'match' => ['value' => 'SYSTEM']], + ['key' => 'entity_type', 'match' => ['value' => ACN_ENTITY_TYPE]], + ['key' => 'source', 'match' => ['value' => ACN_SOURCE]], + ]]); + logln('Purge chunk pre-esistenti: OK'); + } catch (Throwable $e) { + logln(' (warning) purge fallita: ' . $e->getMessage()); + } + } +} + +$ok = 0; $fail = 0; $failed = []; $batch = []; $BATCH_SIZE = 32; +$flush = function () use (&$batch, $vector) { + if (empty($batch)) return; + $vector->upsertPoints($batch); + $batch = []; +}; + +foreach ($records as $n => $rec) { + $human = ($n + 1) . '/' . count($records); + $allegato = allegatoLabel($rec['entity']); + $chunkText = implode("\n", array_filter([ + 'FONTE NORMATIVA: ' . ACN_SOURCE, + 'RIFERIMENTO: ' . $allegato . ' - Soggetti ' . $rec['entity'], + 'FUNZIONE (Framework Nazionale): ' . $rec['function'], + 'SOTTOCATEGORIA: ' . $rec['subcategory'], + $rec['subcatText'] !== '' ? 'DESCRIZIONE SOTTOCATEGORIA: ' . $rec['subcatText'] : null, + 'REQUISITO #' . $rec['reqIndex'] . ': ' . $rec['requirement'], + ])); + + if ($dryRun) { if ($n < 3) logln(" [{$human}] {$rec['code']}"); $ok++; continue; } + + $vec = null; $lastErr = ''; + for ($try = 1; $try <= 5; $try++) { + try { $vec = $embed->embed($chunkText); break; } + catch (Throwable $e) { $lastErr = $e->getMessage(); if ($try < 5) sleep($try); } + } + if (!is_array($vec)) { + $fail++; $failed[] = $rec['code']; + logln(" ERRORE [{$human}] embed {$rec['code']}: " . substr($lastErr, 0, 100) . ' -> SKIP'); + continue; + } + + $pointId = uuidv5(ACN_NAMESPACE_UUID, ACN_SOURCE . '|' . $rec['code']); + $batch[] = [ + 'id' => $pointId, + 'vector' => $vec, + 'payload' => [ + 'doc_uuid' => $pointId, + 'title' => $rec['subcategory'] . ' #' . $rec['reqIndex'] . ' (' . $rec['entity'] . ')', + 'chunk' => $chunkText, + 'entity_type' => ACN_ENTITY_TYPE, + 'source' => ACN_SOURCE, + 'lang' => 'it', + 'scope' => 'SYSTEM', + 'consulting_firm_id' => null, + 'organization_id' => null, + 'shared_with_orgs' => [], + 'code' => $rec['code'], + 'subcategory' => $rec['subcategory'], + 'subcategory_text' => $rec['subcatText'], + 'function' => $rec['function'], + 'req_index' => $rec['reqIndex'], + 'entity' => $rec['entity'], + 'allegato' => $allegato, + 'requirement_text' => $rec['requirement'], + ], + ]; + $ok++; + if (count($batch) >= $BATCH_SIZE) { + try { $flush(); logln(" upsert batch (ok cumulato={$ok})"); } + catch (Throwable $e) { + $n0 = count($batch); $fail += $n0; $ok -= $n0; + foreach ($batch as $b) $failed[] = $b['payload']['code'] ?? '(?)'; + $batch = []; + logln(' ERRORE upsert batch (' . $n0 . ' persi): ' . substr($e->getMessage(), 0, 120)); + } + } +} + +if (!$dryRun) { + try { $flush(); } + catch (Throwable $e) { + $n0 = count($batch); $fail += $n0; $ok -= $n0; + foreach ($batch as $b) $failed[] = $b['payload']['code'] ?? '(?)'; + logln(' ERRORE upsert finale (' . $n0 . ' persi): ' . substr($e->getMessage(), 0, 120)); + } +} + +logln('=== Completato ==='); +logln("Indicizzati OK : {$ok}"); +logln("Falliti : {$fail}"); +if (!empty($failed)) logln('Codici falliti : ' . implode(', ', array_slice($failed, 0, 50)) . (count($failed) > 50 ? ' ...' : '')); +exit($fail > 0 ? 2 : 0);