nis2-agile/scripts/ingest-nis2-sources.php

<?php
/**
 * NIS2 Agile - Ingest Fonti Normative Certe nella Knowledge Base (RAG)
 * ----------------------------------------------------------------------------
 * Indicizza i PDF normativi ufficiali (docs/nis2/*.pdf, registrati in
 * application/config/nis2_sources.php) nella collection Qdrant `nis2_kb` con
 * scope SYSTEM, cosi' che AIService::askWithRag() possa citare le fonti certe.
 *
 * ESEGUIRE SU HETZNER (richiede accesso a Qdrant + Voyage), es:
 *   docker exec -i nis2-app php /var/www/nis2-agile/scripts/ingest-nis2-sources.php
 *   # oppure dalla root del progetto:
 *   php scripts/ingest-nis2-sources.php
 *
 * Estrazione testo: usa `pdftotext` (poppler-utils) se disponibile, altrimenti
 * ricade sull'API document di Claude. Idempotente: cancella i chunk SYSTEM del
 * documento (per `source` stabile) prima del re-upsert.
 *
 * Opzioni:
 *   --only=determina_164179_2025   ingerisce una sola fonte (key del registry)
 *   --dry-run                      estrae e mostra le statistiche senza upsert
 * ============================================================================
 */

if (PHP_SAPI !== 'cli') { fwrite(STDERR, "Solo CLI\n"); exit(1); }

if (!defined('BASE_PATH')) define('BASE_PATH', dirname(__DIR__));
if (!defined('APP_PATH'))  define('APP_PATH', BASE_PATH . '/application');

require_once APP_PATH . '/config/env.php';
require_once APP_PATH . '/config/config.php';
require_once APP_PATH . '/config/database.php';
require_once APP_PATH . '/services/EmbedService.php';
require_once APP_PATH . '/services/VectorService.php';

$opts = getopt('', ['only::', 'dry-run']);
$only = $opts['only'] ?? null;
$dryRun = isset($opts['dry-run']);

$sources = require APP_PATH . '/config/nis2_sources.php';

function logln(string $m): void { echo '[' . date('Y-m-d H:i:s') . "] $m\n"; }

/** Estrae testo da un PDF: pdftotext -> fallback Claude document API. */
function extractPdfText(string $absPath): string
{
    // 0) Cache di testo pre-estratto accanto al PDF (<file>.pdf.txt).
    //    Utile quando l'ingest gira in un container privo di pdftotext:
    //    si estrae prima sull'host e si rilegge il .txt qui.
    $cache = $absPath . '.txt';
    if (is_file($cache)) {
        $t = (string) file_get_contents($cache);
        if (strlen(trim($t)) > 200) { logln('  uso cache testo: ' . basename($cache)); return $t; }
    }

    // 1) pdftotext (veloce, gratuito)
    $bin = trim((string)@shell_exec('command -v pdftotext 2>/dev/null'));
    if ($bin !== '') {
        $tmp = tempnam(sys_get_temp_dir(), 'nis2pdf') . '.txt';
        @shell_exec(escapeshellcmd($bin) . ' -enc UTF-8 -nopgbrk ' . escapeshellarg($absPath) . ' ' . escapeshellarg($tmp) . ' 2>/dev/null');
        $txt = is_file($tmp) ? (string)file_get_contents($tmp) : '';
        @unlink($tmp);
        if (strlen(trim($txt)) > 200) return $txt;
    }

    // 2) Fallback: Claude document API
    logln('  pdftotext non disponibile/insufficiente -> uso Claude document API');
    $data = base64_encode((string)file_get_contents($absPath));
    $body = [
        'model'      => defined('ANTHROPIC_MODEL') ? ANTHROPIC_MODEL : 'claude-sonnet-4-5-20250929',
        'max_tokens' => 8000,
        'messages'   => [[
            'role' => 'user',
            'content' => [
                ['type' => 'document', 'source' => ['type' => 'base64', 'media_type' => 'application/pdf', 'data' => $data]],
                ['type' => 'text', 'text' => 'Estrai integralmente il testo di questo documento normativo in testo semplice, mantenendo numeri di articolo, commi, allegati e tabelle. Non riassumere, non commentare.'],
            ],
        ]],
    ];
    $ch = curl_init('https://api.anthropic.com/v1/messages');
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_POST => true,
        CURLOPT_HTTPHEADER => [
            'content-type: application/json',
            'x-api-key: ' . ANTHROPIC_API_KEY,
            'anthropic-version: 2023-06-01',
        ],
        CURLOPT_POSTFIELDS => json_encode($body),
        CURLOPT_TIMEOUT => 180,
    ]);
    $res = curl_exec($ch);
    if ($res === false) { logln('  ERRORE curl: ' . curl_error($ch)); curl_close($ch); return ''; }
    curl_close($ch);
    $j = json_decode($res, true);
    return $j['content'][0]['text'] ?? '';
}

function chunkText(string $text, int $size = 2000, int $overlap = 200): array
{
    // Multibyte-safe: usa mb_* per non spezzare caratteri UTF-8 a meta'
    // (altrimenti json_encode produce body non valido -> Voyage HTTP 400).
    $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); // bonifica sequenze invalide
    $chunks = []; $len = mb_strlen($text, 'UTF-8'); $start = 0;
    while ($start < $len) {
        $take = min($size, $len - $start);
        $piece = mb_substr($text, $start, $take, 'UTF-8');
        if (trim($piece) !== '') $chunks[] = $piece;
        if ($start + $take >= $len) break;
        $start += ($size - $overlap);
    }
    return $chunks;
}

function uuid(): string
{
    $b = random_bytes(16);
    $b[6] = chr((ord($b[6]) & 0x0f) | 0x40);
    $b[8] = chr((ord($b[8]) & 0x3f) | 0x80);
    return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($b), 4));
}

logln('=== Ingest fonti normative NIS2 nella KB (scope SYSTEM) ===');
if ($dryRun) logln('MODALITA DRY-RUN: nessun upsert.');

$embed = null; $vector = null;
if (!$dryRun) {
    $embed = new EmbedService();
    $vector = new VectorService();
    $vector->ensureCollection($embed->dims);
}

$totalChunks = 0; $done = 0;
foreach ($sources as $key => $src) {
    if ($only && $key !== $only) continue;
    if (empty($src['file'])) { logln("SKIP {$key}: nessun file PDF associato"); continue; }

    $abs = BASE_PATH . '/' . $src['file'];
    if (!is_file($abs)) { logln("SKIP {$key}: file non trovato {$abs}"); continue; }

    logln("Fonte: {$src['short']} ({$src['file']})");
    $text = extractPdfText($abs);
    $text = preg_replace('/[ \t]+/', ' ', $text);
    $text = preg_replace('/\n{3,}/', "\n\n", trim($text));
    if (strlen($text) < 200) { logln("  ERRORE: testo estratto troppo breve, salto."); continue; }

    // Prefisso citazione su ogni documento: aiuta il modello a citare correttamente
    $header = "FONTE NORMATIVA: {$src['citation']}\nAUTORITA: {$src['authority']}\n\n";
    $chunks = chunkText($header . $text, 2000, 200);
    logln('  testo: ' . strlen($text) . ' char -> ' . count($chunks) . ' chunk');
    $totalChunks += count($chunks);

    if ($dryRun) { $done++; continue; }

    // Idempotenza: rimuovi i chunk SYSTEM esistenti per questa fonte
    try {
        $vector->deleteByFilter(['must' => [
            ['key' => 'scope', 'match' => ['value' => 'SYSTEM']],
            ['key' => 'source', 'match' => ['value' => $src['citation']]],
        ]]);
    } catch (Exception $e) { logln('  (warning) delete precedente: ' . $e->getMessage()); }

    $docUuid = uuid();
    $points = [];
    foreach ($chunks as $i => $chunk) {
        // Embedding con retry/backoff: Voyage puo' restituire errori transitori
        // (HTTP 0 timeout / 429 rate limit) su grandi volumi di chunk.
        $vec = null;
        for ($try = 1; $try <= 5; $try++) {
            try { $vec = $embed->embed($chunk); break; }
            catch (Throwable $e) {
                if ($try === 5) { logln("  ERRORE embed chunk {$i} dopo 5 tentativi: " . $e->getMessage()); throw $e; }
                logln("  retry embed chunk {$i} (tentativo {$try}): " . substr($e->getMessage(), 0, 60));
                sleep($try); // backoff lineare 1s,2s,3s,4s
            }
        }
        $points[] = [
            'id' => uuid(),
            'vector' => $vec,
            'payload' => [
                'doc_uuid'           => $docUuid,
                'title'              => $src['short'] . ($i > 0 ? ' (parte ' . ($i + 1) . ')' : ''),
                'chunk'              => $chunk,
                'entity_type'        => 'normativa',
                'source'             => $src['citation'],
                'lang'               => 'it',
                'scope'              => 'SYSTEM',
                'consulting_firm_id' => null,
                'organization_id'    => null,
                'shared_with_orgs'   => [],
                'uploaded_by'        => 0,
            ],
        ];
    }
    // Upsert a batch (per non superare i limiti di payload)
    foreach (array_chunk($points, 64) as $batch) {
        $vector->upsertBatch($batch);
    }

    // Tracking MySQL (best-effort)
    try {
        $stmt = Database::getInstance()->prepare(
            "INSERT INTO kb_uploaded_documents
             (qdrant_doc_uuid, scope, consulting_firm_id, organization_id, uploaded_by, title, entity_type, source, lang, chunk_count, shared_with_orgs, status)
             VALUES (?, 'SYSTEM', NULL, NULL, 0, ?, 'normativa', ?, 'it', ?, '[]', 'ready')"
        );
        $stmt->execute([$docUuid, $src['short'], $src['citation'], count($chunks)]);
    } catch (Exception $e) { logln('  (warning) tracking insert: ' . $e->getMessage()); }

    logln("  OK indicizzato (doc_uuid={$docUuid})");
    $done++;
}

logln("=== Completato: {$done} fonti, {$totalChunks} chunk totali ===");