fallback Claude document API. */ function extractPdfText(string $absPath): string { // 0) Cache di testo pre-estratto accanto al PDF (.pdf.txt). // Utile quando l'ingest gira in un container privo di pdftotext: // si estrae prima sull'host e si rilegge il .txt qui. $cache = $absPath . '.txt'; if (is_file($cache)) { $t = (string) file_get_contents($cache); if (strlen(trim($t)) > 200) { logln(' uso cache testo: ' . basename($cache)); return $t; } } // 1) pdftotext (veloce, gratuito) $bin = trim((string)@shell_exec('command -v pdftotext 2>/dev/null')); if ($bin !== '') { $tmp = tempnam(sys_get_temp_dir(), 'nis2pdf') . '.txt'; @shell_exec(escapeshellcmd($bin) . ' -enc UTF-8 -nopgbrk ' . escapeshellarg($absPath) . ' ' . escapeshellarg($tmp) . ' 2>/dev/null'); $txt = is_file($tmp) ? (string)file_get_contents($tmp) : ''; @unlink($tmp); if (strlen(trim($txt)) > 200) return $txt; } // 2) Fallback: Claude document API logln(' pdftotext non disponibile/insufficiente -> uso Claude document API'); $data = base64_encode((string)file_get_contents($absPath)); $body = [ 'model' => defined('ANTHROPIC_MODEL') ? ANTHROPIC_MODEL : 'claude-sonnet-4-5-20250929', 'max_tokens' => 8000, 'messages' => [[ 'role' => 'user', 'content' => [ ['type' => 'document', 'source' => ['type' => 'base64', 'media_type' => 'application/pdf', 'data' => $data]], ['type' => 'text', 'text' => 'Estrai integralmente il testo di questo documento normativo in testo semplice, mantenendo numeri di articolo, commi, allegati e tabelle. Non riassumere, non commentare.'], ], ]], ]; $ch = curl_init('https://api.anthropic.com/v1/messages'); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => true, CURLOPT_HTTPHEADER => [ 'content-type: application/json', 'x-api-key: ' . ANTHROPIC_API_KEY, 'anthropic-version: 2023-06-01', ], CURLOPT_POSTFIELDS => json_encode($body), CURLOPT_TIMEOUT => 180, ]); $res = curl_exec($ch); if ($res === false) { logln(' ERRORE curl: ' . curl_error($ch)); curl_close($ch); return ''; } curl_close($ch); $j = json_decode($res, true); return $j['content'][0]['text'] ?? ''; } function chunkText(string $text, int $size = 2000, int $overlap = 200): array { // Multibyte-safe: usa mb_* per non spezzare caratteri UTF-8 a meta' // (altrimenti json_encode produce body non valido -> Voyage HTTP 400). $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); // bonifica sequenze invalide $chunks = []; $len = mb_strlen($text, 'UTF-8'); $start = 0; while ($start < $len) { $take = min($size, $len - $start); $piece = mb_substr($text, $start, $take, 'UTF-8'); if (trim($piece) !== '') $chunks[] = $piece; if ($start + $take >= $len) break; $start += ($size - $overlap); } return $chunks; } function uuid(): string { $b = random_bytes(16); $b[6] = chr((ord($b[6]) & 0x0f) | 0x40); $b[8] = chr((ord($b[8]) & 0x3f) | 0x80); return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($b), 4)); } logln('=== Ingest fonti normative NIS2 nella KB (scope SYSTEM) ==='); if ($dryRun) logln('MODALITA DRY-RUN: nessun upsert.'); $embed = null; $vector = null; if (!$dryRun) { $embed = new EmbedService(); $vector = new VectorService(); $vector->ensureCollection($embed->dims); } $totalChunks = 0; $done = 0; foreach ($sources as $key => $src) { if ($only && $key !== $only) continue; if (empty($src['file'])) { logln("SKIP {$key}: nessun file PDF associato"); continue; } $abs = BASE_PATH . '/' . $src['file']; if (!is_file($abs)) { logln("SKIP {$key}: file non trovato {$abs}"); continue; } logln("Fonte: {$src['short']} ({$src['file']})"); $text = extractPdfText($abs); $text = preg_replace('/[ \t]+/', ' ', $text); $text = preg_replace('/\n{3,}/', "\n\n", trim($text)); if (strlen($text) < 200) { logln(" ERRORE: testo estratto troppo breve, salto."); continue; } // Prefisso citazione su ogni documento: aiuta il modello a citare correttamente $header = "FONTE NORMATIVA: {$src['citation']}\nAUTORITA: {$src['authority']}\n\n"; $chunks = chunkText($header . $text, 2000, 200); logln(' testo: ' . strlen($text) . ' char -> ' . count($chunks) . ' chunk'); $totalChunks += count($chunks); if ($dryRun) { $done++; continue; } // Idempotenza: rimuovi i chunk SYSTEM esistenti per questa fonte try { $vector->deleteByFilter(['must' => [ ['key' => 'scope', 'match' => ['value' => 'SYSTEM']], ['key' => 'source', 'match' => ['value' => $src['citation']]], ]]); } catch (Exception $e) { logln(' (warning) delete precedente: ' . $e->getMessage()); } $docUuid = uuid(); $points = []; foreach ($chunks as $i => $chunk) { // Embedding con retry/backoff: Voyage puo' restituire errori transitori // (HTTP 0 timeout / 429 rate limit) su grandi volumi di chunk. $vec = null; for ($try = 1; $try <= 5; $try++) { try { $vec = $embed->embed($chunk); break; } catch (Throwable $e) { if ($try === 5) { logln(" ERRORE embed chunk {$i} dopo 5 tentativi: " . $e->getMessage()); throw $e; } logln(" retry embed chunk {$i} (tentativo {$try}): " . substr($e->getMessage(), 0, 60)); sleep($try); // backoff lineare 1s,2s,3s,4s } } $points[] = [ 'id' => uuid(), 'vector' => $vec, 'payload' => [ 'doc_uuid' => $docUuid, 'title' => $src['short'] . ($i > 0 ? ' (parte ' . ($i + 1) . ')' : ''), 'chunk' => $chunk, 'entity_type' => 'normativa', 'source' => $src['citation'], 'lang' => 'it', 'scope' => 'SYSTEM', 'consulting_firm_id' => null, 'organization_id' => null, 'shared_with_orgs' => [], 'uploaded_by' => 0, ], ]; } // Upsert a batch (per non superare i limiti di payload) foreach (array_chunk($points, 64) as $batch) { $vector->upsertBatch($batch); } // Tracking MySQL (best-effort) try { $stmt = Database::getInstance()->prepare( "INSERT INTO kb_uploaded_documents (qdrant_doc_uuid, scope, consulting_firm_id, organization_id, uploaded_by, title, entity_type, source, lang, chunk_count, shared_with_orgs, status) VALUES (?, 'SYSTEM', NULL, NULL, 0, ?, 'normativa', ?, 'it', ?, '[]', 'ready')" ); $stmt->execute([$docUuid, $src['short'], $src['citation'], count($chunks)]); } catch (Exception $e) { logln(' (warning) tracking insert: ' . $e->getMessage()); } logln(" OK indicizzato (doc_uuid={$docUuid})"); $done++; } logln("=== Completato: {$done} fonti, {$totalChunks} chunk totali ===");