$options * @return array */ public function extract(CrawlRule $rule, string $html, array $options = []): array { $this->lastError = null; $credentials = $this->resolveCredentials(); if ($credentials === null) { $this->lastError = 'AI credentials not configured.'; return []; } $ruleAiOptions = is_array($rule->extractor_config['ai'] ?? null) ? $rule->extractor_config['ai'] : []; $mergedOptions = array_merge($ruleAiOptions, $options); $model = $this->resolveModel($rule->ai_model, $mergedOptions); $temperature = $this->resolveTemperature($mergedOptions); $contentMaxChars = $this->resolveContentMaxChars($mergedOptions, 12000); $targetSchema = $rule->target_module?->value === 'tool' ? 'name, summary, official_url, pricing_type, platform, language, description, logo_url' : 'name, summary, provider, modality, deployment_mode, context_window, price_input, price_output, description'; $defaultUserPrompt = <<resolveSystemPrompt($mergedOptions, '你是一个精确的信息抽取引擎。'); $userPrompt = $this->resolveUserPrompt($mergedOptions, $defaultUserPrompt); $content = $this->requestAiContent( credentials: $credentials, model: $model, temperature: $temperature, systemPrompt: $systemPrompt, userPrompt: $userPrompt, html: $html, contentMaxChars: $contentMaxChars, stripTags: true, ); return $this->decodeJsonContent($content); } public function isConfigured(): bool { return $this->resolveCredentials() !== null; } public function lastError(): ?string { return $this->lastError; } /** * @param array $options * @return array */ public function suggestExtractorConfig(string $targetModule, string $html, array $options = []): array { $this->lastError = null; $credentials = $this->resolveCredentials(); if ($credentials === null) { $this->lastError = 'AI credentials not configured.'; return []; } $targetModule = in_array($targetModule, ['tool', 'model'], true) ? $targetModule : 'tool'; $fields = $targetModule === 'tool' ? ['name', 'summary', 'official_url', 'pricing_type', 'platform', 'language', 'description', 'logo_url'] : ['name', 'summary', 'provider', 'modality', 'deployment_mode', 'context_window', 'price_input', 'price_output', 'description']; $defaultUserPrompt = <<implodeFields($fields)}。 PROMPT; $model = $this->resolveModel((string) ($options['model'] ?? null), $options); $temperature = $this->resolveTemperature($options); $contentMaxChars = $this->resolveContentMaxChars($options, 16000); $systemPrompt = $this->resolveSystemPrompt($options, '你是 XPath 规则设计专家,擅长从 HTML 生成稳定的抽取规则。'); $userPrompt = $this->resolveUserPrompt($options, $defaultUserPrompt); $content = $this->requestAiContent( credentials: $credentials, model: $model, temperature: $temperature, systemPrompt: $systemPrompt, userPrompt: $userPrompt, html: $html, contentMaxChars: $contentMaxChars, stripTags: false, ); $decoded = $this->decodeJsonContent($content); if (! is_array($decoded)) { $this->lastError = $this->lastError ?: 'AI response is not valid JSON.'; return []; } $fieldsConfig = is_array($decoded['fields'] ?? null) ? $decoded['fields'] : []; if ($fieldsConfig === []) { $this->lastError = $this->lastError ?: 'AI response does not include fields config.'; return []; } return [ 'list_link_xpath' => is_string($decoded['list_link_xpath'] ?? null) ? $decoded['list_link_xpath'] : '', 'fields' => $fieldsConfig, ]; } /** * @return array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string}|null */ private function resolveCredentials(): ?array { $apiKey = (string) config('crawler.openai_compatible_key', ''); $endpoint = $this->resolveEndpoint(); if ($endpoint === '' || $apiKey === '') { return null; } return [ 'endpoint' => $endpoint, 'api_key' => $apiKey, 'wire_api' => $this->resolveWireApi(), 'disable_response_storage' => (bool) config('crawler.openai_disable_response_storage', false), 'reasoning_effort' => trim((string) config('crawler.openai_reasoning_effort', '')), ]; } private function resolveEndpoint(): string { $configuredEndpoint = trim((string) config('crawler.openai_compatible_endpoint', '')); if ($configuredEndpoint !== '') { return $configuredEndpoint; } $baseUrl = trim((string) config('crawler.openai_compatible_base_url', '')); if ($baseUrl === '') { return ''; } $baseUrl = rtrim($baseUrl, '/'); return $this->resolveWireApi() === 'responses' ? $baseUrl.'/v1/responses' : $baseUrl.'/v1/chat/completions'; } private function resolveWireApi(): string { $wireApi = strtolower(trim((string) config('crawler.openai_wire_api', 'chat_completions'))); return $wireApi === 'responses' ? 'responses' : 'chat_completions'; } /** * @param string|null $ruleModel * @param array $options */ private function resolveModel(?string $ruleModel, array $options): string { $model = ''; if (is_string($options['model'] ?? null)) { $model = trim((string) $options['model']); } if ($model === '' && is_string($ruleModel)) { $model = trim($ruleModel); } if ($model === '') { $model = (string) config('crawler.openai_default_model', 'gpt-4o-mini'); } return $model; } /** * @param array $options */ private function resolveTemperature(array $options): float { $temperature = is_numeric($options['temperature'] ?? null) ? (float) $options['temperature'] : 0.0; return max(0.0, min(2.0, $temperature)); } /** * @param array $options */ private function resolveContentMaxChars(array $options, int $default): int { $value = is_numeric($options['content_max_chars'] ?? null) ? (int) $options['content_max_chars'] : $default; return max(500, min(50000, $value)); } /** * @param array $options */ private function resolveSystemPrompt(array $options, string $default): string { $prompt = is_string($options['system_prompt'] ?? null) ? trim((string) $options['system_prompt']) : ''; return $prompt === '' ? $default : $prompt; } /** * @param array $options */ private function resolveUserPrompt(array $options, string $default): string { $prompt = is_string($options['user_prompt'] ?? null) ? trim((string) $options['user_prompt']) : ''; return $prompt === '' ? $default : $prompt; } private function implodeFields(array $fields): string { return implode(', ', $fields); } /** * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials */ private function requestAiContent( array $credentials, string $model, float $temperature, string $systemPrompt, string $userPrompt, string $html, int $contentMaxChars, bool $stripTags, ): string { $source = $stripTags ? strip_tags($html) : $html; $content = mb_substr($source, 0, $contentMaxChars); if ($credentials['wire_api'] === 'responses') { return $this->requestResponsesApi( credentials: $credentials, model: $model, systemPrompt: $systemPrompt, userPrompt: $userPrompt, content: $content, ); } return $this->requestChatCompletionsApi( credentials: $credentials, model: $model, temperature: $temperature, systemPrompt: $systemPrompt, userPrompt: $userPrompt, content: $content, ); } /** * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials */ private function requestChatCompletionsApi( array $credentials, string $model, float $temperature, string $systemPrompt, string $userPrompt, string $content, ): string { try { $payload = [ 'model' => $model, 'temperature' => $temperature, 'messages' => [ ['role' => 'system', 'content' => $systemPrompt], ['role' => 'user', 'content' => $userPrompt."\n\n页面内容:\n".$content], ], ]; if ($credentials['disable_response_storage']) { $payload['store'] = false; } if ($credentials['reasoning_effort'] !== '') { $payload['reasoning'] = ['effort' => $credentials['reasoning_effort']]; } $response = $this->requestBuilder($credentials['api_key']) ->post($credentials['endpoint'], $payload); if (! $response->successful()) { $this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240)); return ''; } return (string) data_get($response->json(), 'choices.0.message.content', ''); } catch (\Throwable $exception) { $this->lastError = $exception->getMessage(); return ''; } } /** * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials */ private function requestResponsesApi( array $credentials, string $model, string $systemPrompt, string $userPrompt, string $content, ): string { try { $payload = [ 'model' => $model, 'input' => [ [ 'role' => 'system', 'content' => [ ['type' => 'input_text', 'text' => $systemPrompt], ], ], [ 'role' => 'user', 'content' => [ ['type' => 'input_text', 'text' => $userPrompt."\n\n页面内容:\n".$content], ], ], ], ]; if ($credentials['disable_response_storage']) { $payload['store'] = false; } if ($credentials['reasoning_effort'] !== '') { $payload['reasoning'] = ['effort' => $credentials['reasoning_effort']]; } $response = $this->requestBuilder($credentials['api_key']) ->post($credentials['endpoint'], $payload); if (! $response->successful()) { $this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240)); return ''; } $json = $response->json(); $outputText = (string) data_get($json, 'output_text', ''); if ($outputText !== '') { return $outputText; } $output = data_get($json, 'output', []); if (! is_array($output)) { $this->lastError = 'AI output is empty.'; return ''; } $chunks = []; foreach ($output as $item) { if (! is_array($item)) { continue; } $contents = $item['content'] ?? []; if (! is_array($contents)) { continue; } foreach ($contents as $contentItem) { if (! is_array($contentItem)) { continue; } $text = (string) ($contentItem['text'] ?? ''); if ($text !== '') { $chunks[] = $text; } } } if ($chunks === []) { $this->lastError = 'AI output chunks are empty.'; return ''; } return implode("\n", $chunks); } catch (\Throwable $exception) { $this->lastError = $exception->getMessage(); return ''; } } private function requestBuilder(string $apiKey): PendingRequest { $aiTimeout = (int) config('crawler.ai_timeout_seconds', (int) config('crawler.request_timeout_seconds', 20)); $request = Http::timeout(max($aiTimeout, 5)) ->withToken($apiKey); if (! (bool) config('crawler.verify_ssl', true)) { $request = $request->withoutVerifying(); } $options = []; if ((bool) config('crawler.force_ipv4', false)) { $options['force_ip_resolve'] = 'v4'; } $dnsServers = trim((string) config('crawler.dns_servers', '')); if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) { $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers; } if ($options !== []) { $request = $request->withOptions($options); } return $request; } /** * @return array */ private function decodeJsonContent(string $content): array { if ($content === '') { return []; } $decoded = json_decode($content, true); if (is_array($decoded)) { return $decoded; } if (preg_match('/\{.*\}/s', $content, $matches) === 1) { $decoded = json_decode($matches[0], true); return is_array($decoded) ? $decoded : []; } return []; } }