Files
ai-web/app/Services/Crawler/OpenAiFallbackExtractor.php

495 lines
16 KiB
PHP
Raw Normal View History

2026-02-18 12:56:36 +08:00
<?php
declare(strict_types=1);
namespace App\Services\Crawler;
use App\Models\CrawlRule;
use Illuminate\Http\Client\PendingRequest;
use Illuminate\Support\Facades\Http;
class OpenAiFallbackExtractor
{
private ?string $lastError = null;
/**
* @param array<string, mixed> $options
* @return array<string, mixed>
*/
public function extract(CrawlRule $rule, string $html, array $options = []): array
{
$this->lastError = null;
$credentials = $this->resolveCredentials();
if ($credentials === null) {
$this->lastError = 'AI credentials not configured.';
return [];
}
$ruleAiOptions = is_array($rule->extractor_config['ai'] ?? null)
? $rule->extractor_config['ai']
: [];
$mergedOptions = array_merge($ruleAiOptions, $options);
$model = $this->resolveModel($rule->ai_model, $mergedOptions);
$temperature = $this->resolveTemperature($mergedOptions);
$contentMaxChars = $this->resolveContentMaxChars($mergedOptions, 12000);
$targetSchema = $rule->target_module?->value === 'tool'
? 'name, summary, official_url, pricing_type, platform, language, description, logo_url'
: 'name, summary, provider, modality, deployment_mode, context_window, price_input, price_output, description';
$defaultUserPrompt = <<<PROMPT
从页面内容中提取结构化字段并输出 JSON。
仅返回 JSON 对象本身,不要 Markdown。
目标字段:{$targetSchema}
字段缺失时请直接省略该字段,不要输出 null
PROMPT;
$systemPrompt = $this->resolveSystemPrompt($mergedOptions, '你是一个精确的信息抽取引擎。');
$userPrompt = $this->resolveUserPrompt($mergedOptions, $defaultUserPrompt);
$content = $this->requestAiContent(
credentials: $credentials,
model: $model,
temperature: $temperature,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
html: $html,
contentMaxChars: $contentMaxChars,
stripTags: true,
);
return $this->decodeJsonContent($content);
}
public function isConfigured(): bool
{
return $this->resolveCredentials() !== null;
}
public function lastError(): ?string
{
return $this->lastError;
}
/**
* @param array<string, mixed> $options
* @return array<string, mixed>
*/
public function suggestExtractorConfig(string $targetModule, string $html, array $options = []): array
{
$this->lastError = null;
$credentials = $this->resolveCredentials();
if ($credentials === null) {
$this->lastError = 'AI credentials not configured.';
return [];
}
$targetModule = in_array($targetModule, ['tool', 'model'], true) ? $targetModule : 'tool';
$fields = $targetModule === 'tool'
? ['name', 'summary', 'official_url', 'pricing_type', 'platform', 'language', 'description', 'logo_url']
: ['name', 'summary', 'provider', 'modality', 'deployment_mode', 'context_window', 'price_input', 'price_output', 'description'];
$defaultUserPrompt = <<<PROMPT
请根据页面 HTML 推断可复用的 XPath 抽取规则,输出格式必须是 JSON
{
"list_link_xpath": "...",
"fields": {
"name": "...",
"summary": "..."
}
}
要求:
1. 仅输出 JSON不要代码块。
2. fields 至少包含 name summary。
3. 只返回 XPath 字符串,不返回 CSS 选择器。
4. 如果页面看起来是详情页list_link_xpath 返回空字符串即可。
5. 目标模块是 {$targetModule},优先考虑字段:{$this->implodeFields($fields)}
PROMPT;
$model = $this->resolveModel((string) ($options['model'] ?? null), $options);
$temperature = $this->resolveTemperature($options);
$contentMaxChars = $this->resolveContentMaxChars($options, 16000);
$systemPrompt = $this->resolveSystemPrompt($options, '你是 XPath 规则设计专家,擅长从 HTML 生成稳定的抽取规则。');
$userPrompt = $this->resolveUserPrompt($options, $defaultUserPrompt);
$content = $this->requestAiContent(
credentials: $credentials,
model: $model,
temperature: $temperature,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
html: $html,
contentMaxChars: $contentMaxChars,
stripTags: false,
);
$decoded = $this->decodeJsonContent($content);
if (! is_array($decoded)) {
$this->lastError = $this->lastError ?: 'AI response is not valid JSON.';
return [];
}
$fieldsConfig = is_array($decoded['fields'] ?? null) ? $decoded['fields'] : [];
if ($fieldsConfig === []) {
$this->lastError = $this->lastError ?: 'AI response does not include fields config.';
return [];
}
return [
'list_link_xpath' => is_string($decoded['list_link_xpath'] ?? null) ? $decoded['list_link_xpath'] : '',
'fields' => $fieldsConfig,
];
}
/**
* @return array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string}|null
*/
private function resolveCredentials(): ?array
{
$apiKey = (string) config('crawler.openai_compatible_key', '');
$endpoint = $this->resolveEndpoint();
if ($endpoint === '' || $apiKey === '') {
return null;
}
return [
'endpoint' => $endpoint,
'api_key' => $apiKey,
'wire_api' => $this->resolveWireApi(),
'disable_response_storage' => (bool) config('crawler.openai_disable_response_storage', false),
'reasoning_effort' => trim((string) config('crawler.openai_reasoning_effort', '')),
];
}
private function resolveEndpoint(): string
{
$configuredEndpoint = trim((string) config('crawler.openai_compatible_endpoint', ''));
if ($configuredEndpoint !== '') {
return $configuredEndpoint;
}
$baseUrl = trim((string) config('crawler.openai_compatible_base_url', ''));
if ($baseUrl === '') {
return '';
}
$baseUrl = rtrim($baseUrl, '/');
return $this->resolveWireApi() === 'responses'
? $baseUrl.'/v1/responses'
: $baseUrl.'/v1/chat/completions';
}
private function resolveWireApi(): string
{
$wireApi = strtolower(trim((string) config('crawler.openai_wire_api', 'chat_completions')));
return $wireApi === 'responses' ? 'responses' : 'chat_completions';
}
/**
* @param string|null $ruleModel
* @param array<string, mixed> $options
*/
private function resolveModel(?string $ruleModel, array $options): string
{
$model = '';
if (is_string($options['model'] ?? null)) {
$model = trim((string) $options['model']);
}
if ($model === '' && is_string($ruleModel)) {
$model = trim($ruleModel);
}
if ($model === '') {
$model = (string) config('crawler.openai_default_model', 'gpt-4o-mini');
}
return $model;
}
/**
* @param array<string, mixed> $options
*/
private function resolveTemperature(array $options): float
{
$temperature = is_numeric($options['temperature'] ?? null)
? (float) $options['temperature']
: 0.0;
return max(0.0, min(2.0, $temperature));
}
/**
* @param array<string, mixed> $options
*/
private function resolveContentMaxChars(array $options, int $default): int
{
$value = is_numeric($options['content_max_chars'] ?? null)
? (int) $options['content_max_chars']
: $default;
return max(500, min(50000, $value));
}
/**
* @param array<string, mixed> $options
*/
private function resolveSystemPrompt(array $options, string $default): string
{
$prompt = is_string($options['system_prompt'] ?? null)
? trim((string) $options['system_prompt'])
: '';
return $prompt === '' ? $default : $prompt;
}
/**
* @param array<string, mixed> $options
*/
private function resolveUserPrompt(array $options, string $default): string
{
$prompt = is_string($options['user_prompt'] ?? null)
? trim((string) $options['user_prompt'])
: '';
return $prompt === '' ? $default : $prompt;
}
private function implodeFields(array $fields): string
{
return implode(', ', $fields);
}
/**
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
*/
private function requestAiContent(
array $credentials,
string $model,
float $temperature,
string $systemPrompt,
string $userPrompt,
string $html,
int $contentMaxChars,
bool $stripTags,
): string {
$source = $stripTags ? strip_tags($html) : $html;
$content = mb_substr($source, 0, $contentMaxChars);
if ($credentials['wire_api'] === 'responses') {
return $this->requestResponsesApi(
credentials: $credentials,
model: $model,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
content: $content,
);
}
return $this->requestChatCompletionsApi(
credentials: $credentials,
model: $model,
temperature: $temperature,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
content: $content,
);
}
/**
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
*/
private function requestChatCompletionsApi(
array $credentials,
string $model,
float $temperature,
string $systemPrompt,
string $userPrompt,
string $content,
): string {
try {
$payload = [
'model' => $model,
'temperature' => $temperature,
'messages' => [
['role' => 'system', 'content' => $systemPrompt],
['role' => 'user', 'content' => $userPrompt."\n\n页面内容:\n".$content],
],
];
if ($credentials['disable_response_storage']) {
$payload['store'] = false;
}
if ($credentials['reasoning_effort'] !== '') {
$payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
}
$response = $this->requestBuilder($credentials['api_key'])
->post($credentials['endpoint'], $payload);
if (! $response->successful()) {
$this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
return '';
}
return (string) data_get($response->json(), 'choices.0.message.content', '');
} catch (\Throwable $exception) {
$this->lastError = $exception->getMessage();
return '';
}
}
/**
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
*/
private function requestResponsesApi(
array $credentials,
string $model,
string $systemPrompt,
string $userPrompt,
string $content,
): string {
try {
$payload = [
'model' => $model,
'input' => [
[
'role' => 'system',
'content' => [
['type' => 'input_text', 'text' => $systemPrompt],
],
],
[
'role' => 'user',
'content' => [
['type' => 'input_text', 'text' => $userPrompt."\n\n页面内容:\n".$content],
],
],
],
];
if ($credentials['disable_response_storage']) {
$payload['store'] = false;
}
if ($credentials['reasoning_effort'] !== '') {
$payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
}
$response = $this->requestBuilder($credentials['api_key'])
->post($credentials['endpoint'], $payload);
if (! $response->successful()) {
$this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
return '';
}
$json = $response->json();
$outputText = (string) data_get($json, 'output_text', '');
if ($outputText !== '') {
return $outputText;
}
$output = data_get($json, 'output', []);
if (! is_array($output)) {
$this->lastError = 'AI output is empty.';
return '';
}
$chunks = [];
foreach ($output as $item) {
if (! is_array($item)) {
continue;
}
$contents = $item['content'] ?? [];
if (! is_array($contents)) {
continue;
}
foreach ($contents as $contentItem) {
if (! is_array($contentItem)) {
continue;
}
$text = (string) ($contentItem['text'] ?? '');
if ($text !== '') {
$chunks[] = $text;
}
}
}
if ($chunks === []) {
$this->lastError = 'AI output chunks are empty.';
return '';
}
return implode("\n", $chunks);
} catch (\Throwable $exception) {
$this->lastError = $exception->getMessage();
return '';
}
}
private function requestBuilder(string $apiKey): PendingRequest
{
$aiTimeout = (int) config('crawler.ai_timeout_seconds', (int) config('crawler.request_timeout_seconds', 20));
$request = Http::timeout(max($aiTimeout, 5))
->withToken($apiKey);
if (! (bool) config('crawler.verify_ssl', true)) {
$request = $request->withoutVerifying();
}
$options = [];
if ((bool) config('crawler.force_ipv4', false)) {
$options['force_ip_resolve'] = 'v4';
}
$dnsServers = trim((string) config('crawler.dns_servers', ''));
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
}
if ($options !== []) {
$request = $request->withOptions($options);
}
return $request;
}
/**
* @return array<string, mixed>
*/
private function decodeJsonContent(string $content): array
{
if ($content === '') {
return [];
}
$decoded = json_decode($content, true);
if (is_array($decoded)) {
return $decoded;
}
if (preg_match('/\{.*\}/s', $content, $matches) === 1) {
$decoded = json_decode($matches[0], true);
return is_array($decoded) ? $decoded : [];
}
return [];
}
}