495 lines
16 KiB
PHP
495 lines
16 KiB
PHP
<?php
|
||
|
||
declare(strict_types=1);
|
||
|
||
namespace App\Services\Crawler;
|
||
|
||
use App\Models\CrawlRule;
|
||
use Illuminate\Http\Client\PendingRequest;
|
||
use Illuminate\Support\Facades\Http;
|
||
|
||
class OpenAiFallbackExtractor
|
||
{
|
||
private ?string $lastError = null;
|
||
|
||
/**
|
||
* @param array<string, mixed> $options
|
||
* @return array<string, mixed>
|
||
*/
|
||
public function extract(CrawlRule $rule, string $html, array $options = []): array
|
||
{
|
||
$this->lastError = null;
|
||
|
||
$credentials = $this->resolveCredentials();
|
||
if ($credentials === null) {
|
||
$this->lastError = 'AI credentials not configured.';
|
||
return [];
|
||
}
|
||
|
||
$ruleAiOptions = is_array($rule->extractor_config['ai'] ?? null)
|
||
? $rule->extractor_config['ai']
|
||
: [];
|
||
$mergedOptions = array_merge($ruleAiOptions, $options);
|
||
|
||
$model = $this->resolveModel($rule->ai_model, $mergedOptions);
|
||
$temperature = $this->resolveTemperature($mergedOptions);
|
||
$contentMaxChars = $this->resolveContentMaxChars($mergedOptions, 12000);
|
||
|
||
$targetSchema = $rule->target_module?->value === 'tool'
|
||
? 'name, summary, official_url, pricing_type, platform, language, description, logo_url'
|
||
: 'name, summary, provider, modality, deployment_mode, context_window, price_input, price_output, description';
|
||
|
||
$defaultUserPrompt = <<<PROMPT
|
||
从页面内容中提取结构化字段并输出 JSON。
|
||
仅返回 JSON 对象本身,不要 Markdown。
|
||
目标字段:{$targetSchema}
|
||
字段缺失时请直接省略该字段,不要输出 null。
|
||
PROMPT;
|
||
|
||
$systemPrompt = $this->resolveSystemPrompt($mergedOptions, '你是一个精确的信息抽取引擎。');
|
||
$userPrompt = $this->resolveUserPrompt($mergedOptions, $defaultUserPrompt);
|
||
|
||
$content = $this->requestAiContent(
|
||
credentials: $credentials,
|
||
model: $model,
|
||
temperature: $temperature,
|
||
systemPrompt: $systemPrompt,
|
||
userPrompt: $userPrompt,
|
||
html: $html,
|
||
contentMaxChars: $contentMaxChars,
|
||
stripTags: true,
|
||
);
|
||
|
||
return $this->decodeJsonContent($content);
|
||
}
|
||
|
||
public function isConfigured(): bool
|
||
{
|
||
return $this->resolveCredentials() !== null;
|
||
}
|
||
|
||
public function lastError(): ?string
|
||
{
|
||
return $this->lastError;
|
||
}
|
||
|
||
/**
|
||
* @param array<string, mixed> $options
|
||
* @return array<string, mixed>
|
||
*/
|
||
public function suggestExtractorConfig(string $targetModule, string $html, array $options = []): array
|
||
{
|
||
$this->lastError = null;
|
||
|
||
$credentials = $this->resolveCredentials();
|
||
if ($credentials === null) {
|
||
$this->lastError = 'AI credentials not configured.';
|
||
return [];
|
||
}
|
||
|
||
$targetModule = in_array($targetModule, ['tool', 'model'], true) ? $targetModule : 'tool';
|
||
$fields = $targetModule === 'tool'
|
||
? ['name', 'summary', 'official_url', 'pricing_type', 'platform', 'language', 'description', 'logo_url']
|
||
: ['name', 'summary', 'provider', 'modality', 'deployment_mode', 'context_window', 'price_input', 'price_output', 'description'];
|
||
|
||
$defaultUserPrompt = <<<PROMPT
|
||
请根据页面 HTML 推断可复用的 XPath 抽取规则,输出格式必须是 JSON:
|
||
{
|
||
"list_link_xpath": "...",
|
||
"fields": {
|
||
"name": "...",
|
||
"summary": "..."
|
||
}
|
||
}
|
||
要求:
|
||
1. 仅输出 JSON,不要代码块。
|
||
2. fields 至少包含 name 与 summary。
|
||
3. 只返回 XPath 字符串,不返回 CSS 选择器。
|
||
4. 如果页面看起来是详情页,list_link_xpath 返回空字符串即可。
|
||
5. 目标模块是 {$targetModule},优先考虑字段:{$this->implodeFields($fields)}。
|
||
PROMPT;
|
||
|
||
$model = $this->resolveModel((string) ($options['model'] ?? null), $options);
|
||
$temperature = $this->resolveTemperature($options);
|
||
$contentMaxChars = $this->resolveContentMaxChars($options, 16000);
|
||
|
||
$systemPrompt = $this->resolveSystemPrompt($options, '你是 XPath 规则设计专家,擅长从 HTML 生成稳定的抽取规则。');
|
||
$userPrompt = $this->resolveUserPrompt($options, $defaultUserPrompt);
|
||
|
||
$content = $this->requestAiContent(
|
||
credentials: $credentials,
|
||
model: $model,
|
||
temperature: $temperature,
|
||
systemPrompt: $systemPrompt,
|
||
userPrompt: $userPrompt,
|
||
html: $html,
|
||
contentMaxChars: $contentMaxChars,
|
||
stripTags: false,
|
||
);
|
||
|
||
$decoded = $this->decodeJsonContent($content);
|
||
if (! is_array($decoded)) {
|
||
$this->lastError = $this->lastError ?: 'AI response is not valid JSON.';
|
||
return [];
|
||
}
|
||
|
||
$fieldsConfig = is_array($decoded['fields'] ?? null) ? $decoded['fields'] : [];
|
||
if ($fieldsConfig === []) {
|
||
$this->lastError = $this->lastError ?: 'AI response does not include fields config.';
|
||
return [];
|
||
}
|
||
|
||
return [
|
||
'list_link_xpath' => is_string($decoded['list_link_xpath'] ?? null) ? $decoded['list_link_xpath'] : '',
|
||
'fields' => $fieldsConfig,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* @return array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string}|null
|
||
*/
|
||
private function resolveCredentials(): ?array
|
||
{
|
||
$apiKey = (string) config('crawler.openai_compatible_key', '');
|
||
$endpoint = $this->resolveEndpoint();
|
||
|
||
if ($endpoint === '' || $apiKey === '') {
|
||
return null;
|
||
}
|
||
|
||
return [
|
||
'endpoint' => $endpoint,
|
||
'api_key' => $apiKey,
|
||
'wire_api' => $this->resolveWireApi(),
|
||
'disable_response_storage' => (bool) config('crawler.openai_disable_response_storage', false),
|
||
'reasoning_effort' => trim((string) config('crawler.openai_reasoning_effort', '')),
|
||
];
|
||
}
|
||
|
||
private function resolveEndpoint(): string
|
||
{
|
||
$configuredEndpoint = trim((string) config('crawler.openai_compatible_endpoint', ''));
|
||
if ($configuredEndpoint !== '') {
|
||
return $configuredEndpoint;
|
||
}
|
||
|
||
$baseUrl = trim((string) config('crawler.openai_compatible_base_url', ''));
|
||
if ($baseUrl === '') {
|
||
return '';
|
||
}
|
||
|
||
$baseUrl = rtrim($baseUrl, '/');
|
||
|
||
return $this->resolveWireApi() === 'responses'
|
||
? $baseUrl.'/v1/responses'
|
||
: $baseUrl.'/v1/chat/completions';
|
||
}
|
||
|
||
private function resolveWireApi(): string
|
||
{
|
||
$wireApi = strtolower(trim((string) config('crawler.openai_wire_api', 'chat_completions')));
|
||
|
||
return $wireApi === 'responses' ? 'responses' : 'chat_completions';
|
||
}
|
||
|
||
/**
|
||
* @param string|null $ruleModel
|
||
* @param array<string, mixed> $options
|
||
*/
|
||
private function resolveModel(?string $ruleModel, array $options): string
|
||
{
|
||
$model = '';
|
||
|
||
if (is_string($options['model'] ?? null)) {
|
||
$model = trim((string) $options['model']);
|
||
}
|
||
|
||
if ($model === '' && is_string($ruleModel)) {
|
||
$model = trim($ruleModel);
|
||
}
|
||
|
||
if ($model === '') {
|
||
$model = (string) config('crawler.openai_default_model', 'gpt-4o-mini');
|
||
}
|
||
|
||
return $model;
|
||
}
|
||
|
||
/**
|
||
* @param array<string, mixed> $options
|
||
*/
|
||
private function resolveTemperature(array $options): float
|
||
{
|
||
$temperature = is_numeric($options['temperature'] ?? null)
|
||
? (float) $options['temperature']
|
||
: 0.0;
|
||
|
||
return max(0.0, min(2.0, $temperature));
|
||
}
|
||
|
||
/**
|
||
* @param array<string, mixed> $options
|
||
*/
|
||
private function resolveContentMaxChars(array $options, int $default): int
|
||
{
|
||
$value = is_numeric($options['content_max_chars'] ?? null)
|
||
? (int) $options['content_max_chars']
|
||
: $default;
|
||
|
||
return max(500, min(50000, $value));
|
||
}
|
||
|
||
/**
|
||
* @param array<string, mixed> $options
|
||
*/
|
||
private function resolveSystemPrompt(array $options, string $default): string
|
||
{
|
||
$prompt = is_string($options['system_prompt'] ?? null)
|
||
? trim((string) $options['system_prompt'])
|
||
: '';
|
||
|
||
return $prompt === '' ? $default : $prompt;
|
||
}
|
||
|
||
/**
|
||
* @param array<string, mixed> $options
|
||
*/
|
||
private function resolveUserPrompt(array $options, string $default): string
|
||
{
|
||
$prompt = is_string($options['user_prompt'] ?? null)
|
||
? trim((string) $options['user_prompt'])
|
||
: '';
|
||
|
||
return $prompt === '' ? $default : $prompt;
|
||
}
|
||
|
||
private function implodeFields(array $fields): string
|
||
{
|
||
return implode(', ', $fields);
|
||
}
|
||
|
||
/**
|
||
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
|
||
*/
|
||
private function requestAiContent(
|
||
array $credentials,
|
||
string $model,
|
||
float $temperature,
|
||
string $systemPrompt,
|
||
string $userPrompt,
|
||
string $html,
|
||
int $contentMaxChars,
|
||
bool $stripTags,
|
||
): string {
|
||
$source = $stripTags ? strip_tags($html) : $html;
|
||
$content = mb_substr($source, 0, $contentMaxChars);
|
||
|
||
if ($credentials['wire_api'] === 'responses') {
|
||
return $this->requestResponsesApi(
|
||
credentials: $credentials,
|
||
model: $model,
|
||
systemPrompt: $systemPrompt,
|
||
userPrompt: $userPrompt,
|
||
content: $content,
|
||
);
|
||
}
|
||
|
||
return $this->requestChatCompletionsApi(
|
||
credentials: $credentials,
|
||
model: $model,
|
||
temperature: $temperature,
|
||
systemPrompt: $systemPrompt,
|
||
userPrompt: $userPrompt,
|
||
content: $content,
|
||
);
|
||
}
|
||
|
||
/**
|
||
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
|
||
*/
|
||
private function requestChatCompletionsApi(
|
||
array $credentials,
|
||
string $model,
|
||
float $temperature,
|
||
string $systemPrompt,
|
||
string $userPrompt,
|
||
string $content,
|
||
): string {
|
||
try {
|
||
$payload = [
|
||
'model' => $model,
|
||
'temperature' => $temperature,
|
||
'messages' => [
|
||
['role' => 'system', 'content' => $systemPrompt],
|
||
['role' => 'user', 'content' => $userPrompt."\n\n页面内容:\n".$content],
|
||
],
|
||
];
|
||
|
||
if ($credentials['disable_response_storage']) {
|
||
$payload['store'] = false;
|
||
}
|
||
|
||
if ($credentials['reasoning_effort'] !== '') {
|
||
$payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
|
||
}
|
||
|
||
$response = $this->requestBuilder($credentials['api_key'])
|
||
->post($credentials['endpoint'], $payload);
|
||
|
||
if (! $response->successful()) {
|
||
$this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
|
||
return '';
|
||
}
|
||
|
||
return (string) data_get($response->json(), 'choices.0.message.content', '');
|
||
} catch (\Throwable $exception) {
|
||
$this->lastError = $exception->getMessage();
|
||
|
||
return '';
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
|
||
*/
|
||
private function requestResponsesApi(
|
||
array $credentials,
|
||
string $model,
|
||
string $systemPrompt,
|
||
string $userPrompt,
|
||
string $content,
|
||
): string {
|
||
try {
|
||
$payload = [
|
||
'model' => $model,
|
||
'input' => [
|
||
[
|
||
'role' => 'system',
|
||
'content' => [
|
||
['type' => 'input_text', 'text' => $systemPrompt],
|
||
],
|
||
],
|
||
[
|
||
'role' => 'user',
|
||
'content' => [
|
||
['type' => 'input_text', 'text' => $userPrompt."\n\n页面内容:\n".$content],
|
||
],
|
||
],
|
||
],
|
||
];
|
||
|
||
if ($credentials['disable_response_storage']) {
|
||
$payload['store'] = false;
|
||
}
|
||
|
||
if ($credentials['reasoning_effort'] !== '') {
|
||
$payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
|
||
}
|
||
|
||
$response = $this->requestBuilder($credentials['api_key'])
|
||
->post($credentials['endpoint'], $payload);
|
||
|
||
if (! $response->successful()) {
|
||
$this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
|
||
return '';
|
||
}
|
||
|
||
$json = $response->json();
|
||
$outputText = (string) data_get($json, 'output_text', '');
|
||
if ($outputText !== '') {
|
||
return $outputText;
|
||
}
|
||
|
||
$output = data_get($json, 'output', []);
|
||
if (! is_array($output)) {
|
||
$this->lastError = 'AI output is empty.';
|
||
return '';
|
||
}
|
||
|
||
$chunks = [];
|
||
foreach ($output as $item) {
|
||
if (! is_array($item)) {
|
||
continue;
|
||
}
|
||
|
||
$contents = $item['content'] ?? [];
|
||
if (! is_array($contents)) {
|
||
continue;
|
||
}
|
||
|
||
foreach ($contents as $contentItem) {
|
||
if (! is_array($contentItem)) {
|
||
continue;
|
||
}
|
||
|
||
$text = (string) ($contentItem['text'] ?? '');
|
||
if ($text !== '') {
|
||
$chunks[] = $text;
|
||
}
|
||
}
|
||
}
|
||
|
||
if ($chunks === []) {
|
||
$this->lastError = 'AI output chunks are empty.';
|
||
return '';
|
||
}
|
||
|
||
return implode("\n", $chunks);
|
||
} catch (\Throwable $exception) {
|
||
$this->lastError = $exception->getMessage();
|
||
|
||
return '';
|
||
}
|
||
}
|
||
|
||
private function requestBuilder(string $apiKey): PendingRequest
|
||
{
|
||
$aiTimeout = (int) config('crawler.ai_timeout_seconds', (int) config('crawler.request_timeout_seconds', 20));
|
||
$request = Http::timeout(max($aiTimeout, 5))
|
||
->withToken($apiKey);
|
||
|
||
if (! (bool) config('crawler.verify_ssl', true)) {
|
||
$request = $request->withoutVerifying();
|
||
}
|
||
|
||
$options = [];
|
||
if ((bool) config('crawler.force_ipv4', false)) {
|
||
$options['force_ip_resolve'] = 'v4';
|
||
}
|
||
|
||
$dnsServers = trim((string) config('crawler.dns_servers', ''));
|
||
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
|
||
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
|
||
}
|
||
|
||
if ($options !== []) {
|
||
$request = $request->withOptions($options);
|
||
}
|
||
|
||
return $request;
|
||
}
|
||
|
||
/**
|
||
* @return array<string, mixed>
|
||
*/
|
||
private function decodeJsonContent(string $content): array
|
||
{
|
||
if ($content === '') {
|
||
return [];
|
||
}
|
||
|
||
$decoded = json_decode($content, true);
|
||
if (is_array($decoded)) {
|
||
return $decoded;
|
||
}
|
||
|
||
if (preg_match('/\{.*\}/s', $content, $matches) === 1) {
|
||
$decoded = json_decode($matches[0], true);
|
||
|
||
return is_array($decoded) ? $decoded : [];
|
||
}
|
||
|
||
return [];
|
||
}
|
||
}
|