Files
ai-web/app/Services/Crawler/OpenAiFallbackExtractor.php
cjd 260460df03
Some checks failed
Tests / PHP 8.2 (push) Has been cancelled
Tests / PHP 8.3 (push) Has been cancelled
Tests / PHP 8.4 (push) Has been cancelled
爬虫开发
2026-02-18 12:56:36 +08:00

495 lines
16 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
namespace App\Services\Crawler;
use App\Models\CrawlRule;
use Illuminate\Http\Client\PendingRequest;
use Illuminate\Support\Facades\Http;
class OpenAiFallbackExtractor
{
private ?string $lastError = null;
/**
* @param array<string, mixed> $options
* @return array<string, mixed>
*/
public function extract(CrawlRule $rule, string $html, array $options = []): array
{
$this->lastError = null;
$credentials = $this->resolveCredentials();
if ($credentials === null) {
$this->lastError = 'AI credentials not configured.';
return [];
}
$ruleAiOptions = is_array($rule->extractor_config['ai'] ?? null)
? $rule->extractor_config['ai']
: [];
$mergedOptions = array_merge($ruleAiOptions, $options);
$model = $this->resolveModel($rule->ai_model, $mergedOptions);
$temperature = $this->resolveTemperature($mergedOptions);
$contentMaxChars = $this->resolveContentMaxChars($mergedOptions, 12000);
$targetSchema = $rule->target_module?->value === 'tool'
? 'name, summary, official_url, pricing_type, platform, language, description, logo_url'
: 'name, summary, provider, modality, deployment_mode, context_window, price_input, price_output, description';
$defaultUserPrompt = <<<PROMPT
从页面内容中提取结构化字段并输出 JSON。
仅返回 JSON 对象本身,不要 Markdown。
目标字段:{$targetSchema}
字段缺失时请直接省略该字段,不要输出 null。
PROMPT;
$systemPrompt = $this->resolveSystemPrompt($mergedOptions, '你是一个精确的信息抽取引擎。');
$userPrompt = $this->resolveUserPrompt($mergedOptions, $defaultUserPrompt);
$content = $this->requestAiContent(
credentials: $credentials,
model: $model,
temperature: $temperature,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
html: $html,
contentMaxChars: $contentMaxChars,
stripTags: true,
);
return $this->decodeJsonContent($content);
}
public function isConfigured(): bool
{
return $this->resolveCredentials() !== null;
}
public function lastError(): ?string
{
return $this->lastError;
}
/**
* @param array<string, mixed> $options
* @return array<string, mixed>
*/
public function suggestExtractorConfig(string $targetModule, string $html, array $options = []): array
{
$this->lastError = null;
$credentials = $this->resolveCredentials();
if ($credentials === null) {
$this->lastError = 'AI credentials not configured.';
return [];
}
$targetModule = in_array($targetModule, ['tool', 'model'], true) ? $targetModule : 'tool';
$fields = $targetModule === 'tool'
? ['name', 'summary', 'official_url', 'pricing_type', 'platform', 'language', 'description', 'logo_url']
: ['name', 'summary', 'provider', 'modality', 'deployment_mode', 'context_window', 'price_input', 'price_output', 'description'];
$defaultUserPrompt = <<<PROMPT
请根据页面 HTML 推断可复用的 XPath 抽取规则,输出格式必须是 JSON
{
"list_link_xpath": "...",
"fields": {
"name": "...",
"summary": "..."
}
}
要求:
1. 仅输出 JSON不要代码块。
2. fields 至少包含 name 与 summary。
3. 只返回 XPath 字符串,不返回 CSS 选择器。
4. 如果页面看起来是详情页list_link_xpath 返回空字符串即可。
5. 目标模块是 {$targetModule},优先考虑字段:{$this->implodeFields($fields)}。
PROMPT;
$model = $this->resolveModel((string) ($options['model'] ?? null), $options);
$temperature = $this->resolveTemperature($options);
$contentMaxChars = $this->resolveContentMaxChars($options, 16000);
$systemPrompt = $this->resolveSystemPrompt($options, '你是 XPath 规则设计专家,擅长从 HTML 生成稳定的抽取规则。');
$userPrompt = $this->resolveUserPrompt($options, $defaultUserPrompt);
$content = $this->requestAiContent(
credentials: $credentials,
model: $model,
temperature: $temperature,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
html: $html,
contentMaxChars: $contentMaxChars,
stripTags: false,
);
$decoded = $this->decodeJsonContent($content);
if (! is_array($decoded)) {
$this->lastError = $this->lastError ?: 'AI response is not valid JSON.';
return [];
}
$fieldsConfig = is_array($decoded['fields'] ?? null) ? $decoded['fields'] : [];
if ($fieldsConfig === []) {
$this->lastError = $this->lastError ?: 'AI response does not include fields config.';
return [];
}
return [
'list_link_xpath' => is_string($decoded['list_link_xpath'] ?? null) ? $decoded['list_link_xpath'] : '',
'fields' => $fieldsConfig,
];
}
/**
* @return array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string}|null
*/
private function resolveCredentials(): ?array
{
$apiKey = (string) config('crawler.openai_compatible_key', '');
$endpoint = $this->resolveEndpoint();
if ($endpoint === '' || $apiKey === '') {
return null;
}
return [
'endpoint' => $endpoint,
'api_key' => $apiKey,
'wire_api' => $this->resolveWireApi(),
'disable_response_storage' => (bool) config('crawler.openai_disable_response_storage', false),
'reasoning_effort' => trim((string) config('crawler.openai_reasoning_effort', '')),
];
}
private function resolveEndpoint(): string
{
$configuredEndpoint = trim((string) config('crawler.openai_compatible_endpoint', ''));
if ($configuredEndpoint !== '') {
return $configuredEndpoint;
}
$baseUrl = trim((string) config('crawler.openai_compatible_base_url', ''));
if ($baseUrl === '') {
return '';
}
$baseUrl = rtrim($baseUrl, '/');
return $this->resolveWireApi() === 'responses'
? $baseUrl.'/v1/responses'
: $baseUrl.'/v1/chat/completions';
}
private function resolveWireApi(): string
{
$wireApi = strtolower(trim((string) config('crawler.openai_wire_api', 'chat_completions')));
return $wireApi === 'responses' ? 'responses' : 'chat_completions';
}
/**
* @param string|null $ruleModel
* @param array<string, mixed> $options
*/
private function resolveModel(?string $ruleModel, array $options): string
{
$model = '';
if (is_string($options['model'] ?? null)) {
$model = trim((string) $options['model']);
}
if ($model === '' && is_string($ruleModel)) {
$model = trim($ruleModel);
}
if ($model === '') {
$model = (string) config('crawler.openai_default_model', 'gpt-4o-mini');
}
return $model;
}
/**
* @param array<string, mixed> $options
*/
private function resolveTemperature(array $options): float
{
$temperature = is_numeric($options['temperature'] ?? null)
? (float) $options['temperature']
: 0.0;
return max(0.0, min(2.0, $temperature));
}
/**
* @param array<string, mixed> $options
*/
private function resolveContentMaxChars(array $options, int $default): int
{
$value = is_numeric($options['content_max_chars'] ?? null)
? (int) $options['content_max_chars']
: $default;
return max(500, min(50000, $value));
}
/**
* @param array<string, mixed> $options
*/
private function resolveSystemPrompt(array $options, string $default): string
{
$prompt = is_string($options['system_prompt'] ?? null)
? trim((string) $options['system_prompt'])
: '';
return $prompt === '' ? $default : $prompt;
}
/**
* @param array<string, mixed> $options
*/
private function resolveUserPrompt(array $options, string $default): string
{
$prompt = is_string($options['user_prompt'] ?? null)
? trim((string) $options['user_prompt'])
: '';
return $prompt === '' ? $default : $prompt;
}
private function implodeFields(array $fields): string
{
return implode(', ', $fields);
}
/**
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
*/
private function requestAiContent(
array $credentials,
string $model,
float $temperature,
string $systemPrompt,
string $userPrompt,
string $html,
int $contentMaxChars,
bool $stripTags,
): string {
$source = $stripTags ? strip_tags($html) : $html;
$content = mb_substr($source, 0, $contentMaxChars);
if ($credentials['wire_api'] === 'responses') {
return $this->requestResponsesApi(
credentials: $credentials,
model: $model,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
content: $content,
);
}
return $this->requestChatCompletionsApi(
credentials: $credentials,
model: $model,
temperature: $temperature,
systemPrompt: $systemPrompt,
userPrompt: $userPrompt,
content: $content,
);
}
/**
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
*/
private function requestChatCompletionsApi(
array $credentials,
string $model,
float $temperature,
string $systemPrompt,
string $userPrompt,
string $content,
): string {
try {
$payload = [
'model' => $model,
'temperature' => $temperature,
'messages' => [
['role' => 'system', 'content' => $systemPrompt],
['role' => 'user', 'content' => $userPrompt."\n\n页面内容:\n".$content],
],
];
if ($credentials['disable_response_storage']) {
$payload['store'] = false;
}
if ($credentials['reasoning_effort'] !== '') {
$payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
}
$response = $this->requestBuilder($credentials['api_key'])
->post($credentials['endpoint'], $payload);
if (! $response->successful()) {
$this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
return '';
}
return (string) data_get($response->json(), 'choices.0.message.content', '');
} catch (\Throwable $exception) {
$this->lastError = $exception->getMessage();
return '';
}
}
/**
* @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
*/
private function requestResponsesApi(
array $credentials,
string $model,
string $systemPrompt,
string $userPrompt,
string $content,
): string {
try {
$payload = [
'model' => $model,
'input' => [
[
'role' => 'system',
'content' => [
['type' => 'input_text', 'text' => $systemPrompt],
],
],
[
'role' => 'user',
'content' => [
['type' => 'input_text', 'text' => $userPrompt."\n\n页面内容:\n".$content],
],
],
],
];
if ($credentials['disable_response_storage']) {
$payload['store'] = false;
}
if ($credentials['reasoning_effort'] !== '') {
$payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
}
$response = $this->requestBuilder($credentials['api_key'])
->post($credentials['endpoint'], $payload);
if (! $response->successful()) {
$this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
return '';
}
$json = $response->json();
$outputText = (string) data_get($json, 'output_text', '');
if ($outputText !== '') {
return $outputText;
}
$output = data_get($json, 'output', []);
if (! is_array($output)) {
$this->lastError = 'AI output is empty.';
return '';
}
$chunks = [];
foreach ($output as $item) {
if (! is_array($item)) {
continue;
}
$contents = $item['content'] ?? [];
if (! is_array($contents)) {
continue;
}
foreach ($contents as $contentItem) {
if (! is_array($contentItem)) {
continue;
}
$text = (string) ($contentItem['text'] ?? '');
if ($text !== '') {
$chunks[] = $text;
}
}
}
if ($chunks === []) {
$this->lastError = 'AI output chunks are empty.';
return '';
}
return implode("\n", $chunks);
} catch (\Throwable $exception) {
$this->lastError = $exception->getMessage();
return '';
}
}
private function requestBuilder(string $apiKey): PendingRequest
{
$aiTimeout = (int) config('crawler.ai_timeout_seconds', (int) config('crawler.request_timeout_seconds', 20));
$request = Http::timeout(max($aiTimeout, 5))
->withToken($apiKey);
if (! (bool) config('crawler.verify_ssl', true)) {
$request = $request->withoutVerifying();
}
$options = [];
if ((bool) config('crawler.force_ipv4', false)) {
$options['force_ip_resolve'] = 'v4';
}
$dnsServers = trim((string) config('crawler.dns_servers', ''));
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
}
if ($options !== []) {
$request = $request->withOptions($options);
}
return $request;
}
/**
* @return array<string, mixed>
*/
private function decodeJsonContent(string $content): array
{
if ($content === '') {
return [];
}
$decoded = json_decode($content, true);
if (is_array($decoded)) {
return $decoded;
}
if (preg_match('/\{.*\}/s', $content, $matches) === 1) {
$decoded = json_decode($matches[0], true);
return is_array($decoded) ? $decoded : [];
}
return [];
}
}