254 lines
10 KiB
PHP
254 lines
10 KiB
PHP
|
|
<?php
|
||
|
|
|
||
|
|
declare(strict_types=1);
|
||
|
|
|
||
|
|
namespace App\Http\Requests\Admin;
|
||
|
|
|
||
|
|
use App\Enums\CrawlTargetModule;
|
||
|
|
use Illuminate\Foundation\Http\FormRequest;
|
||
|
|
use Illuminate\Validation\Rule;
|
||
|
|
|
||
|
|
class CrawlRuleRequest extends FormRequest
|
||
|
|
{
|
||
|
|
public function authorize(): bool
|
||
|
|
{
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function prepareForValidation(): void
|
||
|
|
{
|
||
|
|
$extractorConfig = $this->decodeJsonToArray($this->input('extractor_json'));
|
||
|
|
|
||
|
|
$this->merge([
|
||
|
|
'enabled' => $this->boolean('enabled'),
|
||
|
|
'render_js' => $this->boolean('render_js'),
|
||
|
|
'ai_fallback_enabled' => $this->boolean('ai_fallback_enabled'),
|
||
|
|
'extractor_mode' => $this->input('extractor_mode') ?: (string) ($extractorConfig['mode'] ?? 'xpath'),
|
||
|
|
]);
|
||
|
|
}
|
||
|
|
|
||
|
|
public function rules(): array
|
||
|
|
{
|
||
|
|
return [
|
||
|
|
'name' => ['required', 'string', 'max:150'],
|
||
|
|
'target_module' => ['required', Rule::in(array_column(CrawlTargetModule::cases(), 'value'))],
|
||
|
|
'enabled' => ['nullable', 'boolean'],
|
||
|
|
'entry_urls' => ['required', 'string'],
|
||
|
|
'cron_expression' => ['required', 'string', 'max:64'],
|
||
|
|
'timezone' => ['required', 'string', 'max:64'],
|
||
|
|
'max_pages' => ['required', 'integer', 'between:1,2000'],
|
||
|
|
'render_js' => ['nullable', 'boolean'],
|
||
|
|
'user_agent' => ['nullable', 'string', 'max:255'],
|
||
|
|
'headers_json' => ['nullable', 'json'],
|
||
|
|
'cookies_json' => ['nullable', 'json'],
|
||
|
|
'proxy' => ['nullable', 'string', 'max:255'],
|
||
|
|
'rate_limit_per_minute' => ['required', 'integer', 'between:1,2000'],
|
||
|
|
'retry_max' => ['required', 'integer', 'between:1,10'],
|
||
|
|
'retry_backoff_seconds' => ['required', 'integer', 'between:1,3600'],
|
||
|
|
'extractor_json' => ['required', 'json'],
|
||
|
|
'extractor_mode' => ['required', Rule::in(['xpath', 'ai', 'hybrid'])],
|
||
|
|
'mapping_json' => ['nullable', 'json'],
|
||
|
|
'dedupe_json' => ['nullable', 'json'],
|
||
|
|
'ai_fallback_enabled' => ['nullable', 'boolean'],
|
||
|
|
'ai_provider' => ['nullable', 'string', 'max:64'],
|
||
|
|
'ai_model' => ['nullable', 'string', 'max:128'],
|
||
|
|
'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
|
||
|
|
'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
|
||
|
|
'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
|
||
|
|
'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
|
||
|
|
'publish_policy' => ['required', Rule::in(['draft'])],
|
||
|
|
'alert_email' => ['nullable', 'email'],
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
public function messages(): array
|
||
|
|
{
|
||
|
|
return [
|
||
|
|
'name.required' => '请填写规则名称。',
|
||
|
|
'target_module.required' => '请选择目标模块。',
|
||
|
|
'entry_urls.required' => '请至少填写一个入口 URL。',
|
||
|
|
'cron_expression.required' => '请填写 Cron 表达式。',
|
||
|
|
'timezone.required' => '请填写时区。',
|
||
|
|
'max_pages.required' => '请填写最大页面数。',
|
||
|
|
'max_pages.integer' => '最大页面数必须是整数。',
|
||
|
|
'max_pages.between' => '最大页面数需在 1 到 2000 之间。',
|
||
|
|
'rate_limit_per_minute.required' => '请填写每分钟限流值。',
|
||
|
|
'rate_limit_per_minute.integer' => '每分钟限流值必须是整数。',
|
||
|
|
'rate_limit_per_minute.between' => '每分钟限流值需在 1 到 2000 之间。',
|
||
|
|
'retry_max.required' => '请填写最大重试次数。',
|
||
|
|
'retry_max.integer' => '最大重试次数必须是整数。',
|
||
|
|
'retry_max.between' => '最大重试次数需在 1 到 10 之间。',
|
||
|
|
'retry_backoff_seconds.required' => '请填写重试退避秒数。',
|
||
|
|
'retry_backoff_seconds.integer' => '重试退避秒数必须是整数。',
|
||
|
|
'retry_backoff_seconds.between' => '重试退避秒数需在 1 到 3600 之间。',
|
||
|
|
'extractor_json.required' => '请填写 Extractor JSON。',
|
||
|
|
'extractor_json.json' => 'Extractor JSON 格式不合法。',
|
||
|
|
'extractor_mode.required' => '请选择抽取模式。',
|
||
|
|
'extractor_mode.in' => '抽取模式仅支持 xpath、ai、hybrid。',
|
||
|
|
'mapping_json.json' => 'Mapping JSON 格式不合法。',
|
||
|
|
'dedupe_json.json' => 'Dedupe JSON 格式不合法。',
|
||
|
|
'headers_json.json' => 'Headers JSON 格式不合法。',
|
||
|
|
'cookies_json.json' => 'Cookies JSON 格式不合法。',
|
||
|
|
'ai_temperature.between' => 'AI 温度需在 0 到 2 之间。',
|
||
|
|
'ai_content_max_chars.between' => 'AI 内容截断长度需在 500 到 50000 之间。',
|
||
|
|
'alert_email.email' => '告警邮箱格式不合法。',
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
public function attributes(): array
|
||
|
|
{
|
||
|
|
return [
|
||
|
|
'name' => '规则名称',
|
||
|
|
'target_module' => '目标模块',
|
||
|
|
'entry_urls' => '入口 URL',
|
||
|
|
'cron_expression' => 'Cron 表达式',
|
||
|
|
'timezone' => '时区',
|
||
|
|
'max_pages' => '最大页面数',
|
||
|
|
'rate_limit_per_minute' => '每分钟限流',
|
||
|
|
'retry_max' => '最大重试次数',
|
||
|
|
'retry_backoff_seconds' => '重试退避秒数',
|
||
|
|
'extractor_json' => 'Extractor JSON',
|
||
|
|
'extractor_mode' => '抽取模式',
|
||
|
|
'mapping_json' => 'Mapping JSON',
|
||
|
|
'dedupe_json' => 'Dedupe JSON',
|
||
|
|
'headers_json' => 'Headers JSON',
|
||
|
|
'cookies_json' => 'Cookies JSON',
|
||
|
|
'ai_system_prompt' => 'AI 系统提示词',
|
||
|
|
'ai_user_prompt' => 'AI 用户提示词',
|
||
|
|
'ai_temperature' => 'AI 温度',
|
||
|
|
'ai_content_max_chars' => 'AI 内容截断长度',
|
||
|
|
'alert_email' => '告警邮箱',
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return array<string, mixed>
|
||
|
|
*/
|
||
|
|
public function normalizedPayload(): array
|
||
|
|
{
|
||
|
|
$payload = $this->validated();
|
||
|
|
|
||
|
|
$extractorConfig = $this->decodeJsonToArray($payload['extractor_json'] ?? null);
|
||
|
|
$extractorMode = (string) ($payload['extractor_mode'] ?? ($extractorConfig['mode'] ?? 'xpath'));
|
||
|
|
|
||
|
|
if (! in_array($extractorMode, ['xpath', 'ai', 'hybrid'], true)) {
|
||
|
|
$extractorMode = 'xpath';
|
||
|
|
}
|
||
|
|
|
||
|
|
$extractorConfig['mode'] = $extractorMode;
|
||
|
|
|
||
|
|
$aiConfig = $this->buildAiConfig($payload);
|
||
|
|
if ($aiConfig !== []) {
|
||
|
|
$extractorConfig['ai'] = $aiConfig;
|
||
|
|
} else {
|
||
|
|
unset($extractorConfig['ai']);
|
||
|
|
}
|
||
|
|
|
||
|
|
return [
|
||
|
|
'name' => $payload['name'],
|
||
|
|
'target_module' => $payload['target_module'],
|
||
|
|
'enabled' => (bool) ($payload['enabled'] ?? false),
|
||
|
|
'entry_urls' => $this->parseEntryUrls((string) ($payload['entry_urls'] ?? '')),
|
||
|
|
'cron_expression' => trim((string) $payload['cron_expression']),
|
||
|
|
'timezone' => trim((string) $payload['timezone']),
|
||
|
|
'max_pages' => (int) $payload['max_pages'],
|
||
|
|
'render_js' => (bool) ($payload['render_js'] ?? false),
|
||
|
|
'user_agent' => $this->nullableTrim($payload['user_agent'] ?? null),
|
||
|
|
'headers' => $this->decodeJsonToArray($payload['headers_json'] ?? null),
|
||
|
|
'cookies' => $this->decodeJsonToArray($payload['cookies_json'] ?? null),
|
||
|
|
'proxy' => $this->nullableTrim($payload['proxy'] ?? null),
|
||
|
|
'rate_limit_per_minute' => (int) $payload['rate_limit_per_minute'],
|
||
|
|
'retry_max' => (int) $payload['retry_max'],
|
||
|
|
'retry_backoff_seconds' => (int) $payload['retry_backoff_seconds'],
|
||
|
|
'extractor_config' => $extractorConfig,
|
||
|
|
'mapping_config' => $this->decodeJsonToArray($payload['mapping_json'] ?? null),
|
||
|
|
'dedupe_config' => $this->decodeJsonToArray($payload['dedupe_json'] ?? null),
|
||
|
|
'ai_fallback_enabled' => (bool) ($payload['ai_fallback_enabled'] ?? false),
|
||
|
|
'ai_provider' => $this->nullableTrim($payload['ai_provider'] ?? null),
|
||
|
|
'ai_model' => $this->nullableTrim($payload['ai_model'] ?? null),
|
||
|
|
'publish_policy' => (string) $payload['publish_policy'],
|
||
|
|
'alert_email' => $this->nullableTrim($payload['alert_email'] ?? null),
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param array<string, mixed> $payload
|
||
|
|
* @return array<string, mixed>
|
||
|
|
*/
|
||
|
|
private function buildAiConfig(array $payload): array
|
||
|
|
{
|
||
|
|
$aiConfig = [];
|
||
|
|
|
||
|
|
$systemPrompt = $this->nullableTrim($payload['ai_system_prompt'] ?? null);
|
||
|
|
if ($systemPrompt !== null) {
|
||
|
|
$aiConfig['system_prompt'] = $systemPrompt;
|
||
|
|
}
|
||
|
|
|
||
|
|
$userPrompt = $this->nullableTrim($payload['ai_user_prompt'] ?? null);
|
||
|
|
if ($userPrompt !== null) {
|
||
|
|
$aiConfig['user_prompt'] = $userPrompt;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
|
||
|
|
$aiConfig['temperature'] = (float) $payload['ai_temperature'];
|
||
|
|
}
|
||
|
|
|
||
|
|
if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
|
||
|
|
$aiConfig['content_max_chars'] = (int) $payload['ai_content_max_chars'];
|
||
|
|
}
|
||
|
|
|
||
|
|
$aiModel = $this->nullableTrim($payload['ai_model'] ?? null);
|
||
|
|
if ($aiModel !== null) {
|
||
|
|
$aiConfig['model'] = $aiModel;
|
||
|
|
}
|
||
|
|
|
||
|
|
return $aiConfig;
|
||
|
|
}
|
||
|
|
|
||
|
|
private function nullableTrim(mixed $value): ?string
|
||
|
|
{
|
||
|
|
if (! is_string($value)) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$trimmed = trim($value);
|
||
|
|
|
||
|
|
return $trimmed === '' ? null : $trimmed;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return list<string>
|
||
|
|
*/
|
||
|
|
private function parseEntryUrls(string $entryUrls): array
|
||
|
|
{
|
||
|
|
$lines = preg_split('/\r\n|\r|\n/', $entryUrls) ?: [];
|
||
|
|
|
||
|
|
$urls = [];
|
||
|
|
foreach ($lines as $line) {
|
||
|
|
$candidate = trim($line);
|
||
|
|
if ($candidate === '') {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (filter_var($candidate, FILTER_VALIDATE_URL) !== false) {
|
||
|
|
$urls[] = $candidate;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return array_values(array_unique($urls));
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return array<string, mixed>
|
||
|
|
*/
|
||
|
|
private function decodeJsonToArray(mixed $value): array
|
||
|
|
{
|
||
|
|
if (! is_string($value) || trim($value) === '') {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
$decoded = json_decode($value, true);
|
||
|
|
|
||
|
|
return is_array($decoded) ? $decoded : [];
|
||
|
|
}
|
||
|
|
}
|