爬虫开发
This commit is contained in:
42
app/Http/Controllers/Admin/CrawlAlertController.php
Normal file
42
app/Http/Controllers/Admin/CrawlAlertController.php
Normal file
@@ -0,0 +1,42 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Http\Controllers\Admin;
|
||||
|
||||
use App\Http\Controllers\Controller;
|
||||
use App\Models\CrawlAlert;
|
||||
use Illuminate\Http\RedirectResponse;
|
||||
use Illuminate\Http\Request;
|
||||
use Illuminate\View\View;
|
||||
|
||||
class CrawlAlertController extends Controller
|
||||
{
|
||||
public function index(Request $request): View
|
||||
{
|
||||
$items = CrawlAlert::query()
|
||||
->with(['rule', 'run'])
|
||||
->when($request->filled('resolved'), function ($query) use ($request): void {
|
||||
$query->where('is_resolved', (bool) $request->boolean('resolved'));
|
||||
})
|
||||
->latest('id')
|
||||
->paginate(20)
|
||||
->withQueryString();
|
||||
|
||||
return view('admin.crawl-alerts.index', [
|
||||
'items' => $items,
|
||||
'filters' => $request->only(['resolved']),
|
||||
]);
|
||||
}
|
||||
|
||||
public function resolve(CrawlAlert $alert): RedirectResponse
|
||||
{
|
||||
$alert->is_resolved = true;
|
||||
$alert->resolved_at = now();
|
||||
$alert->resolved_by = null;
|
||||
$alert->save();
|
||||
|
||||
return redirect()->back()->with('status', '告警已标记为已处理');
|
||||
}
|
||||
}
|
||||
|
||||
54
app/Http/Controllers/Admin/CrawlRunController.php
Normal file
54
app/Http/Controllers/Admin/CrawlRunController.php
Normal file
@@ -0,0 +1,54 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Http\Controllers\Admin;
|
||||
|
||||
use App\Http\Controllers\Controller;
|
||||
use App\Jobs\RunCrawlRuleJob;
|
||||
use App\Models\CrawlRun;
|
||||
use Illuminate\Http\RedirectResponse;
|
||||
use Illuminate\Http\Request;
|
||||
use Illuminate\View\View;
|
||||
|
||||
class CrawlRunController extends Controller
|
||||
{
|
||||
public function index(Request $request): View
|
||||
{
|
||||
$items = CrawlRun::query()
|
||||
->with('rule')
|
||||
->when($request->filled('rule_id'), function ($query) use ($request): void {
|
||||
$query->where('rule_id', (int) $request->input('rule_id'));
|
||||
})
|
||||
->latest('id')
|
||||
->paginate(20)
|
||||
->withQueryString();
|
||||
|
||||
return view('admin.crawl-runs.index', [
|
||||
'items' => $items,
|
||||
'filters' => $request->only(['rule_id']),
|
||||
]);
|
||||
}
|
||||
|
||||
public function show(CrawlRun $run): View
|
||||
{
|
||||
$run->load(['rule', 'items' => function ($query): void {
|
||||
$query->latest('id');
|
||||
}, 'alerts']);
|
||||
|
||||
return view('admin.crawl-runs.show', [
|
||||
'run' => $run,
|
||||
]);
|
||||
}
|
||||
|
||||
public function retry(CrawlRun $run): RedirectResponse
|
||||
{
|
||||
if ($run->rule_id !== null) {
|
||||
RunCrawlRuleJob::dispatch($run->rule_id, 'retry', null, $run->id);
|
||||
}
|
||||
|
||||
return redirect()->route('admin.crawl-runs.index', ['rule_id' => $run->rule_id])
|
||||
->with('status', '已提交重试任务');
|
||||
}
|
||||
}
|
||||
|
||||
413
app/Http/Controllers/Admin/CrawlerRuleController.php
Normal file
413
app/Http/Controllers/Admin/CrawlerRuleController.php
Normal file
@@ -0,0 +1,413 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Http\Controllers\Admin;
|
||||
|
||||
use App\Enums\CrawlTargetModule;
|
||||
use App\Enums\CrawlTriggerType;
|
||||
use App\Http\Controllers\Controller;
|
||||
use App\Http\Requests\Admin\CrawlRuleRequest;
|
||||
use App\Jobs\RunCrawlRuleJob;
|
||||
use App\Models\CrawlRule;
|
||||
use App\Services\Crawler\CrawlRuleScheduleService;
|
||||
use App\Services\Crawler\OpenAiFallbackExtractor;
|
||||
use Illuminate\Http\JsonResponse;
|
||||
use Illuminate\Http\RedirectResponse;
|
||||
use Illuminate\Http\Request;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Illuminate\View\View;
|
||||
|
||||
class CrawlerRuleController extends Controller
|
||||
{
|
||||
public function __construct(
|
||||
private readonly CrawlRuleScheduleService $scheduleService,
|
||||
private readonly OpenAiFallbackExtractor $aiExtractor,
|
||||
) {
|
||||
}
|
||||
|
||||
public function index(Request $request): View
|
||||
{
|
||||
$items = CrawlRule::query()
|
||||
->withCount('runs')
|
||||
->when($request->filled('q'), function ($query) use ($request): void {
|
||||
$keyword = '%'.trim((string) $request->string('q')).'%';
|
||||
$query->where('name', 'like', $keyword);
|
||||
})
|
||||
->latest('updated_at')
|
||||
->paginate(20)
|
||||
->withQueryString();
|
||||
|
||||
return view('admin.crawlers.index', [
|
||||
'items' => $items,
|
||||
'filters' => $request->only(['q']),
|
||||
]);
|
||||
}
|
||||
|
||||
public function create(): View
|
||||
{
|
||||
return view('admin.crawlers.form', [
|
||||
'item' => new CrawlRule([
|
||||
'enabled' => true,
|
||||
'target_module' => CrawlTargetModule::Tool,
|
||||
'cron_expression' => '0 */6 * * *',
|
||||
'timezone' => 'Asia/Shanghai',
|
||||
'max_pages' => 50,
|
||||
'rate_limit_per_minute' => 30,
|
||||
'retry_max' => 3,
|
||||
'retry_backoff_seconds' => 60,
|
||||
'extractor_config' => [
|
||||
'mode' => 'xpath',
|
||||
'list_link_xpath' => '//a/@href',
|
||||
'fields' => [
|
||||
'name' => '//h1/text()',
|
||||
'summary' => '//meta[@name="description"]/@content',
|
||||
],
|
||||
'ai' => [
|
||||
'temperature' => 0,
|
||||
'content_max_chars' => 12000,
|
||||
],
|
||||
],
|
||||
'mapping_config' => [],
|
||||
'dedupe_config' => [],
|
||||
'publish_policy' => 'draft',
|
||||
'ai_provider' => 'openai_compatible',
|
||||
'ai_fallback_enabled' => false,
|
||||
]),
|
||||
'method' => 'POST',
|
||||
'submitRoute' => route('admin.crawlers.store'),
|
||||
]);
|
||||
}
|
||||
|
||||
public function store(CrawlRuleRequest $request): RedirectResponse
|
||||
{
|
||||
$payload = $request->normalizedPayload();
|
||||
$payload['created_by'] = null;
|
||||
$payload['updated_by'] = null;
|
||||
|
||||
$item = CrawlRule::query()->create($payload);
|
||||
$item->next_run_at = $this->scheduleService->nextRunAt($item);
|
||||
$item->save();
|
||||
|
||||
return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。');
|
||||
}
|
||||
|
||||
public function edit(CrawlRule $crawler): View
|
||||
{
|
||||
return view('admin.crawlers.form', [
|
||||
'item' => $crawler,
|
||||
'method' => 'PUT',
|
||||
'submitRoute' => route('admin.crawlers.update', $crawler),
|
||||
]);
|
||||
}
|
||||
|
||||
public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse
|
||||
{
|
||||
$payload = $request->normalizedPayload();
|
||||
$payload['updated_by'] = null;
|
||||
|
||||
$crawler->fill($payload);
|
||||
$crawler->next_run_at = $this->scheduleService->nextRunAt($crawler);
|
||||
$crawler->save();
|
||||
|
||||
return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。');
|
||||
}
|
||||
|
||||
public function run(CrawlRule $crawler): RedirectResponse
|
||||
{
|
||||
RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value);
|
||||
|
||||
return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id])
|
||||
->with('status', '已提交手动执行任务。');
|
||||
}
|
||||
|
||||
public function preview(Request $request): JsonResponse
|
||||
{
|
||||
$payload = $request->validate([
|
||||
'url' => ['required', 'url', 'max:2000'],
|
||||
'user_agent' => ['nullable', 'string', 'max:255'],
|
||||
]);
|
||||
|
||||
$url = (string) $payload['url'];
|
||||
|
||||
if (! $this->isSafePreviewUrl($url)) {
|
||||
return response()->json([
|
||||
'ok' => false,
|
||||
'message' => '预览地址不安全,已拒绝请求。',
|
||||
], 422);
|
||||
}
|
||||
|
||||
$fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
|
||||
if (! $fetched['ok']) {
|
||||
return response()->json([
|
||||
'ok' => false,
|
||||
'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
|
||||
], 422);
|
||||
}
|
||||
|
||||
$sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']);
|
||||
|
||||
return response()->json([
|
||||
'ok' => true,
|
||||
'url' => $url,
|
||||
'title' => $this->extractTitle($sanitizedHtml),
|
||||
'html' => $sanitizedHtml,
|
||||
]);
|
||||
}
|
||||
|
||||
public function aiSuggestExtractor(Request $request): JsonResponse
|
||||
{
|
||||
$payload = $request->validate([
|
||||
'url' => ['required', 'url', 'max:2000'],
|
||||
'target_module' => ['required', 'in:tool,model'],
|
||||
'user_agent' => ['nullable', 'string', 'max:255'],
|
||||
'ai_model' => ['nullable', 'string', 'max:128'],
|
||||
'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
|
||||
'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
|
||||
'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
|
||||
'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
|
||||
]);
|
||||
|
||||
if (! $this->aiExtractor->isConfigured()) {
|
||||
return response()->json([
|
||||
'ok' => false,
|
||||
'message' => 'AI 配置缺失,请先设置 CRAWLER_AI_KEY 与接口地址。',
|
||||
], 422);
|
||||
}
|
||||
|
||||
$url = (string) $payload['url'];
|
||||
|
||||
if (! $this->isSafePreviewUrl($url)) {
|
||||
return response()->json([
|
||||
'ok' => false,
|
||||
'message' => '目标 URL 不安全,已拒绝请求。',
|
||||
], 422);
|
||||
}
|
||||
|
||||
$fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
|
||||
if (! $fetched['ok']) {
|
||||
return response()->json([
|
||||
'ok' => false,
|
||||
'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
|
||||
], 422);
|
||||
}
|
||||
|
||||
$options = [];
|
||||
foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) {
|
||||
if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') {
|
||||
$options[$target] = trim((string) $payload[$source]);
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
|
||||
$options['temperature'] = (float) $payload['ai_temperature'];
|
||||
}
|
||||
|
||||
if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
|
||||
$options['content_max_chars'] = (int) $payload['ai_content_max_chars'];
|
||||
}
|
||||
|
||||
$extractorConfig = $this->aiExtractor->suggestExtractorConfig(
|
||||
(string) $payload['target_module'],
|
||||
$this->sanitizePreviewHtml($fetched['body']),
|
||||
$options,
|
||||
);
|
||||
|
||||
if ($extractorConfig === []) {
|
||||
$reason = $this->aiExtractor->lastError();
|
||||
|
||||
return response()->json([
|
||||
'ok' => false,
|
||||
'message' => $reason !== null && $reason !== ''
|
||||
? 'AI 生成失败:'.$reason
|
||||
: 'AI 未生成有效规则,请调整页面或提示词后重试。',
|
||||
], 422);
|
||||
}
|
||||
|
||||
return response()->json([
|
||||
'ok' => true,
|
||||
'extractor_config' => $extractorConfig,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{ok: bool, body: string, error: string|null}
|
||||
*/
|
||||
private function fetchHtml(string $url, ?string $userAgent = null): array
|
||||
{
|
||||
$ua = is_string($userAgent) && trim($userAgent) !== ''
|
||||
? trim($userAgent)
|
||||
: (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0');
|
||||
|
||||
$maxAttempts = 3;
|
||||
$lastError = 'unknown';
|
||||
|
||||
for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
|
||||
try {
|
||||
$request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5));
|
||||
if (! (bool) config('crawler.verify_ssl', true)) {
|
||||
$request = $request->withoutVerifying();
|
||||
}
|
||||
|
||||
$request = $this->applyNetworkOptions($request);
|
||||
$response = $request->withUserAgent($ua)->get($url);
|
||||
|
||||
if ($response->successful()) {
|
||||
return [
|
||||
'ok' => true,
|
||||
'body' => $response->body(),
|
||||
'error' => null,
|
||||
];
|
||||
}
|
||||
|
||||
$lastError = sprintf('HTTP %d', $response->status());
|
||||
if ($attempt < $maxAttempts && $response->serverError()) {
|
||||
usleep(250000 * $attempt);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
} catch (\Throwable $exception) {
|
||||
$lastError = $exception->getMessage();
|
||||
if ($attempt < $maxAttempts) {
|
||||
usleep(250000 * $attempt);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'ok' => false,
|
||||
'body' => '',
|
||||
'error' => $lastError,
|
||||
];
|
||||
}
|
||||
|
||||
private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest
|
||||
{
|
||||
$options = [];
|
||||
|
||||
if ((bool) config('crawler.force_ipv4', false)) {
|
||||
$options['force_ip_resolve'] = 'v4';
|
||||
}
|
||||
|
||||
$dnsServers = trim((string) config('crawler.dns_servers', ''));
|
||||
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
|
||||
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
|
||||
}
|
||||
|
||||
if ($options === []) {
|
||||
return $request;
|
||||
}
|
||||
|
||||
return $request->withOptions($options);
|
||||
}
|
||||
|
||||
private function isSafePreviewUrl(string $url): bool
|
||||
{
|
||||
$parts = parse_url($url);
|
||||
if (! is_array($parts)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$scheme = strtolower((string) ($parts['scheme'] ?? ''));
|
||||
$host = strtolower((string) ($parts['host'] ?? ''));
|
||||
|
||||
if (! in_array($scheme, ['http', 'https'], true) || $host === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($host === 'localhost') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
|
||||
return $this->isPublicIp($host);
|
||||
}
|
||||
|
||||
$records = @dns_get_record($host, DNS_A + DNS_AAAA);
|
||||
if (! is_array($records) || $records === []) {
|
||||
return true;
|
||||
}
|
||||
|
||||
foreach ($records as $record) {
|
||||
$ip = (string) ($record['ip'] ?? $record['ipv6'] ?? '');
|
||||
if ($ip !== '' && ! $this->isPublicIp($ip)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private function isPublicIp(string $ip): bool
|
||||
{
|
||||
return filter_var(
|
||||
$ip,
|
||||
FILTER_VALIDATE_IP,
|
||||
FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE,
|
||||
) !== false;
|
||||
}
|
||||
|
||||
private function sanitizePreviewHtml(string $html): string
|
||||
{
|
||||
if (trim($html) === '') {
|
||||
return '<!doctype html><html><head><meta charset="utf-8"></head><body>空页面</body></html>';
|
||||
}
|
||||
|
||||
$dom = new \DOMDocument('1.0', 'UTF-8');
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
$dom->loadHTML('<?xml encoding="UTF-8">'.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
|
||||
libxml_clear_errors();
|
||||
|
||||
$xpath = new \DOMXPath($dom);
|
||||
|
||||
foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) {
|
||||
$nodes = $xpath->query($query);
|
||||
if ($nodes === false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for ($index = $nodes->length - 1; $index >= 0; $index--) {
|
||||
$node = $nodes->item($index);
|
||||
if ($node !== null && $node->parentNode !== null) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$allNodes = $xpath->query('//*');
|
||||
if ($allNodes !== false) {
|
||||
foreach ($allNodes as $node) {
|
||||
if (! $node instanceof \DOMElement) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$attributesToRemove = [];
|
||||
foreach ($node->attributes as $attribute) {
|
||||
if (str_starts_with(strtolower($attribute->name), 'on')) {
|
||||
$attributesToRemove[] = $attribute->name;
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($attributesToRemove as $attributeName) {
|
||||
$node->removeAttribute($attributeName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$output = (string) $dom->saveHTML();
|
||||
|
||||
return mb_substr($output, 0, 300000);
|
||||
}
|
||||
|
||||
private function extractTitle(string $html): string
|
||||
{
|
||||
if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $matches) !== 1) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return trim(strip_tags((string) $matches[1]));
|
||||
}
|
||||
}
|
||||
254
app/Http/Requests/Admin/CrawlRuleRequest.php
Normal file
254
app/Http/Requests/Admin/CrawlRuleRequest.php
Normal file
@@ -0,0 +1,254 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Http\Requests\Admin;
|
||||
|
||||
use App\Enums\CrawlTargetModule;
|
||||
use Illuminate\Foundation\Http\FormRequest;
|
||||
use Illuminate\Validation\Rule;
|
||||
|
||||
class CrawlRuleRequest extends FormRequest
|
||||
{
|
||||
public function authorize(): bool
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
protected function prepareForValidation(): void
|
||||
{
|
||||
$extractorConfig = $this->decodeJsonToArray($this->input('extractor_json'));
|
||||
|
||||
$this->merge([
|
||||
'enabled' => $this->boolean('enabled'),
|
||||
'render_js' => $this->boolean('render_js'),
|
||||
'ai_fallback_enabled' => $this->boolean('ai_fallback_enabled'),
|
||||
'extractor_mode' => $this->input('extractor_mode') ?: (string) ($extractorConfig['mode'] ?? 'xpath'),
|
||||
]);
|
||||
}
|
||||
|
||||
public function rules(): array
|
||||
{
|
||||
return [
|
||||
'name' => ['required', 'string', 'max:150'],
|
||||
'target_module' => ['required', Rule::in(array_column(CrawlTargetModule::cases(), 'value'))],
|
||||
'enabled' => ['nullable', 'boolean'],
|
||||
'entry_urls' => ['required', 'string'],
|
||||
'cron_expression' => ['required', 'string', 'max:64'],
|
||||
'timezone' => ['required', 'string', 'max:64'],
|
||||
'max_pages' => ['required', 'integer', 'between:1,2000'],
|
||||
'render_js' => ['nullable', 'boolean'],
|
||||
'user_agent' => ['nullable', 'string', 'max:255'],
|
||||
'headers_json' => ['nullable', 'json'],
|
||||
'cookies_json' => ['nullable', 'json'],
|
||||
'proxy' => ['nullable', 'string', 'max:255'],
|
||||
'rate_limit_per_minute' => ['required', 'integer', 'between:1,2000'],
|
||||
'retry_max' => ['required', 'integer', 'between:1,10'],
|
||||
'retry_backoff_seconds' => ['required', 'integer', 'between:1,3600'],
|
||||
'extractor_json' => ['required', 'json'],
|
||||
'extractor_mode' => ['required', Rule::in(['xpath', 'ai', 'hybrid'])],
|
||||
'mapping_json' => ['nullable', 'json'],
|
||||
'dedupe_json' => ['nullable', 'json'],
|
||||
'ai_fallback_enabled' => ['nullable', 'boolean'],
|
||||
'ai_provider' => ['nullable', 'string', 'max:64'],
|
||||
'ai_model' => ['nullable', 'string', 'max:128'],
|
||||
'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
|
||||
'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
|
||||
'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
|
||||
'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
|
||||
'publish_policy' => ['required', Rule::in(['draft'])],
|
||||
'alert_email' => ['nullable', 'email'],
|
||||
];
|
||||
}
|
||||
|
||||
public function messages(): array
|
||||
{
|
||||
return [
|
||||
'name.required' => '请填写规则名称。',
|
||||
'target_module.required' => '请选择目标模块。',
|
||||
'entry_urls.required' => '请至少填写一个入口 URL。',
|
||||
'cron_expression.required' => '请填写 Cron 表达式。',
|
||||
'timezone.required' => '请填写时区。',
|
||||
'max_pages.required' => '请填写最大页面数。',
|
||||
'max_pages.integer' => '最大页面数必须是整数。',
|
||||
'max_pages.between' => '最大页面数需在 1 到 2000 之间。',
|
||||
'rate_limit_per_minute.required' => '请填写每分钟限流值。',
|
||||
'rate_limit_per_minute.integer' => '每分钟限流值必须是整数。',
|
||||
'rate_limit_per_minute.between' => '每分钟限流值需在 1 到 2000 之间。',
|
||||
'retry_max.required' => '请填写最大重试次数。',
|
||||
'retry_max.integer' => '最大重试次数必须是整数。',
|
||||
'retry_max.between' => '最大重试次数需在 1 到 10 之间。',
|
||||
'retry_backoff_seconds.required' => '请填写重试退避秒数。',
|
||||
'retry_backoff_seconds.integer' => '重试退避秒数必须是整数。',
|
||||
'retry_backoff_seconds.between' => '重试退避秒数需在 1 到 3600 之间。',
|
||||
'extractor_json.required' => '请填写 Extractor JSON。',
|
||||
'extractor_json.json' => 'Extractor JSON 格式不合法。',
|
||||
'extractor_mode.required' => '请选择抽取模式。',
|
||||
'extractor_mode.in' => '抽取模式仅支持 xpath、ai、hybrid。',
|
||||
'mapping_json.json' => 'Mapping JSON 格式不合法。',
|
||||
'dedupe_json.json' => 'Dedupe JSON 格式不合法。',
|
||||
'headers_json.json' => 'Headers JSON 格式不合法。',
|
||||
'cookies_json.json' => 'Cookies JSON 格式不合法。',
|
||||
'ai_temperature.between' => 'AI 温度需在 0 到 2 之间。',
|
||||
'ai_content_max_chars.between' => 'AI 内容截断长度需在 500 到 50000 之间。',
|
||||
'alert_email.email' => '告警邮箱格式不合法。',
|
||||
];
|
||||
}
|
||||
|
||||
public function attributes(): array
|
||||
{
|
||||
return [
|
||||
'name' => '规则名称',
|
||||
'target_module' => '目标模块',
|
||||
'entry_urls' => '入口 URL',
|
||||
'cron_expression' => 'Cron 表达式',
|
||||
'timezone' => '时区',
|
||||
'max_pages' => '最大页面数',
|
||||
'rate_limit_per_minute' => '每分钟限流',
|
||||
'retry_max' => '最大重试次数',
|
||||
'retry_backoff_seconds' => '重试退避秒数',
|
||||
'extractor_json' => 'Extractor JSON',
|
||||
'extractor_mode' => '抽取模式',
|
||||
'mapping_json' => 'Mapping JSON',
|
||||
'dedupe_json' => 'Dedupe JSON',
|
||||
'headers_json' => 'Headers JSON',
|
||||
'cookies_json' => 'Cookies JSON',
|
||||
'ai_system_prompt' => 'AI 系统提示词',
|
||||
'ai_user_prompt' => 'AI 用户提示词',
|
||||
'ai_temperature' => 'AI 温度',
|
||||
'ai_content_max_chars' => 'AI 内容截断长度',
|
||||
'alert_email' => '告警邮箱',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
public function normalizedPayload(): array
|
||||
{
|
||||
$payload = $this->validated();
|
||||
|
||||
$extractorConfig = $this->decodeJsonToArray($payload['extractor_json'] ?? null);
|
||||
$extractorMode = (string) ($payload['extractor_mode'] ?? ($extractorConfig['mode'] ?? 'xpath'));
|
||||
|
||||
if (! in_array($extractorMode, ['xpath', 'ai', 'hybrid'], true)) {
|
||||
$extractorMode = 'xpath';
|
||||
}
|
||||
|
||||
$extractorConfig['mode'] = $extractorMode;
|
||||
|
||||
$aiConfig = $this->buildAiConfig($payload);
|
||||
if ($aiConfig !== []) {
|
||||
$extractorConfig['ai'] = $aiConfig;
|
||||
} else {
|
||||
unset($extractorConfig['ai']);
|
||||
}
|
||||
|
||||
return [
|
||||
'name' => $payload['name'],
|
||||
'target_module' => $payload['target_module'],
|
||||
'enabled' => (bool) ($payload['enabled'] ?? false),
|
||||
'entry_urls' => $this->parseEntryUrls((string) ($payload['entry_urls'] ?? '')),
|
||||
'cron_expression' => trim((string) $payload['cron_expression']),
|
||||
'timezone' => trim((string) $payload['timezone']),
|
||||
'max_pages' => (int) $payload['max_pages'],
|
||||
'render_js' => (bool) ($payload['render_js'] ?? false),
|
||||
'user_agent' => $this->nullableTrim($payload['user_agent'] ?? null),
|
||||
'headers' => $this->decodeJsonToArray($payload['headers_json'] ?? null),
|
||||
'cookies' => $this->decodeJsonToArray($payload['cookies_json'] ?? null),
|
||||
'proxy' => $this->nullableTrim($payload['proxy'] ?? null),
|
||||
'rate_limit_per_minute' => (int) $payload['rate_limit_per_minute'],
|
||||
'retry_max' => (int) $payload['retry_max'],
|
||||
'retry_backoff_seconds' => (int) $payload['retry_backoff_seconds'],
|
||||
'extractor_config' => $extractorConfig,
|
||||
'mapping_config' => $this->decodeJsonToArray($payload['mapping_json'] ?? null),
|
||||
'dedupe_config' => $this->decodeJsonToArray($payload['dedupe_json'] ?? null),
|
||||
'ai_fallback_enabled' => (bool) ($payload['ai_fallback_enabled'] ?? false),
|
||||
'ai_provider' => $this->nullableTrim($payload['ai_provider'] ?? null),
|
||||
'ai_model' => $this->nullableTrim($payload['ai_model'] ?? null),
|
||||
'publish_policy' => (string) $payload['publish_policy'],
|
||||
'alert_email' => $this->nullableTrim($payload['alert_email'] ?? null),
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $payload
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
private function buildAiConfig(array $payload): array
|
||||
{
|
||||
$aiConfig = [];
|
||||
|
||||
$systemPrompt = $this->nullableTrim($payload['ai_system_prompt'] ?? null);
|
||||
if ($systemPrompt !== null) {
|
||||
$aiConfig['system_prompt'] = $systemPrompt;
|
||||
}
|
||||
|
||||
$userPrompt = $this->nullableTrim($payload['ai_user_prompt'] ?? null);
|
||||
if ($userPrompt !== null) {
|
||||
$aiConfig['user_prompt'] = $userPrompt;
|
||||
}
|
||||
|
||||
if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
|
||||
$aiConfig['temperature'] = (float) $payload['ai_temperature'];
|
||||
}
|
||||
|
||||
if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
|
||||
$aiConfig['content_max_chars'] = (int) $payload['ai_content_max_chars'];
|
||||
}
|
||||
|
||||
$aiModel = $this->nullableTrim($payload['ai_model'] ?? null);
|
||||
if ($aiModel !== null) {
|
||||
$aiConfig['model'] = $aiModel;
|
||||
}
|
||||
|
||||
return $aiConfig;
|
||||
}
|
||||
|
||||
private function nullableTrim(mixed $value): ?string
|
||||
{
|
||||
if (! is_string($value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$trimmed = trim($value);
|
||||
|
||||
return $trimmed === '' ? null : $trimmed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return list<string>
|
||||
*/
|
||||
private function parseEntryUrls(string $entryUrls): array
|
||||
{
|
||||
$lines = preg_split('/\r\n|\r|\n/', $entryUrls) ?: [];
|
||||
|
||||
$urls = [];
|
||||
foreach ($lines as $line) {
|
||||
$candidate = trim($line);
|
||||
if ($candidate === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (filter_var($candidate, FILTER_VALIDATE_URL) !== false) {
|
||||
$urls[] = $candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($urls));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
private function decodeJsonToArray(mixed $value): array
|
||||
{
|
||||
if (! is_string($value) || trim($value) === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
$decoded = json_decode($value, true);
|
||||
|
||||
return is_array($decoded) ? $decoded : [];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user