diff --git a/.env.example b/.env.example
index 939bfcd..f05d3b6 100644
--- a/.env.example
+++ b/.env.example
@@ -61,4 +61,19 @@ AWS_USE_PATH_STYLE_ENDPOINT=false
VITE_APP_NAME="${APP_NAME}"
-
+CRAWLER_USER_AGENT="AIWebCrawler/1.0 (+https://dev.aiweb.com)"
+CRAWLER_REQUEST_TIMEOUT=20
+CRAWLER_AI_TIMEOUT=60
+CRAWLER_VERIFY_SSL=true
+CRAWLER_DNS_SERVERS=
+CRAWLER_FORCE_IPV4=false
+CRAWLER_AI_WIRE_API=chat_completions
+CRAWLER_AI_BASE_URL=
+CRAWLER_BROWSERLESS_ENDPOINT=
+CRAWLER_BROWSERLESS_TOKEN=
+CRAWLER_AI_ENDPOINT=
+CRAWLER_AI_KEY=
+CRAWLER_AI_MODEL=gpt-4o-mini
+CRAWLER_AI_REASONING_EFFORT=
+CRAWLER_AI_DISABLE_RESPONSE_STORAGE=false
+CRAWLER_ALERT_EMAIL=
diff --git a/app/Console/Commands/CrawlerHealthCheckCommand.php b/app/Console/Commands/CrawlerHealthCheckCommand.php
new file mode 100644
index 0000000..51f2fec
--- /dev/null
+++ b/app/Console/Commands/CrawlerHealthCheckCommand.php
@@ -0,0 +1,41 @@
+ 'Queue Connection', 'status' => (string) config('queue.default'), 'detail' => '当前队列连接'],
+ ['item' => 'Browserless Endpoint', 'status' => (string) (config('crawler.browserless_endpoint') ?: 'not-configured'), 'detail' => 'JS渲染服务'],
+ ['item' => 'AI Endpoint', 'status' => (string) (config('crawler.openai_compatible_endpoint') ?: 'not-configured'), 'detail' => 'AI兜底抽取'],
+ ['item' => 'Alert Email', 'status' => (string) (config('crawler.default_alert_email') ?: 'not-configured'), 'detail' => '默认告警邮箱'],
+ ];
+
+ $browserlessEndpoint = (string) config('crawler.browserless_endpoint', '');
+
+ if ($browserlessEndpoint !== '') {
+ try {
+ $response = Http::timeout(5)->get($browserlessEndpoint);
+ $checks[] = ['item' => 'Browserless Reachable', 'status' => $response->status() < 500 ? 'ok' : 'degraded', 'detail' => 'HTTP '.$response->status()];
+ } catch (\Throwable $exception) {
+ $checks[] = ['item' => 'Browserless Reachable', 'status' => 'failed', 'detail' => $exception->getMessage()];
+ }
+ }
+
+ $this->table(['Item', 'Status', 'Detail'], $checks);
+
+ return self::SUCCESS;
+ }
+}
+
diff --git a/app/Console/Commands/CrawlerRetryFailedCommand.php b/app/Console/Commands/CrawlerRetryFailedCommand.php
new file mode 100644
index 0000000..9c4bc3a
--- /dev/null
+++ b/app/Console/Commands/CrawlerRetryFailedCommand.php
@@ -0,0 +1,39 @@
+with('rule')->find((int) $this->argument('runId'));
+
+ if (! $run instanceof CrawlRun || $run->rule === null) {
+ $this->error('运行记录不存在或规则已删除');
+
+ return self::FAILURE;
+ }
+
+ if ((bool) $this->option('sync')) {
+ RunCrawlRuleJob::dispatchSync($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id);
+ } else {
+ RunCrawlRuleJob::dispatch($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id);
+ }
+
+ $this->info(sprintf('已提交重试任务,规则 #%d %s', $run->rule_id, $run->rule->name));
+
+ return self::SUCCESS;
+ }
+}
+
diff --git a/app/Console/Commands/CrawlerRunCommand.php b/app/Console/Commands/CrawlerRunCommand.php
new file mode 100644
index 0000000..71173ca
--- /dev/null
+++ b/app/Console/Commands/CrawlerRunCommand.php
@@ -0,0 +1,64 @@
+argument('ruleId');
+
+ $query = CrawlRule::query()->where('enabled', true);
+
+ if ($ruleId !== null) {
+ $query->whereKey((int) $ruleId);
+ }
+
+ $rules = $query->orderBy('id')->get();
+
+ if ($rules->isEmpty()) {
+ $this->warn('没有可执行的采集规则');
+
+ return self::SUCCESS;
+ }
+
+ $shouldRunAll = (bool) $this->option('all') || $ruleId !== null;
+ $sync = (bool) $this->option('sync');
+
+ $dispatched = 0;
+
+ foreach ($rules as $rule) {
+ if (! $shouldRunAll && ! $scheduleService->isDue($rule)) {
+ continue;
+ }
+
+ if ($sync) {
+ RunCrawlRuleJob::dispatchSync($rule->id, CrawlTriggerType::Schedule->value);
+ } else {
+ RunCrawlRuleJob::dispatch($rule->id, CrawlTriggerType::Schedule->value);
+ }
+
+ $dispatched++;
+ $this->info(sprintf('已提交规则 #%d %s', $rule->id, $rule->name));
+ }
+
+ if ($dispatched === 0) {
+ $this->line('当前无到期规则');
+ }
+
+ return self::SUCCESS;
+ }
+}
+
diff --git a/app/Enums/CrawlAlertSeverity.php b/app/Enums/CrawlAlertSeverity.php
new file mode 100644
index 0000000..08e1a39
--- /dev/null
+++ b/app/Enums/CrawlAlertSeverity.php
@@ -0,0 +1,14 @@
+ 'AI 工具',
+ self::Model => 'AI 模型',
+ };
+ }
+}
+
diff --git a/app/Enums/CrawlTriggerType.php b/app/Enums/CrawlTriggerType.php
new file mode 100644
index 0000000..1852276
--- /dev/null
+++ b/app/Enums/CrawlTriggerType.php
@@ -0,0 +1,13 @@
+with(['rule', 'run'])
+ ->when($request->filled('resolved'), function ($query) use ($request): void {
+ $query->where('is_resolved', (bool) $request->boolean('resolved'));
+ })
+ ->latest('id')
+ ->paginate(20)
+ ->withQueryString();
+
+ return view('admin.crawl-alerts.index', [
+ 'items' => $items,
+ 'filters' => $request->only(['resolved']),
+ ]);
+ }
+
+ public function resolve(CrawlAlert $alert): RedirectResponse
+ {
+ $alert->is_resolved = true;
+ $alert->resolved_at = now();
+ $alert->resolved_by = null;
+ $alert->save();
+
+ return redirect()->back()->with('status', '告警已标记为已处理');
+ }
+}
+
diff --git a/app/Http/Controllers/Admin/CrawlRunController.php b/app/Http/Controllers/Admin/CrawlRunController.php
new file mode 100644
index 0000000..20d6727
--- /dev/null
+++ b/app/Http/Controllers/Admin/CrawlRunController.php
@@ -0,0 +1,54 @@
+with('rule')
+ ->when($request->filled('rule_id'), function ($query) use ($request): void {
+ $query->where('rule_id', (int) $request->input('rule_id'));
+ })
+ ->latest('id')
+ ->paginate(20)
+ ->withQueryString();
+
+ return view('admin.crawl-runs.index', [
+ 'items' => $items,
+ 'filters' => $request->only(['rule_id']),
+ ]);
+ }
+
+ public function show(CrawlRun $run): View
+ {
+ $run->load(['rule', 'items' => function ($query): void {
+ $query->latest('id');
+ }, 'alerts']);
+
+ return view('admin.crawl-runs.show', [
+ 'run' => $run,
+ ]);
+ }
+
+ public function retry(CrawlRun $run): RedirectResponse
+ {
+ if ($run->rule_id !== null) {
+ RunCrawlRuleJob::dispatch($run->rule_id, 'retry', null, $run->id);
+ }
+
+ return redirect()->route('admin.crawl-runs.index', ['rule_id' => $run->rule_id])
+ ->with('status', '已提交重试任务');
+ }
+}
+
diff --git a/app/Http/Controllers/Admin/CrawlerRuleController.php b/app/Http/Controllers/Admin/CrawlerRuleController.php
new file mode 100644
index 0000000..1930647
--- /dev/null
+++ b/app/Http/Controllers/Admin/CrawlerRuleController.php
@@ -0,0 +1,413 @@
+withCount('runs')
+ ->when($request->filled('q'), function ($query) use ($request): void {
+ $keyword = '%'.trim((string) $request->string('q')).'%';
+ $query->where('name', 'like', $keyword);
+ })
+ ->latest('updated_at')
+ ->paginate(20)
+ ->withQueryString();
+
+ return view('admin.crawlers.index', [
+ 'items' => $items,
+ 'filters' => $request->only(['q']),
+ ]);
+ }
+
+ public function create(): View
+ {
+ return view('admin.crawlers.form', [
+ 'item' => new CrawlRule([
+ 'enabled' => true,
+ 'target_module' => CrawlTargetModule::Tool,
+ 'cron_expression' => '0 */6 * * *',
+ 'timezone' => 'Asia/Shanghai',
+ 'max_pages' => 50,
+ 'rate_limit_per_minute' => 30,
+ 'retry_max' => 3,
+ 'retry_backoff_seconds' => 60,
+ 'extractor_config' => [
+ 'mode' => 'xpath',
+ 'list_link_xpath' => '//a/@href',
+ 'fields' => [
+ 'name' => '//h1/text()',
+ 'summary' => '//meta[@name="description"]/@content',
+ ],
+ 'ai' => [
+ 'temperature' => 0,
+ 'content_max_chars' => 12000,
+ ],
+ ],
+ 'mapping_config' => [],
+ 'dedupe_config' => [],
+ 'publish_policy' => 'draft',
+ 'ai_provider' => 'openai_compatible',
+ 'ai_fallback_enabled' => false,
+ ]),
+ 'method' => 'POST',
+ 'submitRoute' => route('admin.crawlers.store'),
+ ]);
+ }
+
+ public function store(CrawlRuleRequest $request): RedirectResponse
+ {
+ $payload = $request->normalizedPayload();
+ $payload['created_by'] = null;
+ $payload['updated_by'] = null;
+
+ $item = CrawlRule::query()->create($payload);
+ $item->next_run_at = $this->scheduleService->nextRunAt($item);
+ $item->save();
+
+ return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。');
+ }
+
+ public function edit(CrawlRule $crawler): View
+ {
+ return view('admin.crawlers.form', [
+ 'item' => $crawler,
+ 'method' => 'PUT',
+ 'submitRoute' => route('admin.crawlers.update', $crawler),
+ ]);
+ }
+
+ public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse
+ {
+ $payload = $request->normalizedPayload();
+ $payload['updated_by'] = null;
+
+ $crawler->fill($payload);
+ $crawler->next_run_at = $this->scheduleService->nextRunAt($crawler);
+ $crawler->save();
+
+ return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。');
+ }
+
+ public function run(CrawlRule $crawler): RedirectResponse
+ {
+ RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value);
+
+ return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id])
+ ->with('status', '已提交手动执行任务。');
+ }
+
+ public function preview(Request $request): JsonResponse
+ {
+ $payload = $request->validate([
+ 'url' => ['required', 'url', 'max:2000'],
+ 'user_agent' => ['nullable', 'string', 'max:255'],
+ ]);
+
+ $url = (string) $payload['url'];
+
+ if (! $this->isSafePreviewUrl($url)) {
+ return response()->json([
+ 'ok' => false,
+ 'message' => '预览地址不安全,已拒绝请求。',
+ ], 422);
+ }
+
+ $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
+ if (! $fetched['ok']) {
+ return response()->json([
+ 'ok' => false,
+ 'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
+ ], 422);
+ }
+
+ $sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']);
+
+ return response()->json([
+ 'ok' => true,
+ 'url' => $url,
+ 'title' => $this->extractTitle($sanitizedHtml),
+ 'html' => $sanitizedHtml,
+ ]);
+ }
+
+ public function aiSuggestExtractor(Request $request): JsonResponse
+ {
+ $payload = $request->validate([
+ 'url' => ['required', 'url', 'max:2000'],
+ 'target_module' => ['required', 'in:tool,model'],
+ 'user_agent' => ['nullable', 'string', 'max:255'],
+ 'ai_model' => ['nullable', 'string', 'max:128'],
+ 'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
+ 'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
+ 'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
+ 'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
+ ]);
+
+ if (! $this->aiExtractor->isConfigured()) {
+ return response()->json([
+ 'ok' => false,
+ 'message' => 'AI 配置缺失,请先设置 CRAWLER_AI_KEY 与接口地址。',
+ ], 422);
+ }
+
+ $url = (string) $payload['url'];
+
+ if (! $this->isSafePreviewUrl($url)) {
+ return response()->json([
+ 'ok' => false,
+ 'message' => '目标 URL 不安全,已拒绝请求。',
+ ], 422);
+ }
+
+ $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
+ if (! $fetched['ok']) {
+ return response()->json([
+ 'ok' => false,
+ 'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
+ ], 422);
+ }
+
+ $options = [];
+ foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) {
+ if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') {
+ $options[$target] = trim((string) $payload[$source]);
+ }
+ }
+
+ if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
+ $options['temperature'] = (float) $payload['ai_temperature'];
+ }
+
+ if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
+ $options['content_max_chars'] = (int) $payload['ai_content_max_chars'];
+ }
+
+ $extractorConfig = $this->aiExtractor->suggestExtractorConfig(
+ (string) $payload['target_module'],
+ $this->sanitizePreviewHtml($fetched['body']),
+ $options,
+ );
+
+ if ($extractorConfig === []) {
+ $reason = $this->aiExtractor->lastError();
+
+ return response()->json([
+ 'ok' => false,
+ 'message' => $reason !== null && $reason !== ''
+ ? 'AI 生成失败:'.$reason
+ : 'AI 未生成有效规则,请调整页面或提示词后重试。',
+ ], 422);
+ }
+
+ return response()->json([
+ 'ok' => true,
+ 'extractor_config' => $extractorConfig,
+ ]);
+ }
+
+ /**
+ * @return array{ok: bool, body: string, error: string|null}
+ */
+ private function fetchHtml(string $url, ?string $userAgent = null): array
+ {
+ $ua = is_string($userAgent) && trim($userAgent) !== ''
+ ? trim($userAgent)
+ : (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0');
+
+ $maxAttempts = 3;
+ $lastError = 'unknown';
+
+ for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
+ try {
+ $request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5));
+ if (! (bool) config('crawler.verify_ssl', true)) {
+ $request = $request->withoutVerifying();
+ }
+
+ $request = $this->applyNetworkOptions($request);
+ $response = $request->withUserAgent($ua)->get($url);
+
+ if ($response->successful()) {
+ return [
+ 'ok' => true,
+ 'body' => $response->body(),
+ 'error' => null,
+ ];
+ }
+
+ $lastError = sprintf('HTTP %d', $response->status());
+ if ($attempt < $maxAttempts && $response->serverError()) {
+ usleep(250000 * $attempt);
+ continue;
+ }
+
+ break;
+ } catch (\Throwable $exception) {
+ $lastError = $exception->getMessage();
+ if ($attempt < $maxAttempts) {
+ usleep(250000 * $attempt);
+ continue;
+ }
+ }
+ }
+
+ return [
+ 'ok' => false,
+ 'body' => '',
+ 'error' => $lastError,
+ ];
+ }
+
+ private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest
+ {
+ $options = [];
+
+ if ((bool) config('crawler.force_ipv4', false)) {
+ $options['force_ip_resolve'] = 'v4';
+ }
+
+ $dnsServers = trim((string) config('crawler.dns_servers', ''));
+ if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
+ $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
+ }
+
+ if ($options === []) {
+ return $request;
+ }
+
+ return $request->withOptions($options);
+ }
+
+ private function isSafePreviewUrl(string $url): bool
+ {
+ $parts = parse_url($url);
+ if (! is_array($parts)) {
+ return false;
+ }
+
+ $scheme = strtolower((string) ($parts['scheme'] ?? ''));
+ $host = strtolower((string) ($parts['host'] ?? ''));
+
+ if (! in_array($scheme, ['http', 'https'], true) || $host === '') {
+ return false;
+ }
+
+ if ($host === 'localhost') {
+ return false;
+ }
+
+ if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
+ return $this->isPublicIp($host);
+ }
+
+ $records = @dns_get_record($host, DNS_A + DNS_AAAA);
+ if (! is_array($records) || $records === []) {
+ return true;
+ }
+
+ foreach ($records as $record) {
+ $ip = (string) ($record['ip'] ?? $record['ipv6'] ?? '');
+ if ($ip !== '' && ! $this->isPublicIp($ip)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private function isPublicIp(string $ip): bool
+ {
+ return filter_var(
+ $ip,
+ FILTER_VALIDATE_IP,
+ FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE,
+ ) !== false;
+ }
+
+ private function sanitizePreviewHtml(string $html): string
+ {
+ if (trim($html) === '') {
+ return '
空页面';
+ }
+
+ $dom = new \DOMDocument('1.0', 'UTF-8');
+
+ libxml_use_internal_errors(true);
+ $dom->loadHTML(''.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
+ libxml_clear_errors();
+
+ $xpath = new \DOMXPath($dom);
+
+ foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) {
+ $nodes = $xpath->query($query);
+ if ($nodes === false) {
+ continue;
+ }
+
+ for ($index = $nodes->length - 1; $index >= 0; $index--) {
+ $node = $nodes->item($index);
+ if ($node !== null && $node->parentNode !== null) {
+ $node->parentNode->removeChild($node);
+ }
+ }
+ }
+
+ $allNodes = $xpath->query('//*');
+ if ($allNodes !== false) {
+ foreach ($allNodes as $node) {
+ if (! $node instanceof \DOMElement) {
+ continue;
+ }
+
+ $attributesToRemove = [];
+ foreach ($node->attributes as $attribute) {
+ if (str_starts_with(strtolower($attribute->name), 'on')) {
+ $attributesToRemove[] = $attribute->name;
+ }
+ }
+
+ foreach ($attributesToRemove as $attributeName) {
+ $node->removeAttribute($attributeName);
+ }
+ }
+ }
+
+ $output = (string) $dom->saveHTML();
+
+ return mb_substr($output, 0, 300000);
+ }
+
+ private function extractTitle(string $html): string
+ {
+ if (preg_match('/]*>(.*?)<\/title>/is', $html, $matches) !== 1) {
+ return '';
+ }
+
+ return trim(strip_tags((string) $matches[1]));
+ }
+}
diff --git a/app/Http/Requests/Admin/CrawlRuleRequest.php b/app/Http/Requests/Admin/CrawlRuleRequest.php
new file mode 100644
index 0000000..2b73d72
--- /dev/null
+++ b/app/Http/Requests/Admin/CrawlRuleRequest.php
@@ -0,0 +1,254 @@
+decodeJsonToArray($this->input('extractor_json'));
+
+ $this->merge([
+ 'enabled' => $this->boolean('enabled'),
+ 'render_js' => $this->boolean('render_js'),
+ 'ai_fallback_enabled' => $this->boolean('ai_fallback_enabled'),
+ 'extractor_mode' => $this->input('extractor_mode') ?: (string) ($extractorConfig['mode'] ?? 'xpath'),
+ ]);
+ }
+
+ public function rules(): array
+ {
+ return [
+ 'name' => ['required', 'string', 'max:150'],
+ 'target_module' => ['required', Rule::in(array_column(CrawlTargetModule::cases(), 'value'))],
+ 'enabled' => ['nullable', 'boolean'],
+ 'entry_urls' => ['required', 'string'],
+ 'cron_expression' => ['required', 'string', 'max:64'],
+ 'timezone' => ['required', 'string', 'max:64'],
+ 'max_pages' => ['required', 'integer', 'between:1,2000'],
+ 'render_js' => ['nullable', 'boolean'],
+ 'user_agent' => ['nullable', 'string', 'max:255'],
+ 'headers_json' => ['nullable', 'json'],
+ 'cookies_json' => ['nullable', 'json'],
+ 'proxy' => ['nullable', 'string', 'max:255'],
+ 'rate_limit_per_minute' => ['required', 'integer', 'between:1,2000'],
+ 'retry_max' => ['required', 'integer', 'between:1,10'],
+ 'retry_backoff_seconds' => ['required', 'integer', 'between:1,3600'],
+ 'extractor_json' => ['required', 'json'],
+ 'extractor_mode' => ['required', Rule::in(['xpath', 'ai', 'hybrid'])],
+ 'mapping_json' => ['nullable', 'json'],
+ 'dedupe_json' => ['nullable', 'json'],
+ 'ai_fallback_enabled' => ['nullable', 'boolean'],
+ 'ai_provider' => ['nullable', 'string', 'max:64'],
+ 'ai_model' => ['nullable', 'string', 'max:128'],
+ 'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
+ 'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
+ 'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
+ 'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
+ 'publish_policy' => ['required', Rule::in(['draft'])],
+ 'alert_email' => ['nullable', 'email'],
+ ];
+ }
+
+ public function messages(): array
+ {
+ return [
+ 'name.required' => '请填写规则名称。',
+ 'target_module.required' => '请选择目标模块。',
+ 'entry_urls.required' => '请至少填写一个入口 URL。',
+ 'cron_expression.required' => '请填写 Cron 表达式。',
+ 'timezone.required' => '请填写时区。',
+ 'max_pages.required' => '请填写最大页面数。',
+ 'max_pages.integer' => '最大页面数必须是整数。',
+ 'max_pages.between' => '最大页面数需在 1 到 2000 之间。',
+ 'rate_limit_per_minute.required' => '请填写每分钟限流值。',
+ 'rate_limit_per_minute.integer' => '每分钟限流值必须是整数。',
+ 'rate_limit_per_minute.between' => '每分钟限流值需在 1 到 2000 之间。',
+ 'retry_max.required' => '请填写最大重试次数。',
+ 'retry_max.integer' => '最大重试次数必须是整数。',
+ 'retry_max.between' => '最大重试次数需在 1 到 10 之间。',
+ 'retry_backoff_seconds.required' => '请填写重试退避秒数。',
+ 'retry_backoff_seconds.integer' => '重试退避秒数必须是整数。',
+ 'retry_backoff_seconds.between' => '重试退避秒数需在 1 到 3600 之间。',
+ 'extractor_json.required' => '请填写 Extractor JSON。',
+ 'extractor_json.json' => 'Extractor JSON 格式不合法。',
+ 'extractor_mode.required' => '请选择抽取模式。',
+ 'extractor_mode.in' => '抽取模式仅支持 xpath、ai、hybrid。',
+ 'mapping_json.json' => 'Mapping JSON 格式不合法。',
+ 'dedupe_json.json' => 'Dedupe JSON 格式不合法。',
+ 'headers_json.json' => 'Headers JSON 格式不合法。',
+ 'cookies_json.json' => 'Cookies JSON 格式不合法。',
+ 'ai_temperature.between' => 'AI 温度需在 0 到 2 之间。',
+ 'ai_content_max_chars.between' => 'AI 内容截断长度需在 500 到 50000 之间。',
+ 'alert_email.email' => '告警邮箱格式不合法。',
+ ];
+ }
+
+ public function attributes(): array
+ {
+ return [
+ 'name' => '规则名称',
+ 'target_module' => '目标模块',
+ 'entry_urls' => '入口 URL',
+ 'cron_expression' => 'Cron 表达式',
+ 'timezone' => '时区',
+ 'max_pages' => '最大页面数',
+ 'rate_limit_per_minute' => '每分钟限流',
+ 'retry_max' => '最大重试次数',
+ 'retry_backoff_seconds' => '重试退避秒数',
+ 'extractor_json' => 'Extractor JSON',
+ 'extractor_mode' => '抽取模式',
+ 'mapping_json' => 'Mapping JSON',
+ 'dedupe_json' => 'Dedupe JSON',
+ 'headers_json' => 'Headers JSON',
+ 'cookies_json' => 'Cookies JSON',
+ 'ai_system_prompt' => 'AI 系统提示词',
+ 'ai_user_prompt' => 'AI 用户提示词',
+ 'ai_temperature' => 'AI 温度',
+ 'ai_content_max_chars' => 'AI 内容截断长度',
+ 'alert_email' => '告警邮箱',
+ ];
+ }
+
+ /**
+ * @return array
+ */
+ public function normalizedPayload(): array
+ {
+ $payload = $this->validated();
+
+ $extractorConfig = $this->decodeJsonToArray($payload['extractor_json'] ?? null);
+ $extractorMode = (string) ($payload['extractor_mode'] ?? ($extractorConfig['mode'] ?? 'xpath'));
+
+ if (! in_array($extractorMode, ['xpath', 'ai', 'hybrid'], true)) {
+ $extractorMode = 'xpath';
+ }
+
+ $extractorConfig['mode'] = $extractorMode;
+
+ $aiConfig = $this->buildAiConfig($payload);
+ if ($aiConfig !== []) {
+ $extractorConfig['ai'] = $aiConfig;
+ } else {
+ unset($extractorConfig['ai']);
+ }
+
+ return [
+ 'name' => $payload['name'],
+ 'target_module' => $payload['target_module'],
+ 'enabled' => (bool) ($payload['enabled'] ?? false),
+ 'entry_urls' => $this->parseEntryUrls((string) ($payload['entry_urls'] ?? '')),
+ 'cron_expression' => trim((string) $payload['cron_expression']),
+ 'timezone' => trim((string) $payload['timezone']),
+ 'max_pages' => (int) $payload['max_pages'],
+ 'render_js' => (bool) ($payload['render_js'] ?? false),
+ 'user_agent' => $this->nullableTrim($payload['user_agent'] ?? null),
+ 'headers' => $this->decodeJsonToArray($payload['headers_json'] ?? null),
+ 'cookies' => $this->decodeJsonToArray($payload['cookies_json'] ?? null),
+ 'proxy' => $this->nullableTrim($payload['proxy'] ?? null),
+ 'rate_limit_per_minute' => (int) $payload['rate_limit_per_minute'],
+ 'retry_max' => (int) $payload['retry_max'],
+ 'retry_backoff_seconds' => (int) $payload['retry_backoff_seconds'],
+ 'extractor_config' => $extractorConfig,
+ 'mapping_config' => $this->decodeJsonToArray($payload['mapping_json'] ?? null),
+ 'dedupe_config' => $this->decodeJsonToArray($payload['dedupe_json'] ?? null),
+ 'ai_fallback_enabled' => (bool) ($payload['ai_fallback_enabled'] ?? false),
+ 'ai_provider' => $this->nullableTrim($payload['ai_provider'] ?? null),
+ 'ai_model' => $this->nullableTrim($payload['ai_model'] ?? null),
+ 'publish_policy' => (string) $payload['publish_policy'],
+ 'alert_email' => $this->nullableTrim($payload['alert_email'] ?? null),
+ ];
+ }
+
+ /**
+ * @param array $payload
+ * @return array
+ */
+ private function buildAiConfig(array $payload): array
+ {
+ $aiConfig = [];
+
+ $systemPrompt = $this->nullableTrim($payload['ai_system_prompt'] ?? null);
+ if ($systemPrompt !== null) {
+ $aiConfig['system_prompt'] = $systemPrompt;
+ }
+
+ $userPrompt = $this->nullableTrim($payload['ai_user_prompt'] ?? null);
+ if ($userPrompt !== null) {
+ $aiConfig['user_prompt'] = $userPrompt;
+ }
+
+ if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
+ $aiConfig['temperature'] = (float) $payload['ai_temperature'];
+ }
+
+ if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
+ $aiConfig['content_max_chars'] = (int) $payload['ai_content_max_chars'];
+ }
+
+ $aiModel = $this->nullableTrim($payload['ai_model'] ?? null);
+ if ($aiModel !== null) {
+ $aiConfig['model'] = $aiModel;
+ }
+
+ return $aiConfig;
+ }
+
+ private function nullableTrim(mixed $value): ?string
+ {
+ if (! is_string($value)) {
+ return null;
+ }
+
+ $trimmed = trim($value);
+
+ return $trimmed === '' ? null : $trimmed;
+ }
+
+ /**
+ * @return list
+ */
+ private function parseEntryUrls(string $entryUrls): array
+ {
+ $lines = preg_split('/\r\n|\r|\n/', $entryUrls) ?: [];
+
+ $urls = [];
+ foreach ($lines as $line) {
+ $candidate = trim($line);
+ if ($candidate === '') {
+ continue;
+ }
+
+ if (filter_var($candidate, FILTER_VALIDATE_URL) !== false) {
+ $urls[] = $candidate;
+ }
+ }
+
+ return array_values(array_unique($urls));
+ }
+
+ /**
+ * @return array
+ */
+ private function decodeJsonToArray(mixed $value): array
+ {
+ if (! is_string($value) || trim($value) === '') {
+ return [];
+ }
+
+ $decoded = json_decode($value, true);
+
+ return is_array($decoded) ? $decoded : [];
+ }
+}
\ No newline at end of file
diff --git a/app/Jobs/RunCrawlRuleJob.php b/app/Jobs/RunCrawlRuleJob.php
new file mode 100644
index 0000000..f560e4b
--- /dev/null
+++ b/app/Jobs/RunCrawlRuleJob.php
@@ -0,0 +1,49 @@
+find($this->ruleId);
+
+ if (! $rule instanceof CrawlRule) {
+ return;
+ }
+
+ $trigger = CrawlTriggerType::tryFrom($this->triggerType) ?? CrawlTriggerType::Manual;
+
+ $metrics = [];
+ if ($this->retryFromRunId !== null) {
+ $metrics['retry_from_run_id'] = $this->retryFromRunId;
+ }
+
+ $executionService->runRule($rule, $trigger, $this->createdBy, $metrics);
+ }
+}
+
diff --git a/app/Models/CrawlAlert.php b/app/Models/CrawlAlert.php
new file mode 100644
index 0000000..e923a96
--- /dev/null
+++ b/app/Models/CrawlAlert.php
@@ -0,0 +1,48 @@
+ CrawlAlertSeverity::class,
+ 'context' => 'array',
+ 'is_resolved' => 'boolean',
+ 'resolved_at' => 'datetime',
+ ];
+ }
+
+ public function run(): BelongsTo
+ {
+ return $this->belongsTo(CrawlRun::class, 'run_id');
+ }
+
+ public function rule(): BelongsTo
+ {
+ return $this->belongsTo(CrawlRule::class, 'rule_id');
+ }
+}
+
diff --git a/app/Models/CrawlRule.php b/app/Models/CrawlRule.php
new file mode 100644
index 0000000..e0f3f69
--- /dev/null
+++ b/app/Models/CrawlRule.php
@@ -0,0 +1,74 @@
+ CrawlTargetModule::class,
+ 'enabled' => 'boolean',
+ 'entry_urls' => 'array',
+ 'headers' => 'array',
+ 'cookies' => 'array',
+ 'extractor_config' => 'array',
+ 'mapping_config' => 'array',
+ 'dedupe_config' => 'array',
+ 'render_js' => 'boolean',
+ 'ai_fallback_enabled' => 'boolean',
+ 'last_run_at' => 'datetime',
+ 'next_run_at' => 'datetime',
+ ];
+ }
+
+ public function runs(): HasMany
+ {
+ return $this->hasMany(CrawlRun::class, 'rule_id');
+ }
+
+ public function alerts(): HasMany
+ {
+ return $this->hasMany(CrawlAlert::class, 'rule_id');
+ }
+}
+
diff --git a/app/Models/CrawlRun.php b/app/Models/CrawlRun.php
new file mode 100644
index 0000000..b0d7d1e
--- /dev/null
+++ b/app/Models/CrawlRun.php
@@ -0,0 +1,59 @@
+ CrawlTriggerType::class,
+ 'status' => CrawlRunStatus::class,
+ 'started_at' => 'datetime',
+ 'finished_at' => 'datetime',
+ 'metrics' => 'array',
+ ];
+ }
+
+ public function rule(): BelongsTo
+ {
+ return $this->belongsTo(CrawlRule::class, 'rule_id');
+ }
+
+ public function items(): HasMany
+ {
+ return $this->hasMany(CrawlRunItem::class, 'run_id');
+ }
+
+ public function alerts(): HasMany
+ {
+ return $this->hasMany(CrawlAlert::class, 'run_id');
+ }
+}
+
diff --git a/app/Models/CrawlRunItem.php b/app/Models/CrawlRunItem.php
new file mode 100644
index 0000000..ae48acb
--- /dev/null
+++ b/app/Models/CrawlRunItem.php
@@ -0,0 +1,46 @@
+ CrawlRunItemStatus::class,
+ 'raw_payload' => 'array',
+ 'normalized_payload' => 'array',
+ 'upsert_result' => 'array',
+ ];
+ }
+
+ public function run(): BelongsTo
+ {
+ return $this->belongsTo(CrawlRun::class, 'run_id');
+ }
+}
+
diff --git a/app/Services/Crawler/CrawlAlertService.php b/app/Services/Crawler/CrawlAlertService.php
new file mode 100644
index 0000000..45e6f88
--- /dev/null
+++ b/app/Services/Crawler/CrawlAlertService.php
@@ -0,0 +1,73 @@
+create([
+ 'run_id' => $run?->id,
+ 'rule_id' => $rule?->id,
+ 'severity' => $severity,
+ 'type' => $type,
+ 'message' => $message,
+ 'context' => $context,
+ 'is_resolved' => false,
+ ]);
+
+ $recipient = $rule?->alert_email ?: config('crawler.default_alert_email');
+
+ if (is_string($recipient) && $recipient !== '') {
+ try {
+ Mail::raw($this->buildEmailBody($alert), static function ($mail) use ($recipient, $severity): void {
+ $mail->to($recipient)
+ ->subject(sprintf('[Crawler][%s] 采集告警', strtoupper($severity->value)));
+ });
+ } catch (\Throwable $exception) {
+ Log::warning('Crawler alert email failed', [
+ 'alert_id' => $alert->id,
+ 'error' => $exception->getMessage(),
+ ]);
+ }
+ }
+
+ return $alert;
+ }
+
+ private function buildEmailBody(CrawlAlert $alert): string
+ {
+ $lines = [
+ '采集告警通知',
+ sprintf('等级: %s', $alert->severity?->value ?? 'unknown'),
+ sprintf('类型: %s', $alert->type),
+ sprintf('信息: %s', $alert->message),
+ sprintf('规则ID: %s', (string) ($alert->rule_id ?? '-')),
+ sprintf('运行ID: %s', (string) ($alert->run_id ?? '-')),
+ sprintf('时间: %s', (string) $alert->created_at),
+ ];
+
+ if (is_array($alert->context) && $alert->context !== []) {
+ $lines[] = '上下文:';
+ $lines[] = json_encode($alert->context, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT) ?: '{}';
+ }
+
+ return implode("\n", $lines);
+ }
+}
+
diff --git a/app/Services/Crawler/CrawlEntityUpsertService.php b/app/Services/Crawler/CrawlEntityUpsertService.php
new file mode 100644
index 0000000..8488b8e
--- /dev/null
+++ b/app/Services/Crawler/CrawlEntityUpsertService.php
@@ -0,0 +1,277 @@
+ $payload
+ * @return array
+ */
+ public function upsert(CrawlRule $rule, array $payload, string $detailUrl): array
+ {
+ $mapped = $this->applyMapping($payload, is_array($rule->mapping_config) ? $rule->mapping_config : []);
+
+ return match ($rule->target_module?->value) {
+ 'model' => $this->upsertModel($mapped, $detailUrl),
+ default => $this->upsertTool($mapped, $detailUrl),
+ };
+ }
+
+ /**
+ * @param array $payload
+ * @param array $mapping
+ * @return array
+ */
+ private function applyMapping(array $payload, array $mapping): array
+ {
+ if ($mapping === []) {
+ return $payload;
+ }
+
+ $result = $payload;
+
+ foreach ($mapping as $target => $source) {
+ if (! is_string($target) || ! is_string($source)) {
+ continue;
+ }
+
+ if (array_key_exists($source, $payload)) {
+ $result[$target] = $payload[$source];
+ }
+ }
+
+ return $result;
+ }
+
+ /**
+ * @param array $payload
+ * @return array
+ */
+ private function upsertTool(array $payload, string $detailUrl): array
+ {
+ $name = trim((string) ($payload['name'] ?? ''));
+ if ($name === '') {
+ throw new \RuntimeException('Tool payload missing name');
+ }
+
+ $slug = trim((string) ($payload['slug'] ?? ''));
+ $slug = $slug !== '' ? Str::slug($slug) : Str::slug($name);
+ $slug = $slug !== '' ? $slug : 'tool-'.Str::lower(Str::random(8));
+
+ $officialUrl = trim((string) ($payload['official_url'] ?? $payload['url'] ?? $detailUrl));
+ $canonicalUrl = trim((string) ($payload['canonical_url'] ?? ''));
+ $summary = trim((string) ($payload['summary'] ?? ''));
+
+ if ($summary === '') {
+ $summary = mb_substr(trim((string) ($payload['description'] ?? $name)), 0, 240);
+ }
+
+ $source = $this->resolveSource($officialUrl !== '' ? $officialUrl : $detailUrl);
+ $categoryId = $this->resolveCategoryId('tool', $payload);
+
+ $entity = Tool::query()
+ ->when($officialUrl !== '', static function ($query) use ($officialUrl): void {
+ $query->where('official_url', $officialUrl)->orWhere('canonical_url', $officialUrl);
+ }, static function ($query) use ($slug, $name): void {
+ $query->where('slug', $slug)->orWhereRaw('LOWER(name) = ?', [mb_strtolower($name)]);
+ })
+ ->first();
+
+ $action = $entity === null ? 'created' : 'updated';
+
+ $entity ??= new Tool();
+
+ $entity->fill([
+ 'category_id' => $categoryId,
+ 'source_id' => $source?->id,
+ 'name' => $name,
+ 'slug' => $this->resolveUniqueSlug(Tool::class, $slug, $entity->id),
+ 'summary' => mb_substr($summary, 0, 260),
+ 'description' => (string) ($payload['description'] ?? ''),
+ 'official_url' => $officialUrl !== '' ? $officialUrl : null,
+ 'logo_url' => (string) ($payload['logo_url'] ?? ''),
+ 'pricing_type' => (string) ($payload['pricing_type'] ?? 'unknown'),
+ 'platform' => (string) ($payload['platform'] ?? ''),
+ 'language' => (string) ($payload['language'] ?? ''),
+ 'has_api' => (bool) ($payload['has_api'] ?? false),
+ 'source_level' => $source?->trust_level ?? SourceLevel::Unknown,
+ 'status' => EntityStatus::Draft,
+ 'canonical_url' => $canonicalUrl !== '' ? $canonicalUrl : null,
+ 'last_verified_at' => now(),
+ ]);
+
+ $entity->save();
+
+ return [
+ 'action' => $action,
+ 'entity' => Tool::class,
+ 'entity_id' => $entity->id,
+ 'name' => $entity->name,
+ ];
+ }
+
+ /**
+ * @param array $payload
+ * @return array
+ */
+ private function upsertModel(array $payload, string $detailUrl): array
+ {
+ $name = trim((string) ($payload['name'] ?? ''));
+ if ($name === '') {
+ throw new \RuntimeException('Model payload missing name');
+ }
+
+ $slug = trim((string) ($payload['slug'] ?? ''));
+ $slug = $slug !== '' ? Str::slug($slug) : Str::slug($name);
+ $slug = $slug !== '' ? $slug : 'model-'.Str::lower(Str::random(8));
+
+ $summary = trim((string) ($payload['summary'] ?? ''));
+ if ($summary === '') {
+ $summary = mb_substr(trim((string) ($payload['description'] ?? $name)), 0, 240);
+ }
+
+ $officialUrl = trim((string) ($payload['official_url'] ?? $payload['url'] ?? $detailUrl));
+ $canonicalUrl = trim((string) ($payload['canonical_url'] ?? ''));
+
+ $source = $this->resolveSource($officialUrl !== '' ? $officialUrl : $detailUrl);
+ $categoryId = $this->resolveCategoryId('model', $payload);
+
+ $entity = AiModel::query()
+ ->when($officialUrl !== '', static function ($query) use ($officialUrl): void {
+ $query->where('canonical_url', $officialUrl);
+ }, static function ($query) use ($slug, $name): void {
+ $query->where('slug', $slug)->orWhereRaw('LOWER(name) = ?', [mb_strtolower($name)]);
+ })
+ ->first();
+
+ $action = $entity === null ? 'created' : 'updated';
+
+ $entity ??= new AiModel();
+
+ $entity->fill([
+ 'category_id' => $categoryId,
+ 'source_id' => $source?->id,
+ 'name' => $name,
+ 'slug' => $this->resolveUniqueSlug(AiModel::class, $slug, $entity->id),
+ 'provider' => (string) ($payload['provider'] ?? ''),
+ 'summary' => mb_substr($summary, 0, 260),
+ 'description' => (string) ($payload['description'] ?? ''),
+ 'modality' => (string) ($payload['modality'] ?? 'text'),
+ 'context_window' => $this->toNullableInt($payload['context_window'] ?? null),
+ 'price_input' => $this->toNullableFloat($payload['price_input'] ?? null),
+ 'price_output' => $this->toNullableFloat($payload['price_output'] ?? null),
+ 'deployment_mode' => (string) ($payload['deployment_mode'] ?? 'api'),
+ 'effectiveness_score' => $this->boundedScore($payload['effectiveness_score'] ?? 60),
+ 'price_score' => $this->boundedScore($payload['price_score'] ?? 60),
+ 'speed_score' => $this->boundedScore($payload['speed_score'] ?? 60),
+ 'source_level' => $source?->trust_level ?? SourceLevel::Unknown,
+ 'status' => EntityStatus::Draft,
+ 'canonical_url' => $canonicalUrl !== '' ? $canonicalUrl : ($officialUrl !== '' ? $officialUrl : null),
+ 'last_verified_at' => now(),
+ ]);
+
+ $this->modelScoringService->apply($entity);
+ $entity->save();
+
+ return [
+ 'action' => $action,
+ 'entity' => AiModel::class,
+ 'entity_id' => $entity->id,
+ 'name' => $entity->name,
+ ];
+ }
+
+ private function resolveSource(string $url): ?Source
+ {
+ $host = parse_url($url, PHP_URL_HOST);
+
+ if (! is_string($host) || $host === '') {
+ return null;
+ }
+
+ return Source::query()->where('domain', $host)->first();
+ }
+
+ /**
+ * @param array $payload
+ */
+ private function resolveCategoryId(string $type, array $payload): ?int
+ {
+ $candidate = trim((string) ($payload['category_slug'] ?? $payload['category'] ?? ''));
+
+ if ($candidate === '') {
+ return null;
+ }
+
+ $category = Category::query()
+ ->where('type', $type)
+ ->where(static function ($query) use ($candidate): void {
+ $query->where('slug', $candidate)->orWhere('name', $candidate);
+ })
+ ->first();
+
+ return $category?->id;
+ }
+
+ /**
+ * @param class-string<\Illuminate\Database\Eloquent\Model> $modelClass
+ */
+ private function resolveUniqueSlug(string $modelClass, string $slug, ?int $exceptId = null): string
+ {
+ $finalSlug = $slug;
+ $suffix = 1;
+
+ while ($modelClass::query()
+ ->when($exceptId !== null, static fn ($query) => $query->where('id', '!=', $exceptId))
+ ->where('slug', $finalSlug)
+ ->exists()) {
+ $finalSlug = sprintf('%s-%d', $slug, $suffix);
+ $suffix++;
+ }
+
+ return $finalSlug;
+ }
+
+ private function boundedScore(mixed $value): int
+ {
+ $score = (int) $value;
+
+ return max(0, min(100, $score));
+ }
+
+ private function toNullableInt(mixed $value): ?int
+ {
+ if ($value === null || $value === '') {
+ return null;
+ }
+
+ return (int) $value;
+ }
+
+ private function toNullableFloat(mixed $value): ?float
+ {
+ if ($value === null || $value === '') {
+ return null;
+ }
+
+ return (float) $value;
+ }
+}
+
diff --git a/app/Services/Crawler/CrawlExecutionService.php b/app/Services/Crawler/CrawlExecutionService.php
new file mode 100644
index 0000000..2364726
--- /dev/null
+++ b/app/Services/Crawler/CrawlExecutionService.php
@@ -0,0 +1,335 @@
+ $metrics
+ */
+ public function runRule(
+ CrawlRule $rule,
+ CrawlTriggerType $triggerType,
+ ?int $createdBy = null,
+ array $metrics = [],
+ ): CrawlRun {
+ $run = CrawlRun::query()->create([
+ 'rule_id' => $rule->id,
+ 'trigger_type' => $triggerType,
+ 'status' => CrawlRunStatus::Running,
+ 'started_at' => now(),
+ 'metrics' => $metrics,
+ 'created_by' => $createdBy,
+ ]);
+
+ $successCount = 0;
+ $failedCount = 0;
+ $skippedCount = 0;
+ $totalUrls = 0;
+ $errors = [];
+
+ $entryUrls = collect($rule->entry_urls)
+ ->filter(static fn ($url): bool => is_string($url) && filter_var($url, FILTER_VALIDATE_URL) !== false)
+ ->values()
+ ->all();
+
+ if ($entryUrls === []) {
+ $errors[] = 'No valid entry urls configured';
+ }
+
+ $maxPages = max(1, (int) $rule->max_pages);
+
+ foreach ($entryUrls as $entryUrl) {
+ [$listResult, $listAttempt] = $this->fetchWithRetry($rule, $entryUrl);
+
+ if (! $listResult['ok']) {
+ $failedCount++;
+ $errors[] = sprintf('List fetch failed: %s', (string) ($listResult['error'] ?? 'unknown'));
+ $this->createRunItem($run, [
+ 'url' => $entryUrl,
+ 'stage' => 'list',
+ 'attempt' => $listAttempt,
+ 'status' => CrawlRunItemStatus::Failed,
+ 'latency_ms' => $listResult['latency_ms'] ?? null,
+ 'http_code' => $listResult['http_code'] ?? null,
+ 'error_code' => 'fetch_failed',
+ 'error_message' => (string) ($listResult['error'] ?? 'Fetch failed'),
+ ]);
+
+ continue;
+ }
+
+ $this->createRunItem($run, [
+ 'url' => $entryUrl,
+ 'stage' => 'list',
+ 'attempt' => $listAttempt,
+ 'status' => CrawlRunItemStatus::Success,
+ 'latency_ms' => $listResult['latency_ms'] ?? null,
+ 'http_code' => $listResult['http_code'] ?? null,
+ ]);
+
+ $detailUrls = $this->extractor->extractListUrls(
+ $listResult['body'],
+ $entryUrl,
+ is_array($rule->extractor_config) ? $rule->extractor_config : [],
+ );
+
+ if ($detailUrls === []) {
+ $detailUrls = [$entryUrl];
+ }
+
+ foreach ($detailUrls as $detailUrl) {
+ if ($totalUrls >= $maxPages) {
+ break 2;
+ }
+
+ $totalUrls++;
+ [$detailResult, $detailAttempt] = $this->fetchWithRetry($rule, $detailUrl);
+
+ if (! $detailResult['ok']) {
+ $failedCount++;
+ $errors[] = sprintf('Detail fetch failed(%s): %s', $detailUrl, (string) ($detailResult['error'] ?? 'unknown'));
+
+ $this->createRunItem($run, [
+ 'url' => $detailUrl,
+ 'stage' => 'detail',
+ 'attempt' => $detailAttempt,
+ 'status' => CrawlRunItemStatus::Failed,
+ 'latency_ms' => $detailResult['latency_ms'] ?? null,
+ 'http_code' => $detailResult['http_code'] ?? null,
+ 'error_code' => 'fetch_failed',
+ 'error_message' => (string) ($detailResult['error'] ?? 'Fetch failed'),
+ ]);
+
+ continue;
+ }
+
+ $extracted = $this->extractPayload($rule, $detailResult['body']);
+ $missing = $this->missingRequiredFields($rule, $extracted);
+
+ if ($missing !== []) {
+ $skippedCount++;
+ $this->createRunItem($run, [
+ 'url' => $detailUrl,
+ 'stage' => 'extract',
+ 'attempt' => $detailAttempt,
+ 'status' => CrawlRunItemStatus::Skipped,
+ 'latency_ms' => $detailResult['latency_ms'] ?? null,
+ 'http_code' => $detailResult['http_code'] ?? null,
+ 'error_code' => 'missing_fields',
+ 'error_message' => 'Missing required fields: '.implode(', ', $missing),
+ 'raw_payload' => ['html_length' => mb_strlen($detailResult['body'])],
+ 'normalized_payload' => $extracted,
+ ]);
+
+ continue;
+ }
+
+ try {
+ $upsertResult = $this->upsertService->upsert($rule, $extracted, $detailUrl);
+ $successCount++;
+
+ $this->createRunItem($run, [
+ 'url' => $detailUrl,
+ 'stage' => 'upsert',
+ 'attempt' => $detailAttempt,
+ 'status' => CrawlRunItemStatus::Success,
+ 'latency_ms' => $detailResult['latency_ms'] ?? null,
+ 'http_code' => $detailResult['http_code'] ?? null,
+ 'normalized_payload' => $extracted,
+ 'upsert_result' => $upsertResult,
+ ]);
+ } catch (\Throwable $exception) {
+ $failedCount++;
+ $errors[] = sprintf('Upsert failed(%s): %s', $detailUrl, $exception->getMessage());
+
+ $this->createRunItem($run, [
+ 'url' => $detailUrl,
+ 'stage' => 'upsert',
+ 'attempt' => $detailAttempt,
+ 'status' => CrawlRunItemStatus::Failed,
+ 'latency_ms' => $detailResult['latency_ms'] ?? null,
+ 'http_code' => $detailResult['http_code'] ?? null,
+ 'error_code' => 'upsert_failed',
+ 'error_message' => $exception->getMessage(),
+ 'normalized_payload' => $extracted,
+ ]);
+ }
+ }
+ }
+
+ $status = $this->finalizeStatus($successCount, $failedCount, $errors);
+ $run->fill([
+ 'status' => $status,
+ 'finished_at' => now(),
+ 'total_urls' => $totalUrls,
+ 'success_count' => $successCount,
+ 'failed_count' => $failedCount,
+ 'skipped_count' => $skippedCount,
+ 'error_summary' => $errors !== [] ? Str::limit(implode(' | ', $errors), 1000) : null,
+ 'metrics' => array_merge($metrics, ['entry_url_count' => count($entryUrls)]),
+ ]);
+ $run->save();
+
+ $rule->last_run_at = now();
+ $rule->next_run_at = $this->scheduleService->nextRunAt($rule);
+ $rule->save();
+
+ if ($failedCount > 0 || $errors !== []) {
+ $this->alertService->notify(
+ $failedCount > 0 ? CrawlAlertSeverity::Error : CrawlAlertSeverity::Warning,
+ 'run_failed_or_partial',
+ sprintf('规则[%s]执行完成,成功%d,失败%d,跳过%d', $rule->name, $successCount, $failedCount, $skippedCount),
+ $rule,
+ $run,
+ [
+ 'errors' => array_slice($errors, 0, 10),
+ ],
+ );
+ }
+
+ return $run->refresh();
+ }
+
+ /**
+ * @return array{0: array{ok: bool, http_code: int|null, body: string, error: string|null, latency_ms: int}, 1: int}
+ */
+ private function fetchWithRetry(CrawlRule $rule, string $url): array
+ {
+ $maxAttempts = max(1, (int) $rule->retry_max);
+ $backoff = max(1, (int) $rule->retry_backoff_seconds);
+
+ $lastResult = [
+ 'ok' => false,
+ 'http_code' => null,
+ 'body' => '',
+ 'error' => 'not_started',
+ 'latency_ms' => 0,
+ ];
+
+ for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
+ $lastResult = $this->fetcher->fetch($rule, $url);
+ if ($lastResult['ok']) {
+ return [$lastResult, $attempt];
+ }
+
+ if ($attempt < $maxAttempts) {
+ sleep(min($backoff * $attempt, 15));
+ }
+ }
+
+ return [$lastResult, $maxAttempts];
+ }
+
+ /**
+ * @return array
+ */
+ private function extractPayload(CrawlRule $rule, string $html): array
+ {
+ $extractorConfig = is_array($rule->extractor_config) ? $rule->extractor_config : [];
+ $mode = strtolower((string) ($extractorConfig['mode'] ?? 'xpath'));
+ if (! in_array($mode, ['xpath', 'ai', 'hybrid'], true)) {
+ $mode = 'xpath';
+ }
+
+ $aiOptions = is_array($extractorConfig['ai'] ?? null) ? $extractorConfig['ai'] : [];
+
+ $xpathPayload = [];
+ $aiPayload = [];
+
+ if ($mode !== 'ai') {
+ $xpathPayload = $this->extractor->extractFields($html, $extractorConfig);
+ }
+
+ $shouldUseAi = $mode === 'ai' || $mode === 'hybrid';
+
+ if ($mode === 'xpath' && $rule->ai_fallback_enabled) {
+ $shouldUseAi = $this->missingRequiredFields($rule, $xpathPayload) !== [];
+ }
+
+ if ($shouldUseAi) {
+ $aiPayload = $this->aiFallbackExtractor->extract($rule, $html, $aiOptions);
+ }
+
+ if ($mode === 'ai') {
+ return $aiPayload;
+ }
+
+ if ($mode === 'hybrid') {
+ return array_merge($aiPayload, $xpathPayload);
+ }
+
+ if ($rule->ai_fallback_enabled && $aiPayload !== []) {
+ return array_merge($aiPayload, $xpathPayload);
+ }
+
+ return $xpathPayload;
+ }
+
+ /**
+ * @param array $payload
+ * @return list
+ */
+ private function missingRequiredFields(CrawlRule $rule, array $payload): array
+ {
+ $required = $rule->target_module?->value === 'model'
+ ? ['name', 'summary', 'modality', 'deployment_mode']
+ : ['name', 'summary'];
+
+ $missing = [];
+ foreach ($required as $field) {
+ $value = Arr::get($payload, $field);
+ if (! is_string($value) || trim($value) === '') {
+ $missing[] = $field;
+ }
+ }
+
+ return $missing;
+ }
+
+ /**
+ * @param list $errors
+ */
+ private function finalizeStatus(int $successCount, int $failedCount, array $errors): CrawlRunStatus
+ {
+ if ($successCount > 0 && $failedCount === 0 && $errors === []) {
+ return CrawlRunStatus::Completed;
+ }
+
+ if ($successCount > 0) {
+ return CrawlRunStatus::Partial;
+ }
+
+ return CrawlRunStatus::Failed;
+ }
+
+ /**
+ * @param array $attributes
+ */
+ private function createRunItem(CrawlRun $run, array $attributes): CrawlRunItem
+ {
+ return $run->items()->create($attributes);
+ }
+}
\ No newline at end of file
diff --git a/app/Services/Crawler/CrawlFetcherService.php b/app/Services/Crawler/CrawlFetcherService.php
new file mode 100644
index 0000000..eb139a4
--- /dev/null
+++ b/app/Services/Crawler/CrawlFetcherService.php
@@ -0,0 +1,103 @@
+render_js && is_string(config('crawler.browserless_endpoint')) && config('crawler.browserless_endpoint') !== '') {
+ $response = $this->browserlessRequest($rule)->post((string) config('crawler.browserless_endpoint'), [
+ 'url' => $url,
+ 'waitUntil' => 'networkidle2',
+ ]);
+ } else {
+ $response = $this->httpRequest($rule)->get($url);
+ }
+
+ return [
+ 'ok' => $response->successful(),
+ 'http_code' => $response->status(),
+ 'body' => $response->body(),
+ 'error' => $response->successful() ? null : sprintf('HTTP %d', $response->status()),
+ 'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
+ ];
+ } catch (\Throwable $exception) {
+ return [
+ 'ok' => false,
+ 'http_code' => null,
+ 'body' => '',
+ 'error' => $exception->getMessage(),
+ 'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
+ ];
+ }
+ }
+
+ private function httpRequest(CrawlRule $rule): PendingRequest
+ {
+ $headers = is_array($rule->headers) ? $rule->headers : [];
+ $cookies = is_array($rule->cookies) ? $rule->cookies : [];
+ $timeout = max((int) config('crawler.request_timeout_seconds', 20), 5);
+
+ $request = Http::timeout($timeout)
+ ->withHeaders($headers)
+ ->withUserAgent((string) ($rule->user_agent ?: config('crawler.default_user_agent')));
+
+ if (! (bool) config('crawler.verify_ssl', true)) {
+ $request = $request->withoutVerifying();
+ }
+
+ $request = $this->applyNetworkOptions($request);
+
+ if ($cookies !== []) {
+ $request = $request->withCookies($cookies, parse_url((string) ($rule->entry_urls[0] ?? ''), PHP_URL_HOST) ?: '');
+ }
+
+ return $request;
+ }
+
+ private function applyNetworkOptions(PendingRequest $request): PendingRequest
+ {
+ $options = [];
+
+ if ((bool) config('crawler.force_ipv4', false)) {
+ $options['force_ip_resolve'] = 'v4';
+ }
+
+ $dnsServers = trim((string) config('crawler.dns_servers', ''));
+ if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
+ $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
+ }
+
+ if ($options === []) {
+ return $request;
+ }
+
+ return $request->withOptions($options);
+ }
+
+ private function browserlessRequest(CrawlRule $rule): PendingRequest
+ {
+ $request = $this->httpRequest($rule);
+ $token = (string) config('crawler.browserless_token', '');
+
+ if ($token !== '') {
+ $request = $request->withToken($token);
+ }
+
+ return $request;
+ }
+}
+
diff --git a/app/Services/Crawler/CrawlRuleScheduleService.php b/app/Services/Crawler/CrawlRuleScheduleService.php
new file mode 100644
index 0000000..c1332ee
--- /dev/null
+++ b/app/Services/Crawler/CrawlRuleScheduleService.php
@@ -0,0 +1,44 @@
+enabled) {
+ return false;
+ }
+
+ $now ??= CarbonImmutable::now($rule->timezone ?: 'Asia/Shanghai');
+
+ try {
+ $cron = new CronExpression($rule->cron_expression);
+ } catch (\Throwable) {
+ return false;
+ }
+
+ return $cron->isDue($now);
+ }
+
+ public function nextRunAt(CrawlRule $rule, ?CarbonImmutable $from = null): ?CarbonImmutable
+ {
+ $from ??= CarbonImmutable::now($rule->timezone ?: 'Asia/Shanghai');
+
+ try {
+ $cron = new CronExpression($rule->cron_expression);
+ $next = CarbonImmutable::instance($cron->getNextRunDate($from));
+ } catch (\Throwable) {
+ return null;
+ }
+
+ return $next->setTimezone('UTC');
+ }
+}
+
diff --git a/app/Services/Crawler/OpenAiFallbackExtractor.php b/app/Services/Crawler/OpenAiFallbackExtractor.php
new file mode 100644
index 0000000..c7871fb
--- /dev/null
+++ b/app/Services/Crawler/OpenAiFallbackExtractor.php
@@ -0,0 +1,494 @@
+ $options
+ * @return array
+ */
+ public function extract(CrawlRule $rule, string $html, array $options = []): array
+ {
+ $this->lastError = null;
+
+ $credentials = $this->resolveCredentials();
+ if ($credentials === null) {
+ $this->lastError = 'AI credentials not configured.';
+ return [];
+ }
+
+ $ruleAiOptions = is_array($rule->extractor_config['ai'] ?? null)
+ ? $rule->extractor_config['ai']
+ : [];
+ $mergedOptions = array_merge($ruleAiOptions, $options);
+
+ $model = $this->resolveModel($rule->ai_model, $mergedOptions);
+ $temperature = $this->resolveTemperature($mergedOptions);
+ $contentMaxChars = $this->resolveContentMaxChars($mergedOptions, 12000);
+
+ $targetSchema = $rule->target_module?->value === 'tool'
+ ? 'name, summary, official_url, pricing_type, platform, language, description, logo_url'
+ : 'name, summary, provider, modality, deployment_mode, context_window, price_input, price_output, description';
+
+ $defaultUserPrompt = <<resolveSystemPrompt($mergedOptions, '你是一个精确的信息抽取引擎。');
+ $userPrompt = $this->resolveUserPrompt($mergedOptions, $defaultUserPrompt);
+
+ $content = $this->requestAiContent(
+ credentials: $credentials,
+ model: $model,
+ temperature: $temperature,
+ systemPrompt: $systemPrompt,
+ userPrompt: $userPrompt,
+ html: $html,
+ contentMaxChars: $contentMaxChars,
+ stripTags: true,
+ );
+
+ return $this->decodeJsonContent($content);
+ }
+
+ public function isConfigured(): bool
+ {
+ return $this->resolveCredentials() !== null;
+ }
+
+ public function lastError(): ?string
+ {
+ return $this->lastError;
+ }
+
+ /**
+ * @param array $options
+ * @return array
+ */
+ public function suggestExtractorConfig(string $targetModule, string $html, array $options = []): array
+ {
+ $this->lastError = null;
+
+ $credentials = $this->resolveCredentials();
+ if ($credentials === null) {
+ $this->lastError = 'AI credentials not configured.';
+ return [];
+ }
+
+ $targetModule = in_array($targetModule, ['tool', 'model'], true) ? $targetModule : 'tool';
+ $fields = $targetModule === 'tool'
+ ? ['name', 'summary', 'official_url', 'pricing_type', 'platform', 'language', 'description', 'logo_url']
+ : ['name', 'summary', 'provider', 'modality', 'deployment_mode', 'context_window', 'price_input', 'price_output', 'description'];
+
+ $defaultUserPrompt = <<implodeFields($fields)}。
+PROMPT;
+
+ $model = $this->resolveModel((string) ($options['model'] ?? null), $options);
+ $temperature = $this->resolveTemperature($options);
+ $contentMaxChars = $this->resolveContentMaxChars($options, 16000);
+
+ $systemPrompt = $this->resolveSystemPrompt($options, '你是 XPath 规则设计专家,擅长从 HTML 生成稳定的抽取规则。');
+ $userPrompt = $this->resolveUserPrompt($options, $defaultUserPrompt);
+
+ $content = $this->requestAiContent(
+ credentials: $credentials,
+ model: $model,
+ temperature: $temperature,
+ systemPrompt: $systemPrompt,
+ userPrompt: $userPrompt,
+ html: $html,
+ contentMaxChars: $contentMaxChars,
+ stripTags: false,
+ );
+
+ $decoded = $this->decodeJsonContent($content);
+ if (! is_array($decoded)) {
+ $this->lastError = $this->lastError ?: 'AI response is not valid JSON.';
+ return [];
+ }
+
+ $fieldsConfig = is_array($decoded['fields'] ?? null) ? $decoded['fields'] : [];
+ if ($fieldsConfig === []) {
+ $this->lastError = $this->lastError ?: 'AI response does not include fields config.';
+ return [];
+ }
+
+ return [
+ 'list_link_xpath' => is_string($decoded['list_link_xpath'] ?? null) ? $decoded['list_link_xpath'] : '',
+ 'fields' => $fieldsConfig,
+ ];
+ }
+
+ /**
+ * @return array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string}|null
+ */
+ private function resolveCredentials(): ?array
+ {
+ $apiKey = (string) config('crawler.openai_compatible_key', '');
+ $endpoint = $this->resolveEndpoint();
+
+ if ($endpoint === '' || $apiKey === '') {
+ return null;
+ }
+
+ return [
+ 'endpoint' => $endpoint,
+ 'api_key' => $apiKey,
+ 'wire_api' => $this->resolveWireApi(),
+ 'disable_response_storage' => (bool) config('crawler.openai_disable_response_storage', false),
+ 'reasoning_effort' => trim((string) config('crawler.openai_reasoning_effort', '')),
+ ];
+ }
+
+ private function resolveEndpoint(): string
+ {
+ $configuredEndpoint = trim((string) config('crawler.openai_compatible_endpoint', ''));
+ if ($configuredEndpoint !== '') {
+ return $configuredEndpoint;
+ }
+
+ $baseUrl = trim((string) config('crawler.openai_compatible_base_url', ''));
+ if ($baseUrl === '') {
+ return '';
+ }
+
+ $baseUrl = rtrim($baseUrl, '/');
+
+ return $this->resolveWireApi() === 'responses'
+ ? $baseUrl.'/v1/responses'
+ : $baseUrl.'/v1/chat/completions';
+ }
+
+ private function resolveWireApi(): string
+ {
+ $wireApi = strtolower(trim((string) config('crawler.openai_wire_api', 'chat_completions')));
+
+ return $wireApi === 'responses' ? 'responses' : 'chat_completions';
+ }
+
+ /**
+ * @param string|null $ruleModel
+ * @param array $options
+ */
+ private function resolveModel(?string $ruleModel, array $options): string
+ {
+ $model = '';
+
+ if (is_string($options['model'] ?? null)) {
+ $model = trim((string) $options['model']);
+ }
+
+ if ($model === '' && is_string($ruleModel)) {
+ $model = trim($ruleModel);
+ }
+
+ if ($model === '') {
+ $model = (string) config('crawler.openai_default_model', 'gpt-4o-mini');
+ }
+
+ return $model;
+ }
+
+ /**
+ * @param array $options
+ */
+ private function resolveTemperature(array $options): float
+ {
+ $temperature = is_numeric($options['temperature'] ?? null)
+ ? (float) $options['temperature']
+ : 0.0;
+
+ return max(0.0, min(2.0, $temperature));
+ }
+
+ /**
+ * @param array $options
+ */
+ private function resolveContentMaxChars(array $options, int $default): int
+ {
+ $value = is_numeric($options['content_max_chars'] ?? null)
+ ? (int) $options['content_max_chars']
+ : $default;
+
+ return max(500, min(50000, $value));
+ }
+
+ /**
+ * @param array $options
+ */
+ private function resolveSystemPrompt(array $options, string $default): string
+ {
+ $prompt = is_string($options['system_prompt'] ?? null)
+ ? trim((string) $options['system_prompt'])
+ : '';
+
+ return $prompt === '' ? $default : $prompt;
+ }
+
+ /**
+ * @param array $options
+ */
+ private function resolveUserPrompt(array $options, string $default): string
+ {
+ $prompt = is_string($options['user_prompt'] ?? null)
+ ? trim((string) $options['user_prompt'])
+ : '';
+
+ return $prompt === '' ? $default : $prompt;
+ }
+
+ private function implodeFields(array $fields): string
+ {
+ return implode(', ', $fields);
+ }
+
+ /**
+ * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
+ */
+ private function requestAiContent(
+ array $credentials,
+ string $model,
+ float $temperature,
+ string $systemPrompt,
+ string $userPrompt,
+ string $html,
+ int $contentMaxChars,
+ bool $stripTags,
+ ): string {
+ $source = $stripTags ? strip_tags($html) : $html;
+ $content = mb_substr($source, 0, $contentMaxChars);
+
+ if ($credentials['wire_api'] === 'responses') {
+ return $this->requestResponsesApi(
+ credentials: $credentials,
+ model: $model,
+ systemPrompt: $systemPrompt,
+ userPrompt: $userPrompt,
+ content: $content,
+ );
+ }
+
+ return $this->requestChatCompletionsApi(
+ credentials: $credentials,
+ model: $model,
+ temperature: $temperature,
+ systemPrompt: $systemPrompt,
+ userPrompt: $userPrompt,
+ content: $content,
+ );
+ }
+
+ /**
+ * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
+ */
+ private function requestChatCompletionsApi(
+ array $credentials,
+ string $model,
+ float $temperature,
+ string $systemPrompt,
+ string $userPrompt,
+ string $content,
+ ): string {
+ try {
+ $payload = [
+ 'model' => $model,
+ 'temperature' => $temperature,
+ 'messages' => [
+ ['role' => 'system', 'content' => $systemPrompt],
+ ['role' => 'user', 'content' => $userPrompt."\n\n页面内容:\n".$content],
+ ],
+ ];
+
+ if ($credentials['disable_response_storage']) {
+ $payload['store'] = false;
+ }
+
+ if ($credentials['reasoning_effort'] !== '') {
+ $payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
+ }
+
+ $response = $this->requestBuilder($credentials['api_key'])
+ ->post($credentials['endpoint'], $payload);
+
+ if (! $response->successful()) {
+ $this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
+ return '';
+ }
+
+ return (string) data_get($response->json(), 'choices.0.message.content', '');
+ } catch (\Throwable $exception) {
+ $this->lastError = $exception->getMessage();
+
+ return '';
+ }
+ }
+
+ /**
+ * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials
+ */
+ private function requestResponsesApi(
+ array $credentials,
+ string $model,
+ string $systemPrompt,
+ string $userPrompt,
+ string $content,
+ ): string {
+ try {
+ $payload = [
+ 'model' => $model,
+ 'input' => [
+ [
+ 'role' => 'system',
+ 'content' => [
+ ['type' => 'input_text', 'text' => $systemPrompt],
+ ],
+ ],
+ [
+ 'role' => 'user',
+ 'content' => [
+ ['type' => 'input_text', 'text' => $userPrompt."\n\n页面内容:\n".$content],
+ ],
+ ],
+ ],
+ ];
+
+ if ($credentials['disable_response_storage']) {
+ $payload['store'] = false;
+ }
+
+ if ($credentials['reasoning_effort'] !== '') {
+ $payload['reasoning'] = ['effort' => $credentials['reasoning_effort']];
+ }
+
+ $response = $this->requestBuilder($credentials['api_key'])
+ ->post($credentials['endpoint'], $payload);
+
+ if (! $response->successful()) {
+ $this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240));
+ return '';
+ }
+
+ $json = $response->json();
+ $outputText = (string) data_get($json, 'output_text', '');
+ if ($outputText !== '') {
+ return $outputText;
+ }
+
+ $output = data_get($json, 'output', []);
+ if (! is_array($output)) {
+ $this->lastError = 'AI output is empty.';
+ return '';
+ }
+
+ $chunks = [];
+ foreach ($output as $item) {
+ if (! is_array($item)) {
+ continue;
+ }
+
+ $contents = $item['content'] ?? [];
+ if (! is_array($contents)) {
+ continue;
+ }
+
+ foreach ($contents as $contentItem) {
+ if (! is_array($contentItem)) {
+ continue;
+ }
+
+ $text = (string) ($contentItem['text'] ?? '');
+ if ($text !== '') {
+ $chunks[] = $text;
+ }
+ }
+ }
+
+ if ($chunks === []) {
+ $this->lastError = 'AI output chunks are empty.';
+ return '';
+ }
+
+ return implode("\n", $chunks);
+ } catch (\Throwable $exception) {
+ $this->lastError = $exception->getMessage();
+
+ return '';
+ }
+ }
+
+ private function requestBuilder(string $apiKey): PendingRequest
+ {
+ $aiTimeout = (int) config('crawler.ai_timeout_seconds', (int) config('crawler.request_timeout_seconds', 20));
+ $request = Http::timeout(max($aiTimeout, 5))
+ ->withToken($apiKey);
+
+ if (! (bool) config('crawler.verify_ssl', true)) {
+ $request = $request->withoutVerifying();
+ }
+
+ $options = [];
+ if ((bool) config('crawler.force_ipv4', false)) {
+ $options['force_ip_resolve'] = 'v4';
+ }
+
+ $dnsServers = trim((string) config('crawler.dns_servers', ''));
+ if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
+ $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
+ }
+
+ if ($options !== []) {
+ $request = $request->withOptions($options);
+ }
+
+ return $request;
+ }
+
+ /**
+ * @return array
+ */
+ private function decodeJsonContent(string $content): array
+ {
+ if ($content === '') {
+ return [];
+ }
+
+ $decoded = json_decode($content, true);
+ if (is_array($decoded)) {
+ return $decoded;
+ }
+
+ if (preg_match('/\{.*\}/s', $content, $matches) === 1) {
+ $decoded = json_decode($matches[0], true);
+
+ return is_array($decoded) ? $decoded : [];
+ }
+
+ return [];
+ }
+}
diff --git a/app/Services/Crawler/XPathExtractor.php b/app/Services/Crawler/XPathExtractor.php
new file mode 100644
index 0000000..bfe94a8
--- /dev/null
+++ b/app/Services/Crawler/XPathExtractor.php
@@ -0,0 +1,171 @@
+ $extractorConfig
+ * @return list
+ */
+ public function extractListUrls(string $html, string $baseUrl, array $extractorConfig): array
+ {
+ $listXPath = (string) ($extractorConfig['list_link_xpath'] ?? '');
+
+ if ($listXPath === '') {
+ return [];
+ }
+
+ $xpath = $this->buildXPath($html);
+
+ if ($xpath === null) {
+ return [];
+ }
+
+ $nodes = $xpath->query($listXPath);
+
+ if ($nodes === false) {
+ return [];
+ }
+
+ $urls = [];
+ foreach ($nodes as $node) {
+ $value = trim($node->nodeValue ?? '');
+ if ($value === '') {
+ continue;
+ }
+
+ $absolute = $this->toAbsoluteUrl($value, $baseUrl);
+ if ($absolute !== null) {
+ $urls[] = $absolute;
+ }
+ }
+
+ return array_values(array_unique($urls));
+ }
+
+ /**
+ * @param array $extractorConfig
+ * @return array
+ */
+ public function extractFields(string $html, array $extractorConfig): array
+ {
+ $fieldRules = $extractorConfig['fields'] ?? [];
+
+ if (! is_array($fieldRules) || $fieldRules === []) {
+ return [];
+ }
+
+ $xpath = $this->buildXPath($html);
+
+ if ($xpath === null) {
+ return [];
+ }
+
+ $result = [];
+
+ foreach ($fieldRules as $field => $rule) {
+ if (! is_string($field)) {
+ continue;
+ }
+
+ $xpathExpr = '';
+ $multiple = false;
+
+ if (is_string($rule)) {
+ $xpathExpr = $rule;
+ } elseif (is_array($rule)) {
+ $xpathExpr = (string) ($rule['xpath'] ?? '');
+ $multiple = (bool) ($rule['multiple'] ?? false);
+ }
+
+ if ($xpathExpr === '') {
+ continue;
+ }
+
+ $nodes = $xpath->query($xpathExpr);
+
+ if ($nodes === false || $nodes->length === 0) {
+ continue;
+ }
+
+ if ($multiple) {
+ $values = [];
+ foreach ($nodes as $node) {
+ $value = trim($node->nodeValue ?? '');
+ if ($value !== '') {
+ $values[] = $value;
+ }
+ }
+
+ if ($values !== []) {
+ $result[$field] = array_values(array_unique($values));
+ }
+
+ continue;
+ }
+
+ $value = trim($nodes->item(0)?->nodeValue ?? '');
+ if ($value !== '') {
+ $result[$field] = Str::squish($value);
+ }
+ }
+
+ return $result;
+ }
+
+ private function buildXPath(string $html): ?\DOMXPath
+ {
+ $dom = new \DOMDocument('1.0', 'UTF-8');
+
+ libxml_use_internal_errors(true);
+ $loaded = $dom->loadHTML(''.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
+ libxml_clear_errors();
+
+ if (! $loaded) {
+ return null;
+ }
+
+ return new \DOMXPath($dom);
+ }
+
+ private function toAbsoluteUrl(string $url, string $baseUrl): ?string
+ {
+ if (Str::startsWith($url, ['http://', 'https://'])) {
+ return $url;
+ }
+
+ if (Str::startsWith($url, '//')) {
+ $scheme = parse_url($baseUrl, PHP_URL_SCHEME) ?: 'https';
+
+ return sprintf('%s:%s', $scheme, $url);
+ }
+
+ $baseParts = parse_url($baseUrl);
+ if (! is_array($baseParts) || ! isset($baseParts['scheme'], $baseParts['host'])) {
+ return null;
+ }
+
+ $prefix = sprintf('%s://%s', $baseParts['scheme'], $baseParts['host']);
+ if (isset($baseParts['port'])) {
+ $prefix .= ':'.$baseParts['port'];
+ }
+
+ if (Str::startsWith($url, '/')) {
+ return $prefix.$url;
+ }
+
+ $path = $baseParts['path'] ?? '/';
+ $dir = rtrim(str_replace('\\', '/', dirname($path)), '/');
+ if ($dir === '') {
+ $dir = '/';
+ }
+
+ return rtrim($prefix, '/').'/'.ltrim($dir.'/'.$url, '/');
+ }
+}
+
diff --git a/bootstrap/app.php b/bootstrap/app.php
index 16b3ac7..2484c2e 100644
--- a/bootstrap/app.php
+++ b/bootstrap/app.php
@@ -12,6 +12,9 @@ return Application::configure(basePath: dirname(__DIR__))
commands: __DIR__.'/../routes/console.php',
health: '/up',
)
+ ->withCommands([
+ __DIR__.'/../app/Console/Commands',
+ ])
->withMiddleware(function (Middleware $middleware): void {
$middleware->alias([
'admin.auth' => AdminAuthenticate::class,
diff --git a/config/crawler.php b/config/crawler.php
new file mode 100644
index 0000000..2886982
--- /dev/null
+++ b/config/crawler.php
@@ -0,0 +1,23 @@
+ env('CRAWLER_USER_AGENT', 'AIWebCrawler/1.0 (+https://dev.aiweb.com)'),
+ 'request_timeout_seconds' => (int) env('CRAWLER_REQUEST_TIMEOUT', 20),
+ 'ai_timeout_seconds' => (int) env('CRAWLER_AI_TIMEOUT', 60),
+ 'verify_ssl' => env('CRAWLER_VERIFY_SSL', true),
+ 'dns_servers' => env('CRAWLER_DNS_SERVERS', ''),
+ 'force_ipv4' => env('CRAWLER_FORCE_IPV4', false),
+ 'openai_wire_api' => env('CRAWLER_AI_WIRE_API', 'chat_completions'),
+ 'openai_compatible_base_url' => env('CRAWLER_AI_BASE_URL'),
+ 'openai_disable_response_storage' => env('CRAWLER_AI_DISABLE_RESPONSE_STORAGE', false),
+ 'openai_reasoning_effort' => env('CRAWLER_AI_REASONING_EFFORT', ''),
+ 'browserless_endpoint' => env('CRAWLER_BROWSERLESS_ENDPOINT'),
+ 'browserless_token' => env('CRAWLER_BROWSERLESS_TOKEN'),
+ 'openai_compatible_endpoint' => env('CRAWLER_AI_ENDPOINT'),
+ 'openai_compatible_key' => env('CRAWLER_AI_KEY'),
+ 'openai_default_model' => env('CRAWLER_AI_MODEL', 'gpt-4o-mini'),
+ 'default_alert_email' => env('CRAWLER_ALERT_EMAIL'),
+];
+
diff --git a/database/migrations/2026_02_18_100000_create_crawl_rules_table.php b/database/migrations/2026_02_18_100000_create_crawl_rules_table.php
new file mode 100644
index 0000000..3ead351
--- /dev/null
+++ b/database/migrations/2026_02_18_100000_create_crawl_rules_table.php
@@ -0,0 +1,54 @@
+id();
+ $table->string('name', 150);
+ $table->string('target_module', 32);
+ $table->boolean('enabled')->default(true);
+ $table->json('entry_urls');
+ $table->string('cron_expression', 64)->default('0 */6 * * *');
+ $table->string('timezone', 64)->default('Asia/Shanghai');
+ $table->unsignedSmallInteger('max_pages')->default(50);
+ $table->boolean('render_js')->default(false);
+ $table->string('user_agent', 255)->nullable();
+ $table->json('headers')->nullable();
+ $table->json('cookies')->nullable();
+ $table->string('proxy', 255)->nullable();
+ $table->unsignedSmallInteger('rate_limit_per_minute')->default(30);
+ $table->unsignedTinyInteger('retry_max')->default(3);
+ $table->unsignedSmallInteger('retry_backoff_seconds')->default(60);
+ $table->json('extractor_config')->nullable();
+ $table->json('mapping_config')->nullable();
+ $table->json('dedupe_config')->nullable();
+ $table->boolean('ai_fallback_enabled')->default(false);
+ $table->string('ai_provider', 64)->nullable();
+ $table->string('ai_model', 128)->nullable();
+ $table->string('publish_policy', 32)->default('draft');
+ $table->string('alert_email', 255)->nullable();
+ $table->timestamp('last_run_at')->nullable();
+ $table->timestamp('next_run_at')->nullable();
+ $table->unsignedBigInteger('created_by')->nullable();
+ $table->unsignedBigInteger('updated_by')->nullable();
+ $table->timestamps();
+
+ $table->index(['enabled', 'next_run_at']);
+ $table->index(['target_module', 'enabled']);
+ });
+ }
+
+ public function down(): void
+ {
+ Schema::dropIfExists('crawl_rules');
+ }
+};
+
diff --git a/database/migrations/2026_02_18_100100_create_crawl_runs_table.php b/database/migrations/2026_02_18_100100_create_crawl_runs_table.php
new file mode 100644
index 0000000..11cb845
--- /dev/null
+++ b/database/migrations/2026_02_18_100100_create_crawl_runs_table.php
@@ -0,0 +1,39 @@
+id();
+ $table->foreignId('rule_id')->constrained('crawl_rules')->cascadeOnDelete();
+ $table->string('trigger_type', 32)->default('manual');
+ $table->string('status', 32)->default('pending');
+ $table->timestamp('started_at')->nullable();
+ $table->timestamp('finished_at')->nullable();
+ $table->unsignedInteger('total_urls')->default(0);
+ $table->unsignedInteger('success_count')->default(0);
+ $table->unsignedInteger('failed_count')->default(0);
+ $table->unsignedInteger('skipped_count')->default(0);
+ $table->text('error_summary')->nullable();
+ $table->json('metrics')->nullable();
+ $table->unsignedBigInteger('created_by')->nullable();
+ $table->timestamps();
+
+ $table->index(['rule_id', 'created_at']);
+ $table->index(['status', 'created_at']);
+ });
+ }
+
+ public function down(): void
+ {
+ Schema::dropIfExists('crawl_runs');
+ }
+};
+
diff --git a/database/migrations/2026_02_18_100200_create_crawl_run_items_table.php b/database/migrations/2026_02_18_100200_create_crawl_run_items_table.php
new file mode 100644
index 0000000..181395c
--- /dev/null
+++ b/database/migrations/2026_02_18_100200_create_crawl_run_items_table.php
@@ -0,0 +1,39 @@
+id();
+ $table->foreignId('run_id')->constrained('crawl_runs')->cascadeOnDelete();
+ $table->string('url', 2048);
+ $table->string('stage', 32);
+ $table->unsignedTinyInteger('attempt')->default(1);
+ $table->string('status', 32)->default('success');
+ $table->unsignedInteger('latency_ms')->nullable();
+ $table->unsignedSmallInteger('http_code')->nullable();
+ $table->string('error_code', 64)->nullable();
+ $table->text('error_message')->nullable();
+ $table->json('raw_payload')->nullable();
+ $table->json('normalized_payload')->nullable();
+ $table->json('upsert_result')->nullable();
+ $table->timestamps();
+
+ $table->index(['run_id', 'status']);
+ $table->index(['run_id', 'stage']);
+ });
+ }
+
+ public function down(): void
+ {
+ Schema::dropIfExists('crawl_run_items');
+ }
+};
+
diff --git a/database/migrations/2026_02_18_100300_create_crawl_alerts_table.php b/database/migrations/2026_02_18_100300_create_crawl_alerts_table.php
new file mode 100644
index 0000000..8ccc1c2
--- /dev/null
+++ b/database/migrations/2026_02_18_100300_create_crawl_alerts_table.php
@@ -0,0 +1,36 @@
+id();
+ $table->foreignId('run_id')->nullable()->constrained('crawl_runs')->nullOnDelete();
+ $table->foreignId('rule_id')->nullable()->constrained('crawl_rules')->nullOnDelete();
+ $table->string('severity', 32)->default('warning');
+ $table->string('type', 64);
+ $table->string('message', 500);
+ $table->json('context')->nullable();
+ $table->boolean('is_resolved')->default(false);
+ $table->unsignedBigInteger('resolved_by')->nullable();
+ $table->timestamp('resolved_at')->nullable();
+ $table->timestamps();
+
+ $table->index(['is_resolved', 'severity']);
+ $table->index(['rule_id', 'created_at']);
+ });
+ }
+
+ public function down(): void
+ {
+ Schema::dropIfExists('crawl_alerts');
+ }
+};
+
diff --git a/docs/crawler-rule-guide.md b/docs/crawler-rule-guide.md
new file mode 100644
index 0000000..815f709
--- /dev/null
+++ b/docs/crawler-rule-guide.md
@@ -0,0 +1,219 @@
+# 采集规则使用文档(含 AI 工具 Demo)
+
+本文面向当前项目内置采集器,目标是让你从 0 到 1 跑通一条规则,并把数据入库到站点模块(如 `AI 工具`、`AI 模型`)。
+
+## 1. 功能概览
+
+当前支持:
+
+- 后台配置采集规则(入口 URL、定时、抓取参数、Extractor JSON、AI 配置)。
+- 定时执行(Laravel Scheduler)与手动触发。
+- 运行日志、失败明细、告警中心。
+- 目标模块入库:`AI 工具`、`AI 模型`。
+- 三种抽取模式:
+ - `xpath`:只用 XPath 规则。
+ - `ai`:只用 AI 抽取结构化数据。
+ - `hybrid`:XPath + AI 合并(XPath 优先)。
+- 页面预览 + 点选元素生成 XPath。
+- AI 一键生成 Extractor 规则(从页面内容推断)。
+
+## 2. 前置准备
+
+### 2.1 迁移数据库
+
+```bash
+php artisan migrate --force
+```
+
+确保存在以下表:
+
+- `crawl_rules`
+- `crawl_runs`
+- `crawl_run_items`
+- `crawl_alerts`
+
+### 2.2 启动队列与调度
+
+采集任务通过队列执行,建议至少一个 worker:
+
+```bash
+php artisan queue:work
+```
+
+系统 cron 每分钟执行一次调度器:
+
+```cron
+* * * * * cd /path/to/ai-web && php artisan schedule:run >> /dev/null 2>&1
+```
+
+### 2.3 AI 配置(用于 AI 抽取/AI 规则生成)
+
+在 `.env` 中配置:
+
+```env
+CRAWLER_AI_ENDPOINT=
+CRAWLER_AI_KEY=
+CRAWLER_AI_MODEL=gpt-4o-mini
+```
+
+## 3. 后台入口
+
+- 采集规则:`/admin/crawlers`
+- 运行记录:`/admin/crawl-runs`
+- 告警中心:`/admin/crawl-alerts`
+
+## 4. AI 工具 Demo(推荐先跑)
+
+### 4.1 新建规则
+
+在 `采集规则` 页面点击“新建采集规则”:
+
+- 规则名称:`AI工具-Demo`
+- 目标模块:`AI 工具`
+- 发布策略:`草稿待审核`
+- Cron:`0 */6 * * *`
+- 时区:`Asia/Shanghai`
+- 最大页面数:`30`
+- 启用规则:勾选
+- 入口 URL:
+
+```text
+https://your-demo-site.com/ai-tools
+```
+
+### 4.2 选择抽取模式
+
+可按场景选:
+
+- 页面结构稳定:`xpath`
+- 页面结构变化大:`ai`
+- 追求稳定 + 覆盖:`hybrid`
+
+### 4.3 配置 Extractor JSON(XPath 模式/Hybrid 建议)
+
+可直接用:
+
+```json
+{
+ "list_link_xpath": "//a[contains(@class,'tool-link')]/@href",
+ "fields": {
+ "name": "//h1/text()",
+ "summary": "//meta[@name='description']/@content",
+ "official_url": "//a[contains(@class,'visit-official')]/@href",
+ "logo_url": "//meta[@property='og:image']/@content",
+ "pricing_type": "//span[@data-field='pricing']/text()",
+ "platform": "//span[@data-field='platform']/text()",
+ "language": "//span[@data-field='language']/text()",
+ "description": "//article[contains(@class,'tool-content')]//text()"
+ }
+}
+```
+
+或使用示例文件:`docs/examples/ai-tools-extractor.json`
+
+### 4.4 用“页面预览 + 选元素”快速生成 XPath
+
+1. 在表单里输入 `预览 URL`。
+2. 点击“加载预览”。
+3. 在预览 iframe 点击目标元素,页面会显示当前 XPath。
+4. 填写“写入字段”(如 `name` / `summary` / `list_link_xpath`)。
+5. 点击“写入 Extractor JSON”。
+
+### 4.5 用 AI 一键生成规则
+
+1. 填写(可选)AI 提示词、模型、温度等。
+2. 点击“AI 生成抽取规则并合并到 Extractor JSON”。
+3. 检查合并后的 JSON 并微调。
+
+### 4.6 Mapping / Dedupe(可选)
+
+`Mapping JSON` 示例:`docs/examples/ai-tools-mapping.json`
+
+```json
+{
+ "name": "title",
+ "summary": "desc",
+ "official_url": "website"
+}
+```
+
+`Dedupe JSON` 当前可先留空对象:`docs/examples/ai-tools-dedupe.json`
+
+```json
+{}
+```
+
+## 5. 如何执行
+
+### 5.1 后台手动执行
+
+在规则列表点击“立即执行”。
+
+### 5.2 命令行执行
+
+执行指定规则:
+
+```bash
+php artisan crawler:run 规则ID --sync
+```
+
+按 cron 执行到期规则:
+
+```bash
+php artisan crawler:run
+```
+
+忽略 cron 执行全部启用规则:
+
+```bash
+php artisan crawler:run --all
+```
+
+重试某次运行:
+
+```bash
+php artisan crawler:retry-failed 运行ID
+```
+
+## 6. 验证结果
+
+1. 打开 `/admin/crawl-runs` 查看该次运行状态。
+2. 进入运行详情看 `list/detail/extract/upsert` 各阶段结果。
+3. 到目标模块(如 AI 工具)确认有新数据,状态应为 `draft`。
+
+## 7. 常见问题
+
+### 7.1 `Table '...crawl_rules' doesn't exist`
+
+未执行迁移:
+
+```bash
+php artisan migrate --force
+```
+
+### 7.2 保存时报 `validation.json` 或 JSON 格式错误
+
+检查以下字段是否是合法 JSON:
+
+- `Extractor JSON`
+- `Mapping JSON`
+- `Headers JSON`
+- `Cookies JSON`
+
+### 7.3 运行成功但没入库
+
+通常是缺少必填字段:
+
+- AI 工具至少需 `name`、`summary`
+- AI 模型至少需 `name`、`summary`、`modality`、`deployment_mode`
+
+去运行详情看 `extract` 阶段的 `Missing required fields`。
+
+### 7.4 预览或 AI 规则生成失败
+
+常见原因:
+
+- URL 不可访问
+- URL 命中安全限制(内网/保留地址)
+- AI 配置缺失(`CRAWLER_AI_ENDPOINT` / `CRAWLER_AI_KEY`)
+
diff --git a/docs/examples/ai-tools-dedupe.json b/docs/examples/ai-tools-dedupe.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/docs/examples/ai-tools-dedupe.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/docs/examples/ai-tools-extractor.json b/docs/examples/ai-tools-extractor.json
new file mode 100644
index 0000000..b8ea34b
--- /dev/null
+++ b/docs/examples/ai-tools-extractor.json
@@ -0,0 +1,13 @@
+{
+ "list_link_xpath": "//a[contains(@class,'tool-link')]/@href",
+ "fields": {
+ "name": "//h1/text()",
+ "summary": "//meta[@name='description']/@content",
+ "official_url": "//a[contains(@class,'visit-official')]/@href",
+ "logo_url": "//meta[@property='og:image']/@content",
+ "pricing_type": "//span[@data-field='pricing']/text()",
+ "platform": "//span[@data-field='platform']/text()",
+ "language": "//span[@data-field='language']/text()",
+ "description": "//article[contains(@class,'tool-content')]//text()"
+ }
+}
\ No newline at end of file
diff --git a/docs/examples/ai-tools-mapping.json b/docs/examples/ai-tools-mapping.json
new file mode 100644
index 0000000..30cf69f
--- /dev/null
+++ b/docs/examples/ai-tools-mapping.json
@@ -0,0 +1,5 @@
+{
+ "name": "title",
+ "summary": "desc",
+ "official_url": "website"
+}
\ No newline at end of file
diff --git a/resources/views/admin/crawl-alerts/index.blade.php b/resources/views/admin/crawl-alerts/index.blade.php
new file mode 100644
index 0000000..f5ed2d3
--- /dev/null
+++ b/resources/views/admin/crawl-alerts/index.blade.php
@@ -0,0 +1,73 @@
+@extends('layouts.admin')
+
+@section('title', '采集告警中心')
+
+@section('head')
+ @include('admin.partials.modern-index-head')
+@endsection
+
+@section('content')
+
+
+
+
+
+
+
+ | ID |
+ 等级 |
+ 规则 |
+ 运行 |
+ 类型 |
+ 信息 |
+ 状态 |
+ 操作 |
+
+
+
+ @forelse($items as $item)
+
+ | #{{ $item->id }} |
+ {{ $item->severity?->value ?? '-' }} |
+ {{ $item->rule?->name ?? '-' }} |
+
+ @if($item->run)
+ #{{ $item->run_id }}
+ @else
+ -
+ @endif
+ |
+ {{ $item->type }} |
+ {{ $item->message }} |
+ {{ $item->is_resolved ? '已处理' : '未处理' }} |
+
+ @if(! $item->is_resolved)
+
+ @endif
+ |
+
+ @empty
+
+ | 暂无告警 |
+
+ @endforelse
+
+
+
+
+
+@endsection
diff --git a/resources/views/admin/crawl-runs/index.blade.php b/resources/views/admin/crawl-runs/index.blade.php
new file mode 100644
index 0000000..3fe8280
--- /dev/null
+++ b/resources/views/admin/crawl-runs/index.blade.php
@@ -0,0 +1,60 @@
+@extends('layouts.admin')
+
+@section('title', '采集运行记录')
+
+@section('head')
+ @include('admin.partials.modern-index-head')
+@endsection
+
+@section('content')
+
+
+
+
+
+
+
+ | ID |
+ 规则 |
+ 触发方式 |
+ 状态 |
+ 统计 |
+ 时间 |
+ 操作 |
+
+
+
+ @forelse($items as $item)
+
+ | #{{ $item->id }} |
+ {{ $item->rule?->name ?? '-' }} |
+ {{ $item->trigger_type?->value ?? '-' }} |
+ {{ $item->status?->value ?? '-' }} |
+ 成功 {{ $item->success_count }} / 失败 {{ $item->failed_count }} / 跳过 {{ $item->skipped_count }} |
+ {{ $item->created_at?->format('Y-m-d H:i:s') }} |
+
+ 详情
+
+ |
+
+ @empty
+
+ | 暂无运行记录 |
+
+ @endforelse
+
+
+
+
+
+@endsection
diff --git a/resources/views/admin/crawl-runs/show.blade.php b/resources/views/admin/crawl-runs/show.blade.php
new file mode 100644
index 0000000..c2f510f
--- /dev/null
+++ b/resources/views/admin/crawl-runs/show.blade.php
@@ -0,0 +1,101 @@
+@extends('layouts.admin')
+
+@section('title', '运行详情 #'.$run->id)
+
+@section('head')
+ @include('admin.partials.modern-index-head')
+@endsection
+
+@section('page_actions')
+ 返回列表
+@endsection
+
+@section('content')
+
+
+
+
规则:{{ $run->rule?->name ?? '-' }}
+
触发方式:{{ $run->trigger_type?->value ?? '-' }}
+
状态:{{ $run->status?->value ?? '-' }}
+
创建时间:{{ $run->created_at?->format('Y-m-d H:i:s') }}
+
总URL:{{ $run->total_urls }}
+
成功:{{ $run->success_count }}
+
失败:{{ $run->failed_count }}
+
跳过:{{ $run->skipped_count }}
+
+ @if($run->error_summary)
+
{{ $run->error_summary }}
+ @endif
+
+
+
+
+
+
+
+
+
+ | ID |
+ URL |
+ 阶段 |
+ 状态 |
+ HTTP |
+ 耗时(ms) |
+ 错误 |
+
+
+
+ @forelse($run->items as $item)
+
+ | #{{ $item->id }} |
+ {{ $item->url }} |
+ {{ $item->stage }} |
+ {{ $item->status?->value ?? '-' }} |
+ {{ $item->http_code ?? '-' }} |
+ {{ $item->latency_ms ?? '-' }} |
+ {{ $item->error_message ?? '-' }} |
+
+ @empty
+
+ | 无明细数据 |
+
+ @endforelse
+
+
+
+
+
+
+
+
+
+
+
+ | ID |
+ 等级 |
+ 类型 |
+ 信息 |
+ 状态 |
+ 时间 |
+
+
+
+ @forelse($run->alerts as $alert)
+
+ | #{{ $alert->id }} |
+ {{ $alert->severity?->value ?? '-' }} |
+ {{ $alert->type }} |
+ {{ $alert->message }} |
+ {{ $alert->is_resolved ? '已处理' : '未处理' }} |
+ {{ $alert->created_at?->format('Y-m-d H:i:s') }} |
+
+ @empty
+
+ | 无告警 |
+
+ @endforelse
+
+
+
+
+@endsection
diff --git a/resources/views/admin/crawlers/form.blade.php b/resources/views/admin/crawlers/form.blade.php
new file mode 100644
index 0000000..7764783
--- /dev/null
+++ b/resources/views/admin/crawlers/form.blade.php
@@ -0,0 +1,482 @@
+@extends('layouts.admin')
+
+@section('title', $item->exists ? '编辑采集规则' : '新建采集规则')
+
+@section('head')
+ @include('admin.partials.modern-form-head')
+@endsection
+
+@section('scripts')
+
+@endsection
+
+@section('content')
+
+@endsection
diff --git a/resources/views/admin/crawlers/index.blade.php b/resources/views/admin/crawlers/index.blade.php
new file mode 100644
index 0000000..9fb5177
--- /dev/null
+++ b/resources/views/admin/crawlers/index.blade.php
@@ -0,0 +1,74 @@
+@extends('layouts.admin')
+
+@section('title', '采集规则')
+
+@section('head')
+ @include('admin.partials.modern-index-head')
+@endsection
+
+@section('page_actions')
+ 新建规则
+@endsection
+
+@section('content')
+
+
+
+
+
+
+
+ | 规则 |
+ 目标模块 |
+ Cron |
+ 状态 |
+ 最近运行 |
+ 操作 |
+
+
+
+ @forelse($items as $item)
+
+ |
+ {{ $item->name }}
+ 运行次数:{{ $item->runs_count }} / 下次:{{ $item->next_run_at?->format('Y-m-d H:i') ?? '-' }}
+ |
+ {{ $item->target_module?->label() ?? '-' }} |
+ {{ $item->cron_expression }} |
+
+ @if($item->enabled)
+ 启用
+ @else
+ 停用
+ @endif
+ |
+ {{ $item->last_run_at?->format('Y-m-d H:i') ?? '-' }} |
+
+
+ 编辑
+ |
+
+ @empty
+
+ | 暂无采集规则 |
+
+ @endforelse
+
+
+
+
+
+@endsection
diff --git a/resources/views/admin/partials/admin-page-header.blade.php b/resources/views/admin/partials/admin-page-header.blade.php
index 0e50d20..c46dc37 100644
--- a/resources/views/admin/partials/admin-page-header.blade.php
+++ b/resources/views/admin/partials/admin-page-header.blade.php
@@ -9,17 +9,21 @@
'tools' => ['label' => 'AI 工具', 'index' => 'admin.tools.index', 'subtitle' => '维护工具信息、状态与展示内容。'],
'models' => ['label' => 'AI 模型', 'index' => 'admin.models.index', 'subtitle' => '管理模型参数、评分与发布状态。'],
'articles' => ['label' => 'AI 资讯', 'index' => 'admin.articles.index', 'subtitle' => '维护资讯内容、来源与发布质量。'],
- 'guides' => ['label' => 'AI 教程', 'index' => 'admin.guides.index', 'subtitle' => '维护教程内容与学习难度分层。'],
+ 'guides' => ['label' => 'AI 教程', 'index' => 'admin.guides.index', 'subtitle' => '维护教程内容与学习难度层级。'],
'categories' => ['label' => '分类管理', 'index' => 'admin.categories.index', 'subtitle' => '统一管理分类体系与启用状态。'],
- 'sources' => ['label' => '来源管理', 'index' => 'admin.sources.index', 'subtitle' => '维护可信来源白名单与抓取策略。'],
- 'settings' => ['label' => '首页配置', 'index' => 'admin.settings.index', 'subtitle' => '配置首页模块、条目与展示顺序。'],
- 'feedback' => ['label' => '反馈管理', 'index' => 'admin.feedback.index', 'subtitle' => '跟进用户反馈并及时更新处理状态。'],
+ 'sources' => ['label' => '来源管理', 'index' => 'admin.sources.index', 'subtitle' => '维护可用来源及可信度配置。'],
+ 'settings' => ['label' => '首页配置', 'index' => 'admin.settings.index', 'subtitle' => '配置首页模块、条目和展示顺序。'],
+ 'feedback' => ['label' => '反馈管理', 'index' => 'admin.feedback.index', 'subtitle' => '跟进用户反馈并更新处理状态。'],
+ 'crawlers' => ['label' => '采集规则', 'index' => 'admin.crawlers.index', 'subtitle' => '维护采集目标、字段映射与调度策略。'],
+ 'crawl-runs' => ['label' => '采集运行', 'index' => 'admin.crawl-runs.index', 'subtitle' => '查看每次采集执行结果、失败原因和重试。'],
+ 'crawl-alerts' => ['label' => '采集告警', 'index' => 'admin.crawl-alerts.index', 'subtitle' => '集中处理采集异常并追踪恢复情况。'],
][$moduleKey] ?? ['label' => '管理后台', 'index' => 'admin.dashboard', 'subtitle' => '维护站点内容与配置。'];
$actionLabel = [
'index' => '列表',
'create' => '新建',
'edit' => '编辑',
+ 'show' => '详情',
][$actionKey] ?? '详情';
$defaultTitle = $moduleMeta['label'];
@@ -30,7 +34,7 @@
if ($pageSubtitle === '') {
$pageSubtitle = $actionKey === 'index'
? $moduleMeta['subtitle']
- : '当前为'.$actionLabel.'页面,请按提示完成必填信息并保存。';
+ : '当前为'.$actionLabel.'页面,请按提示完善信息后保存。';
}
@endphp
diff --git a/resources/views/layouts/admin.blade.php b/resources/views/layouts/admin.blade.php
index ef0a388..dbed6e3 100644
--- a/resources/views/layouts/admin.blade.php
+++ b/resources/views/layouts/admin.blade.php
@@ -1,4 +1,4 @@
-
+
@@ -111,6 +111,10 @@
首页配置
反馈管理
+
+ 采集规则
+ 运行记录
+ 告警中心
访问前台
@@ -329,4 +333,3 @@
@yield('scripts')
-
diff --git a/routes/console.php b/routes/console.php
index 3c9adf1..ece3f41 100644
--- a/routes/console.php
+++ b/routes/console.php
@@ -2,7 +2,10 @@
use Illuminate\Foundation\Inspiring;
use Illuminate\Support\Facades\Artisan;
+use Illuminate\Support\Facades\Schedule;
Artisan::command('inspire', function () {
$this->comment(Inspiring::quote());
})->purpose('Display an inspiring quote');
+
+Schedule::command('crawler:run')->everyMinute()->withoutOverlapping();
diff --git a/routes/web.php b/routes/web.php
index a18aadb..8fedcc4 100644
--- a/routes/web.php
+++ b/routes/web.php
@@ -9,6 +9,9 @@ use App\Http\Controllers\Admin\DashboardController;
use App\Http\Controllers\Admin\FeedbackController as AdminFeedbackController;
use App\Http\Controllers\Admin\GuideController as AdminGuideController;
use App\Http\Controllers\Admin\CategoryController as AdminCategoryController;
+use App\Http\Controllers\Admin\CrawlAlertController as AdminCrawlAlertController;
+use App\Http\Controllers\Admin\CrawlerRuleController as AdminCrawlerRuleController;
+use App\Http\Controllers\Admin\CrawlRunController as AdminCrawlRunController;
use App\Http\Controllers\Admin\SiteSettingController as AdminSiteSettingController;
use App\Http\Controllers\Admin\UploadController as AdminUploadController;
use App\Http\Controllers\Admin\SourceController as AdminSourceController;
@@ -111,5 +114,21 @@ Route::prefix('admin')->name('admin.')->group(function (): void {
Route::get('/feedback', [AdminFeedbackController::class, 'index'])->name('feedback.index');
Route::put('/feedback/{feedback}', [AdminFeedbackController::class, 'updateStatus'])->name('feedback.status');
+
+ Route::get('/crawlers', [AdminCrawlerRuleController::class, 'index'])->name('crawlers.index');
+ Route::get('/crawlers/create', [AdminCrawlerRuleController::class, 'create'])->name('crawlers.create');
+ Route::post('/crawlers', [AdminCrawlerRuleController::class, 'store'])->name('crawlers.store');
+ Route::post('/crawlers/preview', [AdminCrawlerRuleController::class, 'preview'])->name('crawlers.preview');
+ Route::post('/crawlers/ai-suggest-extractor', [AdminCrawlerRuleController::class, 'aiSuggestExtractor'])->name('crawlers.ai-suggest-extractor');
+ Route::get('/crawlers/{crawler}/edit', [AdminCrawlerRuleController::class, 'edit'])->name('crawlers.edit');
+ Route::put('/crawlers/{crawler}', [AdminCrawlerRuleController::class, 'update'])->name('crawlers.update');
+ Route::post('/crawlers/{crawler}/run', [AdminCrawlerRuleController::class, 'run'])->name('crawlers.run');
+
+ Route::get('/crawl-runs', [AdminCrawlRunController::class, 'index'])->name('crawl-runs.index');
+ Route::get('/crawl-runs/{run}', [AdminCrawlRunController::class, 'show'])->name('crawl-runs.show');
+ Route::post('/crawl-runs/{run}/retry', [AdminCrawlRunController::class, 'retry'])->name('crawl-runs.retry');
+
+ Route::get('/crawl-alerts', [AdminCrawlAlertController::class, 'index'])->name('crawl-alerts.index');
+ Route::post('/crawl-alerts/{alert}/resolve', [AdminCrawlAlertController::class, 'resolve'])->name('crawl-alerts.resolve');
});
});
diff --git a/tests/Feature/CrawlerCommandTest.php b/tests/Feature/CrawlerCommandTest.php
new file mode 100644
index 0000000..3a87901
--- /dev/null
+++ b/tests/Feature/CrawlerCommandTest.php
@@ -0,0 +1,62 @@
+ABAlpha