From 260460df037874d3bae030deeaf21a9605a5586d Mon Sep 17 00:00:00 2001 From: cjd Date: Wed, 18 Feb 2026 12:56:36 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E5=BC=80=E5=8F=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 17 +- .../Commands/CrawlerHealthCheckCommand.php | 41 ++ .../Commands/CrawlerRetryFailedCommand.php | 39 ++ app/Console/Commands/CrawlerRunCommand.php | 64 +++ app/Enums/CrawlAlertSeverity.php | 14 + app/Enums/CrawlRunItemStatus.php | 13 + app/Enums/CrawlRunStatus.php | 15 + app/Enums/CrawlTargetModule.php | 20 + app/Enums/CrawlTriggerType.php | 13 + .../Admin/CrawlAlertController.php | 42 ++ .../Controllers/Admin/CrawlRunController.php | 54 ++ .../Admin/CrawlerRuleController.php | 413 +++++++++++++++ app/Http/Requests/Admin/CrawlRuleRequest.php | 254 +++++++++ app/Jobs/RunCrawlRuleJob.php | 49 ++ app/Models/CrawlAlert.php | 48 ++ app/Models/CrawlRule.php | 74 +++ app/Models/CrawlRun.php | 59 +++ app/Models/CrawlRunItem.php | 46 ++ app/Services/Crawler/CrawlAlertService.php | 73 +++ .../Crawler/CrawlEntityUpsertService.php | 277 ++++++++++ .../Crawler/CrawlExecutionService.php | 335 ++++++++++++ app/Services/Crawler/CrawlFetcherService.php | 103 ++++ .../Crawler/CrawlRuleScheduleService.php | 44 ++ .../Crawler/OpenAiFallbackExtractor.php | 494 ++++++++++++++++++ app/Services/Crawler/XPathExtractor.php | 171 ++++++ bootstrap/app.php | 3 + config/crawler.php | 23 + ..._02_18_100000_create_crawl_rules_table.php | 54 ++ ...6_02_18_100100_create_crawl_runs_table.php | 39 ++ ...18_100200_create_crawl_run_items_table.php | 39 ++ ...02_18_100300_create_crawl_alerts_table.php | 36 ++ docs/crawler-rule-guide.md | 219 ++++++++ docs/examples/ai-tools-dedupe.json | 1 + docs/examples/ai-tools-extractor.json | 13 + docs/examples/ai-tools-mapping.json | 5 + .../views/admin/crawl-alerts/index.blade.php | 73 +++ .../views/admin/crawl-runs/index.blade.php | 60 +++ .../views/admin/crawl-runs/show.blade.php | 101 ++++ resources/views/admin/crawlers/form.blade.php | 482 +++++++++++++++++ .../views/admin/crawlers/index.blade.php | 74 +++ .../partials/admin-page-header.blade.php | 14 +- resources/views/layouts/admin.blade.php | 7 +- routes/console.php | 3 + routes/web.php | 19 + tests/Feature/CrawlerCommandTest.php | 62 +++ 45 files changed, 4091 insertions(+), 8 deletions(-) create mode 100644 app/Console/Commands/CrawlerHealthCheckCommand.php create mode 100644 app/Console/Commands/CrawlerRetryFailedCommand.php create mode 100644 app/Console/Commands/CrawlerRunCommand.php create mode 100644 app/Enums/CrawlAlertSeverity.php create mode 100644 app/Enums/CrawlRunItemStatus.php create mode 100644 app/Enums/CrawlRunStatus.php create mode 100644 app/Enums/CrawlTargetModule.php create mode 100644 app/Enums/CrawlTriggerType.php create mode 100644 app/Http/Controllers/Admin/CrawlAlertController.php create mode 100644 app/Http/Controllers/Admin/CrawlRunController.php create mode 100644 app/Http/Controllers/Admin/CrawlerRuleController.php create mode 100644 app/Http/Requests/Admin/CrawlRuleRequest.php create mode 100644 app/Jobs/RunCrawlRuleJob.php create mode 100644 app/Models/CrawlAlert.php create mode 100644 app/Models/CrawlRule.php create mode 100644 app/Models/CrawlRun.php create mode 100644 app/Models/CrawlRunItem.php create mode 100644 app/Services/Crawler/CrawlAlertService.php create mode 100644 app/Services/Crawler/CrawlEntityUpsertService.php create mode 100644 app/Services/Crawler/CrawlExecutionService.php create mode 100644 app/Services/Crawler/CrawlFetcherService.php create mode 100644 app/Services/Crawler/CrawlRuleScheduleService.php create mode 100644 app/Services/Crawler/OpenAiFallbackExtractor.php create mode 100644 app/Services/Crawler/XPathExtractor.php create mode 100644 config/crawler.php create mode 100644 database/migrations/2026_02_18_100000_create_crawl_rules_table.php create mode 100644 database/migrations/2026_02_18_100100_create_crawl_runs_table.php create mode 100644 database/migrations/2026_02_18_100200_create_crawl_run_items_table.php create mode 100644 database/migrations/2026_02_18_100300_create_crawl_alerts_table.php create mode 100644 docs/crawler-rule-guide.md create mode 100644 docs/examples/ai-tools-dedupe.json create mode 100644 docs/examples/ai-tools-extractor.json create mode 100644 docs/examples/ai-tools-mapping.json create mode 100644 resources/views/admin/crawl-alerts/index.blade.php create mode 100644 resources/views/admin/crawl-runs/index.blade.php create mode 100644 resources/views/admin/crawl-runs/show.blade.php create mode 100644 resources/views/admin/crawlers/form.blade.php create mode 100644 resources/views/admin/crawlers/index.blade.php create mode 100644 tests/Feature/CrawlerCommandTest.php diff --git a/.env.example b/.env.example index 939bfcd..f05d3b6 100644 --- a/.env.example +++ b/.env.example @@ -61,4 +61,19 @@ AWS_USE_PATH_STYLE_ENDPOINT=false VITE_APP_NAME="${APP_NAME}" - +CRAWLER_USER_AGENT="AIWebCrawler/1.0 (+https://dev.aiweb.com)" +CRAWLER_REQUEST_TIMEOUT=20 +CRAWLER_AI_TIMEOUT=60 +CRAWLER_VERIFY_SSL=true +CRAWLER_DNS_SERVERS= +CRAWLER_FORCE_IPV4=false +CRAWLER_AI_WIRE_API=chat_completions +CRAWLER_AI_BASE_URL= +CRAWLER_BROWSERLESS_ENDPOINT= +CRAWLER_BROWSERLESS_TOKEN= +CRAWLER_AI_ENDPOINT= +CRAWLER_AI_KEY= +CRAWLER_AI_MODEL=gpt-4o-mini +CRAWLER_AI_REASONING_EFFORT= +CRAWLER_AI_DISABLE_RESPONSE_STORAGE=false +CRAWLER_ALERT_EMAIL= diff --git a/app/Console/Commands/CrawlerHealthCheckCommand.php b/app/Console/Commands/CrawlerHealthCheckCommand.php new file mode 100644 index 0000000..51f2fec --- /dev/null +++ b/app/Console/Commands/CrawlerHealthCheckCommand.php @@ -0,0 +1,41 @@ + 'Queue Connection', 'status' => (string) config('queue.default'), 'detail' => '当前队列连接'], + ['item' => 'Browserless Endpoint', 'status' => (string) (config('crawler.browserless_endpoint') ?: 'not-configured'), 'detail' => 'JS渲染服务'], + ['item' => 'AI Endpoint', 'status' => (string) (config('crawler.openai_compatible_endpoint') ?: 'not-configured'), 'detail' => 'AI兜底抽取'], + ['item' => 'Alert Email', 'status' => (string) (config('crawler.default_alert_email') ?: 'not-configured'), 'detail' => '默认告警邮箱'], + ]; + + $browserlessEndpoint = (string) config('crawler.browserless_endpoint', ''); + + if ($browserlessEndpoint !== '') { + try { + $response = Http::timeout(5)->get($browserlessEndpoint); + $checks[] = ['item' => 'Browserless Reachable', 'status' => $response->status() < 500 ? 'ok' : 'degraded', 'detail' => 'HTTP '.$response->status()]; + } catch (\Throwable $exception) { + $checks[] = ['item' => 'Browserless Reachable', 'status' => 'failed', 'detail' => $exception->getMessage()]; + } + } + + $this->table(['Item', 'Status', 'Detail'], $checks); + + return self::SUCCESS; + } +} + diff --git a/app/Console/Commands/CrawlerRetryFailedCommand.php b/app/Console/Commands/CrawlerRetryFailedCommand.php new file mode 100644 index 0000000..9c4bc3a --- /dev/null +++ b/app/Console/Commands/CrawlerRetryFailedCommand.php @@ -0,0 +1,39 @@ +with('rule')->find((int) $this->argument('runId')); + + if (! $run instanceof CrawlRun || $run->rule === null) { + $this->error('运行记录不存在或规则已删除'); + + return self::FAILURE; + } + + if ((bool) $this->option('sync')) { + RunCrawlRuleJob::dispatchSync($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id); + } else { + RunCrawlRuleJob::dispatch($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id); + } + + $this->info(sprintf('已提交重试任务,规则 #%d %s', $run->rule_id, $run->rule->name)); + + return self::SUCCESS; + } +} + diff --git a/app/Console/Commands/CrawlerRunCommand.php b/app/Console/Commands/CrawlerRunCommand.php new file mode 100644 index 0000000..71173ca --- /dev/null +++ b/app/Console/Commands/CrawlerRunCommand.php @@ -0,0 +1,64 @@ +argument('ruleId'); + + $query = CrawlRule::query()->where('enabled', true); + + if ($ruleId !== null) { + $query->whereKey((int) $ruleId); + } + + $rules = $query->orderBy('id')->get(); + + if ($rules->isEmpty()) { + $this->warn('没有可执行的采集规则'); + + return self::SUCCESS; + } + + $shouldRunAll = (bool) $this->option('all') || $ruleId !== null; + $sync = (bool) $this->option('sync'); + + $dispatched = 0; + + foreach ($rules as $rule) { + if (! $shouldRunAll && ! $scheduleService->isDue($rule)) { + continue; + } + + if ($sync) { + RunCrawlRuleJob::dispatchSync($rule->id, CrawlTriggerType::Schedule->value); + } else { + RunCrawlRuleJob::dispatch($rule->id, CrawlTriggerType::Schedule->value); + } + + $dispatched++; + $this->info(sprintf('已提交规则 #%d %s', $rule->id, $rule->name)); + } + + if ($dispatched === 0) { + $this->line('当前无到期规则'); + } + + return self::SUCCESS; + } +} + diff --git a/app/Enums/CrawlAlertSeverity.php b/app/Enums/CrawlAlertSeverity.php new file mode 100644 index 0000000..08e1a39 --- /dev/null +++ b/app/Enums/CrawlAlertSeverity.php @@ -0,0 +1,14 @@ + 'AI 工具', + self::Model => 'AI 模型', + }; + } +} + diff --git a/app/Enums/CrawlTriggerType.php b/app/Enums/CrawlTriggerType.php new file mode 100644 index 0000000..1852276 --- /dev/null +++ b/app/Enums/CrawlTriggerType.php @@ -0,0 +1,13 @@ +with(['rule', 'run']) + ->when($request->filled('resolved'), function ($query) use ($request): void { + $query->where('is_resolved', (bool) $request->boolean('resolved')); + }) + ->latest('id') + ->paginate(20) + ->withQueryString(); + + return view('admin.crawl-alerts.index', [ + 'items' => $items, + 'filters' => $request->only(['resolved']), + ]); + } + + public function resolve(CrawlAlert $alert): RedirectResponse + { + $alert->is_resolved = true; + $alert->resolved_at = now(); + $alert->resolved_by = null; + $alert->save(); + + return redirect()->back()->with('status', '告警已标记为已处理'); + } +} + diff --git a/app/Http/Controllers/Admin/CrawlRunController.php b/app/Http/Controllers/Admin/CrawlRunController.php new file mode 100644 index 0000000..20d6727 --- /dev/null +++ b/app/Http/Controllers/Admin/CrawlRunController.php @@ -0,0 +1,54 @@ +with('rule') + ->when($request->filled('rule_id'), function ($query) use ($request): void { + $query->where('rule_id', (int) $request->input('rule_id')); + }) + ->latest('id') + ->paginate(20) + ->withQueryString(); + + return view('admin.crawl-runs.index', [ + 'items' => $items, + 'filters' => $request->only(['rule_id']), + ]); + } + + public function show(CrawlRun $run): View + { + $run->load(['rule', 'items' => function ($query): void { + $query->latest('id'); + }, 'alerts']); + + return view('admin.crawl-runs.show', [ + 'run' => $run, + ]); + } + + public function retry(CrawlRun $run): RedirectResponse + { + if ($run->rule_id !== null) { + RunCrawlRuleJob::dispatch($run->rule_id, 'retry', null, $run->id); + } + + return redirect()->route('admin.crawl-runs.index', ['rule_id' => $run->rule_id]) + ->with('status', '已提交重试任务'); + } +} + diff --git a/app/Http/Controllers/Admin/CrawlerRuleController.php b/app/Http/Controllers/Admin/CrawlerRuleController.php new file mode 100644 index 0000000..1930647 --- /dev/null +++ b/app/Http/Controllers/Admin/CrawlerRuleController.php @@ -0,0 +1,413 @@ +withCount('runs') + ->when($request->filled('q'), function ($query) use ($request): void { + $keyword = '%'.trim((string) $request->string('q')).'%'; + $query->where('name', 'like', $keyword); + }) + ->latest('updated_at') + ->paginate(20) + ->withQueryString(); + + return view('admin.crawlers.index', [ + 'items' => $items, + 'filters' => $request->only(['q']), + ]); + } + + public function create(): View + { + return view('admin.crawlers.form', [ + 'item' => new CrawlRule([ + 'enabled' => true, + 'target_module' => CrawlTargetModule::Tool, + 'cron_expression' => '0 */6 * * *', + 'timezone' => 'Asia/Shanghai', + 'max_pages' => 50, + 'rate_limit_per_minute' => 30, + 'retry_max' => 3, + 'retry_backoff_seconds' => 60, + 'extractor_config' => [ + 'mode' => 'xpath', + 'list_link_xpath' => '//a/@href', + 'fields' => [ + 'name' => '//h1/text()', + 'summary' => '//meta[@name="description"]/@content', + ], + 'ai' => [ + 'temperature' => 0, + 'content_max_chars' => 12000, + ], + ], + 'mapping_config' => [], + 'dedupe_config' => [], + 'publish_policy' => 'draft', + 'ai_provider' => 'openai_compatible', + 'ai_fallback_enabled' => false, + ]), + 'method' => 'POST', + 'submitRoute' => route('admin.crawlers.store'), + ]); + } + + public function store(CrawlRuleRequest $request): RedirectResponse + { + $payload = $request->normalizedPayload(); + $payload['created_by'] = null; + $payload['updated_by'] = null; + + $item = CrawlRule::query()->create($payload); + $item->next_run_at = $this->scheduleService->nextRunAt($item); + $item->save(); + + return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。'); + } + + public function edit(CrawlRule $crawler): View + { + return view('admin.crawlers.form', [ + 'item' => $crawler, + 'method' => 'PUT', + 'submitRoute' => route('admin.crawlers.update', $crawler), + ]); + } + + public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse + { + $payload = $request->normalizedPayload(); + $payload['updated_by'] = null; + + $crawler->fill($payload); + $crawler->next_run_at = $this->scheduleService->nextRunAt($crawler); + $crawler->save(); + + return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。'); + } + + public function run(CrawlRule $crawler): RedirectResponse + { + RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value); + + return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id]) + ->with('status', '已提交手动执行任务。'); + } + + public function preview(Request $request): JsonResponse + { + $payload = $request->validate([ + 'url' => ['required', 'url', 'max:2000'], + 'user_agent' => ['nullable', 'string', 'max:255'], + ]); + + $url = (string) $payload['url']; + + if (! $this->isSafePreviewUrl($url)) { + return response()->json([ + 'ok' => false, + 'message' => '预览地址不安全,已拒绝请求。', + ], 422); + } + + $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null); + if (! $fetched['ok']) { + return response()->json([ + 'ok' => false, + 'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'), + ], 422); + } + + $sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']); + + return response()->json([ + 'ok' => true, + 'url' => $url, + 'title' => $this->extractTitle($sanitizedHtml), + 'html' => $sanitizedHtml, + ]); + } + + public function aiSuggestExtractor(Request $request): JsonResponse + { + $payload = $request->validate([ + 'url' => ['required', 'url', 'max:2000'], + 'target_module' => ['required', 'in:tool,model'], + 'user_agent' => ['nullable', 'string', 'max:255'], + 'ai_model' => ['nullable', 'string', 'max:128'], + 'ai_system_prompt' => ['nullable', 'string', 'max:4000'], + 'ai_user_prompt' => ['nullable', 'string', 'max:4000'], + 'ai_temperature' => ['nullable', 'numeric', 'between:0,2'], + 'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'], + ]); + + if (! $this->aiExtractor->isConfigured()) { + return response()->json([ + 'ok' => false, + 'message' => 'AI 配置缺失,请先设置 CRAWLER_AI_KEY 与接口地址。', + ], 422); + } + + $url = (string) $payload['url']; + + if (! $this->isSafePreviewUrl($url)) { + return response()->json([ + 'ok' => false, + 'message' => '目标 URL 不安全,已拒绝请求。', + ], 422); + } + + $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null); + if (! $fetched['ok']) { + return response()->json([ + 'ok' => false, + 'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'), + ], 422); + } + + $options = []; + foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) { + if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') { + $options[$target] = trim((string) $payload[$source]); + } + } + + if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') { + $options['temperature'] = (float) $payload['ai_temperature']; + } + + if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') { + $options['content_max_chars'] = (int) $payload['ai_content_max_chars']; + } + + $extractorConfig = $this->aiExtractor->suggestExtractorConfig( + (string) $payload['target_module'], + $this->sanitizePreviewHtml($fetched['body']), + $options, + ); + + if ($extractorConfig === []) { + $reason = $this->aiExtractor->lastError(); + + return response()->json([ + 'ok' => false, + 'message' => $reason !== null && $reason !== '' + ? 'AI 生成失败:'.$reason + : 'AI 未生成有效规则,请调整页面或提示词后重试。', + ], 422); + } + + return response()->json([ + 'ok' => true, + 'extractor_config' => $extractorConfig, + ]); + } + + /** + * @return array{ok: bool, body: string, error: string|null} + */ + private function fetchHtml(string $url, ?string $userAgent = null): array + { + $ua = is_string($userAgent) && trim($userAgent) !== '' + ? trim($userAgent) + : (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0'); + + $maxAttempts = 3; + $lastError = 'unknown'; + + for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) { + try { + $request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5)); + if (! (bool) config('crawler.verify_ssl', true)) { + $request = $request->withoutVerifying(); + } + + $request = $this->applyNetworkOptions($request); + $response = $request->withUserAgent($ua)->get($url); + + if ($response->successful()) { + return [ + 'ok' => true, + 'body' => $response->body(), + 'error' => null, + ]; + } + + $lastError = sprintf('HTTP %d', $response->status()); + if ($attempt < $maxAttempts && $response->serverError()) { + usleep(250000 * $attempt); + continue; + } + + break; + } catch (\Throwable $exception) { + $lastError = $exception->getMessage(); + if ($attempt < $maxAttempts) { + usleep(250000 * $attempt); + continue; + } + } + } + + return [ + 'ok' => false, + 'body' => '', + 'error' => $lastError, + ]; + } + + private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest + { + $options = []; + + if ((bool) config('crawler.force_ipv4', false)) { + $options['force_ip_resolve'] = 'v4'; + } + + $dnsServers = trim((string) config('crawler.dns_servers', '')); + if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) { + $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers; + } + + if ($options === []) { + return $request; + } + + return $request->withOptions($options); + } + + private function isSafePreviewUrl(string $url): bool + { + $parts = parse_url($url); + if (! is_array($parts)) { + return false; + } + + $scheme = strtolower((string) ($parts['scheme'] ?? '')); + $host = strtolower((string) ($parts['host'] ?? '')); + + if (! in_array($scheme, ['http', 'https'], true) || $host === '') { + return false; + } + + if ($host === 'localhost') { + return false; + } + + if (filter_var($host, FILTER_VALIDATE_IP) !== false) { + return $this->isPublicIp($host); + } + + $records = @dns_get_record($host, DNS_A + DNS_AAAA); + if (! is_array($records) || $records === []) { + return true; + } + + foreach ($records as $record) { + $ip = (string) ($record['ip'] ?? $record['ipv6'] ?? ''); + if ($ip !== '' && ! $this->isPublicIp($ip)) { + return false; + } + } + + return true; + } + + private function isPublicIp(string $ip): bool + { + return filter_var( + $ip, + FILTER_VALIDATE_IP, + FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE, + ) !== false; + } + + private function sanitizePreviewHtml(string $html): string + { + if (trim($html) === '') { + return '空页面'; + } + + $dom = new \DOMDocument('1.0', 'UTF-8'); + + libxml_use_internal_errors(true); + $dom->loadHTML(''.$html, LIBXML_NOWARNING | LIBXML_NOERROR); + libxml_clear_errors(); + + $xpath = new \DOMXPath($dom); + + foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) { + $nodes = $xpath->query($query); + if ($nodes === false) { + continue; + } + + for ($index = $nodes->length - 1; $index >= 0; $index--) { + $node = $nodes->item($index); + if ($node !== null && $node->parentNode !== null) { + $node->parentNode->removeChild($node); + } + } + } + + $allNodes = $xpath->query('//*'); + if ($allNodes !== false) { + foreach ($allNodes as $node) { + if (! $node instanceof \DOMElement) { + continue; + } + + $attributesToRemove = []; + foreach ($node->attributes as $attribute) { + if (str_starts_with(strtolower($attribute->name), 'on')) { + $attributesToRemove[] = $attribute->name; + } + } + + foreach ($attributesToRemove as $attributeName) { + $node->removeAttribute($attributeName); + } + } + } + + $output = (string) $dom->saveHTML(); + + return mb_substr($output, 0, 300000); + } + + private function extractTitle(string $html): string + { + if (preg_match('/]*>(.*?)<\/title>/is', $html, $matches) !== 1) { + return ''; + } + + return trim(strip_tags((string) $matches[1])); + } +} diff --git a/app/Http/Requests/Admin/CrawlRuleRequest.php b/app/Http/Requests/Admin/CrawlRuleRequest.php new file mode 100644 index 0000000..2b73d72 --- /dev/null +++ b/app/Http/Requests/Admin/CrawlRuleRequest.php @@ -0,0 +1,254 @@ +decodeJsonToArray($this->input('extractor_json')); + + $this->merge([ + 'enabled' => $this->boolean('enabled'), + 'render_js' => $this->boolean('render_js'), + 'ai_fallback_enabled' => $this->boolean('ai_fallback_enabled'), + 'extractor_mode' => $this->input('extractor_mode') ?: (string) ($extractorConfig['mode'] ?? 'xpath'), + ]); + } + + public function rules(): array + { + return [ + 'name' => ['required', 'string', 'max:150'], + 'target_module' => ['required', Rule::in(array_column(CrawlTargetModule::cases(), 'value'))], + 'enabled' => ['nullable', 'boolean'], + 'entry_urls' => ['required', 'string'], + 'cron_expression' => ['required', 'string', 'max:64'], + 'timezone' => ['required', 'string', 'max:64'], + 'max_pages' => ['required', 'integer', 'between:1,2000'], + 'render_js' => ['nullable', 'boolean'], + 'user_agent' => ['nullable', 'string', 'max:255'], + 'headers_json' => ['nullable', 'json'], + 'cookies_json' => ['nullable', 'json'], + 'proxy' => ['nullable', 'string', 'max:255'], + 'rate_limit_per_minute' => ['required', 'integer', 'between:1,2000'], + 'retry_max' => ['required', 'integer', 'between:1,10'], + 'retry_backoff_seconds' => ['required', 'integer', 'between:1,3600'], + 'extractor_json' => ['required', 'json'], + 'extractor_mode' => ['required', Rule::in(['xpath', 'ai', 'hybrid'])], + 'mapping_json' => ['nullable', 'json'], + 'dedupe_json' => ['nullable', 'json'], + 'ai_fallback_enabled' => ['nullable', 'boolean'], + 'ai_provider' => ['nullable', 'string', 'max:64'], + 'ai_model' => ['nullable', 'string', 'max:128'], + 'ai_system_prompt' => ['nullable', 'string', 'max:4000'], + 'ai_user_prompt' => ['nullable', 'string', 'max:4000'], + 'ai_temperature' => ['nullable', 'numeric', 'between:0,2'], + 'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'], + 'publish_policy' => ['required', Rule::in(['draft'])], + 'alert_email' => ['nullable', 'email'], + ]; + } + + public function messages(): array + { + return [ + 'name.required' => '请填写规则名称。', + 'target_module.required' => '请选择目标模块。', + 'entry_urls.required' => '请至少填写一个入口 URL。', + 'cron_expression.required' => '请填写 Cron 表达式。', + 'timezone.required' => '请填写时区。', + 'max_pages.required' => '请填写最大页面数。', + 'max_pages.integer' => '最大页面数必须是整数。', + 'max_pages.between' => '最大页面数需在 1 到 2000 之间。', + 'rate_limit_per_minute.required' => '请填写每分钟限流值。', + 'rate_limit_per_minute.integer' => '每分钟限流值必须是整数。', + 'rate_limit_per_minute.between' => '每分钟限流值需在 1 到 2000 之间。', + 'retry_max.required' => '请填写最大重试次数。', + 'retry_max.integer' => '最大重试次数必须是整数。', + 'retry_max.between' => '最大重试次数需在 1 到 10 之间。', + 'retry_backoff_seconds.required' => '请填写重试退避秒数。', + 'retry_backoff_seconds.integer' => '重试退避秒数必须是整数。', + 'retry_backoff_seconds.between' => '重试退避秒数需在 1 到 3600 之间。', + 'extractor_json.required' => '请填写 Extractor JSON。', + 'extractor_json.json' => 'Extractor JSON 格式不合法。', + 'extractor_mode.required' => '请选择抽取模式。', + 'extractor_mode.in' => '抽取模式仅支持 xpath、ai、hybrid。', + 'mapping_json.json' => 'Mapping JSON 格式不合法。', + 'dedupe_json.json' => 'Dedupe JSON 格式不合法。', + 'headers_json.json' => 'Headers JSON 格式不合法。', + 'cookies_json.json' => 'Cookies JSON 格式不合法。', + 'ai_temperature.between' => 'AI 温度需在 0 到 2 之间。', + 'ai_content_max_chars.between' => 'AI 内容截断长度需在 500 到 50000 之间。', + 'alert_email.email' => '告警邮箱格式不合法。', + ]; + } + + public function attributes(): array + { + return [ + 'name' => '规则名称', + 'target_module' => '目标模块', + 'entry_urls' => '入口 URL', + 'cron_expression' => 'Cron 表达式', + 'timezone' => '时区', + 'max_pages' => '最大页面数', + 'rate_limit_per_minute' => '每分钟限流', + 'retry_max' => '最大重试次数', + 'retry_backoff_seconds' => '重试退避秒数', + 'extractor_json' => 'Extractor JSON', + 'extractor_mode' => '抽取模式', + 'mapping_json' => 'Mapping JSON', + 'dedupe_json' => 'Dedupe JSON', + 'headers_json' => 'Headers JSON', + 'cookies_json' => 'Cookies JSON', + 'ai_system_prompt' => 'AI 系统提示词', + 'ai_user_prompt' => 'AI 用户提示词', + 'ai_temperature' => 'AI 温度', + 'ai_content_max_chars' => 'AI 内容截断长度', + 'alert_email' => '告警邮箱', + ]; + } + + /** + * @return array + */ + public function normalizedPayload(): array + { + $payload = $this->validated(); + + $extractorConfig = $this->decodeJsonToArray($payload['extractor_json'] ?? null); + $extractorMode = (string) ($payload['extractor_mode'] ?? ($extractorConfig['mode'] ?? 'xpath')); + + if (! in_array($extractorMode, ['xpath', 'ai', 'hybrid'], true)) { + $extractorMode = 'xpath'; + } + + $extractorConfig['mode'] = $extractorMode; + + $aiConfig = $this->buildAiConfig($payload); + if ($aiConfig !== []) { + $extractorConfig['ai'] = $aiConfig; + } else { + unset($extractorConfig['ai']); + } + + return [ + 'name' => $payload['name'], + 'target_module' => $payload['target_module'], + 'enabled' => (bool) ($payload['enabled'] ?? false), + 'entry_urls' => $this->parseEntryUrls((string) ($payload['entry_urls'] ?? '')), + 'cron_expression' => trim((string) $payload['cron_expression']), + 'timezone' => trim((string) $payload['timezone']), + 'max_pages' => (int) $payload['max_pages'], + 'render_js' => (bool) ($payload['render_js'] ?? false), + 'user_agent' => $this->nullableTrim($payload['user_agent'] ?? null), + 'headers' => $this->decodeJsonToArray($payload['headers_json'] ?? null), + 'cookies' => $this->decodeJsonToArray($payload['cookies_json'] ?? null), + 'proxy' => $this->nullableTrim($payload['proxy'] ?? null), + 'rate_limit_per_minute' => (int) $payload['rate_limit_per_minute'], + 'retry_max' => (int) $payload['retry_max'], + 'retry_backoff_seconds' => (int) $payload['retry_backoff_seconds'], + 'extractor_config' => $extractorConfig, + 'mapping_config' => $this->decodeJsonToArray($payload['mapping_json'] ?? null), + 'dedupe_config' => $this->decodeJsonToArray($payload['dedupe_json'] ?? null), + 'ai_fallback_enabled' => (bool) ($payload['ai_fallback_enabled'] ?? false), + 'ai_provider' => $this->nullableTrim($payload['ai_provider'] ?? null), + 'ai_model' => $this->nullableTrim($payload['ai_model'] ?? null), + 'publish_policy' => (string) $payload['publish_policy'], + 'alert_email' => $this->nullableTrim($payload['alert_email'] ?? null), + ]; + } + + /** + * @param array $payload + * @return array + */ + private function buildAiConfig(array $payload): array + { + $aiConfig = []; + + $systemPrompt = $this->nullableTrim($payload['ai_system_prompt'] ?? null); + if ($systemPrompt !== null) { + $aiConfig['system_prompt'] = $systemPrompt; + } + + $userPrompt = $this->nullableTrim($payload['ai_user_prompt'] ?? null); + if ($userPrompt !== null) { + $aiConfig['user_prompt'] = $userPrompt; + } + + if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') { + $aiConfig['temperature'] = (float) $payload['ai_temperature']; + } + + if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') { + $aiConfig['content_max_chars'] = (int) $payload['ai_content_max_chars']; + } + + $aiModel = $this->nullableTrim($payload['ai_model'] ?? null); + if ($aiModel !== null) { + $aiConfig['model'] = $aiModel; + } + + return $aiConfig; + } + + private function nullableTrim(mixed $value): ?string + { + if (! is_string($value)) { + return null; + } + + $trimmed = trim($value); + + return $trimmed === '' ? null : $trimmed; + } + + /** + * @return list + */ + private function parseEntryUrls(string $entryUrls): array + { + $lines = preg_split('/\r\n|\r|\n/', $entryUrls) ?: []; + + $urls = []; + foreach ($lines as $line) { + $candidate = trim($line); + if ($candidate === '') { + continue; + } + + if (filter_var($candidate, FILTER_VALIDATE_URL) !== false) { + $urls[] = $candidate; + } + } + + return array_values(array_unique($urls)); + } + + /** + * @return array + */ + private function decodeJsonToArray(mixed $value): array + { + if (! is_string($value) || trim($value) === '') { + return []; + } + + $decoded = json_decode($value, true); + + return is_array($decoded) ? $decoded : []; + } +} \ No newline at end of file diff --git a/app/Jobs/RunCrawlRuleJob.php b/app/Jobs/RunCrawlRuleJob.php new file mode 100644 index 0000000..f560e4b --- /dev/null +++ b/app/Jobs/RunCrawlRuleJob.php @@ -0,0 +1,49 @@ +find($this->ruleId); + + if (! $rule instanceof CrawlRule) { + return; + } + + $trigger = CrawlTriggerType::tryFrom($this->triggerType) ?? CrawlTriggerType::Manual; + + $metrics = []; + if ($this->retryFromRunId !== null) { + $metrics['retry_from_run_id'] = $this->retryFromRunId; + } + + $executionService->runRule($rule, $trigger, $this->createdBy, $metrics); + } +} + diff --git a/app/Models/CrawlAlert.php b/app/Models/CrawlAlert.php new file mode 100644 index 0000000..e923a96 --- /dev/null +++ b/app/Models/CrawlAlert.php @@ -0,0 +1,48 @@ + CrawlAlertSeverity::class, + 'context' => 'array', + 'is_resolved' => 'boolean', + 'resolved_at' => 'datetime', + ]; + } + + public function run(): BelongsTo + { + return $this->belongsTo(CrawlRun::class, 'run_id'); + } + + public function rule(): BelongsTo + { + return $this->belongsTo(CrawlRule::class, 'rule_id'); + } +} + diff --git a/app/Models/CrawlRule.php b/app/Models/CrawlRule.php new file mode 100644 index 0000000..e0f3f69 --- /dev/null +++ b/app/Models/CrawlRule.php @@ -0,0 +1,74 @@ + CrawlTargetModule::class, + 'enabled' => 'boolean', + 'entry_urls' => 'array', + 'headers' => 'array', + 'cookies' => 'array', + 'extractor_config' => 'array', + 'mapping_config' => 'array', + 'dedupe_config' => 'array', + 'render_js' => 'boolean', + 'ai_fallback_enabled' => 'boolean', + 'last_run_at' => 'datetime', + 'next_run_at' => 'datetime', + ]; + } + + public function runs(): HasMany + { + return $this->hasMany(CrawlRun::class, 'rule_id'); + } + + public function alerts(): HasMany + { + return $this->hasMany(CrawlAlert::class, 'rule_id'); + } +} + diff --git a/app/Models/CrawlRun.php b/app/Models/CrawlRun.php new file mode 100644 index 0000000..b0d7d1e --- /dev/null +++ b/app/Models/CrawlRun.php @@ -0,0 +1,59 @@ + CrawlTriggerType::class, + 'status' => CrawlRunStatus::class, + 'started_at' => 'datetime', + 'finished_at' => 'datetime', + 'metrics' => 'array', + ]; + } + + public function rule(): BelongsTo + { + return $this->belongsTo(CrawlRule::class, 'rule_id'); + } + + public function items(): HasMany + { + return $this->hasMany(CrawlRunItem::class, 'run_id'); + } + + public function alerts(): HasMany + { + return $this->hasMany(CrawlAlert::class, 'run_id'); + } +} + diff --git a/app/Models/CrawlRunItem.php b/app/Models/CrawlRunItem.php new file mode 100644 index 0000000..ae48acb --- /dev/null +++ b/app/Models/CrawlRunItem.php @@ -0,0 +1,46 @@ + CrawlRunItemStatus::class, + 'raw_payload' => 'array', + 'normalized_payload' => 'array', + 'upsert_result' => 'array', + ]; + } + + public function run(): BelongsTo + { + return $this->belongsTo(CrawlRun::class, 'run_id'); + } +} + diff --git a/app/Services/Crawler/CrawlAlertService.php b/app/Services/Crawler/CrawlAlertService.php new file mode 100644 index 0000000..45e6f88 --- /dev/null +++ b/app/Services/Crawler/CrawlAlertService.php @@ -0,0 +1,73 @@ +create([ + 'run_id' => $run?->id, + 'rule_id' => $rule?->id, + 'severity' => $severity, + 'type' => $type, + 'message' => $message, + 'context' => $context, + 'is_resolved' => false, + ]); + + $recipient = $rule?->alert_email ?: config('crawler.default_alert_email'); + + if (is_string($recipient) && $recipient !== '') { + try { + Mail::raw($this->buildEmailBody($alert), static function ($mail) use ($recipient, $severity): void { + $mail->to($recipient) + ->subject(sprintf('[Crawler][%s] 采集告警', strtoupper($severity->value))); + }); + } catch (\Throwable $exception) { + Log::warning('Crawler alert email failed', [ + 'alert_id' => $alert->id, + 'error' => $exception->getMessage(), + ]); + } + } + + return $alert; + } + + private function buildEmailBody(CrawlAlert $alert): string + { + $lines = [ + '采集告警通知', + sprintf('等级: %s', $alert->severity?->value ?? 'unknown'), + sprintf('类型: %s', $alert->type), + sprintf('信息: %s', $alert->message), + sprintf('规则ID: %s', (string) ($alert->rule_id ?? '-')), + sprintf('运行ID: %s', (string) ($alert->run_id ?? '-')), + sprintf('时间: %s', (string) $alert->created_at), + ]; + + if (is_array($alert->context) && $alert->context !== []) { + $lines[] = '上下文:'; + $lines[] = json_encode($alert->context, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT) ?: '{}'; + } + + return implode("\n", $lines); + } +} + diff --git a/app/Services/Crawler/CrawlEntityUpsertService.php b/app/Services/Crawler/CrawlEntityUpsertService.php new file mode 100644 index 0000000..8488b8e --- /dev/null +++ b/app/Services/Crawler/CrawlEntityUpsertService.php @@ -0,0 +1,277 @@ + $payload + * @return array + */ + public function upsert(CrawlRule $rule, array $payload, string $detailUrl): array + { + $mapped = $this->applyMapping($payload, is_array($rule->mapping_config) ? $rule->mapping_config : []); + + return match ($rule->target_module?->value) { + 'model' => $this->upsertModel($mapped, $detailUrl), + default => $this->upsertTool($mapped, $detailUrl), + }; + } + + /** + * @param array $payload + * @param array $mapping + * @return array + */ + private function applyMapping(array $payload, array $mapping): array + { + if ($mapping === []) { + return $payload; + } + + $result = $payload; + + foreach ($mapping as $target => $source) { + if (! is_string($target) || ! is_string($source)) { + continue; + } + + if (array_key_exists($source, $payload)) { + $result[$target] = $payload[$source]; + } + } + + return $result; + } + + /** + * @param array $payload + * @return array + */ + private function upsertTool(array $payload, string $detailUrl): array + { + $name = trim((string) ($payload['name'] ?? '')); + if ($name === '') { + throw new \RuntimeException('Tool payload missing name'); + } + + $slug = trim((string) ($payload['slug'] ?? '')); + $slug = $slug !== '' ? Str::slug($slug) : Str::slug($name); + $slug = $slug !== '' ? $slug : 'tool-'.Str::lower(Str::random(8)); + + $officialUrl = trim((string) ($payload['official_url'] ?? $payload['url'] ?? $detailUrl)); + $canonicalUrl = trim((string) ($payload['canonical_url'] ?? '')); + $summary = trim((string) ($payload['summary'] ?? '')); + + if ($summary === '') { + $summary = mb_substr(trim((string) ($payload['description'] ?? $name)), 0, 240); + } + + $source = $this->resolveSource($officialUrl !== '' ? $officialUrl : $detailUrl); + $categoryId = $this->resolveCategoryId('tool', $payload); + + $entity = Tool::query() + ->when($officialUrl !== '', static function ($query) use ($officialUrl): void { + $query->where('official_url', $officialUrl)->orWhere('canonical_url', $officialUrl); + }, static function ($query) use ($slug, $name): void { + $query->where('slug', $slug)->orWhereRaw('LOWER(name) = ?', [mb_strtolower($name)]); + }) + ->first(); + + $action = $entity === null ? 'created' : 'updated'; + + $entity ??= new Tool(); + + $entity->fill([ + 'category_id' => $categoryId, + 'source_id' => $source?->id, + 'name' => $name, + 'slug' => $this->resolveUniqueSlug(Tool::class, $slug, $entity->id), + 'summary' => mb_substr($summary, 0, 260), + 'description' => (string) ($payload['description'] ?? ''), + 'official_url' => $officialUrl !== '' ? $officialUrl : null, + 'logo_url' => (string) ($payload['logo_url'] ?? ''), + 'pricing_type' => (string) ($payload['pricing_type'] ?? 'unknown'), + 'platform' => (string) ($payload['platform'] ?? ''), + 'language' => (string) ($payload['language'] ?? ''), + 'has_api' => (bool) ($payload['has_api'] ?? false), + 'source_level' => $source?->trust_level ?? SourceLevel::Unknown, + 'status' => EntityStatus::Draft, + 'canonical_url' => $canonicalUrl !== '' ? $canonicalUrl : null, + 'last_verified_at' => now(), + ]); + + $entity->save(); + + return [ + 'action' => $action, + 'entity' => Tool::class, + 'entity_id' => $entity->id, + 'name' => $entity->name, + ]; + } + + /** + * @param array $payload + * @return array + */ + private function upsertModel(array $payload, string $detailUrl): array + { + $name = trim((string) ($payload['name'] ?? '')); + if ($name === '') { + throw new \RuntimeException('Model payload missing name'); + } + + $slug = trim((string) ($payload['slug'] ?? '')); + $slug = $slug !== '' ? Str::slug($slug) : Str::slug($name); + $slug = $slug !== '' ? $slug : 'model-'.Str::lower(Str::random(8)); + + $summary = trim((string) ($payload['summary'] ?? '')); + if ($summary === '') { + $summary = mb_substr(trim((string) ($payload['description'] ?? $name)), 0, 240); + } + + $officialUrl = trim((string) ($payload['official_url'] ?? $payload['url'] ?? $detailUrl)); + $canonicalUrl = trim((string) ($payload['canonical_url'] ?? '')); + + $source = $this->resolveSource($officialUrl !== '' ? $officialUrl : $detailUrl); + $categoryId = $this->resolveCategoryId('model', $payload); + + $entity = AiModel::query() + ->when($officialUrl !== '', static function ($query) use ($officialUrl): void { + $query->where('canonical_url', $officialUrl); + }, static function ($query) use ($slug, $name): void { + $query->where('slug', $slug)->orWhereRaw('LOWER(name) = ?', [mb_strtolower($name)]); + }) + ->first(); + + $action = $entity === null ? 'created' : 'updated'; + + $entity ??= new AiModel(); + + $entity->fill([ + 'category_id' => $categoryId, + 'source_id' => $source?->id, + 'name' => $name, + 'slug' => $this->resolveUniqueSlug(AiModel::class, $slug, $entity->id), + 'provider' => (string) ($payload['provider'] ?? ''), + 'summary' => mb_substr($summary, 0, 260), + 'description' => (string) ($payload['description'] ?? ''), + 'modality' => (string) ($payload['modality'] ?? 'text'), + 'context_window' => $this->toNullableInt($payload['context_window'] ?? null), + 'price_input' => $this->toNullableFloat($payload['price_input'] ?? null), + 'price_output' => $this->toNullableFloat($payload['price_output'] ?? null), + 'deployment_mode' => (string) ($payload['deployment_mode'] ?? 'api'), + 'effectiveness_score' => $this->boundedScore($payload['effectiveness_score'] ?? 60), + 'price_score' => $this->boundedScore($payload['price_score'] ?? 60), + 'speed_score' => $this->boundedScore($payload['speed_score'] ?? 60), + 'source_level' => $source?->trust_level ?? SourceLevel::Unknown, + 'status' => EntityStatus::Draft, + 'canonical_url' => $canonicalUrl !== '' ? $canonicalUrl : ($officialUrl !== '' ? $officialUrl : null), + 'last_verified_at' => now(), + ]); + + $this->modelScoringService->apply($entity); + $entity->save(); + + return [ + 'action' => $action, + 'entity' => AiModel::class, + 'entity_id' => $entity->id, + 'name' => $entity->name, + ]; + } + + private function resolveSource(string $url): ?Source + { + $host = parse_url($url, PHP_URL_HOST); + + if (! is_string($host) || $host === '') { + return null; + } + + return Source::query()->where('domain', $host)->first(); + } + + /** + * @param array $payload + */ + private function resolveCategoryId(string $type, array $payload): ?int + { + $candidate = trim((string) ($payload['category_slug'] ?? $payload['category'] ?? '')); + + if ($candidate === '') { + return null; + } + + $category = Category::query() + ->where('type', $type) + ->where(static function ($query) use ($candidate): void { + $query->where('slug', $candidate)->orWhere('name', $candidate); + }) + ->first(); + + return $category?->id; + } + + /** + * @param class-string<\Illuminate\Database\Eloquent\Model> $modelClass + */ + private function resolveUniqueSlug(string $modelClass, string $slug, ?int $exceptId = null): string + { + $finalSlug = $slug; + $suffix = 1; + + while ($modelClass::query() + ->when($exceptId !== null, static fn ($query) => $query->where('id', '!=', $exceptId)) + ->where('slug', $finalSlug) + ->exists()) { + $finalSlug = sprintf('%s-%d', $slug, $suffix); + $suffix++; + } + + return $finalSlug; + } + + private function boundedScore(mixed $value): int + { + $score = (int) $value; + + return max(0, min(100, $score)); + } + + private function toNullableInt(mixed $value): ?int + { + if ($value === null || $value === '') { + return null; + } + + return (int) $value; + } + + private function toNullableFloat(mixed $value): ?float + { + if ($value === null || $value === '') { + return null; + } + + return (float) $value; + } +} + diff --git a/app/Services/Crawler/CrawlExecutionService.php b/app/Services/Crawler/CrawlExecutionService.php new file mode 100644 index 0000000..2364726 --- /dev/null +++ b/app/Services/Crawler/CrawlExecutionService.php @@ -0,0 +1,335 @@ + $metrics + */ + public function runRule( + CrawlRule $rule, + CrawlTriggerType $triggerType, + ?int $createdBy = null, + array $metrics = [], + ): CrawlRun { + $run = CrawlRun::query()->create([ + 'rule_id' => $rule->id, + 'trigger_type' => $triggerType, + 'status' => CrawlRunStatus::Running, + 'started_at' => now(), + 'metrics' => $metrics, + 'created_by' => $createdBy, + ]); + + $successCount = 0; + $failedCount = 0; + $skippedCount = 0; + $totalUrls = 0; + $errors = []; + + $entryUrls = collect($rule->entry_urls) + ->filter(static fn ($url): bool => is_string($url) && filter_var($url, FILTER_VALIDATE_URL) !== false) + ->values() + ->all(); + + if ($entryUrls === []) { + $errors[] = 'No valid entry urls configured'; + } + + $maxPages = max(1, (int) $rule->max_pages); + + foreach ($entryUrls as $entryUrl) { + [$listResult, $listAttempt] = $this->fetchWithRetry($rule, $entryUrl); + + if (! $listResult['ok']) { + $failedCount++; + $errors[] = sprintf('List fetch failed: %s', (string) ($listResult['error'] ?? 'unknown')); + $this->createRunItem($run, [ + 'url' => $entryUrl, + 'stage' => 'list', + 'attempt' => $listAttempt, + 'status' => CrawlRunItemStatus::Failed, + 'latency_ms' => $listResult['latency_ms'] ?? null, + 'http_code' => $listResult['http_code'] ?? null, + 'error_code' => 'fetch_failed', + 'error_message' => (string) ($listResult['error'] ?? 'Fetch failed'), + ]); + + continue; + } + + $this->createRunItem($run, [ + 'url' => $entryUrl, + 'stage' => 'list', + 'attempt' => $listAttempt, + 'status' => CrawlRunItemStatus::Success, + 'latency_ms' => $listResult['latency_ms'] ?? null, + 'http_code' => $listResult['http_code'] ?? null, + ]); + + $detailUrls = $this->extractor->extractListUrls( + $listResult['body'], + $entryUrl, + is_array($rule->extractor_config) ? $rule->extractor_config : [], + ); + + if ($detailUrls === []) { + $detailUrls = [$entryUrl]; + } + + foreach ($detailUrls as $detailUrl) { + if ($totalUrls >= $maxPages) { + break 2; + } + + $totalUrls++; + [$detailResult, $detailAttempt] = $this->fetchWithRetry($rule, $detailUrl); + + if (! $detailResult['ok']) { + $failedCount++; + $errors[] = sprintf('Detail fetch failed(%s): %s', $detailUrl, (string) ($detailResult['error'] ?? 'unknown')); + + $this->createRunItem($run, [ + 'url' => $detailUrl, + 'stage' => 'detail', + 'attempt' => $detailAttempt, + 'status' => CrawlRunItemStatus::Failed, + 'latency_ms' => $detailResult['latency_ms'] ?? null, + 'http_code' => $detailResult['http_code'] ?? null, + 'error_code' => 'fetch_failed', + 'error_message' => (string) ($detailResult['error'] ?? 'Fetch failed'), + ]); + + continue; + } + + $extracted = $this->extractPayload($rule, $detailResult['body']); + $missing = $this->missingRequiredFields($rule, $extracted); + + if ($missing !== []) { + $skippedCount++; + $this->createRunItem($run, [ + 'url' => $detailUrl, + 'stage' => 'extract', + 'attempt' => $detailAttempt, + 'status' => CrawlRunItemStatus::Skipped, + 'latency_ms' => $detailResult['latency_ms'] ?? null, + 'http_code' => $detailResult['http_code'] ?? null, + 'error_code' => 'missing_fields', + 'error_message' => 'Missing required fields: '.implode(', ', $missing), + 'raw_payload' => ['html_length' => mb_strlen($detailResult['body'])], + 'normalized_payload' => $extracted, + ]); + + continue; + } + + try { + $upsertResult = $this->upsertService->upsert($rule, $extracted, $detailUrl); + $successCount++; + + $this->createRunItem($run, [ + 'url' => $detailUrl, + 'stage' => 'upsert', + 'attempt' => $detailAttempt, + 'status' => CrawlRunItemStatus::Success, + 'latency_ms' => $detailResult['latency_ms'] ?? null, + 'http_code' => $detailResult['http_code'] ?? null, + 'normalized_payload' => $extracted, + 'upsert_result' => $upsertResult, + ]); + } catch (\Throwable $exception) { + $failedCount++; + $errors[] = sprintf('Upsert failed(%s): %s', $detailUrl, $exception->getMessage()); + + $this->createRunItem($run, [ + 'url' => $detailUrl, + 'stage' => 'upsert', + 'attempt' => $detailAttempt, + 'status' => CrawlRunItemStatus::Failed, + 'latency_ms' => $detailResult['latency_ms'] ?? null, + 'http_code' => $detailResult['http_code'] ?? null, + 'error_code' => 'upsert_failed', + 'error_message' => $exception->getMessage(), + 'normalized_payload' => $extracted, + ]); + } + } + } + + $status = $this->finalizeStatus($successCount, $failedCount, $errors); + $run->fill([ + 'status' => $status, + 'finished_at' => now(), + 'total_urls' => $totalUrls, + 'success_count' => $successCount, + 'failed_count' => $failedCount, + 'skipped_count' => $skippedCount, + 'error_summary' => $errors !== [] ? Str::limit(implode(' | ', $errors), 1000) : null, + 'metrics' => array_merge($metrics, ['entry_url_count' => count($entryUrls)]), + ]); + $run->save(); + + $rule->last_run_at = now(); + $rule->next_run_at = $this->scheduleService->nextRunAt($rule); + $rule->save(); + + if ($failedCount > 0 || $errors !== []) { + $this->alertService->notify( + $failedCount > 0 ? CrawlAlertSeverity::Error : CrawlAlertSeverity::Warning, + 'run_failed_or_partial', + sprintf('规则[%s]执行完成,成功%d,失败%d,跳过%d', $rule->name, $successCount, $failedCount, $skippedCount), + $rule, + $run, + [ + 'errors' => array_slice($errors, 0, 10), + ], + ); + } + + return $run->refresh(); + } + + /** + * @return array{0: array{ok: bool, http_code: int|null, body: string, error: string|null, latency_ms: int}, 1: int} + */ + private function fetchWithRetry(CrawlRule $rule, string $url): array + { + $maxAttempts = max(1, (int) $rule->retry_max); + $backoff = max(1, (int) $rule->retry_backoff_seconds); + + $lastResult = [ + 'ok' => false, + 'http_code' => null, + 'body' => '', + 'error' => 'not_started', + 'latency_ms' => 0, + ]; + + for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) { + $lastResult = $this->fetcher->fetch($rule, $url); + if ($lastResult['ok']) { + return [$lastResult, $attempt]; + } + + if ($attempt < $maxAttempts) { + sleep(min($backoff * $attempt, 15)); + } + } + + return [$lastResult, $maxAttempts]; + } + + /** + * @return array + */ + private function extractPayload(CrawlRule $rule, string $html): array + { + $extractorConfig = is_array($rule->extractor_config) ? $rule->extractor_config : []; + $mode = strtolower((string) ($extractorConfig['mode'] ?? 'xpath')); + if (! in_array($mode, ['xpath', 'ai', 'hybrid'], true)) { + $mode = 'xpath'; + } + + $aiOptions = is_array($extractorConfig['ai'] ?? null) ? $extractorConfig['ai'] : []; + + $xpathPayload = []; + $aiPayload = []; + + if ($mode !== 'ai') { + $xpathPayload = $this->extractor->extractFields($html, $extractorConfig); + } + + $shouldUseAi = $mode === 'ai' || $mode === 'hybrid'; + + if ($mode === 'xpath' && $rule->ai_fallback_enabled) { + $shouldUseAi = $this->missingRequiredFields($rule, $xpathPayload) !== []; + } + + if ($shouldUseAi) { + $aiPayload = $this->aiFallbackExtractor->extract($rule, $html, $aiOptions); + } + + if ($mode === 'ai') { + return $aiPayload; + } + + if ($mode === 'hybrid') { + return array_merge($aiPayload, $xpathPayload); + } + + if ($rule->ai_fallback_enabled && $aiPayload !== []) { + return array_merge($aiPayload, $xpathPayload); + } + + return $xpathPayload; + } + + /** + * @param array $payload + * @return list + */ + private function missingRequiredFields(CrawlRule $rule, array $payload): array + { + $required = $rule->target_module?->value === 'model' + ? ['name', 'summary', 'modality', 'deployment_mode'] + : ['name', 'summary']; + + $missing = []; + foreach ($required as $field) { + $value = Arr::get($payload, $field); + if (! is_string($value) || trim($value) === '') { + $missing[] = $field; + } + } + + return $missing; + } + + /** + * @param list $errors + */ + private function finalizeStatus(int $successCount, int $failedCount, array $errors): CrawlRunStatus + { + if ($successCount > 0 && $failedCount === 0 && $errors === []) { + return CrawlRunStatus::Completed; + } + + if ($successCount > 0) { + return CrawlRunStatus::Partial; + } + + return CrawlRunStatus::Failed; + } + + /** + * @param array $attributes + */ + private function createRunItem(CrawlRun $run, array $attributes): CrawlRunItem + { + return $run->items()->create($attributes); + } +} \ No newline at end of file diff --git a/app/Services/Crawler/CrawlFetcherService.php b/app/Services/Crawler/CrawlFetcherService.php new file mode 100644 index 0000000..eb139a4 --- /dev/null +++ b/app/Services/Crawler/CrawlFetcherService.php @@ -0,0 +1,103 @@ +render_js && is_string(config('crawler.browserless_endpoint')) && config('crawler.browserless_endpoint') !== '') { + $response = $this->browserlessRequest($rule)->post((string) config('crawler.browserless_endpoint'), [ + 'url' => $url, + 'waitUntil' => 'networkidle2', + ]); + } else { + $response = $this->httpRequest($rule)->get($url); + } + + return [ + 'ok' => $response->successful(), + 'http_code' => $response->status(), + 'body' => $response->body(), + 'error' => $response->successful() ? null : sprintf('HTTP %d', $response->status()), + 'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000), + ]; + } catch (\Throwable $exception) { + return [ + 'ok' => false, + 'http_code' => null, + 'body' => '', + 'error' => $exception->getMessage(), + 'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000), + ]; + } + } + + private function httpRequest(CrawlRule $rule): PendingRequest + { + $headers = is_array($rule->headers) ? $rule->headers : []; + $cookies = is_array($rule->cookies) ? $rule->cookies : []; + $timeout = max((int) config('crawler.request_timeout_seconds', 20), 5); + + $request = Http::timeout($timeout) + ->withHeaders($headers) + ->withUserAgent((string) ($rule->user_agent ?: config('crawler.default_user_agent'))); + + if (! (bool) config('crawler.verify_ssl', true)) { + $request = $request->withoutVerifying(); + } + + $request = $this->applyNetworkOptions($request); + + if ($cookies !== []) { + $request = $request->withCookies($cookies, parse_url((string) ($rule->entry_urls[0] ?? ''), PHP_URL_HOST) ?: ''); + } + + return $request; + } + + private function applyNetworkOptions(PendingRequest $request): PendingRequest + { + $options = []; + + if ((bool) config('crawler.force_ipv4', false)) { + $options['force_ip_resolve'] = 'v4'; + } + + $dnsServers = trim((string) config('crawler.dns_servers', '')); + if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) { + $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers; + } + + if ($options === []) { + return $request; + } + + return $request->withOptions($options); + } + + private function browserlessRequest(CrawlRule $rule): PendingRequest + { + $request = $this->httpRequest($rule); + $token = (string) config('crawler.browserless_token', ''); + + if ($token !== '') { + $request = $request->withToken($token); + } + + return $request; + } +} + diff --git a/app/Services/Crawler/CrawlRuleScheduleService.php b/app/Services/Crawler/CrawlRuleScheduleService.php new file mode 100644 index 0000000..c1332ee --- /dev/null +++ b/app/Services/Crawler/CrawlRuleScheduleService.php @@ -0,0 +1,44 @@ +enabled) { + return false; + } + + $now ??= CarbonImmutable::now($rule->timezone ?: 'Asia/Shanghai'); + + try { + $cron = new CronExpression($rule->cron_expression); + } catch (\Throwable) { + return false; + } + + return $cron->isDue($now); + } + + public function nextRunAt(CrawlRule $rule, ?CarbonImmutable $from = null): ?CarbonImmutable + { + $from ??= CarbonImmutable::now($rule->timezone ?: 'Asia/Shanghai'); + + try { + $cron = new CronExpression($rule->cron_expression); + $next = CarbonImmutable::instance($cron->getNextRunDate($from)); + } catch (\Throwable) { + return null; + } + + return $next->setTimezone('UTC'); + } +} + diff --git a/app/Services/Crawler/OpenAiFallbackExtractor.php b/app/Services/Crawler/OpenAiFallbackExtractor.php new file mode 100644 index 0000000..c7871fb --- /dev/null +++ b/app/Services/Crawler/OpenAiFallbackExtractor.php @@ -0,0 +1,494 @@ + $options + * @return array + */ + public function extract(CrawlRule $rule, string $html, array $options = []): array + { + $this->lastError = null; + + $credentials = $this->resolveCredentials(); + if ($credentials === null) { + $this->lastError = 'AI credentials not configured.'; + return []; + } + + $ruleAiOptions = is_array($rule->extractor_config['ai'] ?? null) + ? $rule->extractor_config['ai'] + : []; + $mergedOptions = array_merge($ruleAiOptions, $options); + + $model = $this->resolveModel($rule->ai_model, $mergedOptions); + $temperature = $this->resolveTemperature($mergedOptions); + $contentMaxChars = $this->resolveContentMaxChars($mergedOptions, 12000); + + $targetSchema = $rule->target_module?->value === 'tool' + ? 'name, summary, official_url, pricing_type, platform, language, description, logo_url' + : 'name, summary, provider, modality, deployment_mode, context_window, price_input, price_output, description'; + + $defaultUserPrompt = <<resolveSystemPrompt($mergedOptions, '你是一个精确的信息抽取引擎。'); + $userPrompt = $this->resolveUserPrompt($mergedOptions, $defaultUserPrompt); + + $content = $this->requestAiContent( + credentials: $credentials, + model: $model, + temperature: $temperature, + systemPrompt: $systemPrompt, + userPrompt: $userPrompt, + html: $html, + contentMaxChars: $contentMaxChars, + stripTags: true, + ); + + return $this->decodeJsonContent($content); + } + + public function isConfigured(): bool + { + return $this->resolveCredentials() !== null; + } + + public function lastError(): ?string + { + return $this->lastError; + } + + /** + * @param array $options + * @return array + */ + public function suggestExtractorConfig(string $targetModule, string $html, array $options = []): array + { + $this->lastError = null; + + $credentials = $this->resolveCredentials(); + if ($credentials === null) { + $this->lastError = 'AI credentials not configured.'; + return []; + } + + $targetModule = in_array($targetModule, ['tool', 'model'], true) ? $targetModule : 'tool'; + $fields = $targetModule === 'tool' + ? ['name', 'summary', 'official_url', 'pricing_type', 'platform', 'language', 'description', 'logo_url'] + : ['name', 'summary', 'provider', 'modality', 'deployment_mode', 'context_window', 'price_input', 'price_output', 'description']; + + $defaultUserPrompt = <<implodeFields($fields)}。 +PROMPT; + + $model = $this->resolveModel((string) ($options['model'] ?? null), $options); + $temperature = $this->resolveTemperature($options); + $contentMaxChars = $this->resolveContentMaxChars($options, 16000); + + $systemPrompt = $this->resolveSystemPrompt($options, '你是 XPath 规则设计专家,擅长从 HTML 生成稳定的抽取规则。'); + $userPrompt = $this->resolveUserPrompt($options, $defaultUserPrompt); + + $content = $this->requestAiContent( + credentials: $credentials, + model: $model, + temperature: $temperature, + systemPrompt: $systemPrompt, + userPrompt: $userPrompt, + html: $html, + contentMaxChars: $contentMaxChars, + stripTags: false, + ); + + $decoded = $this->decodeJsonContent($content); + if (! is_array($decoded)) { + $this->lastError = $this->lastError ?: 'AI response is not valid JSON.'; + return []; + } + + $fieldsConfig = is_array($decoded['fields'] ?? null) ? $decoded['fields'] : []; + if ($fieldsConfig === []) { + $this->lastError = $this->lastError ?: 'AI response does not include fields config.'; + return []; + } + + return [ + 'list_link_xpath' => is_string($decoded['list_link_xpath'] ?? null) ? $decoded['list_link_xpath'] : '', + 'fields' => $fieldsConfig, + ]; + } + + /** + * @return array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string}|null + */ + private function resolveCredentials(): ?array + { + $apiKey = (string) config('crawler.openai_compatible_key', ''); + $endpoint = $this->resolveEndpoint(); + + if ($endpoint === '' || $apiKey === '') { + return null; + } + + return [ + 'endpoint' => $endpoint, + 'api_key' => $apiKey, + 'wire_api' => $this->resolveWireApi(), + 'disable_response_storage' => (bool) config('crawler.openai_disable_response_storage', false), + 'reasoning_effort' => trim((string) config('crawler.openai_reasoning_effort', '')), + ]; + } + + private function resolveEndpoint(): string + { + $configuredEndpoint = trim((string) config('crawler.openai_compatible_endpoint', '')); + if ($configuredEndpoint !== '') { + return $configuredEndpoint; + } + + $baseUrl = trim((string) config('crawler.openai_compatible_base_url', '')); + if ($baseUrl === '') { + return ''; + } + + $baseUrl = rtrim($baseUrl, '/'); + + return $this->resolveWireApi() === 'responses' + ? $baseUrl.'/v1/responses' + : $baseUrl.'/v1/chat/completions'; + } + + private function resolveWireApi(): string + { + $wireApi = strtolower(trim((string) config('crawler.openai_wire_api', 'chat_completions'))); + + return $wireApi === 'responses' ? 'responses' : 'chat_completions'; + } + + /** + * @param string|null $ruleModel + * @param array $options + */ + private function resolveModel(?string $ruleModel, array $options): string + { + $model = ''; + + if (is_string($options['model'] ?? null)) { + $model = trim((string) $options['model']); + } + + if ($model === '' && is_string($ruleModel)) { + $model = trim($ruleModel); + } + + if ($model === '') { + $model = (string) config('crawler.openai_default_model', 'gpt-4o-mini'); + } + + return $model; + } + + /** + * @param array $options + */ + private function resolveTemperature(array $options): float + { + $temperature = is_numeric($options['temperature'] ?? null) + ? (float) $options['temperature'] + : 0.0; + + return max(0.0, min(2.0, $temperature)); + } + + /** + * @param array $options + */ + private function resolveContentMaxChars(array $options, int $default): int + { + $value = is_numeric($options['content_max_chars'] ?? null) + ? (int) $options['content_max_chars'] + : $default; + + return max(500, min(50000, $value)); + } + + /** + * @param array $options + */ + private function resolveSystemPrompt(array $options, string $default): string + { + $prompt = is_string($options['system_prompt'] ?? null) + ? trim((string) $options['system_prompt']) + : ''; + + return $prompt === '' ? $default : $prompt; + } + + /** + * @param array $options + */ + private function resolveUserPrompt(array $options, string $default): string + { + $prompt = is_string($options['user_prompt'] ?? null) + ? trim((string) $options['user_prompt']) + : ''; + + return $prompt === '' ? $default : $prompt; + } + + private function implodeFields(array $fields): string + { + return implode(', ', $fields); + } + + /** + * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials + */ + private function requestAiContent( + array $credentials, + string $model, + float $temperature, + string $systemPrompt, + string $userPrompt, + string $html, + int $contentMaxChars, + bool $stripTags, + ): string { + $source = $stripTags ? strip_tags($html) : $html; + $content = mb_substr($source, 0, $contentMaxChars); + + if ($credentials['wire_api'] === 'responses') { + return $this->requestResponsesApi( + credentials: $credentials, + model: $model, + systemPrompt: $systemPrompt, + userPrompt: $userPrompt, + content: $content, + ); + } + + return $this->requestChatCompletionsApi( + credentials: $credentials, + model: $model, + temperature: $temperature, + systemPrompt: $systemPrompt, + userPrompt: $userPrompt, + content: $content, + ); + } + + /** + * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials + */ + private function requestChatCompletionsApi( + array $credentials, + string $model, + float $temperature, + string $systemPrompt, + string $userPrompt, + string $content, + ): string { + try { + $payload = [ + 'model' => $model, + 'temperature' => $temperature, + 'messages' => [ + ['role' => 'system', 'content' => $systemPrompt], + ['role' => 'user', 'content' => $userPrompt."\n\n页面内容:\n".$content], + ], + ]; + + if ($credentials['disable_response_storage']) { + $payload['store'] = false; + } + + if ($credentials['reasoning_effort'] !== '') { + $payload['reasoning'] = ['effort' => $credentials['reasoning_effort']]; + } + + $response = $this->requestBuilder($credentials['api_key']) + ->post($credentials['endpoint'], $payload); + + if (! $response->successful()) { + $this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240)); + return ''; + } + + return (string) data_get($response->json(), 'choices.0.message.content', ''); + } catch (\Throwable $exception) { + $this->lastError = $exception->getMessage(); + + return ''; + } + } + + /** + * @param array{endpoint: string, api_key: string, wire_api: string, disable_response_storage: bool, reasoning_effort: string} $credentials + */ + private function requestResponsesApi( + array $credentials, + string $model, + string $systemPrompt, + string $userPrompt, + string $content, + ): string { + try { + $payload = [ + 'model' => $model, + 'input' => [ + [ + 'role' => 'system', + 'content' => [ + ['type' => 'input_text', 'text' => $systemPrompt], + ], + ], + [ + 'role' => 'user', + 'content' => [ + ['type' => 'input_text', 'text' => $userPrompt."\n\n页面内容:\n".$content], + ], + ], + ], + ]; + + if ($credentials['disable_response_storage']) { + $payload['store'] = false; + } + + if ($credentials['reasoning_effort'] !== '') { + $payload['reasoning'] = ['effort' => $credentials['reasoning_effort']]; + } + + $response = $this->requestBuilder($credentials['api_key']) + ->post($credentials['endpoint'], $payload); + + if (! $response->successful()) { + $this->lastError = sprintf('AI HTTP %d: %s', $response->status(), mb_substr($response->body(), 0, 240)); + return ''; + } + + $json = $response->json(); + $outputText = (string) data_get($json, 'output_text', ''); + if ($outputText !== '') { + return $outputText; + } + + $output = data_get($json, 'output', []); + if (! is_array($output)) { + $this->lastError = 'AI output is empty.'; + return ''; + } + + $chunks = []; + foreach ($output as $item) { + if (! is_array($item)) { + continue; + } + + $contents = $item['content'] ?? []; + if (! is_array($contents)) { + continue; + } + + foreach ($contents as $contentItem) { + if (! is_array($contentItem)) { + continue; + } + + $text = (string) ($contentItem['text'] ?? ''); + if ($text !== '') { + $chunks[] = $text; + } + } + } + + if ($chunks === []) { + $this->lastError = 'AI output chunks are empty.'; + return ''; + } + + return implode("\n", $chunks); + } catch (\Throwable $exception) { + $this->lastError = $exception->getMessage(); + + return ''; + } + } + + private function requestBuilder(string $apiKey): PendingRequest + { + $aiTimeout = (int) config('crawler.ai_timeout_seconds', (int) config('crawler.request_timeout_seconds', 20)); + $request = Http::timeout(max($aiTimeout, 5)) + ->withToken($apiKey); + + if (! (bool) config('crawler.verify_ssl', true)) { + $request = $request->withoutVerifying(); + } + + $options = []; + if ((bool) config('crawler.force_ipv4', false)) { + $options['force_ip_resolve'] = 'v4'; + } + + $dnsServers = trim((string) config('crawler.dns_servers', '')); + if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) { + $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers; + } + + if ($options !== []) { + $request = $request->withOptions($options); + } + + return $request; + } + + /** + * @return array + */ + private function decodeJsonContent(string $content): array + { + if ($content === '') { + return []; + } + + $decoded = json_decode($content, true); + if (is_array($decoded)) { + return $decoded; + } + + if (preg_match('/\{.*\}/s', $content, $matches) === 1) { + $decoded = json_decode($matches[0], true); + + return is_array($decoded) ? $decoded : []; + } + + return []; + } +} diff --git a/app/Services/Crawler/XPathExtractor.php b/app/Services/Crawler/XPathExtractor.php new file mode 100644 index 0000000..bfe94a8 --- /dev/null +++ b/app/Services/Crawler/XPathExtractor.php @@ -0,0 +1,171 @@ + $extractorConfig + * @return list + */ + public function extractListUrls(string $html, string $baseUrl, array $extractorConfig): array + { + $listXPath = (string) ($extractorConfig['list_link_xpath'] ?? ''); + + if ($listXPath === '') { + return []; + } + + $xpath = $this->buildXPath($html); + + if ($xpath === null) { + return []; + } + + $nodes = $xpath->query($listXPath); + + if ($nodes === false) { + return []; + } + + $urls = []; + foreach ($nodes as $node) { + $value = trim($node->nodeValue ?? ''); + if ($value === '') { + continue; + } + + $absolute = $this->toAbsoluteUrl($value, $baseUrl); + if ($absolute !== null) { + $urls[] = $absolute; + } + } + + return array_values(array_unique($urls)); + } + + /** + * @param array $extractorConfig + * @return array + */ + public function extractFields(string $html, array $extractorConfig): array + { + $fieldRules = $extractorConfig['fields'] ?? []; + + if (! is_array($fieldRules) || $fieldRules === []) { + return []; + } + + $xpath = $this->buildXPath($html); + + if ($xpath === null) { + return []; + } + + $result = []; + + foreach ($fieldRules as $field => $rule) { + if (! is_string($field)) { + continue; + } + + $xpathExpr = ''; + $multiple = false; + + if (is_string($rule)) { + $xpathExpr = $rule; + } elseif (is_array($rule)) { + $xpathExpr = (string) ($rule['xpath'] ?? ''); + $multiple = (bool) ($rule['multiple'] ?? false); + } + + if ($xpathExpr === '') { + continue; + } + + $nodes = $xpath->query($xpathExpr); + + if ($nodes === false || $nodes->length === 0) { + continue; + } + + if ($multiple) { + $values = []; + foreach ($nodes as $node) { + $value = trim($node->nodeValue ?? ''); + if ($value !== '') { + $values[] = $value; + } + } + + if ($values !== []) { + $result[$field] = array_values(array_unique($values)); + } + + continue; + } + + $value = trim($nodes->item(0)?->nodeValue ?? ''); + if ($value !== '') { + $result[$field] = Str::squish($value); + } + } + + return $result; + } + + private function buildXPath(string $html): ?\DOMXPath + { + $dom = new \DOMDocument('1.0', 'UTF-8'); + + libxml_use_internal_errors(true); + $loaded = $dom->loadHTML(''.$html, LIBXML_NOWARNING | LIBXML_NOERROR); + libxml_clear_errors(); + + if (! $loaded) { + return null; + } + + return new \DOMXPath($dom); + } + + private function toAbsoluteUrl(string $url, string $baseUrl): ?string + { + if (Str::startsWith($url, ['http://', 'https://'])) { + return $url; + } + + if (Str::startsWith($url, '//')) { + $scheme = parse_url($baseUrl, PHP_URL_SCHEME) ?: 'https'; + + return sprintf('%s:%s', $scheme, $url); + } + + $baseParts = parse_url($baseUrl); + if (! is_array($baseParts) || ! isset($baseParts['scheme'], $baseParts['host'])) { + return null; + } + + $prefix = sprintf('%s://%s', $baseParts['scheme'], $baseParts['host']); + if (isset($baseParts['port'])) { + $prefix .= ':'.$baseParts['port']; + } + + if (Str::startsWith($url, '/')) { + return $prefix.$url; + } + + $path = $baseParts['path'] ?? '/'; + $dir = rtrim(str_replace('\\', '/', dirname($path)), '/'); + if ($dir === '') { + $dir = '/'; + } + + return rtrim($prefix, '/').'/'.ltrim($dir.'/'.$url, '/'); + } +} + diff --git a/bootstrap/app.php b/bootstrap/app.php index 16b3ac7..2484c2e 100644 --- a/bootstrap/app.php +++ b/bootstrap/app.php @@ -12,6 +12,9 @@ return Application::configure(basePath: dirname(__DIR__)) commands: __DIR__.'/../routes/console.php', health: '/up', ) + ->withCommands([ + __DIR__.'/../app/Console/Commands', + ]) ->withMiddleware(function (Middleware $middleware): void { $middleware->alias([ 'admin.auth' => AdminAuthenticate::class, diff --git a/config/crawler.php b/config/crawler.php new file mode 100644 index 0000000..2886982 --- /dev/null +++ b/config/crawler.php @@ -0,0 +1,23 @@ + env('CRAWLER_USER_AGENT', 'AIWebCrawler/1.0 (+https://dev.aiweb.com)'), + 'request_timeout_seconds' => (int) env('CRAWLER_REQUEST_TIMEOUT', 20), + 'ai_timeout_seconds' => (int) env('CRAWLER_AI_TIMEOUT', 60), + 'verify_ssl' => env('CRAWLER_VERIFY_SSL', true), + 'dns_servers' => env('CRAWLER_DNS_SERVERS', ''), + 'force_ipv4' => env('CRAWLER_FORCE_IPV4', false), + 'openai_wire_api' => env('CRAWLER_AI_WIRE_API', 'chat_completions'), + 'openai_compatible_base_url' => env('CRAWLER_AI_BASE_URL'), + 'openai_disable_response_storage' => env('CRAWLER_AI_DISABLE_RESPONSE_STORAGE', false), + 'openai_reasoning_effort' => env('CRAWLER_AI_REASONING_EFFORT', ''), + 'browserless_endpoint' => env('CRAWLER_BROWSERLESS_ENDPOINT'), + 'browserless_token' => env('CRAWLER_BROWSERLESS_TOKEN'), + 'openai_compatible_endpoint' => env('CRAWLER_AI_ENDPOINT'), + 'openai_compatible_key' => env('CRAWLER_AI_KEY'), + 'openai_default_model' => env('CRAWLER_AI_MODEL', 'gpt-4o-mini'), + 'default_alert_email' => env('CRAWLER_ALERT_EMAIL'), +]; + diff --git a/database/migrations/2026_02_18_100000_create_crawl_rules_table.php b/database/migrations/2026_02_18_100000_create_crawl_rules_table.php new file mode 100644 index 0000000..3ead351 --- /dev/null +++ b/database/migrations/2026_02_18_100000_create_crawl_rules_table.php @@ -0,0 +1,54 @@ +id(); + $table->string('name', 150); + $table->string('target_module', 32); + $table->boolean('enabled')->default(true); + $table->json('entry_urls'); + $table->string('cron_expression', 64)->default('0 */6 * * *'); + $table->string('timezone', 64)->default('Asia/Shanghai'); + $table->unsignedSmallInteger('max_pages')->default(50); + $table->boolean('render_js')->default(false); + $table->string('user_agent', 255)->nullable(); + $table->json('headers')->nullable(); + $table->json('cookies')->nullable(); + $table->string('proxy', 255)->nullable(); + $table->unsignedSmallInteger('rate_limit_per_minute')->default(30); + $table->unsignedTinyInteger('retry_max')->default(3); + $table->unsignedSmallInteger('retry_backoff_seconds')->default(60); + $table->json('extractor_config')->nullable(); + $table->json('mapping_config')->nullable(); + $table->json('dedupe_config')->nullable(); + $table->boolean('ai_fallback_enabled')->default(false); + $table->string('ai_provider', 64)->nullable(); + $table->string('ai_model', 128)->nullable(); + $table->string('publish_policy', 32)->default('draft'); + $table->string('alert_email', 255)->nullable(); + $table->timestamp('last_run_at')->nullable(); + $table->timestamp('next_run_at')->nullable(); + $table->unsignedBigInteger('created_by')->nullable(); + $table->unsignedBigInteger('updated_by')->nullable(); + $table->timestamps(); + + $table->index(['enabled', 'next_run_at']); + $table->index(['target_module', 'enabled']); + }); + } + + public function down(): void + { + Schema::dropIfExists('crawl_rules'); + } +}; + diff --git a/database/migrations/2026_02_18_100100_create_crawl_runs_table.php b/database/migrations/2026_02_18_100100_create_crawl_runs_table.php new file mode 100644 index 0000000..11cb845 --- /dev/null +++ b/database/migrations/2026_02_18_100100_create_crawl_runs_table.php @@ -0,0 +1,39 @@ +id(); + $table->foreignId('rule_id')->constrained('crawl_rules')->cascadeOnDelete(); + $table->string('trigger_type', 32)->default('manual'); + $table->string('status', 32)->default('pending'); + $table->timestamp('started_at')->nullable(); + $table->timestamp('finished_at')->nullable(); + $table->unsignedInteger('total_urls')->default(0); + $table->unsignedInteger('success_count')->default(0); + $table->unsignedInteger('failed_count')->default(0); + $table->unsignedInteger('skipped_count')->default(0); + $table->text('error_summary')->nullable(); + $table->json('metrics')->nullable(); + $table->unsignedBigInteger('created_by')->nullable(); + $table->timestamps(); + + $table->index(['rule_id', 'created_at']); + $table->index(['status', 'created_at']); + }); + } + + public function down(): void + { + Schema::dropIfExists('crawl_runs'); + } +}; + diff --git a/database/migrations/2026_02_18_100200_create_crawl_run_items_table.php b/database/migrations/2026_02_18_100200_create_crawl_run_items_table.php new file mode 100644 index 0000000..181395c --- /dev/null +++ b/database/migrations/2026_02_18_100200_create_crawl_run_items_table.php @@ -0,0 +1,39 @@ +id(); + $table->foreignId('run_id')->constrained('crawl_runs')->cascadeOnDelete(); + $table->string('url', 2048); + $table->string('stage', 32); + $table->unsignedTinyInteger('attempt')->default(1); + $table->string('status', 32)->default('success'); + $table->unsignedInteger('latency_ms')->nullable(); + $table->unsignedSmallInteger('http_code')->nullable(); + $table->string('error_code', 64)->nullable(); + $table->text('error_message')->nullable(); + $table->json('raw_payload')->nullable(); + $table->json('normalized_payload')->nullable(); + $table->json('upsert_result')->nullable(); + $table->timestamps(); + + $table->index(['run_id', 'status']); + $table->index(['run_id', 'stage']); + }); + } + + public function down(): void + { + Schema::dropIfExists('crawl_run_items'); + } +}; + diff --git a/database/migrations/2026_02_18_100300_create_crawl_alerts_table.php b/database/migrations/2026_02_18_100300_create_crawl_alerts_table.php new file mode 100644 index 0000000..8ccc1c2 --- /dev/null +++ b/database/migrations/2026_02_18_100300_create_crawl_alerts_table.php @@ -0,0 +1,36 @@ +id(); + $table->foreignId('run_id')->nullable()->constrained('crawl_runs')->nullOnDelete(); + $table->foreignId('rule_id')->nullable()->constrained('crawl_rules')->nullOnDelete(); + $table->string('severity', 32)->default('warning'); + $table->string('type', 64); + $table->string('message', 500); + $table->json('context')->nullable(); + $table->boolean('is_resolved')->default(false); + $table->unsignedBigInteger('resolved_by')->nullable(); + $table->timestamp('resolved_at')->nullable(); + $table->timestamps(); + + $table->index(['is_resolved', 'severity']); + $table->index(['rule_id', 'created_at']); + }); + } + + public function down(): void + { + Schema::dropIfExists('crawl_alerts'); + } +}; + diff --git a/docs/crawler-rule-guide.md b/docs/crawler-rule-guide.md new file mode 100644 index 0000000..815f709 --- /dev/null +++ b/docs/crawler-rule-guide.md @@ -0,0 +1,219 @@ +# 采集规则使用文档(含 AI 工具 Demo) + +本文面向当前项目内置采集器,目标是让你从 0 到 1 跑通一条规则,并把数据入库到站点模块(如 `AI 工具`、`AI 模型`)。 + +## 1. 功能概览 + +当前支持: + +- 后台配置采集规则(入口 URL、定时、抓取参数、Extractor JSON、AI 配置)。 +- 定时执行(Laravel Scheduler)与手动触发。 +- 运行日志、失败明细、告警中心。 +- 目标模块入库:`AI 工具`、`AI 模型`。 +- 三种抽取模式: + - `xpath`:只用 XPath 规则。 + - `ai`:只用 AI 抽取结构化数据。 + - `hybrid`:XPath + AI 合并(XPath 优先)。 +- 页面预览 + 点选元素生成 XPath。 +- AI 一键生成 Extractor 规则(从页面内容推断)。 + +## 2. 前置准备 + +### 2.1 迁移数据库 + +```bash +php artisan migrate --force +``` + +确保存在以下表: + +- `crawl_rules` +- `crawl_runs` +- `crawl_run_items` +- `crawl_alerts` + +### 2.2 启动队列与调度 + +采集任务通过队列执行,建议至少一个 worker: + +```bash +php artisan queue:work +``` + +系统 cron 每分钟执行一次调度器: + +```cron +* * * * * cd /path/to/ai-web && php artisan schedule:run >> /dev/null 2>&1 +``` + +### 2.3 AI 配置(用于 AI 抽取/AI 规则生成) + +在 `.env` 中配置: + +```env +CRAWLER_AI_ENDPOINT= +CRAWLER_AI_KEY= +CRAWLER_AI_MODEL=gpt-4o-mini +``` + +## 3. 后台入口 + +- 采集规则:`/admin/crawlers` +- 运行记录:`/admin/crawl-runs` +- 告警中心:`/admin/crawl-alerts` + +## 4. AI 工具 Demo(推荐先跑) + +### 4.1 新建规则 + +在 `采集规则` 页面点击“新建采集规则”: + +- 规则名称:`AI工具-Demo` +- 目标模块:`AI 工具` +- 发布策略:`草稿待审核` +- Cron:`0 */6 * * *` +- 时区:`Asia/Shanghai` +- 最大页面数:`30` +- 启用规则:勾选 +- 入口 URL: + +```text +https://your-demo-site.com/ai-tools +``` + +### 4.2 选择抽取模式 + +可按场景选: + +- 页面结构稳定:`xpath` +- 页面结构变化大:`ai` +- 追求稳定 + 覆盖:`hybrid` + +### 4.3 配置 Extractor JSON(XPath 模式/Hybrid 建议) + +可直接用: + +```json +{ + "list_link_xpath": "//a[contains(@class,'tool-link')]/@href", + "fields": { + "name": "//h1/text()", + "summary": "//meta[@name='description']/@content", + "official_url": "//a[contains(@class,'visit-official')]/@href", + "logo_url": "//meta[@property='og:image']/@content", + "pricing_type": "//span[@data-field='pricing']/text()", + "platform": "//span[@data-field='platform']/text()", + "language": "//span[@data-field='language']/text()", + "description": "//article[contains(@class,'tool-content')]//text()" + } +} +``` + +或使用示例文件:`docs/examples/ai-tools-extractor.json` + +### 4.4 用“页面预览 + 选元素”快速生成 XPath + +1. 在表单里输入 `预览 URL`。 +2. 点击“加载预览”。 +3. 在预览 iframe 点击目标元素,页面会显示当前 XPath。 +4. 填写“写入字段”(如 `name` / `summary` / `list_link_xpath`)。 +5. 点击“写入 Extractor JSON”。 + +### 4.5 用 AI 一键生成规则 + +1. 填写(可选)AI 提示词、模型、温度等。 +2. 点击“AI 生成抽取规则并合并到 Extractor JSON”。 +3. 检查合并后的 JSON 并微调。 + +### 4.6 Mapping / Dedupe(可选) + +`Mapping JSON` 示例:`docs/examples/ai-tools-mapping.json` + +```json +{ + "name": "title", + "summary": "desc", + "official_url": "website" +} +``` + +`Dedupe JSON` 当前可先留空对象:`docs/examples/ai-tools-dedupe.json` + +```json +{} +``` + +## 5. 如何执行 + +### 5.1 后台手动执行 + +在规则列表点击“立即执行”。 + +### 5.2 命令行执行 + +执行指定规则: + +```bash +php artisan crawler:run 规则ID --sync +``` + +按 cron 执行到期规则: + +```bash +php artisan crawler:run +``` + +忽略 cron 执行全部启用规则: + +```bash +php artisan crawler:run --all +``` + +重试某次运行: + +```bash +php artisan crawler:retry-failed 运行ID +``` + +## 6. 验证结果 + +1. 打开 `/admin/crawl-runs` 查看该次运行状态。 +2. 进入运行详情看 `list/detail/extract/upsert` 各阶段结果。 +3. 到目标模块(如 AI 工具)确认有新数据,状态应为 `draft`。 + +## 7. 常见问题 + +### 7.1 `Table '...crawl_rules' doesn't exist` + +未执行迁移: + +```bash +php artisan migrate --force +``` + +### 7.2 保存时报 `validation.json` 或 JSON 格式错误 + +检查以下字段是否是合法 JSON: + +- `Extractor JSON` +- `Mapping JSON` +- `Headers JSON` +- `Cookies JSON` + +### 7.3 运行成功但没入库 + +通常是缺少必填字段: + +- AI 工具至少需 `name`、`summary` +- AI 模型至少需 `name`、`summary`、`modality`、`deployment_mode` + +去运行详情看 `extract` 阶段的 `Missing required fields`。 + +### 7.4 预览或 AI 规则生成失败 + +常见原因: + +- URL 不可访问 +- URL 命中安全限制(内网/保留地址) +- AI 配置缺失(`CRAWLER_AI_ENDPOINT` / `CRAWLER_AI_KEY`) + diff --git a/docs/examples/ai-tools-dedupe.json b/docs/examples/ai-tools-dedupe.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/docs/examples/ai-tools-dedupe.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/docs/examples/ai-tools-extractor.json b/docs/examples/ai-tools-extractor.json new file mode 100644 index 0000000..b8ea34b --- /dev/null +++ b/docs/examples/ai-tools-extractor.json @@ -0,0 +1,13 @@ +{ + "list_link_xpath": "//a[contains(@class,'tool-link')]/@href", + "fields": { + "name": "//h1/text()", + "summary": "//meta[@name='description']/@content", + "official_url": "//a[contains(@class,'visit-official')]/@href", + "logo_url": "//meta[@property='og:image']/@content", + "pricing_type": "//span[@data-field='pricing']/text()", + "platform": "//span[@data-field='platform']/text()", + "language": "//span[@data-field='language']/text()", + "description": "//article[contains(@class,'tool-content')]//text()" + } +} \ No newline at end of file diff --git a/docs/examples/ai-tools-mapping.json b/docs/examples/ai-tools-mapping.json new file mode 100644 index 0000000..30cf69f --- /dev/null +++ b/docs/examples/ai-tools-mapping.json @@ -0,0 +1,5 @@ +{ + "name": "title", + "summary": "desc", + "official_url": "website" +} \ No newline at end of file diff --git a/resources/views/admin/crawl-alerts/index.blade.php b/resources/views/admin/crawl-alerts/index.blade.php new file mode 100644 index 0000000..f5ed2d3 --- /dev/null +++ b/resources/views/admin/crawl-alerts/index.blade.php @@ -0,0 +1,73 @@ +@extends('layouts.admin') + +@section('title', '采集告警中心') + +@section('head') + @include('admin.partials.modern-index-head') +@endsection + +@section('content') +
+
+
+ + +
+
+
+ +
+
+ + + + + + + + + + + + + + + @forelse($items as $item) + + + + + + + + + + + @empty + + + + @endforelse + +
ID等级规则运行类型信息状态操作
#{{ $item->id }}{{ $item->severity?->value ?? '-' }}{{ $item->rule?->name ?? '-' }} + @if($item->run) + #{{ $item->run_id }} + @else + - + @endif + {{ $item->type }}{{ $item->message }}{{ $item->is_resolved ? '已处理' : '未处理' }} + @if(! $item->is_resolved) +
+ @csrf + +
+ @endif +
暂无告警
+
+ +
+@endsection diff --git a/resources/views/admin/crawl-runs/index.blade.php b/resources/views/admin/crawl-runs/index.blade.php new file mode 100644 index 0000000..3fe8280 --- /dev/null +++ b/resources/views/admin/crawl-runs/index.blade.php @@ -0,0 +1,60 @@ +@extends('layouts.admin') + +@section('title', '采集运行记录') + +@section('head') + @include('admin.partials.modern-index-head') +@endsection + +@section('content') +
+
+
+ + +
+
+
+ +
+
+ + + + + + + + + + + + + + @forelse($items as $item) + + + + + + + + + + @empty + + + + @endforelse + +
ID规则触发方式状态统计时间操作
#{{ $item->id }}{{ $item->rule?->name ?? '-' }}{{ $item->trigger_type?->value ?? '-' }}{{ $item->status?->value ?? '-' }}成功 {{ $item->success_count }} / 失败 {{ $item->failed_count }} / 跳过 {{ $item->skipped_count }}{{ $item->created_at?->format('Y-m-d H:i:s') }} + 详情 +
+ @csrf + +
+
暂无运行记录
+
+ +
+@endsection diff --git a/resources/views/admin/crawl-runs/show.blade.php b/resources/views/admin/crawl-runs/show.blade.php new file mode 100644 index 0000000..c2f510f --- /dev/null +++ b/resources/views/admin/crawl-runs/show.blade.php @@ -0,0 +1,101 @@ +@extends('layouts.admin') + +@section('title', '运行详情 #'.$run->id) + +@section('head') + @include('admin.partials.modern-index-head') +@endsection + +@section('page_actions') + 返回列表 +@endsection + +@section('content') +
+
+
+
规则:{{ $run->rule?->name ?? '-' }}
+
触发方式:{{ $run->trigger_type?->value ?? '-' }}
+
状态:{{ $run->status?->value ?? '-' }}
+
创建时间:{{ $run->created_at?->format('Y-m-d H:i:s') }}
+
总URL:{{ $run->total_urls }}
+
成功:{{ $run->success_count }}
+
失败:{{ $run->failed_count }}
+
跳过:{{ $run->skipped_count }}
+
+ @if($run->error_summary) +
{{ $run->error_summary }}
+ @endif +
+
+ +
+
运行明细
+
+ + + + + + + + + + + + + + @forelse($run->items as $item) + + + + + + + + + + @empty + + + + @endforelse + +
IDURL阶段状态HTTP耗时(ms)错误
#{{ $item->id }}{{ $item->url }}{{ $item->stage }}{{ $item->status?->value ?? '-' }}{{ $item->http_code ?? '-' }}{{ $item->latency_ms ?? '-' }}{{ $item->error_message ?? '-' }}
无明细数据
+
+
+ +
+
关联告警
+
+ + + + + + + + + + + + + @forelse($run->alerts as $alert) + + + + + + + + + @empty + + + + @endforelse + +
ID等级类型信息状态时间
#{{ $alert->id }}{{ $alert->severity?->value ?? '-' }}{{ $alert->type }}{{ $alert->message }}{{ $alert->is_resolved ? '已处理' : '未处理' }}{{ $alert->created_at?->format('Y-m-d H:i:s') }}
无告警
+
+
+@endsection diff --git a/resources/views/admin/crawlers/form.blade.php b/resources/views/admin/crawlers/form.blade.php new file mode 100644 index 0000000..7764783 --- /dev/null +++ b/resources/views/admin/crawlers/form.blade.php @@ -0,0 +1,482 @@ +@extends('layouts.admin') + +@section('title', $item->exists ? '编辑采集规则' : '新建采集规则') + +@section('head') + @include('admin.partials.modern-form-head') +@endsection + +@section('scripts') + +@endsection + +@section('content') +
+
+

{{ $item->exists ? '编辑采集规则' : '新建采集规则' }}

+ 返回列表 +
+
+
+ @csrf + @if($method !== 'POST') @method($method) @endif + + @php + $entryUrls = old('entry_urls', is_array($item->entry_urls) ? implode("\n", $item->entry_urls) : ''); + $headersJson = old('headers_json', json_encode($item->headers ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)); + $cookiesJson = old('cookies_json', json_encode($item->cookies ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)); + $extractorConfig = is_array($item->extractor_config) ? $item->extractor_config : []; + $extractorJson = old('extractor_json', json_encode($extractorConfig, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)); + $mappingJson = old('mapping_json', json_encode($item->mapping_config ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)); + $dedupeJson = old('dedupe_json', json_encode($item->dedupe_config ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)); + $extractorAi = is_array($extractorConfig['ai'] ?? null) ? $extractorConfig['ai'] : []; + $mode = old('extractor_mode', $extractorConfig['mode'] ?? 'xpath'); + @endphp + +
+
+

基础配置

+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+ enabled))> + +
+
+
+ + +
+
+
+
+ +
+
+

抓取与 AI 配置

+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+ ai_fallback_enabled))> + +
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+

Extractor / Mapping / 预览选元素

+
+
+ + +
+
+ + + + +
+
+ +
+
+ +
+
+ +
未加载预览
+
+
+ +
未选择
+
+
+ + +
+
+ + +
+
+ +
+
+
+
+ +
+ 建议流程:加载预览 -> 点选元素写 XPath -> AI 补全规则 -> 保存。 + +
+
+
+
+@endsection diff --git a/resources/views/admin/crawlers/index.blade.php b/resources/views/admin/crawlers/index.blade.php new file mode 100644 index 0000000..9fb5177 --- /dev/null +++ b/resources/views/admin/crawlers/index.blade.php @@ -0,0 +1,74 @@ +@extends('layouts.admin') + +@section('title', '采集规则') + +@section('head') + @include('admin.partials.modern-index-head') +@endsection + +@section('page_actions') + 新建规则 +@endsection + +@section('content') +
+
+
+ + +
+
+ +
+ +
+
+ + + + + + + + + + + + + @forelse($items as $item) + + + + + + + + + @empty + + + + @endforelse + +
规则目标模块Cron状态最近运行操作
+
{{ $item->name }}
+
运行次数:{{ $item->runs_count }} / 下次:{{ $item->next_run_at?->format('Y-m-d H:i') ?? '-' }}
+
{{ $item->target_module?->label() ?? '-' }}{{ $item->cron_expression }} + @if($item->enabled) + 启用 + @else + 停用 + @endif + {{ $item->last_run_at?->format('Y-m-d H:i') ?? '-' }} +
+ @csrf + +
+ 编辑 +
暂无采集规则
+
+ +
+@endsection diff --git a/resources/views/admin/partials/admin-page-header.blade.php b/resources/views/admin/partials/admin-page-header.blade.php index 0e50d20..c46dc37 100644 --- a/resources/views/admin/partials/admin-page-header.blade.php +++ b/resources/views/admin/partials/admin-page-header.blade.php @@ -9,17 +9,21 @@ 'tools' => ['label' => 'AI 工具', 'index' => 'admin.tools.index', 'subtitle' => '维护工具信息、状态与展示内容。'], 'models' => ['label' => 'AI 模型', 'index' => 'admin.models.index', 'subtitle' => '管理模型参数、评分与发布状态。'], 'articles' => ['label' => 'AI 资讯', 'index' => 'admin.articles.index', 'subtitle' => '维护资讯内容、来源与发布质量。'], - 'guides' => ['label' => 'AI 教程', 'index' => 'admin.guides.index', 'subtitle' => '维护教程内容与学习难度分层。'], + 'guides' => ['label' => 'AI 教程', 'index' => 'admin.guides.index', 'subtitle' => '维护教程内容与学习难度层级。'], 'categories' => ['label' => '分类管理', 'index' => 'admin.categories.index', 'subtitle' => '统一管理分类体系与启用状态。'], - 'sources' => ['label' => '来源管理', 'index' => 'admin.sources.index', 'subtitle' => '维护可信来源白名单与抓取策略。'], - 'settings' => ['label' => '首页配置', 'index' => 'admin.settings.index', 'subtitle' => '配置首页模块、条目与展示顺序。'], - 'feedback' => ['label' => '反馈管理', 'index' => 'admin.feedback.index', 'subtitle' => '跟进用户反馈并及时更新处理状态。'], + 'sources' => ['label' => '来源管理', 'index' => 'admin.sources.index', 'subtitle' => '维护可用来源及可信度配置。'], + 'settings' => ['label' => '首页配置', 'index' => 'admin.settings.index', 'subtitle' => '配置首页模块、条目和展示顺序。'], + 'feedback' => ['label' => '反馈管理', 'index' => 'admin.feedback.index', 'subtitle' => '跟进用户反馈并更新处理状态。'], + 'crawlers' => ['label' => '采集规则', 'index' => 'admin.crawlers.index', 'subtitle' => '维护采集目标、字段映射与调度策略。'], + 'crawl-runs' => ['label' => '采集运行', 'index' => 'admin.crawl-runs.index', 'subtitle' => '查看每次采集执行结果、失败原因和重试。'], + 'crawl-alerts' => ['label' => '采集告警', 'index' => 'admin.crawl-alerts.index', 'subtitle' => '集中处理采集异常并追踪恢复情况。'], ][$moduleKey] ?? ['label' => '管理后台', 'index' => 'admin.dashboard', 'subtitle' => '维护站点内容与配置。']; $actionLabel = [ 'index' => '列表', 'create' => '新建', 'edit' => '编辑', + 'show' => '详情', ][$actionKey] ?? '详情'; $defaultTitle = $moduleMeta['label']; @@ -30,7 +34,7 @@ if ($pageSubtitle === '') { $pageSubtitle = $actionKey === 'index' ? $moduleMeta['subtitle'] - : '当前为'.$actionLabel.'页面,请按提示完成必填信息并保存。'; + : '当前为'.$actionLabel.'页面,请按提示完善信息后保存。'; } @endphp diff --git a/resources/views/layouts/admin.blade.php b/resources/views/layouts/admin.blade.php index ef0a388..dbed6e3 100644 --- a/resources/views/layouts/admin.blade.php +++ b/resources/views/layouts/admin.blade.php @@ -1,4 +1,4 @@ - + @@ -111,6 +111,10 @@ + + + + @@ -329,4 +333,3 @@ @yield('scripts') - diff --git a/routes/console.php b/routes/console.php index 3c9adf1..ece3f41 100644 --- a/routes/console.php +++ b/routes/console.php @@ -2,7 +2,10 @@ use Illuminate\Foundation\Inspiring; use Illuminate\Support\Facades\Artisan; +use Illuminate\Support\Facades\Schedule; Artisan::command('inspire', function () { $this->comment(Inspiring::quote()); })->purpose('Display an inspiring quote'); + +Schedule::command('crawler:run')->everyMinute()->withoutOverlapping(); diff --git a/routes/web.php b/routes/web.php index a18aadb..8fedcc4 100644 --- a/routes/web.php +++ b/routes/web.php @@ -9,6 +9,9 @@ use App\Http\Controllers\Admin\DashboardController; use App\Http\Controllers\Admin\FeedbackController as AdminFeedbackController; use App\Http\Controllers\Admin\GuideController as AdminGuideController; use App\Http\Controllers\Admin\CategoryController as AdminCategoryController; +use App\Http\Controllers\Admin\CrawlAlertController as AdminCrawlAlertController; +use App\Http\Controllers\Admin\CrawlerRuleController as AdminCrawlerRuleController; +use App\Http\Controllers\Admin\CrawlRunController as AdminCrawlRunController; use App\Http\Controllers\Admin\SiteSettingController as AdminSiteSettingController; use App\Http\Controllers\Admin\UploadController as AdminUploadController; use App\Http\Controllers\Admin\SourceController as AdminSourceController; @@ -111,5 +114,21 @@ Route::prefix('admin')->name('admin.')->group(function (): void { Route::get('/feedback', [AdminFeedbackController::class, 'index'])->name('feedback.index'); Route::put('/feedback/{feedback}', [AdminFeedbackController::class, 'updateStatus'])->name('feedback.status'); + + Route::get('/crawlers', [AdminCrawlerRuleController::class, 'index'])->name('crawlers.index'); + Route::get('/crawlers/create', [AdminCrawlerRuleController::class, 'create'])->name('crawlers.create'); + Route::post('/crawlers', [AdminCrawlerRuleController::class, 'store'])->name('crawlers.store'); + Route::post('/crawlers/preview', [AdminCrawlerRuleController::class, 'preview'])->name('crawlers.preview'); + Route::post('/crawlers/ai-suggest-extractor', [AdminCrawlerRuleController::class, 'aiSuggestExtractor'])->name('crawlers.ai-suggest-extractor'); + Route::get('/crawlers/{crawler}/edit', [AdminCrawlerRuleController::class, 'edit'])->name('crawlers.edit'); + Route::put('/crawlers/{crawler}', [AdminCrawlerRuleController::class, 'update'])->name('crawlers.update'); + Route::post('/crawlers/{crawler}/run', [AdminCrawlerRuleController::class, 'run'])->name('crawlers.run'); + + Route::get('/crawl-runs', [AdminCrawlRunController::class, 'index'])->name('crawl-runs.index'); + Route::get('/crawl-runs/{run}', [AdminCrawlRunController::class, 'show'])->name('crawl-runs.show'); + Route::post('/crawl-runs/{run}/retry', [AdminCrawlRunController::class, 'retry'])->name('crawl-runs.retry'); + + Route::get('/crawl-alerts', [AdminCrawlAlertController::class, 'index'])->name('crawl-alerts.index'); + Route::post('/crawl-alerts/{alert}/resolve', [AdminCrawlAlertController::class, 'resolve'])->name('crawl-alerts.resolve'); }); }); diff --git a/tests/Feature/CrawlerCommandTest.php b/tests/Feature/CrawlerCommandTest.php new file mode 100644 index 0000000..3a87901 --- /dev/null +++ b/tests/Feature/CrawlerCommandTest.php @@ -0,0 +1,62 @@ +AB

Alpha

'; + + $extractor = new XPathExtractor(); + + $urls = $extractor->extractListUrls($html, 'https://example.com/list', [ + 'list_link_xpath' => '//a/@href', + ]); + + $this->assertSame([ + 'https://example.com/tools/a', + 'https://example.com/tools/b', + ], $urls); + + $fields = $extractor->extractFields($html, [ + 'fields' => [ + 'name' => '//h1/text()', + 'summary' => '//meta[@name="description"]/@content', + ], + ]); + + $this->assertSame('Alpha', $fields['name']); + $this->assertSame('Alpha summary', $fields['summary']); + } + + public function test_schedule_service_can_compute_due_and_next_time(): void + { + $rule = new CrawlRule([ + 'name' => 'test', + 'target_module' => CrawlTargetModule::Tool, + 'enabled' => true, + 'cron_expression' => '*/5 * * * *', + 'timezone' => 'Asia/Shanghai', + ]); + + $service = new CrawlRuleScheduleService(); + $now = CarbonImmutable::parse('2026-02-18 10:10:00', 'Asia/Shanghai'); + + $this->assertTrue($service->isDue($rule, $now)); + + $nextRunAt = $service->nextRunAt($rule, $now); + + $this->assertNotNull($nextRunAt); + $this->assertSame('2026-02-18 02:15:00', $nextRunAt?->format('Y-m-d H:i:s')); + } +} \ No newline at end of file