withCount('runs') ->when($request->filled('q'), function ($query) use ($request): void { $keyword = '%'.trim((string) $request->string('q')).'%'; $query->where('name', 'like', $keyword); }) ->latest('updated_at') ->paginate(20) ->withQueryString(); return view('admin.crawlers.index', [ 'items' => $items, 'filters' => $request->only(['q']), ]); } public function create(): View { return view('admin.crawlers.form', [ 'item' => new CrawlRule([ 'enabled' => true, 'target_module' => CrawlTargetModule::Tool, 'cron_expression' => '0 */6 * * *', 'timezone' => 'Asia/Shanghai', 'max_pages' => 50, 'rate_limit_per_minute' => 30, 'retry_max' => 3, 'retry_backoff_seconds' => 60, 'extractor_config' => [ 'mode' => 'xpath', 'list_link_xpath' => '//a/@href', 'fields' => [ 'name' => '//h1/text()', 'summary' => '//meta[@name="description"]/@content', ], 'ai' => [ 'temperature' => 0, 'content_max_chars' => 12000, ], ], 'mapping_config' => [], 'dedupe_config' => [], 'publish_policy' => 'draft', 'ai_provider' => 'openai_compatible', 'ai_fallback_enabled' => false, ]), 'method' => 'POST', 'submitRoute' => route('admin.crawlers.store'), ]); } public function store(CrawlRuleRequest $request): RedirectResponse { $payload = $request->normalizedPayload(); $payload['created_by'] = null; $payload['updated_by'] = null; $item = CrawlRule::query()->create($payload); $item->next_run_at = $this->scheduleService->nextRunAt($item); $item->save(); return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。'); } public function edit(CrawlRule $crawler): View { return view('admin.crawlers.form', [ 'item' => $crawler, 'method' => 'PUT', 'submitRoute' => route('admin.crawlers.update', $crawler), ]); } public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse { $payload = $request->normalizedPayload(); $payload['updated_by'] = null; $crawler->fill($payload); $crawler->next_run_at = $this->scheduleService->nextRunAt($crawler); $crawler->save(); return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。'); } public function run(CrawlRule $crawler): RedirectResponse { RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value); return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id]) ->with('status', '已提交手动执行任务。'); } public function preview(Request $request): JsonResponse { $payload = $request->validate([ 'url' => ['required', 'url', 'max:2000'], 'user_agent' => ['nullable', 'string', 'max:255'], ]); $url = (string) $payload['url']; if (! $this->isSafePreviewUrl($url)) { return response()->json([ 'ok' => false, 'message' => '预览地址不安全,已拒绝请求。', ], 422); } $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null); if (! $fetched['ok']) { return response()->json([ 'ok' => false, 'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'), ], 422); } $sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']); return response()->json([ 'ok' => true, 'url' => $url, 'title' => $this->extractTitle($sanitizedHtml), 'html' => $sanitizedHtml, ]); } public function aiSuggestExtractor(Request $request): JsonResponse { $payload = $request->validate([ 'url' => ['required', 'url', 'max:2000'], 'target_module' => ['required', 'in:tool,model'], 'user_agent' => ['nullable', 'string', 'max:255'], 'ai_model' => ['nullable', 'string', 'max:128'], 'ai_system_prompt' => ['nullable', 'string', 'max:4000'], 'ai_user_prompt' => ['nullable', 'string', 'max:4000'], 'ai_temperature' => ['nullable', 'numeric', 'between:0,2'], 'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'], ]); if (! $this->aiExtractor->isConfigured()) { return response()->json([ 'ok' => false, 'message' => 'AI 配置缺失,请先设置 CRAWLER_AI_KEY 与接口地址。', ], 422); } $url = (string) $payload['url']; if (! $this->isSafePreviewUrl($url)) { return response()->json([ 'ok' => false, 'message' => '目标 URL 不安全,已拒绝请求。', ], 422); } $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null); if (! $fetched['ok']) { return response()->json([ 'ok' => false, 'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'), ], 422); } $options = []; foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) { if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') { $options[$target] = trim((string) $payload[$source]); } } if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') { $options['temperature'] = (float) $payload['ai_temperature']; } if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') { $options['content_max_chars'] = (int) $payload['ai_content_max_chars']; } $extractorConfig = $this->aiExtractor->suggestExtractorConfig( (string) $payload['target_module'], $this->sanitizePreviewHtml($fetched['body']), $options, ); if ($extractorConfig === []) { $reason = $this->aiExtractor->lastError(); return response()->json([ 'ok' => false, 'message' => $reason !== null && $reason !== '' ? 'AI 生成失败:'.$reason : 'AI 未生成有效规则,请调整页面或提示词后重试。', ], 422); } return response()->json([ 'ok' => true, 'extractor_config' => $extractorConfig, ]); } /** * @return array{ok: bool, body: string, error: string|null} */ private function fetchHtml(string $url, ?string $userAgent = null): array { $ua = is_string($userAgent) && trim($userAgent) !== '' ? trim($userAgent) : (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0'); $maxAttempts = 3; $lastError = 'unknown'; for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) { try { $request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5)); if (! (bool) config('crawler.verify_ssl', true)) { $request = $request->withoutVerifying(); } $request = $this->applyNetworkOptions($request); $response = $request->withUserAgent($ua)->get($url); if ($response->successful()) { return [ 'ok' => true, 'body' => $response->body(), 'error' => null, ]; } $lastError = sprintf('HTTP %d', $response->status()); if ($attempt < $maxAttempts && $response->serverError()) { usleep(250000 * $attempt); continue; } break; } catch (\Throwable $exception) { $lastError = $exception->getMessage(); if ($attempt < $maxAttempts) { usleep(250000 * $attempt); continue; } } } return [ 'ok' => false, 'body' => '', 'error' => $lastError, ]; } private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest { $options = []; if ((bool) config('crawler.force_ipv4', false)) { $options['force_ip_resolve'] = 'v4'; } $dnsServers = trim((string) config('crawler.dns_servers', '')); if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) { $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers; } if ($options === []) { return $request; } return $request->withOptions($options); } private function isSafePreviewUrl(string $url): bool { $parts = parse_url($url); if (! is_array($parts)) { return false; } $scheme = strtolower((string) ($parts['scheme'] ?? '')); $host = strtolower((string) ($parts['host'] ?? '')); if (! in_array($scheme, ['http', 'https'], true) || $host === '') { return false; } if ($host === 'localhost') { return false; } if (filter_var($host, FILTER_VALIDATE_IP) !== false) { return $this->isPublicIp($host); } $records = @dns_get_record($host, DNS_A + DNS_AAAA); if (! is_array($records) || $records === []) { return true; } foreach ($records as $record) { $ip = (string) ($record['ip'] ?? $record['ipv6'] ?? ''); if ($ip !== '' && ! $this->isPublicIp($ip)) { return false; } } return true; } private function isPublicIp(string $ip): bool { return filter_var( $ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE, ) !== false; } private function sanitizePreviewHtml(string $html): string { if (trim($html) === '') { return '
空页面'; } $dom = new \DOMDocument('1.0', 'UTF-8'); libxml_use_internal_errors(true); $dom->loadHTML(''.$html, LIBXML_NOWARNING | LIBXML_NOERROR); libxml_clear_errors(); $xpath = new \DOMXPath($dom); foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) { $nodes = $xpath->query($query); if ($nodes === false) { continue; } for ($index = $nodes->length - 1; $index >= 0; $index--) { $node = $nodes->item($index); if ($node !== null && $node->parentNode !== null) { $node->parentNode->removeChild($node); } } } $allNodes = $xpath->query('//*'); if ($allNodes !== false) { foreach ($allNodes as $node) { if (! $node instanceof \DOMElement) { continue; } $attributesToRemove = []; foreach ($node->attributes as $attribute) { if (str_starts_with(strtolower($attribute->name), 'on')) { $attributesToRemove[] = $attribute->name; } } foreach ($attributesToRemove as $attributeName) { $node->removeAttribute($attributeName); } } } $output = (string) $dom->saveHTML(); return mb_substr($output, 0, 300000); } private function extractTitle(string $html): string { if (preg_match('/