Files
ai-web/app/Http/Controllers/Admin/CrawlerRuleController.php

414 lines
14 KiB
PHP
Raw Permalink Normal View History

2026-02-18 12:56:36 +08:00
<?php
declare(strict_types=1);
namespace App\Http\Controllers\Admin;
use App\Enums\CrawlTargetModule;
use App\Enums\CrawlTriggerType;
use App\Http\Controllers\Controller;
use App\Http\Requests\Admin\CrawlRuleRequest;
use App\Jobs\RunCrawlRuleJob;
use App\Models\CrawlRule;
use App\Services\Crawler\CrawlRuleScheduleService;
use App\Services\Crawler\OpenAiFallbackExtractor;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\RedirectResponse;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Http;
use Illuminate\View\View;
class CrawlerRuleController extends Controller
{
public function __construct(
private readonly CrawlRuleScheduleService $scheduleService,
private readonly OpenAiFallbackExtractor $aiExtractor,
) {
}
public function index(Request $request): View
{
$items = CrawlRule::query()
->withCount('runs')
->when($request->filled('q'), function ($query) use ($request): void {
$keyword = '%'.trim((string) $request->string('q')).'%';
$query->where('name', 'like', $keyword);
})
->latest('updated_at')
->paginate(20)
->withQueryString();
return view('admin.crawlers.index', [
'items' => $items,
'filters' => $request->only(['q']),
]);
}
public function create(): View
{
return view('admin.crawlers.form', [
'item' => new CrawlRule([
'enabled' => true,
'target_module' => CrawlTargetModule::Tool,
'cron_expression' => '0 */6 * * *',
'timezone' => 'Asia/Shanghai',
'max_pages' => 50,
'rate_limit_per_minute' => 30,
'retry_max' => 3,
'retry_backoff_seconds' => 60,
'extractor_config' => [
'mode' => 'xpath',
'list_link_xpath' => '//a/@href',
'fields' => [
'name' => '//h1/text()',
'summary' => '//meta[@name="description"]/@content',
],
'ai' => [
'temperature' => 0,
'content_max_chars' => 12000,
],
],
'mapping_config' => [],
'dedupe_config' => [],
'publish_policy' => 'draft',
'ai_provider' => 'openai_compatible',
'ai_fallback_enabled' => false,
]),
'method' => 'POST',
'submitRoute' => route('admin.crawlers.store'),
]);
}
public function store(CrawlRuleRequest $request): RedirectResponse
{
$payload = $request->normalizedPayload();
$payload['created_by'] = null;
$payload['updated_by'] = null;
$item = CrawlRule::query()->create($payload);
$item->next_run_at = $this->scheduleService->nextRunAt($item);
$item->save();
return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。');
}
public function edit(CrawlRule $crawler): View
{
return view('admin.crawlers.form', [
'item' => $crawler,
'method' => 'PUT',
'submitRoute' => route('admin.crawlers.update', $crawler),
]);
}
public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse
{
$payload = $request->normalizedPayload();
$payload['updated_by'] = null;
$crawler->fill($payload);
$crawler->next_run_at = $this->scheduleService->nextRunAt($crawler);
$crawler->save();
return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。');
}
public function run(CrawlRule $crawler): RedirectResponse
{
RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value);
return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id])
->with('status', '已提交手动执行任务。');
}
public function preview(Request $request): JsonResponse
{
$payload = $request->validate([
'url' => ['required', 'url', 'max:2000'],
'user_agent' => ['nullable', 'string', 'max:255'],
]);
$url = (string) $payload['url'];
if (! $this->isSafePreviewUrl($url)) {
return response()->json([
'ok' => false,
'message' => '预览地址不安全,已拒绝请求。',
], 422);
}
$fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
if (! $fetched['ok']) {
return response()->json([
'ok' => false,
'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
], 422);
}
$sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']);
return response()->json([
'ok' => true,
'url' => $url,
'title' => $this->extractTitle($sanitizedHtml),
'html' => $sanitizedHtml,
]);
}
public function aiSuggestExtractor(Request $request): JsonResponse
{
$payload = $request->validate([
'url' => ['required', 'url', 'max:2000'],
'target_module' => ['required', 'in:tool,model'],
'user_agent' => ['nullable', 'string', 'max:255'],
'ai_model' => ['nullable', 'string', 'max:128'],
'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
]);
if (! $this->aiExtractor->isConfigured()) {
return response()->json([
'ok' => false,
'message' => 'AI 配置缺失,请先设置 CRAWLER_AI_KEY 与接口地址。',
], 422);
}
$url = (string) $payload['url'];
if (! $this->isSafePreviewUrl($url)) {
return response()->json([
'ok' => false,
'message' => '目标 URL 不安全,已拒绝请求。',
], 422);
}
$fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
if (! $fetched['ok']) {
return response()->json([
'ok' => false,
'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
], 422);
}
$options = [];
foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) {
if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') {
$options[$target] = trim((string) $payload[$source]);
}
}
if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
$options['temperature'] = (float) $payload['ai_temperature'];
}
if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
$options['content_max_chars'] = (int) $payload['ai_content_max_chars'];
}
$extractorConfig = $this->aiExtractor->suggestExtractorConfig(
(string) $payload['target_module'],
$this->sanitizePreviewHtml($fetched['body']),
$options,
);
if ($extractorConfig === []) {
$reason = $this->aiExtractor->lastError();
return response()->json([
'ok' => false,
'message' => $reason !== null && $reason !== ''
? 'AI 生成失败:'.$reason
: 'AI 未生成有效规则,请调整页面或提示词后重试。',
], 422);
}
return response()->json([
'ok' => true,
'extractor_config' => $extractorConfig,
]);
}
/**
* @return array{ok: bool, body: string, error: string|null}
*/
private function fetchHtml(string $url, ?string $userAgent = null): array
{
$ua = is_string($userAgent) && trim($userAgent) !== ''
? trim($userAgent)
: (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0');
$maxAttempts = 3;
$lastError = 'unknown';
for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
try {
$request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5));
if (! (bool) config('crawler.verify_ssl', true)) {
$request = $request->withoutVerifying();
}
$request = $this->applyNetworkOptions($request);
$response = $request->withUserAgent($ua)->get($url);
if ($response->successful()) {
return [
'ok' => true,
'body' => $response->body(),
'error' => null,
];
}
$lastError = sprintf('HTTP %d', $response->status());
if ($attempt < $maxAttempts && $response->serverError()) {
usleep(250000 * $attempt);
continue;
}
break;
} catch (\Throwable $exception) {
$lastError = $exception->getMessage();
if ($attempt < $maxAttempts) {
usleep(250000 * $attempt);
continue;
}
}
}
return [
'ok' => false,
'body' => '',
'error' => $lastError,
];
}
private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest
{
$options = [];
if ((bool) config('crawler.force_ipv4', false)) {
$options['force_ip_resolve'] = 'v4';
}
$dnsServers = trim((string) config('crawler.dns_servers', ''));
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
}
if ($options === []) {
return $request;
}
return $request->withOptions($options);
}
private function isSafePreviewUrl(string $url): bool
{
$parts = parse_url($url);
if (! is_array($parts)) {
return false;
}
$scheme = strtolower((string) ($parts['scheme'] ?? ''));
$host = strtolower((string) ($parts['host'] ?? ''));
if (! in_array($scheme, ['http', 'https'], true) || $host === '') {
return false;
}
if ($host === 'localhost') {
return false;
}
if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
return $this->isPublicIp($host);
}
$records = @dns_get_record($host, DNS_A + DNS_AAAA);
if (! is_array($records) || $records === []) {
return true;
}
foreach ($records as $record) {
$ip = (string) ($record['ip'] ?? $record['ipv6'] ?? '');
if ($ip !== '' && ! $this->isPublicIp($ip)) {
return false;
}
}
return true;
}
private function isPublicIp(string $ip): bool
{
return filter_var(
$ip,
FILTER_VALIDATE_IP,
FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE,
) !== false;
}
private function sanitizePreviewHtml(string $html): string
{
if (trim($html) === '') {
return '<!doctype html><html><head><meta charset="utf-8"></head><body>空页面</body></html>';
}
$dom = new \DOMDocument('1.0', 'UTF-8');
libxml_use_internal_errors(true);
$dom->loadHTML('<?xml encoding="UTF-8">'.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
libxml_clear_errors();
$xpath = new \DOMXPath($dom);
foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) {
$nodes = $xpath->query($query);
if ($nodes === false) {
continue;
}
for ($index = $nodes->length - 1; $index >= 0; $index--) {
$node = $nodes->item($index);
if ($node !== null && $node->parentNode !== null) {
$node->parentNode->removeChild($node);
}
}
}
$allNodes = $xpath->query('//*');
if ($allNodes !== false) {
foreach ($allNodes as $node) {
if (! $node instanceof \DOMElement) {
continue;
}
$attributesToRemove = [];
foreach ($node->attributes as $attribute) {
if (str_starts_with(strtolower($attribute->name), 'on')) {
$attributesToRemove[] = $attribute->name;
}
}
foreach ($attributesToRemove as $attributeName) {
$node->removeAttribute($attributeName);
}
}
}
$output = (string) $dom->saveHTML();
return mb_substr($output, 0, 300000);
}
private function extractTitle(string $html): string
{
if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $matches) !== 1) {
return '';
}
return trim(strip_tags((string) $matches[1]));
}
}