ai-web/app/Http/Controllers/Admin/CrawlerRuleController.php

<?php

declare(strict_types=1);

namespace App\Http\Controllers\Admin;

use App\Enums\CrawlTargetModule;
use App\Enums\CrawlTriggerType;
use App\Http\Controllers\Controller;
use App\Http\Requests\Admin\CrawlRuleRequest;
use App\Jobs\RunCrawlRuleJob;
use App\Models\CrawlRule;
use App\Services\Crawler\CrawlRuleScheduleService;
use App\Services\Crawler\OpenAiFallbackExtractor;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\RedirectResponse;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Http;
use Illuminate\View\View;

class CrawlerRuleController extends Controller
{
    public function __construct(
        private readonly CrawlRuleScheduleService $scheduleService,
        private readonly OpenAiFallbackExtractor $aiExtractor,
    ) {
    }

    public function index(Request $request): View
    {
        $items = CrawlRule::query()
            ->withCount('runs')
            ->when($request->filled('q'), function ($query) use ($request): void {
                $keyword = '%'.trim((string) $request->string('q')).'%';
                $query->where('name', 'like', $keyword);
            })
            ->latest('updated_at')
            ->paginate(20)
            ->withQueryString();

        return view('admin.crawlers.index', [
            'items' => $items,
            'filters' => $request->only(['q']),
        ]);
    }

    public function create(): View
    {
        return view('admin.crawlers.form', [
            'item' => new CrawlRule([
                'enabled' => true,
                'target_module' => CrawlTargetModule::Tool,
                'cron_expression' => '0 */6 * * *',
                'timezone' => 'Asia/Shanghai',
                'max_pages' => 50,
                'rate_limit_per_minute' => 30,
                'retry_max' => 3,
                'retry_backoff_seconds' => 60,
                'extractor_config' => [
                    'mode' => 'xpath',
                    'list_link_xpath' => '//a/@href',
                    'fields' => [
                        'name' => '//h1/text()',
                        'summary' => '//meta[@name="description"]/@content',
                    ],
                    'ai' => [
                        'temperature' => 0,
                        'content_max_chars' => 12000,
                    ],
                ],
                'mapping_config' => [],
                'dedupe_config' => [],
                'publish_policy' => 'draft',
                'ai_provider' => 'openai_compatible',
                'ai_fallback_enabled' => false,
            ]),
            'method' => 'POST',
            'submitRoute' => route('admin.crawlers.store'),
        ]);
    }

    public function store(CrawlRuleRequest $request): RedirectResponse
    {
        $payload = $request->normalizedPayload();
        $payload['created_by'] = null;
        $payload['updated_by'] = null;

        $item = CrawlRule::query()->create($payload);
        $item->next_run_at = $this->scheduleService->nextRunAt($item);
        $item->save();

        return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。');
    }

    public function edit(CrawlRule $crawler): View
    {
        return view('admin.crawlers.form', [
            'item' => $crawler,
            'method' => 'PUT',
            'submitRoute' => route('admin.crawlers.update', $crawler),
        ]);
    }

    public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse
    {
        $payload = $request->normalizedPayload();
        $payload['updated_by'] = null;

        $crawler->fill($payload);
        $crawler->next_run_at = $this->scheduleService->nextRunAt($crawler);
        $crawler->save();

        return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。');
    }

    public function run(CrawlRule $crawler): RedirectResponse
    {
        RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value);

        return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id])
            ->with('status', '已提交手动执行任务。');
    }

    public function preview(Request $request): JsonResponse
    {
        $payload = $request->validate([
            'url' => ['required', 'url', 'max:2000'],
            'user_agent' => ['nullable', 'string', 'max:255'],
        ]);

        $url = (string) $payload['url'];

        if (! $this->isSafePreviewUrl($url)) {
            return response()->json([
                'ok' => false,
                'message' => '预览地址不安全，已拒绝请求。',
            ], 422);
        }

        $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
        if (! $fetched['ok']) {
            return response()->json([
                'ok' => false,
                'message' => '页面抓取失败：'.($fetched['error'] ?? 'unknown'),
            ], 422);
        }

        $sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']);

        return response()->json([
            'ok' => true,
            'url' => $url,
            'title' => $this->extractTitle($sanitizedHtml),
            'html' => $sanitizedHtml,
        ]);
    }

    public function aiSuggestExtractor(Request $request): JsonResponse
    {
        $payload = $request->validate([
            'url' => ['required', 'url', 'max:2000'],
            'target_module' => ['required', 'in:tool,model'],
            'user_agent' => ['nullable', 'string', 'max:255'],
            'ai_model' => ['nullable', 'string', 'max:128'],
            'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
            'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
            'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
            'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
        ]);

        if (! $this->aiExtractor->isConfigured()) {
            return response()->json([
                'ok' => false,
                'message' => 'AI 配置缺失，请先设置 CRAWLER_AI_KEY 与接口地址。',
            ], 422);
        }

        $url = (string) $payload['url'];

        if (! $this->isSafePreviewUrl($url)) {
            return response()->json([
                'ok' => false,
                'message' => '目标 URL 不安全，已拒绝请求。',
            ], 422);
        }

        $fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
        if (! $fetched['ok']) {
            return response()->json([
                'ok' => false,
                'message' => '页面抓取失败：'.($fetched['error'] ?? 'unknown'),
            ], 422);
        }

        $options = [];
        foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) {
            if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') {
                $options[$target] = trim((string) $payload[$source]);
            }
        }

        if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
            $options['temperature'] = (float) $payload['ai_temperature'];
        }

        if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
            $options['content_max_chars'] = (int) $payload['ai_content_max_chars'];
        }

        $extractorConfig = $this->aiExtractor->suggestExtractorConfig(
            (string) $payload['target_module'],
            $this->sanitizePreviewHtml($fetched['body']),
            $options,
        );

        if ($extractorConfig === []) {
            $reason = $this->aiExtractor->lastError();

            return response()->json([
                'ok' => false,
                'message' => $reason !== null && $reason !== ''
                    ? 'AI 生成失败：'.$reason
                    : 'AI 未生成有效规则，请调整页面或提示词后重试。',
            ], 422);
        }

        return response()->json([
            'ok' => true,
            'extractor_config' => $extractorConfig,
        ]);
    }

    /**
     * @return array{ok: bool, body: string, error: string|null}
     */
    private function fetchHtml(string $url, ?string $userAgent = null): array
    {
        $ua = is_string($userAgent) && trim($userAgent) !== ''
            ? trim($userAgent)
            : (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0');

        $maxAttempts = 3;
        $lastError = 'unknown';

        for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
            try {
                $request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5));
                if (! (bool) config('crawler.verify_ssl', true)) {
                    $request = $request->withoutVerifying();
                }

                $request = $this->applyNetworkOptions($request);
                $response = $request->withUserAgent($ua)->get($url);

                if ($response->successful()) {
                    return [
                        'ok' => true,
                        'body' => $response->body(),
                        'error' => null,
                    ];
                }

                $lastError = sprintf('HTTP %d', $response->status());
                if ($attempt < $maxAttempts && $response->serverError()) {
                    usleep(250000 * $attempt);
                    continue;
                }

                break;
            } catch (\Throwable $exception) {
                $lastError = $exception->getMessage();
                if ($attempt < $maxAttempts) {
                    usleep(250000 * $attempt);
                    continue;
                }
            }
        }

        return [
            'ok' => false,
            'body' => '',
            'error' => $lastError,
        ];
    }

    private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest
    {
        $options = [];

        if ((bool) config('crawler.force_ipv4', false)) {
            $options['force_ip_resolve'] = 'v4';
        }

        $dnsServers = trim((string) config('crawler.dns_servers', ''));
        if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
            $options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
        }

        if ($options === []) {
            return $request;
        }

        return $request->withOptions($options);
    }

    private function isSafePreviewUrl(string $url): bool
    {
        $parts = parse_url($url);
        if (! is_array($parts)) {
            return false;
        }

        $scheme = strtolower((string) ($parts['scheme'] ?? ''));
        $host = strtolower((string) ($parts['host'] ?? ''));

        if (! in_array($scheme, ['http', 'https'], true) || $host === '') {
            return false;
        }

        if ($host === 'localhost') {
            return false;
        }

        if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
            return $this->isPublicIp($host);
        }

        $records = @dns_get_record($host, DNS_A + DNS_AAAA);
        if (! is_array($records) || $records === []) {
            return true;
        }

        foreach ($records as $record) {
            $ip = (string) ($record['ip'] ?? $record['ipv6'] ?? '');
            if ($ip !== '' && ! $this->isPublicIp($ip)) {
                return false;
            }
        }

        return true;
    }

    private function isPublicIp(string $ip): bool
    {
        return filter_var(
            $ip,
            FILTER_VALIDATE_IP,
            FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE,
        ) !== false;
    }

    private function sanitizePreviewHtml(string $html): string
    {
        if (trim($html) === '') {
            return '<!doctype html><html><head><meta charset="utf-8"></head><body>空页面</body></html>';
        }

        $dom = new \DOMDocument('1.0', 'UTF-8');

        libxml_use_internal_errors(true);
        $dom->loadHTML('<?xml encoding="UTF-8">'.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
        libxml_clear_errors();

        $xpath = new \DOMXPath($dom);

        foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) {
            $nodes = $xpath->query($query);
            if ($nodes === false) {
                continue;
            }

            for ($index = $nodes->length - 1; $index >= 0; $index--) {
                $node = $nodes->item($index);
                if ($node !== null && $node->parentNode !== null) {
                    $node->parentNode->removeChild($node);
                }
            }
        }

        $allNodes = $xpath->query('//*');
        if ($allNodes !== false) {
            foreach ($allNodes as $node) {
                if (! $node instanceof \DOMElement) {
                    continue;
                }

                $attributesToRemove = [];
                foreach ($node->attributes as $attribute) {
                    if (str_starts_with(strtolower($attribute->name), 'on')) {
                        $attributesToRemove[] = $attribute->name;
                    }
                }

                foreach ($attributesToRemove as $attributeName) {
                    $node->removeAttribute($attributeName);
                }
            }
        }

        $output = (string) $dom->saveHTML();

        return mb_substr($output, 0, 300000);
    }

    private function extractTitle(string $html): string
    {
        if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $matches) !== 1) {
            return '';
        }

        return trim(strip_tags((string) $matches[1]));
    }
}