414 lines
14 KiB
PHP
414 lines
14 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Http\Controllers\Admin;
|
|
|
|
use App\Enums\CrawlTargetModule;
|
|
use App\Enums\CrawlTriggerType;
|
|
use App\Http\Controllers\Controller;
|
|
use App\Http\Requests\Admin\CrawlRuleRequest;
|
|
use App\Jobs\RunCrawlRuleJob;
|
|
use App\Models\CrawlRule;
|
|
use App\Services\Crawler\CrawlRuleScheduleService;
|
|
use App\Services\Crawler\OpenAiFallbackExtractor;
|
|
use Illuminate\Http\JsonResponse;
|
|
use Illuminate\Http\RedirectResponse;
|
|
use Illuminate\Http\Request;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Illuminate\View\View;
|
|
|
|
class CrawlerRuleController extends Controller
|
|
{
|
|
public function __construct(
|
|
private readonly CrawlRuleScheduleService $scheduleService,
|
|
private readonly OpenAiFallbackExtractor $aiExtractor,
|
|
) {
|
|
}
|
|
|
|
public function index(Request $request): View
|
|
{
|
|
$items = CrawlRule::query()
|
|
->withCount('runs')
|
|
->when($request->filled('q'), function ($query) use ($request): void {
|
|
$keyword = '%'.trim((string) $request->string('q')).'%';
|
|
$query->where('name', 'like', $keyword);
|
|
})
|
|
->latest('updated_at')
|
|
->paginate(20)
|
|
->withQueryString();
|
|
|
|
return view('admin.crawlers.index', [
|
|
'items' => $items,
|
|
'filters' => $request->only(['q']),
|
|
]);
|
|
}
|
|
|
|
public function create(): View
|
|
{
|
|
return view('admin.crawlers.form', [
|
|
'item' => new CrawlRule([
|
|
'enabled' => true,
|
|
'target_module' => CrawlTargetModule::Tool,
|
|
'cron_expression' => '0 */6 * * *',
|
|
'timezone' => 'Asia/Shanghai',
|
|
'max_pages' => 50,
|
|
'rate_limit_per_minute' => 30,
|
|
'retry_max' => 3,
|
|
'retry_backoff_seconds' => 60,
|
|
'extractor_config' => [
|
|
'mode' => 'xpath',
|
|
'list_link_xpath' => '//a/@href',
|
|
'fields' => [
|
|
'name' => '//h1/text()',
|
|
'summary' => '//meta[@name="description"]/@content',
|
|
],
|
|
'ai' => [
|
|
'temperature' => 0,
|
|
'content_max_chars' => 12000,
|
|
],
|
|
],
|
|
'mapping_config' => [],
|
|
'dedupe_config' => [],
|
|
'publish_policy' => 'draft',
|
|
'ai_provider' => 'openai_compatible',
|
|
'ai_fallback_enabled' => false,
|
|
]),
|
|
'method' => 'POST',
|
|
'submitRoute' => route('admin.crawlers.store'),
|
|
]);
|
|
}
|
|
|
|
public function store(CrawlRuleRequest $request): RedirectResponse
|
|
{
|
|
$payload = $request->normalizedPayload();
|
|
$payload['created_by'] = null;
|
|
$payload['updated_by'] = null;
|
|
|
|
$item = CrawlRule::query()->create($payload);
|
|
$item->next_run_at = $this->scheduleService->nextRunAt($item);
|
|
$item->save();
|
|
|
|
return redirect()->route('admin.crawlers.edit', $item)->with('status', '采集规则已创建。');
|
|
}
|
|
|
|
public function edit(CrawlRule $crawler): View
|
|
{
|
|
return view('admin.crawlers.form', [
|
|
'item' => $crawler,
|
|
'method' => 'PUT',
|
|
'submitRoute' => route('admin.crawlers.update', $crawler),
|
|
]);
|
|
}
|
|
|
|
public function update(CrawlRuleRequest $request, CrawlRule $crawler): RedirectResponse
|
|
{
|
|
$payload = $request->normalizedPayload();
|
|
$payload['updated_by'] = null;
|
|
|
|
$crawler->fill($payload);
|
|
$crawler->next_run_at = $this->scheduleService->nextRunAt($crawler);
|
|
$crawler->save();
|
|
|
|
return redirect()->route('admin.crawlers.edit', $crawler)->with('status', '采集规则已更新。');
|
|
}
|
|
|
|
public function run(CrawlRule $crawler): RedirectResponse
|
|
{
|
|
RunCrawlRuleJob::dispatch($crawler->id, CrawlTriggerType::Manual->value);
|
|
|
|
return redirect()->route('admin.crawl-runs.index', ['rule_id' => $crawler->id])
|
|
->with('status', '已提交手动执行任务。');
|
|
}
|
|
|
|
public function preview(Request $request): JsonResponse
|
|
{
|
|
$payload = $request->validate([
|
|
'url' => ['required', 'url', 'max:2000'],
|
|
'user_agent' => ['nullable', 'string', 'max:255'],
|
|
]);
|
|
|
|
$url = (string) $payload['url'];
|
|
|
|
if (! $this->isSafePreviewUrl($url)) {
|
|
return response()->json([
|
|
'ok' => false,
|
|
'message' => '预览地址不安全,已拒绝请求。',
|
|
], 422);
|
|
}
|
|
|
|
$fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
|
|
if (! $fetched['ok']) {
|
|
return response()->json([
|
|
'ok' => false,
|
|
'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
|
|
], 422);
|
|
}
|
|
|
|
$sanitizedHtml = $this->sanitizePreviewHtml($fetched['body']);
|
|
|
|
return response()->json([
|
|
'ok' => true,
|
|
'url' => $url,
|
|
'title' => $this->extractTitle($sanitizedHtml),
|
|
'html' => $sanitizedHtml,
|
|
]);
|
|
}
|
|
|
|
public function aiSuggestExtractor(Request $request): JsonResponse
|
|
{
|
|
$payload = $request->validate([
|
|
'url' => ['required', 'url', 'max:2000'],
|
|
'target_module' => ['required', 'in:tool,model'],
|
|
'user_agent' => ['nullable', 'string', 'max:255'],
|
|
'ai_model' => ['nullable', 'string', 'max:128'],
|
|
'ai_system_prompt' => ['nullable', 'string', 'max:4000'],
|
|
'ai_user_prompt' => ['nullable', 'string', 'max:4000'],
|
|
'ai_temperature' => ['nullable', 'numeric', 'between:0,2'],
|
|
'ai_content_max_chars' => ['nullable', 'integer', 'between:500,50000'],
|
|
]);
|
|
|
|
if (! $this->aiExtractor->isConfigured()) {
|
|
return response()->json([
|
|
'ok' => false,
|
|
'message' => 'AI 配置缺失,请先设置 CRAWLER_AI_KEY 与接口地址。',
|
|
], 422);
|
|
}
|
|
|
|
$url = (string) $payload['url'];
|
|
|
|
if (! $this->isSafePreviewUrl($url)) {
|
|
return response()->json([
|
|
'ok' => false,
|
|
'message' => '目标 URL 不安全,已拒绝请求。',
|
|
], 422);
|
|
}
|
|
|
|
$fetched = $this->fetchHtml($url, $payload['user_agent'] ?? null);
|
|
if (! $fetched['ok']) {
|
|
return response()->json([
|
|
'ok' => false,
|
|
'message' => '页面抓取失败:'.($fetched['error'] ?? 'unknown'),
|
|
], 422);
|
|
}
|
|
|
|
$options = [];
|
|
foreach (['ai_model' => 'model', 'ai_system_prompt' => 'system_prompt', 'ai_user_prompt' => 'user_prompt'] as $source => $target) {
|
|
if (is_string($payload[$source] ?? null) && trim((string) $payload[$source]) !== '') {
|
|
$options[$target] = trim((string) $payload[$source]);
|
|
}
|
|
}
|
|
|
|
if (isset($payload['ai_temperature']) && $payload['ai_temperature'] !== '') {
|
|
$options['temperature'] = (float) $payload['ai_temperature'];
|
|
}
|
|
|
|
if (isset($payload['ai_content_max_chars']) && $payload['ai_content_max_chars'] !== '') {
|
|
$options['content_max_chars'] = (int) $payload['ai_content_max_chars'];
|
|
}
|
|
|
|
$extractorConfig = $this->aiExtractor->suggestExtractorConfig(
|
|
(string) $payload['target_module'],
|
|
$this->sanitizePreviewHtml($fetched['body']),
|
|
$options,
|
|
);
|
|
|
|
if ($extractorConfig === []) {
|
|
$reason = $this->aiExtractor->lastError();
|
|
|
|
return response()->json([
|
|
'ok' => false,
|
|
'message' => $reason !== null && $reason !== ''
|
|
? 'AI 生成失败:'.$reason
|
|
: 'AI 未生成有效规则,请调整页面或提示词后重试。',
|
|
], 422);
|
|
}
|
|
|
|
return response()->json([
|
|
'ok' => true,
|
|
'extractor_config' => $extractorConfig,
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* @return array{ok: bool, body: string, error: string|null}
|
|
*/
|
|
private function fetchHtml(string $url, ?string $userAgent = null): array
|
|
{
|
|
$ua = is_string($userAgent) && trim($userAgent) !== ''
|
|
? trim($userAgent)
|
|
: (string) config('crawler.default_user_agent', 'AIWebCrawler/1.0');
|
|
|
|
$maxAttempts = 3;
|
|
$lastError = 'unknown';
|
|
|
|
for ($attempt = 1; $attempt <= $maxAttempts; $attempt++) {
|
|
try {
|
|
$request = Http::timeout(max((int) config('crawler.request_timeout_seconds', 20), 5));
|
|
if (! (bool) config('crawler.verify_ssl', true)) {
|
|
$request = $request->withoutVerifying();
|
|
}
|
|
|
|
$request = $this->applyNetworkOptions($request);
|
|
$response = $request->withUserAgent($ua)->get($url);
|
|
|
|
if ($response->successful()) {
|
|
return [
|
|
'ok' => true,
|
|
'body' => $response->body(),
|
|
'error' => null,
|
|
];
|
|
}
|
|
|
|
$lastError = sprintf('HTTP %d', $response->status());
|
|
if ($attempt < $maxAttempts && $response->serverError()) {
|
|
usleep(250000 * $attempt);
|
|
continue;
|
|
}
|
|
|
|
break;
|
|
} catch (\Throwable $exception) {
|
|
$lastError = $exception->getMessage();
|
|
if ($attempt < $maxAttempts) {
|
|
usleep(250000 * $attempt);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
return [
|
|
'ok' => false,
|
|
'body' => '',
|
|
'error' => $lastError,
|
|
];
|
|
}
|
|
|
|
private function applyNetworkOptions(\Illuminate\Http\Client\PendingRequest $request): \Illuminate\Http\Client\PendingRequest
|
|
{
|
|
$options = [];
|
|
|
|
if ((bool) config('crawler.force_ipv4', false)) {
|
|
$options['force_ip_resolve'] = 'v4';
|
|
}
|
|
|
|
$dnsServers = trim((string) config('crawler.dns_servers', ''));
|
|
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
|
|
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
|
|
}
|
|
|
|
if ($options === []) {
|
|
return $request;
|
|
}
|
|
|
|
return $request->withOptions($options);
|
|
}
|
|
|
|
private function isSafePreviewUrl(string $url): bool
|
|
{
|
|
$parts = parse_url($url);
|
|
if (! is_array($parts)) {
|
|
return false;
|
|
}
|
|
|
|
$scheme = strtolower((string) ($parts['scheme'] ?? ''));
|
|
$host = strtolower((string) ($parts['host'] ?? ''));
|
|
|
|
if (! in_array($scheme, ['http', 'https'], true) || $host === '') {
|
|
return false;
|
|
}
|
|
|
|
if ($host === 'localhost') {
|
|
return false;
|
|
}
|
|
|
|
if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
|
|
return $this->isPublicIp($host);
|
|
}
|
|
|
|
$records = @dns_get_record($host, DNS_A + DNS_AAAA);
|
|
if (! is_array($records) || $records === []) {
|
|
return true;
|
|
}
|
|
|
|
foreach ($records as $record) {
|
|
$ip = (string) ($record['ip'] ?? $record['ipv6'] ?? '');
|
|
if ($ip !== '' && ! $this->isPublicIp($ip)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private function isPublicIp(string $ip): bool
|
|
{
|
|
return filter_var(
|
|
$ip,
|
|
FILTER_VALIDATE_IP,
|
|
FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE,
|
|
) !== false;
|
|
}
|
|
|
|
private function sanitizePreviewHtml(string $html): string
|
|
{
|
|
if (trim($html) === '') {
|
|
return '<!doctype html><html><head><meta charset="utf-8"></head><body>空页面</body></html>';
|
|
}
|
|
|
|
$dom = new \DOMDocument('1.0', 'UTF-8');
|
|
|
|
libxml_use_internal_errors(true);
|
|
$dom->loadHTML('<?xml encoding="UTF-8">'.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
|
|
libxml_clear_errors();
|
|
|
|
$xpath = new \DOMXPath($dom);
|
|
|
|
foreach (['//script', '//noscript', '//iframe', '//object', '//embed', '//base'] as $query) {
|
|
$nodes = $xpath->query($query);
|
|
if ($nodes === false) {
|
|
continue;
|
|
}
|
|
|
|
for ($index = $nodes->length - 1; $index >= 0; $index--) {
|
|
$node = $nodes->item($index);
|
|
if ($node !== null && $node->parentNode !== null) {
|
|
$node->parentNode->removeChild($node);
|
|
}
|
|
}
|
|
}
|
|
|
|
$allNodes = $xpath->query('//*');
|
|
if ($allNodes !== false) {
|
|
foreach ($allNodes as $node) {
|
|
if (! $node instanceof \DOMElement) {
|
|
continue;
|
|
}
|
|
|
|
$attributesToRemove = [];
|
|
foreach ($node->attributes as $attribute) {
|
|
if (str_starts_with(strtolower($attribute->name), 'on')) {
|
|
$attributesToRemove[] = $attribute->name;
|
|
}
|
|
}
|
|
|
|
foreach ($attributesToRemove as $attributeName) {
|
|
$node->removeAttribute($attributeName);
|
|
}
|
|
}
|
|
}
|
|
|
|
$output = (string) $dom->saveHTML();
|
|
|
|
return mb_substr($output, 0, 300000);
|
|
}
|
|
|
|
private function extractTitle(string $html): string
|
|
{
|
|
if (preg_match('/<title[^>]*>(.*?)<\/title>/is', $html, $matches) !== 1) {
|
|
return '';
|
|
}
|
|
|
|
return trim(strip_tags((string) $matches[1]));
|
|
}
|
|
}
|