爬虫开发
Some checks failed
Tests / PHP 8.2 (push) Has been cancelled
Tests / PHP 8.3 (push) Has been cancelled
Tests / PHP 8.4 (push) Has been cancelled

This commit is contained in:
cjd
2026-02-18 12:56:36 +08:00
parent a98bc6f13c
commit 260460df03
45 changed files with 4091 additions and 8 deletions

View File

@@ -0,0 +1,482 @@
@extends('layouts.admin')
@section('title', $item->exists ? '编辑采集规则' : '新建采集规则')
@section('head')
@include('admin.partials.modern-form-head')
@endsection
@section('scripts')
<script>
(function () {
const previewEndpoint = '{{ route('admin.crawlers.preview') }}';
const aiSuggestEndpoint = '{{ route('admin.crawlers.ai-suggest-extractor') }}';
const csrfToken = document.querySelector('meta[name="csrf-token"]')?.getAttribute('content') || '';
const previewUrlInput = document.getElementById('preview-url');
const previewFrame = document.getElementById('preview-frame');
const previewStatus = document.getElementById('preview-status');
const selectedXPathView = document.getElementById('selected-xpath');
const extractorJsonInput = document.getElementById('extractor-json');
const pickerFieldInput = document.getElementById('picker-field');
let selectedXPath = '';
const parseJson = (text) => {
try {
const data = JSON.parse(text || '{}');
return typeof data === 'object' && data ? data : {};
} catch (_) {
return null;
}
};
const writeExtractor = (config) => {
extractorJsonInput.value = JSON.stringify(config, null, 2);
};
const collectAiOptions = () => {
const options = {};
const model = document.getElementById('ai-model')?.value?.trim();
const systemPrompt = document.getElementById('ai-system-prompt')?.value?.trim();
const userPrompt = document.getElementById('ai-user-prompt')?.value?.trim();
const temperature = document.getElementById('ai-temperature')?.value?.trim();
const maxChars = document.getElementById('ai-content-max-chars')?.value?.trim();
if (model) options.model = model;
if (systemPrompt) options.system_prompt = systemPrompt;
if (userPrompt) options.user_prompt = userPrompt;
if (temperature !== '') options.temperature = Number(temperature);
if (maxChars !== '') options.content_max_chars = Number(maxChars);
return options;
};
const normalizeConfig = (config) => {
const normalized = config && typeof config === 'object' ? config : {};
normalized.mode = document.getElementById('extractor-mode')?.value || 'xpath';
normalized.ai = collectAiOptions();
if (Object.keys(normalized.ai).length === 0) {
delete normalized.ai;
}
if (!normalized.fields || typeof normalized.fields !== 'object') {
normalized.fields = {};
}
return normalized;
};
const postJson = async (url, payload) => {
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRF-TOKEN': csrfToken,
'X-Requested-With': 'XMLHttpRequest',
},
body: JSON.stringify(payload),
});
const data = await response.json().catch(() => ({}));
if (!response.ok || data.ok === false) {
throw new Error(data.message || ('HTTP ' + response.status));
}
return data;
};
const installPicker = () => {
const frameWindow = previewFrame?.contentWindow;
const frameDocument = previewFrame?.contentDocument;
if (!frameWindow || !frameDocument || frameWindow.__pickerInstalled) {
return;
}
frameWindow.__pickerInstalled = true;
const script = frameDocument.createElement('script');
script.text = `
(function () {
try {
window.open = function () { return null; };
} catch (e) {}
const disableNavigation = () => {
document.querySelectorAll('a[href], area[href]').forEach((node) => {
const href = node.getAttribute('href') || '';
node.setAttribute('data-original-href', href);
node.setAttribute('href', 'javascript:void(0)');
node.removeAttribute('target');
});
document.querySelectorAll('form').forEach((form) => {
form.setAttribute('data-original-action', form.getAttribute('action') || '');
form.setAttribute('action', 'javascript:void(0)');
form.addEventListener('submit', (e) => {
e.preventDefault();
e.stopPropagation();
}, true);
});
document.querySelectorAll('button, input[type="submit"], input[type="button"]').forEach((node) => {
node.addEventListener('click', (e) => {
e.preventDefault();
e.stopPropagation();
}, true);
});
};
disableNavigation();
document.documentElement.style.cursor = 'crosshair';
document.body.style.cursor = 'crosshair';
document.querySelectorAll('*').forEach((node) => {
node.style.cursor = 'crosshair';
});
const xpath = (el) => {
if (!el || el.nodeType !== 1) return '';
if (el.id) return '//*[@id="' + el.id + '"]';
const parts = [];
let node = el;
while (node && node.nodeType === 1) {
let i = 1;
let p = node.previousElementSibling;
while (p) {
if (p.tagName === node.tagName) i += 1;
p = p.previousElementSibling;
}
parts.unshift(node.tagName.toLowerCase() + '[' + i + ']');
node = node.parentElement;
}
return '/' + parts.join('/');
};
document.addEventListener('mouseover', (e) => {
if (e.target instanceof Element) {
e.target.style.outline = '2px solid #2563eb';
}
}, true);
document.addEventListener('mouseout', (e) => {
if (e.target instanceof Element) {
e.target.style.outline = '';
}
}, true);
document.addEventListener('click', (e) => {
if (!(e.target instanceof Element)) return;
e.preventDefault();
e.stopPropagation();
if (typeof e.stopImmediatePropagation === 'function') {
e.stopImmediatePropagation();
}
window.parent.postMessage({
source: 'crawler-picker',
xpath: xpath(e.target),
}, '*');
}, true);
document.addEventListener('mousedown', (e) => {
e.preventDefault();
e.stopPropagation();
}, true);
document.addEventListener('mouseup', (e) => {
e.preventDefault();
e.stopPropagation();
}, true);
document.addEventListener('auxclick', (e) => {
e.preventDefault();
e.stopPropagation();
}, true);
document.addEventListener('keydown', (e) => {
if (e.key === 'Enter') {
e.preventDefault();
e.stopPropagation();
}
}, true);
}());
`;
frameDocument.body?.appendChild(script);
};
document.getElementById('preview-load-btn')?.addEventListener('click', async () => {
const url = previewUrlInput?.value?.trim();
if (!url) {
alert('请先输入 URL');
return;
}
previewStatus.textContent = '正在加载预览...';
try {
const data = await postJson(previewEndpoint, {
url,
user_agent: document.getElementById('user-agent')?.value || '',
});
previewFrame.srcdoc = data.html || '';
previewFrame.onload = installPicker;
previewStatus.textContent = '预览已加载,可点击页面元素。';
} catch (error) {
previewStatus.textContent = '加载失败:' + (error.message || 'unknown');
}
});
document.getElementById('apply-selector-btn')?.addEventListener('click', () => {
if (!selectedXPath) {
alert('请先在预览中点选元素');
return;
}
const field = (pickerFieldInput?.value || '').trim() || 'list_link_xpath';
const current = parseJson(extractorJsonInput.value);
if (current === null) {
alert('Extractor JSON 不是有效 JSON');
return;
}
const config = normalizeConfig(current);
if (field === 'list_link_xpath') {
config.list_link_xpath = selectedXPath;
} else {
config.fields[field] = selectedXPath;
}
writeExtractor(config);
});
document.getElementById('ai-suggest-btn')?.addEventListener('click', async () => {
const url = previewUrlInput?.value?.trim();
if (!url) {
alert('请先输入 URL');
return;
}
try {
const data = await postJson(aiSuggestEndpoint, {
url,
target_module: document.getElementById('target-module')?.value || 'tool',
user_agent: document.getElementById('user-agent')?.value || '',
ai_model: document.getElementById('ai-model')?.value || '',
ai_system_prompt: document.getElementById('ai-system-prompt')?.value || '',
ai_user_prompt: document.getElementById('ai-user-prompt')?.value || '',
ai_temperature: document.getElementById('ai-temperature')?.value || '',
ai_content_max_chars: document.getElementById('ai-content-max-chars')?.value || '',
});
const current = parseJson(extractorJsonInput.value);
const base = current && typeof current === 'object' ? current : {};
writeExtractor(normalizeConfig({
...base,
...data.extractor_config,
}));
previewStatus.textContent = 'AI 规则已生成并合并。';
} catch (error) {
previewStatus.textContent = 'AI 生成失败:' + (error.message || 'unknown');
}
});
window.addEventListener('message', (event) => {
if (!event.data || event.data.source !== 'crawler-picker') {
return;
}
selectedXPath = String(event.data.xpath || '').trim();
selectedXPathView.textContent = selectedXPath || '未选择';
});
document.getElementById('extractor-mode')?.addEventListener('change', () => {
const current = parseJson(extractorJsonInput.value);
if (current !== null) {
writeExtractor(normalizeConfig(current));
}
});
}());
</script>
@endsection
@section('content')
<div class="card modern-form-card">
<div class="card-header d-flex justify-content-between align-items-center">
<h3 class="card-title mb-0">{{ $item->exists ? '编辑采集规则' : '新建采集规则' }}</h3>
<a class="btn btn-sm btn-outline-secondary" href="{{ route('admin.crawlers.index') }}">返回列表</a>
</div>
<div class="card-body">
<form method="post" action="{{ $submitRoute }}" class="row g-3" id="crawler-form">
@csrf
@if($method !== 'POST') @method($method) @endif
@php
$entryUrls = old('entry_urls', is_array($item->entry_urls) ? implode("\n", $item->entry_urls) : '');
$headersJson = old('headers_json', json_encode($item->headers ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
$cookiesJson = old('cookies_json', json_encode($item->cookies ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
$extractorConfig = is_array($item->extractor_config) ? $item->extractor_config : [];
$extractorJson = old('extractor_json', json_encode($extractorConfig, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
$mappingJson = old('mapping_json', json_encode($item->mapping_config ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
$dedupeJson = old('dedupe_json', json_encode($item->dedupe_config ?? [], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
$extractorAi = is_array($extractorConfig['ai'] ?? null) ? $extractorConfig['ai'] : [];
$mode = old('extractor_mode', $extractorConfig['mode'] ?? 'xpath');
@endphp
<div class="col-12">
<section class="form-section">
<h4 class="form-section-title">基础配置</h4>
<div class="row g-3">
<div class="col-md-6">
<label class="form-label">规则名称</label>
<input class="form-control" name="name" value="{{ old('name', $item->name) }}" required>
</div>
<div class="col-md-3">
<label class="form-label">目标模块</label>
<select class="form-select" name="target_module" id="target-module" required>
<option value="tool" @selected(old('target_module', $item->target_module?->value ?? 'tool') === 'tool')>AI 工具</option>
<option value="model" @selected(old('target_module', $item->target_module?->value ?? 'tool') === 'model')>AI 模型</option>
</select>
</div>
<div class="col-md-3">
<label class="form-label">发布策略</label>
<select class="form-select" name="publish_policy">
<option value="draft" @selected(old('publish_policy', $item->publish_policy ?? 'draft') === 'draft')>草稿待审核</option>
</select>
</div>
<div class="col-md-4">
<label class="form-label">Cron 表达式</label>
<input class="form-control" name="cron_expression" value="{{ old('cron_expression', $item->cron_expression ?: '0 */6 * * *') }}" required>
</div>
<div class="col-md-4">
<label class="form-label">时区</label>
<input class="form-control" name="timezone" value="{{ old('timezone', $item->timezone ?: 'Asia/Shanghai') }}" required>
</div>
<div class="col-md-2">
<label class="form-label">最大页面数</label>
<input class="form-control" type="number" min="1" max="2000" name="max_pages" value="{{ old('max_pages', $item->max_pages ?: 50) }}" required>
</div>
<div class="col-md-2">
<label class="form-label">启用</label>
<div class="form-check mt-2">
<input class="form-check-input" type="checkbox" name="enabled" value="1" @checked(old('enabled', $item->enabled))>
<label class="form-check-label">启用规则</label>
</div>
</div>
<div class="col-12">
<label class="form-label">入口 URL每行一个</label>
<textarea class="form-control" name="entry_urls" rows="4" required>{{ $entryUrls }}</textarea>
</div>
</div>
</section>
</div>
<div class="col-12">
<section class="form-section">
<h4 class="form-section-title">抓取与 AI 配置</h4>
<div class="row g-3">
<div class="col-md-3">
<label class="form-label">每分钟限流</label>
<input class="form-control" type="number" min="1" max="2000" name="rate_limit_per_minute" value="{{ old('rate_limit_per_minute', $item->rate_limit_per_minute ?: 30) }}" required>
</div>
<div class="col-md-3">
<label class="form-label">最大重试次数</label>
<input class="form-control" type="number" min="1" max="10" name="retry_max" value="{{ old('retry_max', $item->retry_max ?: 3) }}" required>
</div>
<div class="col-md-3">
<label class="form-label">退避秒数</label>
<input class="form-control" type="number" min="1" max="3600" name="retry_backoff_seconds" value="{{ old('retry_backoff_seconds', $item->retry_backoff_seconds ?: 60) }}" required>
</div>
<div class="col-md-3">
<label class="form-label">告警邮箱</label>
<input class="form-control" type="email" name="alert_email" value="{{ old('alert_email', $item->alert_email) }}">
</div>
<div class="col-md-3">
<label class="form-label">抽取模式</label>
<select class="form-select" name="extractor_mode" id="extractor-mode" required>
<option value="xpath" @selected($mode === 'xpath')>XPath</option>
<option value="ai" @selected($mode === 'ai')>AI</option>
<option value="hybrid" @selected($mode === 'hybrid')>Hybrid</option>
</select>
</div>
<div class="col-md-3">
<label class="form-label">AI Provider</label>
<input class="form-control" name="ai_provider" value="{{ old('ai_provider', $item->ai_provider ?: 'openai_compatible') }}">
</div>
<div class="col-md-3">
<label class="form-label">AI Model</label>
<input class="form-control" name="ai_model" id="ai-model" value="{{ old('ai_model', $item->ai_model ?: config('crawler.openai_default_model')) }}">
</div>
<div class="col-md-3">
<label class="form-label">AI 温度</label>
<input class="form-control" type="number" step="0.1" min="0" max="2" name="ai_temperature" id="ai-temperature" value="{{ old('ai_temperature', $extractorAi['temperature'] ?? 0) }}">
</div>
<div class="col-md-4">
<label class="form-label">AI 截断长度</label>
<input class="form-control" type="number" min="500" max="50000" name="ai_content_max_chars" id="ai-content-max-chars" value="{{ old('ai_content_max_chars', $extractorAi['content_max_chars'] ?? 12000) }}">
</div>
<div class="col-md-4">
<label class="form-label">AI 系统提示词</label>
<textarea class="form-control" name="ai_system_prompt" id="ai-system-prompt" rows="3">{{ old('ai_system_prompt', $extractorAi['system_prompt'] ?? '') }}</textarea>
</div>
<div class="col-md-4">
<label class="form-label">AI 用户提示词</label>
<textarea class="form-control" name="ai_user_prompt" id="ai-user-prompt" rows="3">{{ old('ai_user_prompt', $extractorAi['user_prompt'] ?? '') }}</textarea>
</div>
<div class="col-md-4">
<label class="form-label">User-Agent</label>
<input class="form-control" name="user_agent" id="user-agent" value="{{ old('user_agent', $item->user_agent) }}">
</div>
<div class="col-md-4">
<label class="form-label">代理</label>
<input class="form-control" name="proxy" value="{{ old('proxy', $item->proxy) }}">
</div>
<div class="col-md-4">
<label class="form-label">AI 兜底</label>
<div class="form-check mt-2">
<input class="form-check-input" type="checkbox" name="ai_fallback_enabled" value="1" @checked(old('ai_fallback_enabled', $item->ai_fallback_enabled))>
<label class="form-check-label">缺字段启用兜底</label>
</div>
</div>
<div class="col-md-6">
<label class="form-label">Headers JSON</label>
<textarea class="form-control" name="headers_json" rows="5">{{ $headersJson }}</textarea>
</div>
<div class="col-md-6">
<label class="form-label">Cookies JSON</label>
<textarea class="form-control" name="cookies_json" rows="5">{{ $cookiesJson }}</textarea>
</div>
</div>
</section>
</div>
<div class="col-12">
<section class="form-section">
<h4 class="form-section-title">Extractor / Mapping / 预览选元素</h4>
<div class="row g-3">
<div class="col-md-6">
<label class="form-label">Extractor JSON</label>
<textarea class="form-control" name="extractor_json" id="extractor-json" rows="14" required>{{ $extractorJson }}</textarea>
</div>
<div class="col-md-6">
<label class="form-label">Mapping JSON</label>
<textarea class="form-control" name="mapping_json" rows="6">{{ $mappingJson }}</textarea>
<label class="form-label mt-3">Dedupe JSON</label>
<textarea class="form-control" name="dedupe_json" rows="6">{{ $dedupeJson }}</textarea>
</div>
<div class="col-md-9">
<input class="form-control" type="url" id="preview-url" placeholder="输入目标页面 URL用于预览和 AI 生成规则)">
</div>
<div class="col-md-3 d-grid">
<button class="btn btn-outline-primary" type="button" id="preview-load-btn">加载预览</button>
</div>
<div class="col-12">
<iframe id="preview-frame" style="width:100%;height:480px;border:1px solid #d7e0ef;border-radius:.6rem;" sandbox="allow-same-origin allow-scripts"></iframe>
<div class="small text-muted mt-2" id="preview-status">未加载预览</div>
</div>
<div class="col-md-5">
<label class="form-label">当前 XPath</label>
<div id="selected-xpath" class="form-control" style="height:auto;min-height:42px;">未选择</div>
</div>
<div class="col-md-4">
<label class="form-label">写入字段(支持自定义)</label>
<input class="form-control" id="picker-field" placeholder="list_link_xpath 或 name/summary/...">
</div>
<div class="col-md-3 d-grid">
<label class="form-label">&nbsp;</label>
<button class="btn btn-primary" type="button" id="apply-selector-btn">写入 Extractor JSON</button>
</div>
<div class="col-md-12 d-grid">
<button class="btn btn-outline-success" type="button" id="ai-suggest-btn">AI 生成抽取规则并合并到 Extractor JSON</button>
</div>
</div>
</section>
</div>
<div class="col-12 d-flex justify-content-between align-items-center">
<small class="text-muted">建议流程:加载预览 -> 点选元素写 XPath -> AI 补全规则 -> 保存。</small>
<button class="btn btn-primary" type="submit">保存规则</button>
</div>
</form>
</div>
</div>
@endsection