爬虫开发
Some checks failed
Tests / PHP 8.2 (push) Has been cancelled
Tests / PHP 8.3 (push) Has been cancelled
Tests / PHP 8.4 (push) Has been cancelled

This commit is contained in:
cjd
2026-02-18 12:56:36 +08:00
parent a98bc6f13c
commit 260460df03
45 changed files with 4091 additions and 8 deletions

View File

@@ -0,0 +1,41 @@
<?php
declare(strict_types=1);
namespace App\Console\Commands;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Http;
class CrawlerHealthCheckCommand extends Command
{
protected $signature = 'crawler:health-check';
protected $description = '采集器依赖与配置自检';
public function handle(): int
{
$checks = [
['item' => 'Queue Connection', 'status' => (string) config('queue.default'), 'detail' => '当前队列连接'],
['item' => 'Browserless Endpoint', 'status' => (string) (config('crawler.browserless_endpoint') ?: 'not-configured'), 'detail' => 'JS渲染服务'],
['item' => 'AI Endpoint', 'status' => (string) (config('crawler.openai_compatible_endpoint') ?: 'not-configured'), 'detail' => 'AI兜底抽取'],
['item' => 'Alert Email', 'status' => (string) (config('crawler.default_alert_email') ?: 'not-configured'), 'detail' => '默认告警邮箱'],
];
$browserlessEndpoint = (string) config('crawler.browserless_endpoint', '');
if ($browserlessEndpoint !== '') {
try {
$response = Http::timeout(5)->get($browserlessEndpoint);
$checks[] = ['item' => 'Browserless Reachable', 'status' => $response->status() < 500 ? 'ok' : 'degraded', 'detail' => 'HTTP '.$response->status()];
} catch (\Throwable $exception) {
$checks[] = ['item' => 'Browserless Reachable', 'status' => 'failed', 'detail' => $exception->getMessage()];
}
}
$this->table(['Item', 'Status', 'Detail'], $checks);
return self::SUCCESS;
}
}

View File

@@ -0,0 +1,39 @@
<?php
declare(strict_types=1);
namespace App\Console\Commands;
use App\Enums\CrawlTriggerType;
use App\Jobs\RunCrawlRuleJob;
use App\Models\CrawlRun;
use Illuminate\Console\Command;
class CrawlerRetryFailedCommand extends Command
{
protected $signature = 'crawler:retry-failed {runId : 待重试的运行ID} {--sync : 同步执行,不走队列}';
protected $description = '重试失败的采集运行(按原规则重跑)';
public function handle(): int
{
$run = CrawlRun::query()->with('rule')->find((int) $this->argument('runId'));
if (! $run instanceof CrawlRun || $run->rule === null) {
$this->error('运行记录不存在或规则已删除');
return self::FAILURE;
}
if ((bool) $this->option('sync')) {
RunCrawlRuleJob::dispatchSync($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id);
} else {
RunCrawlRuleJob::dispatch($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id);
}
$this->info(sprintf('已提交重试任务,规则 #%d %s', $run->rule_id, $run->rule->name));
return self::SUCCESS;
}
}

View File

@@ -0,0 +1,64 @@
<?php
declare(strict_types=1);
namespace App\Console\Commands;
use App\Enums\CrawlTriggerType;
use App\Jobs\RunCrawlRuleJob;
use App\Models\CrawlRule;
use App\Services\Crawler\CrawlRuleScheduleService;
use Illuminate\Console\Command;
class CrawlerRunCommand extends Command
{
protected $signature = 'crawler:run {ruleId? : 指定规则ID} {--all : 忽略cron执行全部启用规则} {--sync : 同步执行,不走队列}';
protected $description = '执行采集规则';
public function handle(CrawlRuleScheduleService $scheduleService): int
{
$ruleId = $this->argument('ruleId');
$query = CrawlRule::query()->where('enabled', true);
if ($ruleId !== null) {
$query->whereKey((int) $ruleId);
}
$rules = $query->orderBy('id')->get();
if ($rules->isEmpty()) {
$this->warn('没有可执行的采集规则');
return self::SUCCESS;
}
$shouldRunAll = (bool) $this->option('all') || $ruleId !== null;
$sync = (bool) $this->option('sync');
$dispatched = 0;
foreach ($rules as $rule) {
if (! $shouldRunAll && ! $scheduleService->isDue($rule)) {
continue;
}
if ($sync) {
RunCrawlRuleJob::dispatchSync($rule->id, CrawlTriggerType::Schedule->value);
} else {
RunCrawlRuleJob::dispatch($rule->id, CrawlTriggerType::Schedule->value);
}
$dispatched++;
$this->info(sprintf('已提交规则 #%d %s', $rule->id, $rule->name));
}
if ($dispatched === 0) {
$this->line('当前无到期规则');
}
return self::SUCCESS;
}
}