爬虫开发
This commit is contained in:
41
app/Console/Commands/CrawlerHealthCheckCommand.php
Normal file
41
app/Console/Commands/CrawlerHealthCheckCommand.php
Normal file
@@ -0,0 +1,41 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Console\Commands;
|
||||
|
||||
use Illuminate\Console\Command;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
|
||||
class CrawlerHealthCheckCommand extends Command
|
||||
{
|
||||
protected $signature = 'crawler:health-check';
|
||||
|
||||
protected $description = '采集器依赖与配置自检';
|
||||
|
||||
public function handle(): int
|
||||
{
|
||||
$checks = [
|
||||
['item' => 'Queue Connection', 'status' => (string) config('queue.default'), 'detail' => '当前队列连接'],
|
||||
['item' => 'Browserless Endpoint', 'status' => (string) (config('crawler.browserless_endpoint') ?: 'not-configured'), 'detail' => 'JS渲染服务'],
|
||||
['item' => 'AI Endpoint', 'status' => (string) (config('crawler.openai_compatible_endpoint') ?: 'not-configured'), 'detail' => 'AI兜底抽取'],
|
||||
['item' => 'Alert Email', 'status' => (string) (config('crawler.default_alert_email') ?: 'not-configured'), 'detail' => '默认告警邮箱'],
|
||||
];
|
||||
|
||||
$browserlessEndpoint = (string) config('crawler.browserless_endpoint', '');
|
||||
|
||||
if ($browserlessEndpoint !== '') {
|
||||
try {
|
||||
$response = Http::timeout(5)->get($browserlessEndpoint);
|
||||
$checks[] = ['item' => 'Browserless Reachable', 'status' => $response->status() < 500 ? 'ok' : 'degraded', 'detail' => 'HTTP '.$response->status()];
|
||||
} catch (\Throwable $exception) {
|
||||
$checks[] = ['item' => 'Browserless Reachable', 'status' => 'failed', 'detail' => $exception->getMessage()];
|
||||
}
|
||||
}
|
||||
|
||||
$this->table(['Item', 'Status', 'Detail'], $checks);
|
||||
|
||||
return self::SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
39
app/Console/Commands/CrawlerRetryFailedCommand.php
Normal file
39
app/Console/Commands/CrawlerRetryFailedCommand.php
Normal file
@@ -0,0 +1,39 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Console\Commands;
|
||||
|
||||
use App\Enums\CrawlTriggerType;
|
||||
use App\Jobs\RunCrawlRuleJob;
|
||||
use App\Models\CrawlRun;
|
||||
use Illuminate\Console\Command;
|
||||
|
||||
class CrawlerRetryFailedCommand extends Command
|
||||
{
|
||||
protected $signature = 'crawler:retry-failed {runId : 待重试的运行ID} {--sync : 同步执行,不走队列}';
|
||||
|
||||
protected $description = '重试失败的采集运行(按原规则重跑)';
|
||||
|
||||
public function handle(): int
|
||||
{
|
||||
$run = CrawlRun::query()->with('rule')->find((int) $this->argument('runId'));
|
||||
|
||||
if (! $run instanceof CrawlRun || $run->rule === null) {
|
||||
$this->error('运行记录不存在或规则已删除');
|
||||
|
||||
return self::FAILURE;
|
||||
}
|
||||
|
||||
if ((bool) $this->option('sync')) {
|
||||
RunCrawlRuleJob::dispatchSync($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id);
|
||||
} else {
|
||||
RunCrawlRuleJob::dispatch($run->rule_id, CrawlTriggerType::Retry->value, null, $run->id);
|
||||
}
|
||||
|
||||
$this->info(sprintf('已提交重试任务,规则 #%d %s', $run->rule_id, $run->rule->name));
|
||||
|
||||
return self::SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
64
app/Console/Commands/CrawlerRunCommand.php
Normal file
64
app/Console/Commands/CrawlerRunCommand.php
Normal file
@@ -0,0 +1,64 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Console\Commands;
|
||||
|
||||
use App\Enums\CrawlTriggerType;
|
||||
use App\Jobs\RunCrawlRuleJob;
|
||||
use App\Models\CrawlRule;
|
||||
use App\Services\Crawler\CrawlRuleScheduleService;
|
||||
use Illuminate\Console\Command;
|
||||
|
||||
class CrawlerRunCommand extends Command
|
||||
{
|
||||
protected $signature = 'crawler:run {ruleId? : 指定规则ID} {--all : 忽略cron,执行全部启用规则} {--sync : 同步执行,不走队列}';
|
||||
|
||||
protected $description = '执行采集规则';
|
||||
|
||||
public function handle(CrawlRuleScheduleService $scheduleService): int
|
||||
{
|
||||
$ruleId = $this->argument('ruleId');
|
||||
|
||||
$query = CrawlRule::query()->where('enabled', true);
|
||||
|
||||
if ($ruleId !== null) {
|
||||
$query->whereKey((int) $ruleId);
|
||||
}
|
||||
|
||||
$rules = $query->orderBy('id')->get();
|
||||
|
||||
if ($rules->isEmpty()) {
|
||||
$this->warn('没有可执行的采集规则');
|
||||
|
||||
return self::SUCCESS;
|
||||
}
|
||||
|
||||
$shouldRunAll = (bool) $this->option('all') || $ruleId !== null;
|
||||
$sync = (bool) $this->option('sync');
|
||||
|
||||
$dispatched = 0;
|
||||
|
||||
foreach ($rules as $rule) {
|
||||
if (! $shouldRunAll && ! $scheduleService->isDue($rule)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($sync) {
|
||||
RunCrawlRuleJob::dispatchSync($rule->id, CrawlTriggerType::Schedule->value);
|
||||
} else {
|
||||
RunCrawlRuleJob::dispatch($rule->id, CrawlTriggerType::Schedule->value);
|
||||
}
|
||||
|
||||
$dispatched++;
|
||||
$this->info(sprintf('已提交规则 #%d %s', $rule->id, $rule->name));
|
||||
}
|
||||
|
||||
if ($dispatched === 0) {
|
||||
$this->line('当前无到期规则');
|
||||
}
|
||||
|
||||
return self::SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user