Files
ai-web/app/Services/Crawler/CrawlFetcherService.php
cjd 260460df03
Some checks failed
Tests / PHP 8.2 (push) Has been cancelled
Tests / PHP 8.3 (push) Has been cancelled
Tests / PHP 8.4 (push) Has been cancelled
爬虫开发
2026-02-18 12:56:36 +08:00

104 lines
3.3 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Services\Crawler;
use App\Models\CrawlRule;
use Illuminate\Http\Client\PendingRequest;
use Illuminate\Support\Facades\Http;
class CrawlFetcherService
{
/**
* @return array{ok: bool, http_code: int|null, body: string, error: string|null, latency_ms: int}
*/
public function fetch(CrawlRule $rule, string $url): array
{
$startedAt = microtime(true);
try {
if ($rule->render_js && is_string(config('crawler.browserless_endpoint')) && config('crawler.browserless_endpoint') !== '') {
$response = $this->browserlessRequest($rule)->post((string) config('crawler.browserless_endpoint'), [
'url' => $url,
'waitUntil' => 'networkidle2',
]);
} else {
$response = $this->httpRequest($rule)->get($url);
}
return [
'ok' => $response->successful(),
'http_code' => $response->status(),
'body' => $response->body(),
'error' => $response->successful() ? null : sprintf('HTTP %d', $response->status()),
'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
];
} catch (\Throwable $exception) {
return [
'ok' => false,
'http_code' => null,
'body' => '',
'error' => $exception->getMessage(),
'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
];
}
}
private function httpRequest(CrawlRule $rule): PendingRequest
{
$headers = is_array($rule->headers) ? $rule->headers : [];
$cookies = is_array($rule->cookies) ? $rule->cookies : [];
$timeout = max((int) config('crawler.request_timeout_seconds', 20), 5);
$request = Http::timeout($timeout)
->withHeaders($headers)
->withUserAgent((string) ($rule->user_agent ?: config('crawler.default_user_agent')));
if (! (bool) config('crawler.verify_ssl', true)) {
$request = $request->withoutVerifying();
}
$request = $this->applyNetworkOptions($request);
if ($cookies !== []) {
$request = $request->withCookies($cookies, parse_url((string) ($rule->entry_urls[0] ?? ''), PHP_URL_HOST) ?: '');
}
return $request;
}
private function applyNetworkOptions(PendingRequest $request): PendingRequest
{
$options = [];
if ((bool) config('crawler.force_ipv4', false)) {
$options['force_ip_resolve'] = 'v4';
}
$dnsServers = trim((string) config('crawler.dns_servers', ''));
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
}
if ($options === []) {
return $request;
}
return $request->withOptions($options);
}
private function browserlessRequest(CrawlRule $rule): PendingRequest
{
$request = $this->httpRequest($rule);
$token = (string) config('crawler.browserless_token', '');
if ($token !== '') {
$request = $request->withToken($token);
}
return $request;
}
}