104 lines
3.3 KiB
PHP
104 lines
3.3 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Services\Crawler;
|
|
|
|
use App\Models\CrawlRule;
|
|
use Illuminate\Http\Client\PendingRequest;
|
|
use Illuminate\Support\Facades\Http;
|
|
|
|
class CrawlFetcherService
|
|
{
|
|
/**
|
|
* @return array{ok: bool, http_code: int|null, body: string, error: string|null, latency_ms: int}
|
|
*/
|
|
public function fetch(CrawlRule $rule, string $url): array
|
|
{
|
|
$startedAt = microtime(true);
|
|
|
|
try {
|
|
if ($rule->render_js && is_string(config('crawler.browserless_endpoint')) && config('crawler.browserless_endpoint') !== '') {
|
|
$response = $this->browserlessRequest($rule)->post((string) config('crawler.browserless_endpoint'), [
|
|
'url' => $url,
|
|
'waitUntil' => 'networkidle2',
|
|
]);
|
|
} else {
|
|
$response = $this->httpRequest($rule)->get($url);
|
|
}
|
|
|
|
return [
|
|
'ok' => $response->successful(),
|
|
'http_code' => $response->status(),
|
|
'body' => $response->body(),
|
|
'error' => $response->successful() ? null : sprintf('HTTP %d', $response->status()),
|
|
'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
|
|
];
|
|
} catch (\Throwable $exception) {
|
|
return [
|
|
'ok' => false,
|
|
'http_code' => null,
|
|
'body' => '',
|
|
'error' => $exception->getMessage(),
|
|
'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
|
|
];
|
|
}
|
|
}
|
|
|
|
private function httpRequest(CrawlRule $rule): PendingRequest
|
|
{
|
|
$headers = is_array($rule->headers) ? $rule->headers : [];
|
|
$cookies = is_array($rule->cookies) ? $rule->cookies : [];
|
|
$timeout = max((int) config('crawler.request_timeout_seconds', 20), 5);
|
|
|
|
$request = Http::timeout($timeout)
|
|
->withHeaders($headers)
|
|
->withUserAgent((string) ($rule->user_agent ?: config('crawler.default_user_agent')));
|
|
|
|
if (! (bool) config('crawler.verify_ssl', true)) {
|
|
$request = $request->withoutVerifying();
|
|
}
|
|
|
|
$request = $this->applyNetworkOptions($request);
|
|
|
|
if ($cookies !== []) {
|
|
$request = $request->withCookies($cookies, parse_url((string) ($rule->entry_urls[0] ?? ''), PHP_URL_HOST) ?: '');
|
|
}
|
|
|
|
return $request;
|
|
}
|
|
|
|
private function applyNetworkOptions(PendingRequest $request): PendingRequest
|
|
{
|
|
$options = [];
|
|
|
|
if ((bool) config('crawler.force_ipv4', false)) {
|
|
$options['force_ip_resolve'] = 'v4';
|
|
}
|
|
|
|
$dnsServers = trim((string) config('crawler.dns_servers', ''));
|
|
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
|
|
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
|
|
}
|
|
|
|
if ($options === []) {
|
|
return $request;
|
|
}
|
|
|
|
return $request->withOptions($options);
|
|
}
|
|
|
|
private function browserlessRequest(CrawlRule $rule): PendingRequest
|
|
{
|
|
$request = $this->httpRequest($rule);
|
|
$token = (string) config('crawler.browserless_token', '');
|
|
|
|
if ($token !== '') {
|
|
$request = $request->withToken($token);
|
|
}
|
|
|
|
return $request;
|
|
}
|
|
}
|
|
|