爬虫开发
This commit is contained in:
103
app/Services/Crawler/CrawlFetcherService.php
Normal file
103
app/Services/Crawler/CrawlFetcherService.php
Normal file
@@ -0,0 +1,103 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services\Crawler;
|
||||
|
||||
use App\Models\CrawlRule;
|
||||
use Illuminate\Http\Client\PendingRequest;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
|
||||
class CrawlFetcherService
|
||||
{
|
||||
/**
|
||||
* @return array{ok: bool, http_code: int|null, body: string, error: string|null, latency_ms: int}
|
||||
*/
|
||||
public function fetch(CrawlRule $rule, string $url): array
|
||||
{
|
||||
$startedAt = microtime(true);
|
||||
|
||||
try {
|
||||
if ($rule->render_js && is_string(config('crawler.browserless_endpoint')) && config('crawler.browserless_endpoint') !== '') {
|
||||
$response = $this->browserlessRequest($rule)->post((string) config('crawler.browserless_endpoint'), [
|
||||
'url' => $url,
|
||||
'waitUntil' => 'networkidle2',
|
||||
]);
|
||||
} else {
|
||||
$response = $this->httpRequest($rule)->get($url);
|
||||
}
|
||||
|
||||
return [
|
||||
'ok' => $response->successful(),
|
||||
'http_code' => $response->status(),
|
||||
'body' => $response->body(),
|
||||
'error' => $response->successful() ? null : sprintf('HTTP %d', $response->status()),
|
||||
'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
|
||||
];
|
||||
} catch (\Throwable $exception) {
|
||||
return [
|
||||
'ok' => false,
|
||||
'http_code' => null,
|
||||
'body' => '',
|
||||
'error' => $exception->getMessage(),
|
||||
'latency_ms' => (int) ((microtime(true) - $startedAt) * 1000),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
private function httpRequest(CrawlRule $rule): PendingRequest
|
||||
{
|
||||
$headers = is_array($rule->headers) ? $rule->headers : [];
|
||||
$cookies = is_array($rule->cookies) ? $rule->cookies : [];
|
||||
$timeout = max((int) config('crawler.request_timeout_seconds', 20), 5);
|
||||
|
||||
$request = Http::timeout($timeout)
|
||||
->withHeaders($headers)
|
||||
->withUserAgent((string) ($rule->user_agent ?: config('crawler.default_user_agent')));
|
||||
|
||||
if (! (bool) config('crawler.verify_ssl', true)) {
|
||||
$request = $request->withoutVerifying();
|
||||
}
|
||||
|
||||
$request = $this->applyNetworkOptions($request);
|
||||
|
||||
if ($cookies !== []) {
|
||||
$request = $request->withCookies($cookies, parse_url((string) ($rule->entry_urls[0] ?? ''), PHP_URL_HOST) ?: '');
|
||||
}
|
||||
|
||||
return $request;
|
||||
}
|
||||
|
||||
private function applyNetworkOptions(PendingRequest $request): PendingRequest
|
||||
{
|
||||
$options = [];
|
||||
|
||||
if ((bool) config('crawler.force_ipv4', false)) {
|
||||
$options['force_ip_resolve'] = 'v4';
|
||||
}
|
||||
|
||||
$dnsServers = trim((string) config('crawler.dns_servers', ''));
|
||||
if ($dnsServers !== '' && defined('CURLOPT_DNS_SERVERS')) {
|
||||
$options['curl'][CURLOPT_DNS_SERVERS] = $dnsServers;
|
||||
}
|
||||
|
||||
if ($options === []) {
|
||||
return $request;
|
||||
}
|
||||
|
||||
return $request->withOptions($options);
|
||||
}
|
||||
|
||||
private function browserlessRequest(CrawlRule $rule): PendingRequest
|
||||
{
|
||||
$request = $this->httpRequest($rule);
|
||||
$token = (string) config('crawler.browserless_token', '');
|
||||
|
||||
if ($token !== '') {
|
||||
$request = $request->withToken($token);
|
||||
}
|
||||
|
||||
return $request;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user