172 lines
4.2 KiB
PHP
172 lines
4.2 KiB
PHP
|
|
<?php
|
||
|
|
|
||
|
|
declare(strict_types=1);
|
||
|
|
|
||
|
|
namespace App\Services\Crawler;
|
||
|
|
|
||
|
|
use Illuminate\Support\Str;
|
||
|
|
|
||
|
|
class XPathExtractor
|
||
|
|
{
|
||
|
|
/**
|
||
|
|
* @param array<string, mixed> $extractorConfig
|
||
|
|
* @return list<string>
|
||
|
|
*/
|
||
|
|
public function extractListUrls(string $html, string $baseUrl, array $extractorConfig): array
|
||
|
|
{
|
||
|
|
$listXPath = (string) ($extractorConfig['list_link_xpath'] ?? '');
|
||
|
|
|
||
|
|
if ($listXPath === '') {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
$xpath = $this->buildXPath($html);
|
||
|
|
|
||
|
|
if ($xpath === null) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
$nodes = $xpath->query($listXPath);
|
||
|
|
|
||
|
|
if ($nodes === false) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
$urls = [];
|
||
|
|
foreach ($nodes as $node) {
|
||
|
|
$value = trim($node->nodeValue ?? '');
|
||
|
|
if ($value === '') {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$absolute = $this->toAbsoluteUrl($value, $baseUrl);
|
||
|
|
if ($absolute !== null) {
|
||
|
|
$urls[] = $absolute;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return array_values(array_unique($urls));
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param array<string, mixed> $extractorConfig
|
||
|
|
* @return array<string, mixed>
|
||
|
|
*/
|
||
|
|
public function extractFields(string $html, array $extractorConfig): array
|
||
|
|
{
|
||
|
|
$fieldRules = $extractorConfig['fields'] ?? [];
|
||
|
|
|
||
|
|
if (! is_array($fieldRules) || $fieldRules === []) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
$xpath = $this->buildXPath($html);
|
||
|
|
|
||
|
|
if ($xpath === null) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
$result = [];
|
||
|
|
|
||
|
|
foreach ($fieldRules as $field => $rule) {
|
||
|
|
if (! is_string($field)) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$xpathExpr = '';
|
||
|
|
$multiple = false;
|
||
|
|
|
||
|
|
if (is_string($rule)) {
|
||
|
|
$xpathExpr = $rule;
|
||
|
|
} elseif (is_array($rule)) {
|
||
|
|
$xpathExpr = (string) ($rule['xpath'] ?? '');
|
||
|
|
$multiple = (bool) ($rule['multiple'] ?? false);
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($xpathExpr === '') {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$nodes = $xpath->query($xpathExpr);
|
||
|
|
|
||
|
|
if ($nodes === false || $nodes->length === 0) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($multiple) {
|
||
|
|
$values = [];
|
||
|
|
foreach ($nodes as $node) {
|
||
|
|
$value = trim($node->nodeValue ?? '');
|
||
|
|
if ($value !== '') {
|
||
|
|
$values[] = $value;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($values !== []) {
|
||
|
|
$result[$field] = array_values(array_unique($values));
|
||
|
|
}
|
||
|
|
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$value = trim($nodes->item(0)?->nodeValue ?? '');
|
||
|
|
if ($value !== '') {
|
||
|
|
$result[$field] = Str::squish($value);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return $result;
|
||
|
|
}
|
||
|
|
|
||
|
|
private function buildXPath(string $html): ?\DOMXPath
|
||
|
|
{
|
||
|
|
$dom = new \DOMDocument('1.0', 'UTF-8');
|
||
|
|
|
||
|
|
libxml_use_internal_errors(true);
|
||
|
|
$loaded = $dom->loadHTML('<?xml encoding="UTF-8">'.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
|
||
|
|
libxml_clear_errors();
|
||
|
|
|
||
|
|
if (! $loaded) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return new \DOMXPath($dom);
|
||
|
|
}
|
||
|
|
|
||
|
|
private function toAbsoluteUrl(string $url, string $baseUrl): ?string
|
||
|
|
{
|
||
|
|
if (Str::startsWith($url, ['http://', 'https://'])) {
|
||
|
|
return $url;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (Str::startsWith($url, '//')) {
|
||
|
|
$scheme = parse_url($baseUrl, PHP_URL_SCHEME) ?: 'https';
|
||
|
|
|
||
|
|
return sprintf('%s:%s', $scheme, $url);
|
||
|
|
}
|
||
|
|
|
||
|
|
$baseParts = parse_url($baseUrl);
|
||
|
|
if (! is_array($baseParts) || ! isset($baseParts['scheme'], $baseParts['host'])) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$prefix = sprintf('%s://%s', $baseParts['scheme'], $baseParts['host']);
|
||
|
|
if (isset($baseParts['port'])) {
|
||
|
|
$prefix .= ':'.$baseParts['port'];
|
||
|
|
}
|
||
|
|
|
||
|
|
if (Str::startsWith($url, '/')) {
|
||
|
|
return $prefix.$url;
|
||
|
|
}
|
||
|
|
|
||
|
|
$path = $baseParts['path'] ?? '/';
|
||
|
|
$dir = rtrim(str_replace('\\', '/', dirname($path)), '/');
|
||
|
|
if ($dir === '') {
|
||
|
|
$dir = '/';
|
||
|
|
}
|
||
|
|
|
||
|
|
return rtrim($prefix, '/').'/'.ltrim($dir.'/'.$url, '/');
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|