Files
ai-web/app/Services/Crawler/XPathExtractor.php
cjd 260460df03
Some checks failed
Tests / PHP 8.2 (push) Has been cancelled
Tests / PHP 8.3 (push) Has been cancelled
Tests / PHP 8.4 (push) Has been cancelled
爬虫开发
2026-02-18 12:56:36 +08:00

172 lines
4.2 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Services\Crawler;
use Illuminate\Support\Str;
class XPathExtractor
{
/**
* @param array<string, mixed> $extractorConfig
* @return list<string>
*/
public function extractListUrls(string $html, string $baseUrl, array $extractorConfig): array
{
$listXPath = (string) ($extractorConfig['list_link_xpath'] ?? '');
if ($listXPath === '') {
return [];
}
$xpath = $this->buildXPath($html);
if ($xpath === null) {
return [];
}
$nodes = $xpath->query($listXPath);
if ($nodes === false) {
return [];
}
$urls = [];
foreach ($nodes as $node) {
$value = trim($node->nodeValue ?? '');
if ($value === '') {
continue;
}
$absolute = $this->toAbsoluteUrl($value, $baseUrl);
if ($absolute !== null) {
$urls[] = $absolute;
}
}
return array_values(array_unique($urls));
}
/**
* @param array<string, mixed> $extractorConfig
* @return array<string, mixed>
*/
public function extractFields(string $html, array $extractorConfig): array
{
$fieldRules = $extractorConfig['fields'] ?? [];
if (! is_array($fieldRules) || $fieldRules === []) {
return [];
}
$xpath = $this->buildXPath($html);
if ($xpath === null) {
return [];
}
$result = [];
foreach ($fieldRules as $field => $rule) {
if (! is_string($field)) {
continue;
}
$xpathExpr = '';
$multiple = false;
if (is_string($rule)) {
$xpathExpr = $rule;
} elseif (is_array($rule)) {
$xpathExpr = (string) ($rule['xpath'] ?? '');
$multiple = (bool) ($rule['multiple'] ?? false);
}
if ($xpathExpr === '') {
continue;
}
$nodes = $xpath->query($xpathExpr);
if ($nodes === false || $nodes->length === 0) {
continue;
}
if ($multiple) {
$values = [];
foreach ($nodes as $node) {
$value = trim($node->nodeValue ?? '');
if ($value !== '') {
$values[] = $value;
}
}
if ($values !== []) {
$result[$field] = array_values(array_unique($values));
}
continue;
}
$value = trim($nodes->item(0)?->nodeValue ?? '');
if ($value !== '') {
$result[$field] = Str::squish($value);
}
}
return $result;
}
private function buildXPath(string $html): ?\DOMXPath
{
$dom = new \DOMDocument('1.0', 'UTF-8');
libxml_use_internal_errors(true);
$loaded = $dom->loadHTML('<?xml encoding="UTF-8">'.$html, LIBXML_NOWARNING | LIBXML_NOERROR);
libxml_clear_errors();
if (! $loaded) {
return null;
}
return new \DOMXPath($dom);
}
private function toAbsoluteUrl(string $url, string $baseUrl): ?string
{
if (Str::startsWith($url, ['http://', 'https://'])) {
return $url;
}
if (Str::startsWith($url, '//')) {
$scheme = parse_url($baseUrl, PHP_URL_SCHEME) ?: 'https';
return sprintf('%s:%s', $scheme, $url);
}
$baseParts = parse_url($baseUrl);
if (! is_array($baseParts) || ! isset($baseParts['scheme'], $baseParts['host'])) {
return null;
}
$prefix = sprintf('%s://%s', $baseParts['scheme'], $baseParts['host']);
if (isset($baseParts['port'])) {
$prefix .= ':'.$baseParts['port'];
}
if (Str::startsWith($url, '/')) {
return $prefix.$url;
}
$path = $baseParts['path'] ?? '/';
$dir = rtrim(str_replace('\\', '/', dirname($path)), '/');
if ($dir === '') {
$dir = '/';
}
return rtrim($prefix, '/').'/'.ltrim($dir.'/'.$url, '/');
}
}