$extractorConfig * @return list */ public function extractListUrls(string $html, string $baseUrl, array $extractorConfig): array { $listXPath = (string) ($extractorConfig['list_link_xpath'] ?? ''); if ($listXPath === '') { return []; } $xpath = $this->buildXPath($html); if ($xpath === null) { return []; } $nodes = $xpath->query($listXPath); if ($nodes === false) { return []; } $urls = []; foreach ($nodes as $node) { $value = trim($node->nodeValue ?? ''); if ($value === '') { continue; } $absolute = $this->toAbsoluteUrl($value, $baseUrl); if ($absolute !== null) { $urls[] = $absolute; } } return array_values(array_unique($urls)); } /** * @param array $extractorConfig * @return array */ public function extractFields(string $html, array $extractorConfig): array { $fieldRules = $extractorConfig['fields'] ?? []; if (! is_array($fieldRules) || $fieldRules === []) { return []; } $xpath = $this->buildXPath($html); if ($xpath === null) { return []; } $result = []; foreach ($fieldRules as $field => $rule) { if (! is_string($field)) { continue; } $xpathExpr = ''; $multiple = false; if (is_string($rule)) { $xpathExpr = $rule; } elseif (is_array($rule)) { $xpathExpr = (string) ($rule['xpath'] ?? ''); $multiple = (bool) ($rule['multiple'] ?? false); } if ($xpathExpr === '') { continue; } $nodes = $xpath->query($xpathExpr); if ($nodes === false || $nodes->length === 0) { continue; } if ($multiple) { $values = []; foreach ($nodes as $node) { $value = trim($node->nodeValue ?? ''); if ($value !== '') { $values[] = $value; } } if ($values !== []) { $result[$field] = array_values(array_unique($values)); } continue; } $value = trim($nodes->item(0)?->nodeValue ?? ''); if ($value !== '') { $result[$field] = Str::squish($value); } } return $result; } private function buildXPath(string $html): ?\DOMXPath { $dom = new \DOMDocument('1.0', 'UTF-8'); libxml_use_internal_errors(true); $loaded = $dom->loadHTML(''.$html, LIBXML_NOWARNING | LIBXML_NOERROR); libxml_clear_errors(); if (! $loaded) { return null; } return new \DOMXPath($dom); } private function toAbsoluteUrl(string $url, string $baseUrl): ?string { if (Str::startsWith($url, ['http://', 'https://'])) { return $url; } if (Str::startsWith($url, '//')) { $scheme = parse_url($baseUrl, PHP_URL_SCHEME) ?: 'https'; return sprintf('%s:%s', $scheme, $url); } $baseParts = parse_url($baseUrl); if (! is_array($baseParts) || ! isset($baseParts['scheme'], $baseParts['host'])) { return null; } $prefix = sprintf('%s://%s', $baseParts['scheme'], $baseParts['host']); if (isset($baseParts['port'])) { $prefix .= ':'.$baseParts['port']; } if (Str::startsWith($url, '/')) { return $prefix.$url; } $path = $baseParts['path'] ?? '/'; $dir = rtrim(str_replace('\\', '/', dirname($path)), '/'); if ($dir === '') { $dir = '/'; } return rtrim($prefix, '/').'/'.ltrim($dir.'/'.$url, '/'); } }