Files
2026-04-28 15:13:50 +02:00

523 lines
11 KiB
PHP

<?php
namespace AIOSEO\Plugin\Pro\SeoAnalysis;
// Exit if accessed directly.
if ( ! defined( 'ABSPATH' ) ) {
exit;
}
/**
* Handles the HTML parser for the Seo Analysis.
*
* @since 4.8.6
*/
class PageParser {
/**
* The DOM document instance.
*
* @since 4.8.6
*
* @var \DOMDocument
*/
private $document;
/**
* The page URL/permalink.
*
* @since 4.8.6
*
* @var string
*/
private $url;
/**
* The URL util instance.
*
* @since 4.8.6
*
* @var Utils\Url
*/
private $urlUtil;
/**
* The content of the page.
*
* @since 4.8.6
*
* @var string
*/
private $content;
/**
* Whether the page is being scraped.
*
* @since 4.8.6
*
* @var boolean
*/
public $isScraping;
/**
* Class constructor.
*
* @since 4.8.6
*
* @param string $url Url of the page.
*/
public function __construct( $url, $content = null, $isScraping = true ) {
$this->url = $url;
$this->content = $content;
$this->urlUtil = new Utils\Url( trim( $this->url ) );
$this->isScraping = $isScraping;
$this->loadDocument();
}
/**
* Load the document parsing the HTML.
*
* @since 4.8.6
*
* @return void
*/
private function loadDocument() {
if ( ! $this->isScraping && empty( $this->content ) ) {
$this->document = null;
return;
}
$this->document = new \DOMDocument();
// Store the current libxml error handling state
$previousUseInternalErrors = libxml_use_internal_errors( true );
if ( ! $this->isScraping ) {
@$this->document->loadHTML( $this->content, LIBXML_NOWARNING | LIBXML_NOERROR ); // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
libxml_clear_errors();
libxml_use_internal_errors( $previousUseInternalErrors );
return;
}
// Use WordPress HTTP API
$response = wp_remote_get($this->url, [
'timeout' => 10,
'user-agent' => 'Mozilla/5.0 (compatible; AIOSEO/1.0)'
]);
$httpCode = wp_remote_retrieve_response_code( $response );
if ( is_wp_error( $response ) || 200 !== $httpCode ) {
$this->document = null;
return;
}
$html = wp_remote_retrieve_body( $response );
@$this->document->loadHTML( $html, LIBXML_NOWARNING | LIBXML_NOERROR ); // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
libxml_clear_errors();
libxml_use_internal_errors( $previousUseInternalErrors );
}
/**
* Returns the document.
*
* @since 4.8.6
*
* @return \DOMDocument|null
*/
public function hasDocument() {
return (bool) $this->document;
}
/**
* Returns the head from the document.
*
* @since 4.8.6
*
* @return \DOMElement|null
*/
public function getHead() {
if ( ! $this->hasDocument() ) {
return null;
}
$head = $this->document->getElementsByTagName( 'head' )->item( 0 );
if ( ! $head ) {
return null;
}
return $head;
}
/**
* Returns the body from the document.
*
* @since 4.8.6
*
* @return \DOMElement|null
*/
public function getBody() {
if ( ! $this->hasDocument() ) {
return null;
}
$body = $this->isScraping
? $this->document->getElementsByTagName( 'body' )->item( 0 )
: $this->document;
if ( ! $body ) {
return null;
}
return $body;
}
/**
* Returns the main from the document.
*
* @since 4.8.6
*
* @return \DOMElement|null
*/
public function getMain() {
if ( ! $this->hasDocument() ) {
return null;
}
$main = @$this->document->getElementsByTagName( 'main' )->item( 0 );
if ( ! $main ) {
return $this->getBody();
}
return $main;
}
/**
* Returns the elements from a specific section and tags.
*
* @since 4.8.6
*
* @param string $section The section to get the elements from (body, head)
* @param array $tags List of tags to get the elements from. (link, script, img, source)
* @return \DOMElement[]
*/
public function getSectionElements( $section, $tags ) {
if ( ! $this->hasDocument() ) {
return [];
}
switch ( $section ) {
case 'body':
$section = $this->getBody();
break;
case 'head':
$section = $this->getHead();
break;
case 'main':
$section = $this->getMain();
break;
default:
$section = $this->document;
break;
}
if ( ! $section ) {
return [];
}
$elements = [];
foreach ( $tags as $tag ) {
$obj = $section->getElementsByTagName( $tag );
if ( ! $obj instanceof \DOMNodeList ) {
continue;
}
for ( $i = 0; $i < $obj->length; $i++ ) {
$content = $obj->item( $i );
if ( ! $content instanceof \DOMElement ) {
continue;
}
array_push( $elements, $content );
}
}
return $elements;
}
/**
* Returns the content from a specific HTML tag.
*
* @since 4.8.6
*
* @param string $tag HTML tag.
* @return array Array of content from the tag.
*/
public function getElementsByTagName( $tag ) {
$result = [];
if ( ! $this->hasDocument() ) {
return $result;
}
foreach ( $this->document->getElementsByTagName( $tag ) as $item ) {
array_push( $result, trim( $item->nodeValue ) );
}
return $result;
}
/**
* Returns the content from a specific HTML meta tag.
*
* @since 4.8.6
*
* @param string $name Meta tag name attribute.
* @return array Array of content from the meta tags.
*/
public function getMetaTag( $name ) {
$result = [];
if ( ! $this->hasDocument() ) {
return $result;
}
foreach ( $this->document->getElementsByTagName( 'meta' ) as $item ) {
if ( $item instanceof \DOMElement && $item->getAttribute( 'name' ) === $name ) {
array_push( $result, $item->getAttribute( 'content' ) );
}
}
return $result;
}
/**
* Returns the images from the document body.
*
* @since 4.8.6
*
* @return array
*/
public function getBodyImages() {
if ( ! $this->hasDocument() ) {
return [];
}
// Just consider the body since we're not interested in images in the head.
$body = $this->getBody();
if ( ! $body ) {
return [];
}
$result = [
'imagesWithoutAltText' => [],
'imageUrls' => [],
];
foreach ( $body->getElementsByTagName( 'img' ) as $item ) {
if ( ! $item instanceof \DOMElement ) {
continue;
}
// If any of this image's parents are a noscript tag, let's skip it.
$parentTag = $item->parentNode;
while ( $parentTag ) {
if ( $parentTag instanceof \DOMElement && ! empty( $parentTag->tagName ) && 'noscript' === $parentTag->tagName ) {
continue 2;
}
$parentTag = $parentTag->parentNode;
}
if ( ! $item->hasAttribute( 'src' ) ) {
continue;
}
// If this is a lazy-loaded data URI image, let's use the data-src if possible.
$imageSrcUrl = $item->getAttribute( 'src' );
if ( false !== strpos( $imageSrcUrl, 'data:image' ) && $item->hasAttribute( 'data-src' ) ) {
$imageSrcUrl = $item->getAttribute( 'data-src' );
}
// If this is a relative link, let's append our domain to it.
$imageUrlUtil = new Utils\Url( $imageSrcUrl );
if ( empty( $imageUrlUtil->getTld() ) ) {
$imageUrlUtil = $this->urlUtil->relativeToAbsolute( $imageSrcUrl );
}
if ( ! $item->hasAttribute( 'alt' ) || empty( $item->getAttribute( 'alt' ) ) ) {
array_push( $result['imagesWithoutAltText'], $imageUrlUtil->getUrl() );
}
array_push( $result['imageUrls'], $imageUrlUtil->getUrl() );
}
return $result;
}
/**
* Returns the links from the document.
*
* @since 4.8.6
*
* @return array
*/
public function getLinks() {
$result = [
'internalLinks' => [],
'externalLinks' => [],
];
if ( ! $this->hasDocument() ) {
return $result;
}
foreach ( $this->document->getElementsByTagName( 'a' ) as $item ) {
/** @var \DOMElement $item */
$href = $item->getAttribute( 'href' );
$hrefUrlUtil = new Utils\Url( $href );
// If this is a relative link, let's append our domain to it.
if ( empty( $hrefUrlUtil->getTld() ) ) {
$hrefUrlUtil = $this->urlUtil->relativeToAbsolute( $href );
}
if ( $hrefUrlUtil->getTld() === $this->urlUtil->getTld() ) {
array_push( $result['internalLinks'], rtrim( $hrefUrlUtil->getUrl(), '/' ) );
continue;
}
array_push( $result['externalLinks'], rtrim( $hrefUrlUtil->getUrl(), '/' ) );
}
return $result;
}
/**
* Returns the canonical URL from the document.
*
* @since 4.8.6
*
* @return string The canonical URL.
*/
public function getCanonicalUrl() {
if ( ! $this->hasDocument() ) {
return '';
}
foreach ( $this->document->getElementsByTagName( 'link' ) as $item ) {
if ( $item instanceof \DOMElement && $item->getAttribute( 'rel' ) === 'canonical' ) {
return $item->getAttribute( 'href' );
}
}
return '';
}
/**
* Returns the social tags (open graph, twitter) from the document.
*
* @since 4.8.6
*
* @return array
*/
private function getSocialTags() {
$result = [];
if ( ! $this->hasDocument() ) {
return $result;
}
$tags = [
'og:title' => '',
'og:description' => '',
'og:type' => '',
'og:image' => '',
'og:url' => '',
'twitter:title' => '',
'twitter:image' => '',
];
foreach ( $this->document->getElementsByTagName( 'meta' ) as $item ) {
/** @var \DOMElement $item */
$property = $item->getAttribute( 'property' );
$name = $item->getAttribute( 'name' );
$content = $item->getAttribute( 'content' );
if ( ! empty( $property ) && array_key_exists( $property, $tags ) ) {
array_push( $result, [
'key' => $property,
'value' => $content
] );
}
if ( ! empty( $name ) && array_key_exists( $name, $tags ) ) {
array_push( $result, [
'key' => $name,
'value' => $content
] );
}
}
return $result;
}
/**
* Returns the open graph tags from the document.
*
* @since 4.8.6
*
* @return array
*/
public function getOpenGraphTags() {
$socialTags = $this->getSocialTags();
return array_filter( $socialTags, function( $tag ) {
return 'og:' === substr( $tag['key'], 0, 3 );
} );
}
/**
* Returns the twitter tags from the document.
*
* @since 4.8.6
*
* @return array
*/
public function getTwitterTags() {
$socialTags = $this->getSocialTags();
return array_filter( $socialTags, function( $tag ) {
return 'twitter:' === substr( $tag['key'], 0, 8 );
} );
}
/**
* Returns the schema.org rules from the document.
*
* @since 4.8.6
*
* @return string The schema.org rules.
*/
public function getSchemas() {
$result = '';
if ( ! $this->hasDocument() ) {
return $result;
}
foreach ( $this->document->getElementsByTagName( 'script' ) as $item ) {
if ( $item instanceof \DOMElement && 'application/ld+json' === $item->getAttribute( 'type' ) && 'aioseo-schema' === $item->getAttribute( 'class' ) ) {
$result = $item->nodeValue;
}
}
return $result;
}
}