preserveWhiteSpace = false; $document->formatOutput = true; return self::from_string( $xml, $document, array( $document, 'loadXML' ), $flags ); } /** * Creates a PLL_DOM_Document instance from a HTML string. * * @since 3.1 * @since 3.3 Added parameters $version, $encoding, and $flags. * @since 3.3 Doesn't format the output anymore. * @since 3.3 Doesn't add the `` and `` tags by default anymore. * * Doctype declaration is disallowed for security reasons (XEE vulnerability). * * @param string $html A HTML valid string. * @param string $version Optional. XML version to use. Default is '1.0'. * @param string $encoding Optional. Encoding to use. Default is 'UTF-8'. * @param int $flags Optional. A series of libxml flags to parameterize the loading. * Default is `LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD`. * {@link https://www.php.net/manual/en/libxml.constants.php}. * @return PLL_DOM_Document */ public static function from_html( $html, $version = '1.0', $encoding = 'UTF-8', $flags = 0 ) { $document = new self( $version, $encoding ); $document->strictErrorChecking = false; /* * Hack to enforce that the string will be processed with the right encoding by DOMDocument. * The added processing instruction is then removed by contains_not_allowed_node(). */ $html = '' . $html; $flags = ! empty( $flags ) ? $flags : LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD; $document = self::from_string( $html, $document, array( $document, 'loadHTML' ), $flags ); $document->encoding = $encoding; // Enforce encoding, as it is not set by DOMDocument. return $document; } /** * Factory function to safely generate DOMDocument from strings. * * @since 3.1 * * Entity loading is disabled to prevent External Entity Injections {@link https://phpsecurity.readthedocs.io/en/latest/Injection-Attacks.html#xml-external-entity-injection}. * * @param string $string A XML content to load. * @param PLL_DOM_Document $document A document parameterized to load the content into. * @param callable $function Method name which will handle the loading. * @param int $flags A series of libxml flags to parameterize the loading. {@link https://www.php.net/manual/en/libxml.constants.php}. * @return PLL_DOM_Document */ private static function from_string( $string, $document, $function, $flags = 0 ) { if ( ! empty( $string ) ) { // libxml2 version 2.9.0 and superior doesn't load external entities by default. libxml_disable_entity_loader() is deprecated since PHP 8.0.0 . $internal_errors = libxml_use_internal_errors( true ); libxml_clear_errors(); if ( ! defined( 'LIBXML_DOTTED_VERSION' ) || version_compare( LIBXML_DOTTED_VERSION, '2.9.0', '<' ) ) { $entity_loader = libxml_disable_entity_loader( true ); // phpcs:ignore Generic.PHP.DeprecatedFunctions.Deprecated $document = self::safe_load_string( $string, $document, $function, $flags ); libxml_disable_entity_loader( $entity_loader ); // phpcs:ignore Generic.PHP.DeprecatedFunctions.Deprecated } else { $document = self::safe_load_string( $string, $document, $function, $flags ); } libxml_clear_errors(); libxml_use_internal_errors( $internal_errors ); } return $document; } /** * Loads the string into the given document, returns the document if it's safe, or return an empty document with errors. * * @since 3.1 * * @param string $string A string to be loaded and parsed as the document. * @param PLL_DOM_Document $document A configured instance of PLL_DOM_Document to load the string into. * @param callable $function Name of the loading method to use. * @param int $flags A series of libxml flags to parameterize the loading. {@link https://www.php.net/manual/en/libxml.constants.php}. * @return PLL_DOM_Document */ private static function safe_load_string( $string, $document, $function, $flags = 0 ) { call_user_func( $function, $string, LIBXML_NONET | $flags ); if ( $document->contains_not_allowed_node() ) { $document = new self(); } $document->errors = array_merge( $document->errors, libxml_get_errors() ); return $document; } /** * Verifies that the document contains only nodes of allowed types. * * @since 3.1 * * @see https://www.php.net/manual/en/dom.constants.php. * * @return bool */ public function contains_not_allowed_node() { foreach ( $this->childNodes as $node ) { if ( ! $node instanceof DOMNode || ! in_array( $node->nodeType, array( XML_DOCUMENT_NODE, XML_ELEMENT_NODE, XML_ATTRIBUTE_NODE, XML_TEXT_NODE, XML_COMMENT_NODE, XML_CDATA_SECTION_NODE, XML_PI_NODE, ) ) ) { return true; } if ( XML_PI_NODE === $node->nodeType ) { $this->removeChild( $node ); // Remove our hacky element. * * @return DOMNodeList */ public function get_first_level_html_nodes() { $body = $this->getElementsByTagName( 'body' )->item( 0 ); return null !== $body ? $body->childNodes : new DOMNodeList(); } /** * Whether the document contains errors or not * * @since 3.1 * * @return bool */ public function has_errors() { return ! empty( $this->errors ); } /** * Returns the document's errors. * * @since 3.3 * * @return LibXMLError[] */ public function get_errors() { return $this->errors; } }