227 lines
7.0 KiB
PHP
227 lines
7.0 KiB
PHP
<?php
|
||
|
||
namespace WordPress\XML;
|
||
|
||
use function WordPress\Encoding\codepoint_to_utf8_bytes;
|
||
|
||
/**
|
||
* XML API: WP_XML_Decoder class
|
||
*
|
||
* Decodes spans of raw text found inside XML content,
|
||
* whether found in an attribute or in a text node.
|
||
*
|
||
* Do not use this function on the contents of a CDATA section,
|
||
* as those sections are not encoded with the XML rules unless
|
||
* they are embedded XML content.
|
||
*
|
||
* @package WordPress
|
||
* @subpackage HTML-API
|
||
* @since WP_VERSION
|
||
*/
|
||
class XMLDecoder {
|
||
/**
|
||
* Decodes a span of XML text.
|
||
*
|
||
* Example:
|
||
*
|
||
* '&' = WP_XML_Decoder::decode( '&' );
|
||
* '…' = WP_XML_Decoder::decode( '…' );
|
||
*
|
||
* @todo Add examples of parse failures, and decide if it should fail or not.
|
||
*
|
||
* @since WP_VERSION
|
||
*
|
||
* @access private
|
||
*
|
||
* @param string $text Text document containing span of text to decode.
|
||
* @return string Decoded UTF-8 string.
|
||
*/
|
||
public static function decode( $text ) {
|
||
$decoded = '';
|
||
$end = strlen( $text );
|
||
$at = 0;
|
||
$was_at = 0;
|
||
|
||
while ( $at < $end ) {
|
||
$next_character_reference_at = strpos( $text, '&', $at );
|
||
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
|
||
break;
|
||
}
|
||
|
||
$start_of_potential_reference_at = $next_character_reference_at + 1;
|
||
if ( $start_of_potential_reference_at >= $end ) {
|
||
// @todo This is an error. The document ended too early; consume the rest as plaintext, which is wrong.
|
||
break;
|
||
}
|
||
|
||
/**
|
||
* First character after the opening `&`.
|
||
*/
|
||
$start_of_potential_reference = $text[ $start_of_potential_reference_at ];
|
||
|
||
/*
|
||
* If it's a named character reference, it will be one of the five mandated references.
|
||
*
|
||
* - `&`
|
||
* - `'`
|
||
* - `>`
|
||
* - `<`
|
||
* - `"`
|
||
*
|
||
* These all must be found within the five successive characters from the `&`.
|
||
*
|
||
* Example:
|
||
*
|
||
* ╭ ampersand at 9 = $end - 6
|
||
* 'XML' ($end = 15)
|
||
* ╰───┴─ this length must be at least 5 long,
|
||
* which is $end - 5.
|
||
*/
|
||
if (
|
||
$next_character_reference_at < $end - 5 &&
|
||
(
|
||
'a' === $start_of_potential_reference ||
|
||
'g' === $start_of_potential_reference ||
|
||
'l' === $start_of_potential_reference ||
|
||
'q' === $start_of_potential_reference
|
||
)
|
||
) {
|
||
foreach ( array(
|
||
'amp;' => '&',
|
||
'apos;' => "'",
|
||
'lt;' => '<',
|
||
'gt;' => '>',
|
||
'quot;' => '"',
|
||
) as $name => $substitution ) {
|
||
if ( 0 === substr_compare( $text, $name, $start_of_potential_reference_at, strlen( $name ) ) ) {
|
||
$decoded .= substr( $text, $was_at, $next_character_reference_at - $was_at ) . $substitution;
|
||
$at = $start_of_potential_reference_at + strlen( $name );
|
||
$was_at = $at;
|
||
continue 2;
|
||
}
|
||
}
|
||
|
||
// @todo This is an invalid document. It should be communicated. Treat as plaintext and continue.
|
||
++$at;
|
||
continue;
|
||
}
|
||
|
||
/*
|
||
* The shortest numerical character reference is four characters.
|
||
*
|
||
* Example:
|
||
*
|
||
* 	
|
||
*/
|
||
if ( '#' !== $start_of_potential_reference || $next_character_reference_at + 4 >= $end ) {
|
||
// @todo This is an error. This ampersand _must_ be encoded. Treat as plaintext and move on.
|
||
++$at;
|
||
continue;
|
||
}
|
||
|
||
$is_hex = 'x' === $text[ $start_of_potential_reference_at + 1 ];
|
||
if ( $is_hex ) {
|
||
$zeros_at = $start_of_potential_reference_at + 2;
|
||
$base = 16;
|
||
$digit_chars = '0123456789abcdefABCDEF';
|
||
$max_digits = 6; // ``.
|
||
} else {
|
||
$zeros_at = $start_of_potential_reference_at + 1;
|
||
$base = 10;
|
||
$digit_chars = '0123456789';
|
||
$max_digits = 7; // ``.
|
||
}
|
||
|
||
$zero_count = strspn( $text, '0', $zeros_at );
|
||
$digits_at = $zeros_at + $zero_count;
|
||
$digit_count = strspn( $text, $digit_chars, $digits_at, $max_digits );
|
||
$semi_at = $digits_at + $digit_count;
|
||
|
||
if ( 0 === $digit_count || $semi_at >= $end || ';' !== $text[ $semi_at ] ) {
|
||
// @todo This is an error. Treat as plaintext and move on.
|
||
++$at;
|
||
continue;
|
||
}
|
||
|
||
$codepoint = intval( substr( $text, $digits_at, $digit_count ), $base );
|
||
$character_reference = codepoint_to_utf8_bytes( $codepoint );
|
||
if ( '<27>' === $character_reference && 0xFFFD !== $codepoint ) {
|
||
/*
|
||
* Stop processing if we got an invalid character AND the reference does not
|
||
* specifically refer code point FFFD (<28>).
|
||
*
|
||
* > It is a fatal error when an XML processor encounters an entity with an
|
||
* > encoding that it is unable to process. It is a fatal error if an XML entity
|
||
* > is determined (via default, encoding declaration, or higher-level protocol)
|
||
* > to be in a certain encoding but contains byte sequences that are not legal
|
||
* > in that encoding. Specifically, it is a fatal error if an entity encoded in
|
||
* > UTF-8 contains any ill-formed code unit sequences, as defined in section
|
||
* > 3.9 of Unicode [Unicode]. Unless an encoding is determined by a higher-level
|
||
* > protocol, it is also a fatal error if an XML entity contains no encoding
|
||
* > declaration and its content is not legal UTF-8 or UTF-16.
|
||
*
|
||
* See https://www.w3.org/TR/xml/#charencoding
|
||
*/
|
||
// @todo This is an error. Treat as plaintext and continue, which is wrong.
|
||
++$at;
|
||
continue;
|
||
}
|
||
|
||
$decoded .= substr( $text, $was_at, $at - $was_at );
|
||
$decoded .= $character_reference;
|
||
$at = $semi_at + 1;
|
||
$was_at = $at;
|
||
}
|
||
|
||
if ( 0 === $was_at ) {
|
||
return $text;
|
||
}
|
||
|
||
if ( $was_at < $end ) {
|
||
$decoded .= substr( $text, $was_at, $end - $was_at );
|
||
}
|
||
|
||
return $decoded;
|
||
}
|
||
|
||
/**
|
||
* Finds and parses the next entity in a given text starting after the
|
||
* given byte offset, and being entirely found within the given max length.
|
||
*
|
||
* @since {WP_VERSION}
|
||
*
|
||
* // @todo Implement this function.
|
||
*
|
||
* @param string $text Text in which to search for an XML entity.
|
||
* @param int $starting_byte_offset Start looking after this byte offset.
|
||
* @param int $ending_byte_offset Stop looking if entity is not fully contained before this byte offset.
|
||
* @param int|null $entity_at Optional. If provided, will be set to byte offset where entity was
|
||
* found, if found. Otherwise, will not be set.
|
||
*
|
||
* @return string|null Parsed entity, if parsed, otherwise `null`.
|
||
*/
|
||
public static function next_entity( string $text, int $starting_byte_offset, int $ending_byte_offset, ?int &$entity_at = null ): ?string {
|
||
$at = $starting_byte_offset;
|
||
$end = $ending_byte_offset;
|
||
|
||
while ( $at < $end ) {
|
||
$remaining = $end - $at;
|
||
$amp_after = strcspn( $text, '&', $at, $remaining );
|
||
|
||
// There are no more possible entities.
|
||
if ( $amp_after === $remaining ) {
|
||
return null;
|
||
}
|
||
|
||
/*
|
||
* @todo Move the decoding logic from `decode()` above into here,
|
||
* then call this function in a loop from `decode()`.
|
||
*/
|
||
|
||
++$at;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
}
|