XMLProcessor::create_fragment()'
),
'6.4.0'
);
}
$this->xml = isset( $xml ) ? $xml : '';
$this->document_namespaces = array_merge(
$document_namespaces,
// These initial namespaces cannot be overridden.
array(
'xml' => 'http://www.w3.org/XML/1998/namespace', // Predefined, cannot be unbound or changed.
'xmlns' => 'http://www.w3.org/2000/xmlns/', // Reserved for xmlns attributes, not a real namespace for elements/attributes.
'' => '', // Default namespace is initially empty (no namespace).
)
);
}
/**
* Wipes out the processed XML and appends the next chunk of XML to
* any remaining unprocessed XML.
*
* @param string $next_chunk XML to append.
*/
public function append_bytes( $next_chunk ) {
if ( ! $this->expecting_more_input ) {
_doing_it_wrong(
__METHOD__,
__( 'Cannot append bytes after the last input chunk was provided and input_finished() was called.' ),
'WP_VERSION'
);
return false;
}
$this->xml .= $next_chunk;
if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->parser_state = self::STATE_READY;
}
// Periodically flush the processed bytes to avoid high memory usage.
if (
null !== $this->memory_budget &&
strlen( $this->xml ) > $this->memory_budget
) {
$this->flush_processed_xml();
}
return true;
}
/**
* Forgets the XML bytes that have been processed and are no longer needed to
* avoid high memory usage.
*
* @return string The flushed bytes.
*/
private function flush_processed_xml() {
// Flush updates.
$this->get_updated_xml();
$unreferenced_bytes = $this->bytes_already_parsed;
if ( null !== $this->token_starts_at ) {
$unreferenced_bytes = min( $unreferenced_bytes, $this->token_starts_at );
}
$flushed_bytes = substr( $this->xml, 0, $unreferenced_bytes );
$this->xml = substr( $this->xml, $unreferenced_bytes );
$this->bookmarks = array();
$this->lexical_updates = array();
$this->seek_count = 0;
$this->bytes_already_parsed -= $unreferenced_bytes;
if ( null !== $this->token_starts_at ) {
$this->token_starts_at -= $unreferenced_bytes;
}
if ( null !== $this->tag_name_starts_at ) {
$this->tag_name_starts_at -= $unreferenced_bytes;
}
if ( null !== $this->text_starts_at ) {
$this->text_starts_at -= $unreferenced_bytes;
}
$this->upstream_bytes_forgotten += $unreferenced_bytes;
return $flushed_bytes;
}
/**
* Indicates that all the XML document bytes have been provided.
*
* After calling this method, the processor will emit errors where
* previously it would have entered the STATE_INCOMPLETE_INPUT state.
*/
public function input_finished() {
$this->expecting_more_input = false;
$this->parser_state = self::STATE_READY;
}
/**
* Indicates if the processor is expecting more data bytes.
* If not, the processor will expect the remaining XML bytes to form
* a valid document and will not stop on incomplete input.
*
* @return bool Whether the processor is expecting more data bytes.
*/
public function is_expecting_more_input() {
return $this->expecting_more_input;
}
/**
* Internal method which finds the next token in the XML document.
*
* This method is a protected internal function which implements the logic for
* finding the next token in a document. It exists so that the parser can update
* its state without affecting the location of the cursor in the document and
* without triggering subclass methods for things like `next_token()`, e.g. when
* applying patches before searching for the next token.
*
* @return bool Whether a token was parsed.
* @since 6.5.0
*
* @access private
*/
protected function parse_next_token() {
$was_at = $this->bytes_already_parsed;
$this->after_tag();
// Don't proceed if there's nothing more to scan.
if (
self::STATE_COMPLETE === $this->parser_state ||
self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
null !== $this->last_error
) {
return false;
}
/*
* The next step in the parsing loop determines the parsing state;
* clear it so that state doesn't linger from the previous step.
*/
$this->parser_state = self::STATE_READY;
if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
if ( $this->expecting_more_input ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
} else {
$this->parser_state = self::STATE_COMPLETE;
}
return false;
}
// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->bytes_already_parsed = $was_at;
}
return false;
}
if ( null !== $this->last_error ) {
return false;
}
/*
* For legacy reasons the rest of this function handles tags and their
* attributes. If the processor has reached the end of the document
* or if it matched any other token then it should return here to avoid
* attempting to process tag-specific syntax.
*/
if (
self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
self::STATE_COMPLETE !== $this->parser_state &&
self::STATE_MATCHED_TAG !== $this->parser_state
) {
return true;
}
if ( $this->is_closing_tag ) {
$this->skip_whitespace();
} else {
// Parse all of its attributes.
while ( $this->parse_next_attribute() ) {
continue;
}
}
if ( null !== $this->last_error ) {
return false;
}
if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->bytes_already_parsed = $was_at;
return false;
}
// Ensure that the tag closes before the end of the document.
if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
// Does this appropriately clear state (parsed attributes)?
$this->mark_incomplete_input( 'Tag attributes were not closed before the end of the document.' );
$this->bytes_already_parsed = $was_at;
return false;
}
$tag_ends_at = strpos( $this->xml, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
$this->mark_incomplete_input( 'No > found at the end of a tag.' );
$this->bytes_already_parsed = $was_at;
return false;
}
if ( $this->is_closing_tag && $tag_ends_at !== $this->bytes_already_parsed ) {
$this->bail(
'Invalid closing tag encountered.',
self::ERROR_SYNTAX
);
return false;
}
$this->parser_state = self::STATE_MATCHED_TAG;
$this->bytes_already_parsed = $tag_ends_at + 1;
$this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
/**
* Resolve the namespaces defined in opening tags.
*/
if ( ! $this->is_closing_tag ) {
/**
* By default, inherit all namespaces from the parent element.
*/
$namespaces = $this->get_tag_namespaces_in_scope();
foreach ( $this->qualified_attributes as $attribute ) {
/**
* `xmlns` attribute is the default namespace
* `xmlns:Previous XMLMore XML
* ↑ │ back up by the length of the tag name plus the opening < * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_token; $this->parse_next_token(); return $this->xml; } /** * Finds the next token in the XML document. * * An XML document can be viewed as a stream of tokens, * where tokens are things like XML tags, XML comments, * text nodes, etc. This method finds the next token in * the XML document and returns whether it found one. * * If it starts parsing a token and reaches the end of the * document then it will seek to the start of the last * token and pause, returning `false` to indicate that it * failed to find a complete token. * * Possible token types, based on the XML specification: * * - an XML tag * - a text node - the plaintext inside tags. * - a CData section * - an XML comment. * - a DOCTYPE declaration. * - a processing instruction, e.g. ``. * * @return bool Whether a token was parsed. */ public function next_token() { return $this->step(); } /** * Moves the internal cursor to the next token in the XML document * according to the XML specification. * * It considers the current XML context (prolog, element, or misc) * and only expects the nodes that are allowed in that context. * * @param int $node_to_process Whether to process the next node or * reprocess the current node, e.g. using another parser context. * * @return bool Whether a token was parsed. * @since WP_VERSION * * @access private */ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Refuse to proceed if there was a previous error. if ( null !== $this->last_error ) { return false; } // Finish stepping when there are no more tokens in the document. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state || self::STATE_COMPLETE === $this->parser_state ) { return false; } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { if ( $this->is_empty_element() ) { array_pop( $this->stack_of_open_elements ); } } try { switch ( $this->parser_context ) { case self::IN_PROLOG_CONTEXT: return $this->step_in_prolog( $node_to_process ); case self::IN_ELEMENT_CONTEXT: return $this->step_in_element( $node_to_process ); case self::IN_MISC_CONTEXT: return $this->step_in_misc( $node_to_process ); default: $this->last_error = self::ERROR_UNSUPPORTED; return false; } } catch ( XMLUnsupportedException $e ) { /* * Exceptions are used in this class to escape deep call stacks that * otherwise might involve messier calling and return conventions. */ return false; } } /** * Parses the next node in the 'prolog' part of the XML document. * * @return bool Whether a node was found. * @see https://www.w3.org/TR/xml/#NT-document. * @see XMLProcessor::step * * @since WP_VERSION */ private function step_in_prolog( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( self::PROCESS_NEXT_NODE === $node_to_process ) { $has_next_node = $this->parse_next_token(); if ( false === $has_next_node && ! $this->expecting_more_input ) { $this->bail( 'The root element was not found.', self::ERROR_SYNTAX ); } } // XML requires a root element. If we've reached the end of data in the prolog stage, // before finding a root element, then the document is incomplete. if ( self::STATE_COMPLETE === $this->parser_state ) { $this->mark_incomplete_input(); return false; } // Do not step if we paused due to an incomplete input. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } switch ( $this->get_token_type() ) { case '#text': $text = $this->get_modifiable_text(); $whitespaces = strspn( $text, " \t\n\r" ); if ( strlen( $text ) !== $whitespaces ) { // @TODO: Only look for this in the 2 initial bytes of the document:. if ( "\xFF\xFE" === substr( $text, 0, 2 ) ) { $this->bail( 'Unexpected UTF-16 BOM byte sequence (0xFFFE) in the document. XMLProcessor only supports UTF-8.', self::ERROR_SYNTAX ); } $this->bail( 'Unexpected non-whitespace text token in prolog stage.', self::ERROR_SYNTAX ); } return $this->step(); // @TODO: Fail if there's more than one or if was found before the XML declaration token. case '#doctype': case '#comment': case '#xml-declaration': case '#processing-instructions': return true; case '#tag': $this->parser_context = self::IN_ELEMENT_CONTEXT; return $this->step( self::PROCESS_CURRENT_NODE ); default: $this->bail( 'Unexpected token type in prolog stage.', self::ERROR_SYNTAX ); } } /** * Parses the next node in the 'element' part of the XML document. * * @return bool Whether a node was found. * @see https://www.w3.org/TR/xml/#NT-document. * @see XMLProcessor::step * * @since WP_VERSION */ private function step_in_element( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( self::PROCESS_NEXT_NODE === $node_to_process ) { $has_next_node = $this->parse_next_token(); if ( false === $has_next_node && ! $this->expecting_more_input ) { $this->bail( 'A tag was not closed.', self::ERROR_SYNTAX ); } } // Do not step if we paused due to an incomplete input. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } switch ( $this->get_token_type() ) { case '#text': case '#cdata-section': case '#comment': case '#processing-instructions': return true; case '#tag': // Update the stack of open elements. $tag_qname = $this->get_tag_name_qualified(); if ( $this->is_tag_closer() ) { if ( ! count( $this->stack_of_open_elements ) ) { $this->bail( // Translators: 1: The closing tag name. 2: The opening tag name. __( 'The closing tag "%1$s" did not match the opening tag "%2$s".' ), $tag_qname, $tag_qname ); return false; } $this->element = array_pop( $this->stack_of_open_elements ); $popped_qname = $this->element->qualified_name; if ( $popped_qname !== $tag_qname ) { $this->bail( sprintf( // translators: %1$s is the name of the closing HTML tag, %2$s is the name of the opening HTML tag. __( 'The closing tag "%1$s" did not match the opening tag "%2$s".' ), $tag_qname, $popped_qname ), self::ERROR_SYNTAX ); } if ( 0 === count( $this->stack_of_open_elements ) ) { $this->parser_context = self::IN_MISC_CONTEXT; } } else { array_push( $this->stack_of_open_elements, $this->element ); $this->element = $this->top_element(); } return true; default: $this->bail( sprintf( // translators: %1$s is the unexpected token type. __( 'Unexpected token type "%1$s" in element stage.', 'data-liberation' ), $this->get_token_type() ), self::ERROR_SYNTAX ); } } /** * Parses the next node in the 'misc' part of the XML document. * * @return bool Whether a node was found. * @see https://www.w3.org/TR/xml/#NT-document. * @see XMLProcessor::step * * @since WP_VERSION */ private function step_in_misc( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( self::PROCESS_NEXT_NODE === $node_to_process ) { $has_next_node = $this->parse_next_token(); if ( false === $has_next_node && ! $this->expecting_more_input ) { // Parsing is complete. $this->parser_state = self::STATE_COMPLETE; return true; } } // Do not step if we paused due to an incomplete input. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } if ( self::STATE_COMPLETE === $this->parser_state ) { return true; } switch ( $this->get_token_type() ) { case '#comment': case '#processing-instructions': return true; case '#text': $text = $this->get_modifiable_text(); $whitespaces = strspn( $text, " \t\n\r" ); if ( strlen( $text ) !== $whitespaces ) { $this->bail( 'Unexpected token type "' . $this->get_token_type() . '" in misc stage.', self::ERROR_SYNTAX ); } return $this->step(); default: $this->bail( 'Unexpected token type "' . $this->get_token_type() . '" in misc stage.', self::ERROR_SYNTAX ); } } /** * Computes the XML breadcrumbs for the currently-matched element, if matched. * * Breadcrumbs start at the outermost parent and descend toward the matched element. * They always include the entire path from the root XML node to the matched element. * Example * * $processor = XMLProcessor::create_fragment( '