File: //var/www/biblioteka/wp-content/plugins/wordpress-importer/php-toolkit/XML/class-xmldecoder.php
<?php
namespace WordPress\XML;
use function WordPress\Encoding\codepoint_to_utf8_bytes;
/**
* XML API: WP_XML_Decoder class
*
* Decodes spans of raw text found inside XML content,
* whether found in an attribute or in a text node.
*
* Do not use this function on the contents of a CDATA section,
* as those sections are not encoded with the XML rules unless
* they are embedded XML content.
*
* @package WordPress
* @subpackage HTML-API
* @since WP_VERSION
*/
class XMLDecoder {
/**
* Decodes a span of XML text.
*
* Example:
*
* '&' = WP_XML_Decoder::decode( '&' );
* '…' = WP_XML_Decoder::decode( '…' );
*
* @todo Add examples of parse failures, and decide if it should fail or not.
*
* @since WP_VERSION
*
* @access private
*
* @param string $text Text document containing span of text to decode.
* @return string Decoded UTF-8 string.
*/
public static function decode( $text ) {
$decoded = '';
$end = strlen( $text );
$at = 0;
$was_at = 0;
while ( $at < $end ) {
$next_character_reference_at = strpos( $text, '&', $at );
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
break;
}
$start_of_potential_reference_at = $next_character_reference_at + 1;
if ( $start_of_potential_reference_at >= $end ) {
// @todo This is an error. The document ended too early; consume the rest as plaintext, which is wrong.
break;
}
/**
* First character after the opening `&`.
*/
$start_of_potential_reference = $text[ $start_of_potential_reference_at ];
/*
* If it's a named character reference, it will be one of the five mandated references.
*
* - `&`
* - `'`
* - `>`
* - `<`
* - `"`
*
* These all must be found within the five successive characters from the `&`.
*
* Example:
*
* ╭ ampersand at 9 = $end - 6
* 'XML' ($end = 15)
* ╰───┴─ this length must be at least 5 long,
* which is $end - 5.
*/
if (
$next_character_reference_at < $end - 5 &&
(
'a' === $start_of_potential_reference ||
'g' === $start_of_potential_reference ||
'l' === $start_of_potential_reference ||
'q' === $start_of_potential_reference
)
) {
foreach ( array(
'amp;' => '&',
'apos;' => "'",
'lt;' => '<',
'gt;' => '>',
'quot;' => '"',
) as $name => $substitution ) {
if ( 0 === substr_compare( $text, $name, $start_of_potential_reference_at, strlen( $name ) ) ) {
$decoded .= substr( $text, $was_at, $next_character_reference_at - $was_at ) . $substitution;
$at = $start_of_potential_reference_at + strlen( $name );
$was_at = $at;
continue 2;
}
}
// @todo This is an invalid document. It should be communicated. Treat as plaintext and continue.
++$at;
continue;
}
/*
* The shortest numerical character reference is four characters.
*
* Example:
*
* 	
*/
if ( '#' !== $start_of_potential_reference || $next_character_reference_at + 4 >= $end ) {
// @todo This is an error. This ampersand _must_ be encoded. Treat as plaintext and move on.
++$at;
continue;
}
$is_hex = 'x' === $text[ $start_of_potential_reference_at + 1 ];
if ( $is_hex ) {
$zeros_at = $start_of_potential_reference_at + 2;
$base = 16;
$digit_chars = '0123456789abcdefABCDEF';
$max_digits = 6; // ``.
} else {
$zeros_at = $start_of_potential_reference_at + 1;
$base = 10;
$digit_chars = '0123456789';
$max_digits = 7; // ``.
}
$zero_count = strspn( $text, '0', $zeros_at );
$digits_at = $zeros_at + $zero_count;
$digit_count = strspn( $text, $digit_chars, $digits_at, $max_digits );
$semi_at = $digits_at + $digit_count;
if ( 0 === $digit_count || $semi_at >= $end || ';' !== $text[ $semi_at ] ) {
// @todo This is an error. Treat as plaintext and move on.
++$at;
continue;
}
$codepoint = intval( substr( $text, $digits_at, $digit_count ), $base );
$character_reference = codepoint_to_utf8_bytes( $codepoint );
if ( '�' === $character_reference && 0xFFFD !== $codepoint ) {
/*
* Stop processing if we got an invalid character AND the reference does not
* specifically refer code point FFFD (�).
*
* > It is a fatal error when an XML processor encounters an entity with an
* > encoding that it is unable to process. It is a fatal error if an XML entity
* > is determined (via default, encoding declaration, or higher-level protocol)
* > to be in a certain encoding but contains byte sequences that are not legal
* > in that encoding. Specifically, it is a fatal error if an entity encoded in
* > UTF-8 contains any ill-formed code unit sequences, as defined in section
* > 3.9 of Unicode [Unicode]. Unless an encoding is determined by a higher-level
* > protocol, it is also a fatal error if an XML entity contains no encoding
* > declaration and its content is not legal UTF-8 or UTF-16.
*
* See https://www.w3.org/TR/xml/#charencoding
*/
// @todo This is an error. Treat as plaintext and continue, which is wrong.
++$at;
continue;
}
$decoded .= substr( $text, $was_at, $at - $was_at );
$decoded .= $character_reference;
$at = $semi_at + 1;
$was_at = $at;
}
if ( 0 === $was_at ) {
return $text;
}
if ( $was_at < $end ) {
$decoded .= substr( $text, $was_at, $end - $was_at );
}
return $decoded;
}
/**
* Finds and parses the next entity in a given text starting after the
* given byte offset, and being entirely found within the given max length.
*
* @since {WP_VERSION}
*
* // @todo Implement this function.
*
* @param string $text Text in which to search for an XML entity.
* @param int $starting_byte_offset Start looking after this byte offset.
* @param int $ending_byte_offset Stop looking if entity is not fully contained before this byte offset.
* @param int|null $entity_at Optional. If provided, will be set to byte offset where entity was
* found, if found. Otherwise, will not be set.
*
* @return string|null Parsed entity, if parsed, otherwise `null`.
*/
public static function next_entity( string $text, int $starting_byte_offset, int $ending_byte_offset, ?int &$entity_at = null ): ?string {
$at = $starting_byte_offset;
$end = $ending_byte_offset;
while ( $at < $end ) {
$remaining = $end - $at;
$amp_after = strcspn( $text, '&', $at, $remaining );
// There are no more possible entities.
if ( $amp_after === $remaining ) {
return null;
}
/*
* @todo Move the decoding logic from `decode()` above into here,
* then call this function in a loop from `decode()`.
*/
++$at;
}
return null;
}
}