HEX

File: /var/www/biblioteka/wp-content/plugins/wordpress-importer/php-toolkit/Encoding/utf8-decoder.php
<?php

namespace WordPress\Encoding;

/*
 * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
 * proposed in https://github.com/WordPress/wordpress-develop/pull/6883.
 *
 * It enables parsing XML documents with incomplete UTF-8 byte sequences
 * without crashing or depending on the mbstring extension.
 */

if ( ! defined( 'UTF8_DECODER_ACCEPT' ) ) {
	define( 'UTF8_DECODER_ACCEPT', 0 );
}

if ( ! defined( 'UTF8_DECODER_REJECT' ) ) {
	define( 'UTF8_DECODER_REJECT', 1 );
}

/**
 * Indicates if a given byte stream represents valid UTF-8.
 *
 * Note that unpaired surrogate halves are not valid UTF-8 and will be rejected.
 *
 * Example:
 *
 *     true  === utf8_is_valid_byte_stream( 'Hello, World! 🌎' );
 *
 *     false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at );
 *     12    === $error_at;
 *
 *     false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at );
 *     23    === $error_at;
 *
 *     false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at );
 *     15    === $error_at;
 *
 * @param  string   $bytes  Text to validate as UTF-8 bytes.
 * @param  int      $starting_byte  Byte offset in string where decoding should begin.
 * @param  int|null $first_error_byte_at  Optional. If provided and byte stream fails to validate,
 *                                     will be set to the byte offset where the first invalid
 *                                     byte appeared. Otherwise, will not be set.
 *
 * @return bool Whether the given byte stream represents valid UTF-8.
 * @since {WP_VERSION}
 */
function utf8_is_valid_byte_stream( string $bytes, int $starting_byte = 0, ?int &$first_error_byte_at = null ): bool {
	$state         = UTF8_DECODER_ACCEPT;
	$last_start_at = $starting_byte;

	for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
		if ( UTF8_DECODER_ACCEPT === $state ) {
			$last_start_at = $at;
		}

		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state );
	}

	if ( UTF8_DECODER_ACCEPT === $state ) {
		return true;
	} else {
		$first_error_byte_at = $last_start_at;

		return false;
	}
}

/**
 * Returns number of code points found within a UTF-8 string, similar to `strlen()`.
 *
 * If the byte stream fails to properly decode as UTF-8 this function will set the
 * byte index of the first error byte and report the number of decoded code points.
 *
 * @param  string   $bytes  Text for which to count code points.
 * @param  int|null $first_error_byte_at  Optional. If provided, will be set upon finding
 *                                     the first invalid byte.
 *
 * @return int How many code points were decoded in the given byte stream before an error
 *             or before reaching the end of the string.
 * @since {WP_VERSION}
 */
function utf8_codepoint_count( string $bytes, ?int &$first_error_byte_at = null ): int {
	$state         = UTF8_DECODER_ACCEPT;
	$last_start_at = 0;
	$count         = 0;
	$codepoint     = 0;

	for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
		if ( UTF8_DECODER_ACCEPT === $state ) {
			$last_start_at = $at;
		}

		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $codepoint );

		if ( UTF8_DECODER_ACCEPT === $state ) {
			++$count;
		}
	}

	if ( UTF8_DECODER_ACCEPT !== $state ) {
		$first_error_byte_at = $last_start_at;
	}

	return $count;
}

/**
 * Inner loop for a number of UTF-8 decoding-related functions.
 *
 * You probably don't need this! This is highly-specific and optimized
 * code for UTF-8 operations used in other functions.
 *
 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 *
 * @since {WP_VERSION}
 *
 * @access private
 *
 * @param  string   $byte  Next byte to be applied in UTF-8 decoding or validation.
 * @param  int      $state  UTF-8 decoding state, one of the following values:<br><ul>
 *                                 <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
 *                                 <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
 *                                 Any other positive value: Decoder is waiting for additional bytes.
 * @param  int|null $codepoint  Optional. If provided, will accumulate the decoded code point as
 *                            each byte is processed. If not provided or unable to decode, will
 *                            not be set, or will be set to invalid and unusable data.
 *
 * @return int Next decoder state after processing the current byte.
 */
function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0 ): int {
	/**
	 * State classification and transition table for UTF-8 validation.
	 *
	 * > The first part of the table maps bytes to character classes that
	 * > to reduce the size of the transition table and create bitmasks.
	 * >
	 * > The second part is a transition table that maps a combination
	 * > of a state of the automaton and a character class to a state.
	 *
	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
	 */
	static $state_table = (
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
		"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
		"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
		"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
		"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
		"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
		"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
		"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
		"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
	);

	$byte      = ord( $byte );
	$type      = ord( $state_table[ $byte ] );
	$codepoint = ( UTF8_DECODER_ACCEPT === $state )
		? ( ( 0xFF >> $type ) & $byte )
		: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );

	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
}

/**
 * Extract a slice of a text by code point, where invalid byte sequences count
 * as a single code point, U+FFFD (the Unicode replacement character `�`).
 *
 * This function does not permit passing negative indices and will return
 * the original string if such are provide.
 *
 * @param  string $text  Input text from which to extract.
 * @param  int    $from  Start extracting after this many code-points.
 * @param  int    $length  Extract this many code points.
 *
 * @return string Extracted slice of input string.
 */
function utf8_substr( string $text, int $from = 0, ?int $length = null ): string {
	if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
		return $text;
	}

	$position_in_input = 0;
	$codepoint_at      = 0;
	$end_byte          = strlen( $text );
	$buffer            = '';
	$seen_codepoints   = 0;
	$sliced_codepoints = 0;
	$decoder_state     = UTF8_DECODER_ACCEPT;

	// Get to the start of the string.
	while ( $position_in_input < $end_byte && $seen_codepoints < $length ) {
		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );

		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
			++$position_in_input;

			if ( $seen_codepoints >= $from ) {
				++$sliced_codepoints;
				$buffer .= substr( $text, $codepoint_at, $position_in_input - $codepoint_at );
			}

			++$seen_codepoints;
			$codepoint_at = $position_in_input;
		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
			// "\u{FFFD}" is not supported in PHP 5.6.
			$buffer .= "\xEF\xBF\xBD";

			// Skip to the start of the next code point.
			while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
				$decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
			}

			++$seen_codepoints;
			$codepoint_at  = $position_in_input;
			$decoder_state = UTF8_DECODER_ACCEPT;
		} else {
			++$position_in_input;
		}
	}

	return $buffer;
}

/**
 * Extract a unicode codepoint from a specific offset in text.
 * Invalid byte sequences count as a single code point, U+FFFD
 * (the Unicode replacement character ``).
 *
 * This function does not permit passing negative indices and will return
 * null if such are provided.
 *
 * @param  string $text  Input text from which to extract.
 * @param  int    $byte_offset  Start at this byte offset in the input text.
 * @param  int    $matched_bytes  How many bytes were matched to produce the codepoint.
 *
 * @return int Unicode codepoint.
 */
function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
	if ( $byte_offset < 0 ) {
		return null;
	}

	$position_in_input = $byte_offset;
	$codepoint_at      = $byte_offset;
	$end_byte          = strlen( $text );
	$codepoint         = null;
	$decoder_state     = UTF8_DECODER_ACCEPT;

	// Get to the start of the string.
	while ( $position_in_input < $end_byte ) {
		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );

		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
			++$position_in_input;
			$codepoint = utf8_ord( substr( $text, $codepoint_at, $position_in_input - $codepoint_at ) );
			break;
		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
			// "\u{FFFD}" is not supported in PHP 5.6.
			$codepoint = utf8_ord( "\xEF\xBF\xBD" );
			break;
		} else {
			++$position_in_input;
		}
	}

	$matched_bytes = $position_in_input - $byte_offset;

	return $codepoint;
}

/**
 * Convert a UTF-8 byte sequence to its Unicode codepoint.
 *
 * @param  string $character  UTF-8 encoded byte sequence representing a single Unicode character.
 *
 * @return int Unicode codepoint.
 */
function utf8_ord( string $character ): int {
	// Convert the byte sequence to its binary representation.
	$bytes = unpack( 'C*', $character );

	// Initialize the codepoint.
	$codepoint = 0;

	// Calculate the codepoint based on the number of bytes.
	if ( 1 === count( $bytes ) ) {
		$codepoint = $bytes[1];
	} elseif ( 2 === count( $bytes ) ) {
		$codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
	} elseif ( 3 === count( $bytes ) ) {
		$codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
	} elseif ( 4 === count( $bytes ) ) {
		$codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
	}

	return $codepoint;
}