PHP code example of sjorek / unicode-normalization

1. Go to this page and download the library: Download sjorek/unicode-normalization library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

sjorek / unicode-normalization example snippets




/**
 * Class for normalizing unicode.
 *
 *    “Normalization: A process of removing alternate representations of equivalent
 *    sequences from textual data, to convert the data into a form that can be
 *    binary-compared for equivalence. In the Unicode Standard, normalization refers
 *    specifically to processing to ensure that canonical-equivalent (and/or
 *    compatibility-equivalent) strings have unique representations.”
 *
 *     -- quoted from unicode glossary linked below
 *
 * @see http://www.unicode.org/glossary/#normalization
 * @see http://www.php.net/manual/en/class.normalizer.php
 * @see http://www.w3.org/wiki/I18N/CanonicalNormalization
 * @see http://www.w3.org/TR/charmod-norm/
 * @see http://blog.whatwg.org/tag/unicode
 * @see http://en.wikipedia.org/wiki/Unicode_equivalence
 * @see http://stackoverflow.com/questions/7931204/what-is-normalized-utf-8-all-about
 * @see http://php.net/manual/en/class.normalizer.php
 */
class Sjorek\UnicodeNormalization\Normalizer
    implements Sjorek\UnicodeNormalization\Implementation\NormalizerInterface
{

    /**
     * Constructor.
     *
     * @param null|bool|int|string $form (optional) Set normalization form, default: NFC
     *
     * Besides the normalization form class constants defined below,
     * the following case-insensitive aliases are supported:
     * <pre>
     * - Disable unicode-normalization     : 0,  false, null, empty
     * - Ignore/skip unicode-normalization : 1,  NONE, true, binary, default, validate
     * - Normalization form D              : 2,  NFD, FORM_D, D, form-d, decompose, collation
     * - Normalization form D (mac)        : 18, NFD_MAC, FORM_D_MAC, D_MAC, form-d-mac, d-mac, mac
     * - Normalization form KD             : 3,  NFKD, FORM_KD, KD, form-kd
     * - Normalization form C              : 4,  NFC, FORM_C, C, form-c, compose, recompose, legacy, html5
     * - Normalization form KC             : 5,  NFKC, FORM_KC, KC, form-kc, matching
     * </pre>
     *
     * Hints:
     * <pre>
     * - The W3C recommends NFC for HTML5 Output.
     * - Mac OS X's HFS+ filesystem uses a NFD variant to store paths. We provide one implementation for this
     *   special variant, but plain NFD works in most cases too. Even if you use something else than NFD or its
     *   variant HFS+ will always use decomposed NFD path-strings if needed.
     * </pre>
     */
    public function __construct($form = null);

    /**
     * Ignore any decomposition/composition.
     *
     * Ignoring Implementation decomposition/composition, means nothing is automatically normalized.
     * Many Linux- and BSD-filesystems do not normalize paths and filenames, but treat them as binary data.
     * Apple™'s APFS filesystem treats paths and filenames as binary data.
     *
     * @var int
     */
    const NONE = 1;

    /**
     * Canonical decomposition.
     *
     *    “A normalization form that erases any canonical differences, and produces a
     *    decomposed result. For example, ä is converted to a + umlaut in this form.
     *    This form is most often used in internal processing, such as in collation.”
     *
     *    -- quoted from unicode glossary linked below
     *
     * @var int
     *
     * @see http://www.unicode.org/glossary/#normalization_form_d
     * @see https://developer.apple.com/library/content/qa/qa1173/_index.html
     * @see https://developer.apple.com/library/content/qa/qa1235/_index.html
     */
    const NFD = 2;

    /**
     * Compatibility decomposition.
     *
     *    “A normalization form that erases both canonical and compatibility differences,
     *    and produces a decomposed result: for example, the single dž character is
     *    converted to d + z + caron in this form.”
     *
     *    -- quoted from unicode glossary linked below
     *
     * @var int
     *
     * @see http://www.unicode.org/glossary/#normalization_form_kd
     */
    const NFKD = 3;

    /**
     * Canonical decomposition followed by canonical composition.
     *
     *    “A normalization form that erases any canonical differences, and generally produces
     *    a composed result. For example, a + umlaut is converted to ä in this form. This form
     *    most closely matches legacy usage.”
     *
     *    -- quoted from unicode glossary linked below
     *
     * W3C recommends NFC for HTML5 output and lementation even if given $form is NONE, but finally it normalizes only if needed.
     *
     * @param string $input the string to normalize
     * @param int    $form  (optional) normalization form to use, overriding the default
     *
     * @throws \Sjorek\UnicodeNormalization\Exception\InvalidNormalizationForm
     *
     * @return null|string Normalized string or null if an error occurred
     */
    public function normalizeTo($input, $form = null);

    /**
     * Normalizes the $string provided to the given or default $form and returns the normalized string.
     *
     * Does not call underlying implementation if given normalization is NONE and normalizes only if needed.
     *
     * @param string $input the string to normalize
     * @param int    $form  (optional) normalization form to use, overriding the default
     *
     * @throws \Sjorek\UnicodeNormalization\Exception\InvalidNormalizationForm
     *
     * @return null|string Normalized string or null if an error occurred
     */
    public function normalizeStringTo($input, $form = null);

    /**
     * Get the supported unicode version level as version triple ("X.Y.Z").
     *
     * @return string
     */
    public static function getUnicodeVersion();

    /**
     * Get the supported unicode normalization forms as array.
     *
     * @return int[]
     */
    public static function getNormalizationForms();
}



/**
 * @var $stream        resource    The stream to filter.
 * @var $form          string      The form to normalize unicode to.
 * @var $read_write    int         (optional) STREAM_FILTER_* constant to override the filter injection point
 * @var $params        string|int  (optional) A normalization-form alias or value
 *
 * @link http://php.net/manual/en/function.stream-filter-append.php
 * @link http://php.net/manual/en/function.stream-filter-prepend.php
 */
stream_filter_append($stream, "convert.unicode-normalization.$form"[, $read_write[, $params]]);



use Sjorek\UnicodeNormalization\Normalizer;

$string = 'äöü';

$normalizer = new Normalizer(Normalizer::NONE);
$nfc = new Normalizer();
$nfd = new Normalizer(Normalizer::NFD);
$nfkc = new Normalizer('matching');

var_dump(
    // yields false as form NONE is never normalized
    $normalizer->isNormalized($string),

    // yields true, as NFC is the default for utf8 in the web.
    $nfc->isNormalized($string),

    // yields false
    $nfd->isNormalized($string),

    // yields false
    $nfkc->isNormalized($string),

    // yields false
    $normalizer->isNormalized($string, Normalizer::NFKD),

    // yields true
    $normalizer->normalize($string) === $string,

    // yields true
    $nfc->normalize($string) === $string,

    // yields false
    $nfd->normalize($string) === $string,

    // yields true, as only combined characters (means two or more letters in one
    // character, like the single dž character) are decomposed (for faster matching).
    $nfkc->normalize($string) === $string,

    Normalizer::getUnicodeVersion(),
    Normalizer::getNormalizationForms()
);




$in_file = fopen('utf8-file.txt', 'r');
$out_file = fopen('utf8-normalized-to-nfc-file.txt', 'w');

// It works as a read filter:
stream_filter_append($in_file, 'convert.unicode-normalization.NFC');

// Normalization form may be given as fourth parameter:
// stream_filter_append($in_file, 'convert.unicode-normalization', null, 'NFC');

// And it also works as a write filter:
// stream_filter_append($out_file, 'convert.unicode-normalization.NFC');

stream_copy_to_stream($in_file, $out_file);
bash
php composer.phar