PHP code example of sjorek / unicode-normalization
1. Go to this page and download the library: Download sjorek/unicode-normalization library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
sjorek / unicode-normalization example snippets
/**
* Class for normalizing unicode.
*
* “Normalization: A process of removing alternate representations of equivalent
* sequences from textual data, to convert the data into a form that can be
* binary-compared for equivalence. In the Unicode Standard, normalization refers
* specifically to processing to ensure that canonical-equivalent (and/or
* compatibility-equivalent) strings have unique representations.”
*
* -- quoted from unicode glossary linked below
*
* @see http://www.unicode.org/glossary/#normalization
* @see http://www.php.net/manual/en/class.normalizer.php
* @see http://www.w3.org/wiki/I18N/CanonicalNormalization
* @see http://www.w3.org/TR/charmod-norm/
* @see http://blog.whatwg.org/tag/unicode
* @see http://en.wikipedia.org/wiki/Unicode_equivalence
* @see http://stackoverflow.com/questions/7931204/what-is-normalized-utf-8-all-about
* @see http://php.net/manual/en/class.normalizer.php
*/
class Sjorek\UnicodeNormalization\Normalizer
implements Sjorek\UnicodeNormalization\Implementation\NormalizerInterface
{
/**
* Constructor.
*
* @param null|bool|int|string $form (optional) Set normalization form, default: NFC
*
* Besides the normalization form class constants defined below,
* the following case-insensitive aliases are supported:
* <pre>
* - Disable unicode-normalization : 0, false, null, empty
* - Ignore/skip unicode-normalization : 1, NONE, true, binary, default, validate
* - Normalization form D : 2, NFD, FORM_D, D, form-d, decompose, collation
* - Normalization form D (mac) : 18, NFD_MAC, FORM_D_MAC, D_MAC, form-d-mac, d-mac, mac
* - Normalization form KD : 3, NFKD, FORM_KD, KD, form-kd
* - Normalization form C : 4, NFC, FORM_C, C, form-c, compose, recompose, legacy, html5
* - Normalization form KC : 5, NFKC, FORM_KC, KC, form-kc, matching
* </pre>
*
* Hints:
* <pre>
* - The W3C recommends NFC for HTML5 Output.
* - Mac OS X's HFS+ filesystem uses a NFD variant to store paths. We provide one implementation for this
* special variant, but plain NFD works in most cases too. Even if you use something else than NFD or its
* variant HFS+ will always use decomposed NFD path-strings if needed.
* </pre>
*/
public function __construct($form = null);
/**
* Ignore any decomposition/composition.
*
* Ignoring Implementation decomposition/composition, means nothing is automatically normalized.
* Many Linux- and BSD-filesystems do not normalize paths and filenames, but treat them as binary data.
* Apple™'s APFS filesystem treats paths and filenames as binary data.
*
* @var int
*/
const NONE = 1;
/**
* Canonical decomposition.
*
* “A normalization form that erases any canonical differences, and produces a
* decomposed result. For example, ä is converted to a + umlaut in this form.
* This form is most often used in internal processing, such as in collation.”
*
* -- quoted from unicode glossary linked below
*
* @var int
*
* @see http://www.unicode.org/glossary/#normalization_form_d
* @see https://developer.apple.com/library/content/qa/qa1173/_index.html
* @see https://developer.apple.com/library/content/qa/qa1235/_index.html
*/
const NFD = 2;
/**
* Compatibility decomposition.
*
* “A normalization form that erases both canonical and compatibility differences,
* and produces a decomposed result: for example, the single dž character is
* converted to d + z + caron in this form.”
*
* -- quoted from unicode glossary linked below
*
* @var int
*
* @see http://www.unicode.org/glossary/#normalization_form_kd
*/
const NFKD = 3;
/**
* Canonical decomposition followed by canonical composition.
*
* “A normalization form that erases any canonical differences, and generally produces
* a composed result. For example, a + umlaut is converted to ä in this form. This form
* most closely matches legacy usage.”
*
* -- quoted from unicode glossary linked below
*
* W3C recommends NFC for HTML5 output and lementation even if given $form is NONE, but finally it normalizes only if needed.
*
* @param string $input the string to normalize
* @param int $form (optional) normalization form to use, overriding the default
*
* @throws \Sjorek\UnicodeNormalization\Exception\InvalidNormalizationForm
*
* @return null|string Normalized string or null if an error occurred
*/
public function normalizeTo($input, $form = null);
/**
* Normalizes the $string provided to the given or default $form and returns the normalized string.
*
* Does not call underlying implementation if given normalization is NONE and normalizes only if needed.
*
* @param string $input the string to normalize
* @param int $form (optional) normalization form to use, overriding the default
*
* @throws \Sjorek\UnicodeNormalization\Exception\InvalidNormalizationForm
*
* @return null|string Normalized string or null if an error occurred
*/
public function normalizeStringTo($input, $form = null);
/**
* Get the supported unicode version level as version triple ("X.Y.Z").
*
* @return string
*/
public static function getUnicodeVersion();
/**
* Get the supported unicode normalization forms as array.
*
* @return int[]
*/
public static function getNormalizationForms();
}
/**
* @var $stream resource The stream to filter.
* @var $form string The form to normalize unicode to.
* @var $read_write int (optional) STREAM_FILTER_* constant to override the filter injection point
* @var $params string|int (optional) A normalization-form alias or value
*
* @link http://php.net/manual/en/function.stream-filter-append.php
* @link http://php.net/manual/en/function.stream-filter-prepend.php
*/
stream_filter_append($stream, "convert.unicode-normalization.$form"[, $read_write[, $params]]);
use Sjorek\UnicodeNormalization\Normalizer;
$string = 'äöü';
$normalizer = new Normalizer(Normalizer::NONE);
$nfc = new Normalizer();
$nfd = new Normalizer(Normalizer::NFD);
$nfkc = new Normalizer('matching');
var_dump(
// yields false as form NONE is never normalized
$normalizer->isNormalized($string),
// yields true, as NFC is the default for utf8 in the web.
$nfc->isNormalized($string),
// yields false
$nfd->isNormalized($string),
// yields false
$nfkc->isNormalized($string),
// yields false
$normalizer->isNormalized($string, Normalizer::NFKD),
// yields true
$normalizer->normalize($string) === $string,
// yields true
$nfc->normalize($string) === $string,
// yields false
$nfd->normalize($string) === $string,
// yields true, as only combined characters (means two or more letters in one
// character, like the single dž character) are decomposed (for faster matching).
$nfkc->normalize($string) === $string,
Normalizer::getUnicodeVersion(),
Normalizer::getNormalizationForms()
);
$in_file = fopen('utf8-file.txt', 'r');
$out_file = fopen('utf8-normalized-to-nfc-file.txt', 'w');
// It works as a read filter:
stream_filter_append($in_file, 'convert.unicode-normalization.NFC');
// Normalization form may be given as fourth parameter:
// stream_filter_append($in_file, 'convert.unicode-normalization', null, 'NFC');
// And it also works as a write filter:
// stream_filter_append($out_file, 'convert.unicode-normalization.NFC');
stream_copy_to_stream($in_file, $out_file);
bash
php composer.phar
Loading please wait ...
Before you can download the PHP files, the dependencies should be resolved. This can take some minutes. Please be patient.