class PhpTransliteration
Implements transliteration without using the PECL extensions.
Transliterations are done character-by-character, by looking up non-US-ASCII characters in a transliteration database.
The database comes from two types of files, both of which are searched for in the PhpTransliteration::$dataDirectory directory. First, language-specific overrides are searched (see PhpTransliteration::readLanguageOverrides()). If there is no language-specific override for a character, the generic transliteration character tables are searched (see PhpTransliteration::readGenericData()). If looking up the character in the generic table results in a NULL value, or an illegal character is encountered, then a substitute character is returned.
Some parts of this code were derived from the MediaWiki project's UtfNormal class, Copyright © 2004 Brion Vibber <brion@pobox.com>, http://www.mediawiki.org/
Hierarchy
- class \Drupal\Component\Transliteration\PhpTransliteration implements \Drupal\Component\Transliteration\TransliterationInterface
 
Expanded class hierarchy of PhpTransliteration
4 files declare their use of PhpTransliteration
- MachineNameControllerTest.php in core/
modules/ system/ tests/ src/ Unit/ Transliteration/ MachineNameControllerTest.php  - PhpTransliteration.php in core/
lib/ Drupal/ Core/ Transliteration/ PhpTransliteration.php  - PhpTransliterationTest.php in core/
tests/ Drupal/ Tests/ Component/ Transliteration/ PhpTransliterationTest.php  - SanitizeNameTest.php in core/
modules/ file/ tests/ src/ Unit/ SanitizeNameTest.php  
File
- 
              core/
lib/ Drupal/ Component/ Transliteration/ PhpTransliteration.php, line 26  
Namespace
Drupal\Component\TransliterationView source
class PhpTransliteration implements TransliterationInterface {
  
  /**
   * Directory where data for transliteration resides.
   *
   * The constructor sets this (by default) to subdirectory 'data' underneath
   * the directory where the class's PHP file resides.
   *
   * @var string
   */
  protected $dataDirectory;
  
  /**
   * Associative array of language-specific character transliteration tables.
   *
   * The outermost array keys are language codes. For each language code key,
   * the value is an array whose keys are Unicode character codes, and whose
   * values are the transliterations of those characters to US-ASCII. This is
   * set up as needed in PhpTransliteration::replace() by calling
   * PhpTransliteration::readLanguageOverrides().
   *
   * @var array
   */
  protected $languageOverrides = [];
  
  /**
   * Non-language-specific transliteration tables.
   *
   * Array whose keys are the upper two bytes of the Unicode character, and
   * whose values are an array of transliterations for each lower-two bytes
   * character code. This is set up as needed in PhpTransliteration::replace()
   * by calling PhpTransliteration::readGenericData().
   *
   * @var array
   */
  protected $genericMap = [];
  
  /**
   * Special characters for ::removeDiacritics().
   *
   * Characters which have accented variants but their base character
   * transliterates to more than one ASCII character require special
   * treatment: we want to remove their accent and use the un-
   * transliterated base character.
   */
  protected $fixTransliterateForRemoveDiacritics = [
    'AE' => 'Æ',
    'ae' => 'æ',
    'ZH' => 'Ʒ',
    'zh' => 'ʒ',
  ];
  
  /**
   * Constructs a transliteration object.
   *
   * @param string $data_directory
   *   (optional) The directory where data files reside. If omitted, defaults
   *   to subdirectory 'data' underneath the directory where the class's PHP
   *   file resides.
   */
  public function __construct($data_directory = NULL) {
    $this->dataDirectory = isset($data_directory) ? $data_directory : __DIR__ . '/data';
  }
  
  /**
   * {@inheritdoc}
   */
  public function removeDiacritics($string) {
    $result = '';
    foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
      $code = self::ordUTF8($character);
      // These two Unicode ranges include the accented US-ASCII letters, with a
      // few characters that aren't accented letters mixed in. So define the
      // ranges and the excluded characters.
      $range1 = $code > 0xbf && $code < 0x17f;
      $exclusions_range1 = [
        0xd0,
        0xd7,
        0xf0,
        0xf7,
        0x138,
        0x14a,
        0x14b,
      ];
      $range2 = $code > 0x1cc && $code < 0x250;
      $exclusions_range2 = [
        0x1dd,
        0x1f7,
        0x21c,
        0x21d,
        0x220,
        0x221,
        0x241,
        0x242,
        0x245,
      ];
      $replacement = $character;
      if ($range1 && !in_array($code, $exclusions_range1) || $range2 && !in_array($code, $exclusions_range2)) {
        $to_add = $this->lookupReplacement($code, 'xyz');
        if (strlen($to_add) === 1) {
          $replacement = $to_add;
        }
        elseif (isset($this->fixTransliterateForRemoveDiacritics[$to_add])) {
          $replacement = $this->fixTransliterateForRemoveDiacritics[$to_add];
        }
      }
      $result .= $replacement;
    }
    return $result;
  }
  
  /**
   * {@inheritdoc}
   */
  public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
    $result = '';
    $length = 0;
    $hash = FALSE;
    // Replace question marks with a unique hash if necessary. This because
    // mb_convert_encoding() replaces all invalid characters with a question
    // mark.
    if ($unknown_character != '?' && str_contains($string, '?')) {
      $hash = hash('sha256', $string);
      $string = str_replace('?', $hash, $string);
    }
    // Ensure the string is valid UTF8 for preg_split(). Unknown characters will
    // be replaced by a question mark.
    $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
    // Use the provided unknown character instead of a question mark.
    if ($unknown_character != '?') {
      $string = str_replace('?', $unknown_character, $string);
      // Restore original question marks if necessary.
      if ($hash !== FALSE) {
        $string = str_replace($hash, '?', $string);
      }
    }
    // Split into Unicode characters and transliterate each one.
    foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
      $code = self::ordUTF8($character);
      if ($code == -1) {
        $to_add = $unknown_character;
      }
      else {
        $to_add = $this->replace($code, $langcode, $unknown_character);
      }
      // Check if this exceeds the maximum allowed length.
      if (isset($max_length)) {
        $length += strlen($to_add);
        if ($length > $max_length) {
          // There is no more space.
          return $result;
        }
      }
      $result .= $to_add;
    }
    return $result;
  }
  
  /**
   * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
   *
   * @param string $character
   *   A single UTF-8 character.
   *
   * @return int
   *   The character code, or -1 if an illegal character is found.
   */
  protected static function ordUTF8($character) {
    $first_byte = ord($character[0]);
    if (($first_byte & 0x80) == 0) {
      // Single-byte form: 0xxxxxxxx.
      return $first_byte;
    }
    if (($first_byte & 0xe0) == 0xc0) {
      // Two-byte form: 110xxxxx 10xxxxxx.
      return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
    }
    if (($first_byte & 0xf0) == 0xe0) {
      // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
      return (($first_byte & 0xf) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
    }
    if (($first_byte & 0xf8) == 0xf0) {
      // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
      return (($first_byte & 0x7) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
    }
    // Other forms are not legal.
    return -1;
  }
  
  /**
   * Replaces a single Unicode character using the transliteration database.
   *
   * @param int $code
   *   The character code of a Unicode character.
   * @param string $langcode
   *   The language code of the language the character is in.
   * @param string $unknown_character
   *   The character to substitute for characters without transliterated
   *   equivalents.
   *
   * @return string
   *   US-ASCII replacement character. If it has a mapping, it is returned;
   *   otherwise, $unknown_character is returned. The replacement can contain
   *   multiple characters.
   */
  protected function replace($code, $langcode, $unknown_character) {
    if ($code < 0x80) {
      // Already lower ASCII.
      return chr($code);
    }
    // See if there is a language-specific override for this character.
    if (!isset($this->languageOverrides[$langcode])) {
      $this->readLanguageOverrides($langcode);
    }
    if (isset($this->languageOverrides[$langcode][$code])) {
      return $this->languageOverrides[$langcode][$code];
    }
    return $this->lookupReplacement($code, $unknown_character);
  }
  
  /**
   * Look up the generic replacement for a UTF-8 character code.
   *
   * @param $code
   *   The UTF-8 character code.
   * @param string $unknown_character
   *   (optional) The character to substitute for characters without entries in
   *   the replacement tables.
   *
   * @return string
   *   US-ASCII replacement characters. If it has a mapping, it is returned;
   *   otherwise, $unknown_character is returned. The replacement can contain
   *   multiple characters.
   */
  protected function lookupReplacement($code, $unknown_character = '?') {
    // See if there is a generic mapping for this character.
    $bank = $code >> 8;
    if (!isset($this->genericMap[$bank])) {
      $this->readGenericData($bank);
    }
    $code = $code & 0xff;
    return $this->genericMap[$bank][$code] ?? $unknown_character;
  }
  
  /**
   * Reads in language overrides for a language code.
   *
   * The data is read from files named "$langcode.php" in
   * PhpTransliteration::$dataDirectory. These files should set up an array
   * variable $overrides with an element whose key is $langcode and whose value
   * is an array whose keys are character codes, and whose values are their
   * transliterations in this language. The character codes can be for any valid
   * Unicode character, independent of the number of bytes.
   *
   * @param $langcode
   *   Code for the language to read.
   */
  protected function readLanguageOverrides($langcode) {
    // Figure out the file name to use by sanitizing the language code,
    // just in case.
    $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\\-]/', '', $langcode) . '.php';
    // Read in this file, which should set up a variable called $overrides,
    // which will be local to this function.
    $overrides[$langcode] = [];
    if (is_file($file)) {
      include $file;
    }
    $this->languageOverrides[$langcode] = $overrides[$langcode];
  }
  
  /**
   * Reads in generic transliteration data for a bank of characters.
   *
   * The data is read in from a file named "x$bank.php" (with $bank in
   * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
   * should set up a variable $bank containing an array whose numerical indices
   * are the remaining two bytes of the character code, and whose values are the
   * transliterations of these characters into US-ASCII. Note that the maximum
   * Unicode character that can be encoded in this way is 4 bytes.
   *
   * @param $bank
   *   First two bytes of the Unicode character, or 0 for the ASCII range.
   */
  protected function readGenericData($bank) {
    // Figure out the file name.
    $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
    // Read in this file, which should set up a variable called $base, which
    // will be local to this function.
    $base = [];
    if (is_file($file)) {
      include $file;
    }
    $this->genericMap[$bank] = $base;
  }
}
Members
| Title Sort descending | Modifiers | Object type | Summary | Overriden Title | Overrides | 
|---|---|---|---|---|---|
| PhpTransliteration::$dataDirectory | protected | property | Directory where data for transliteration resides. | ||
| PhpTransliteration::$fixTransliterateForRemoveDiacritics | protected | property | Special characters for ::removeDiacritics(). | ||
| PhpTransliteration::$genericMap | protected | property | Non-language-specific transliteration tables. | ||
| PhpTransliteration::$languageOverrides | protected | property | Associative array of language-specific character transliteration tables. | ||
| PhpTransliteration::lookupReplacement | protected | function | Look up the generic replacement for a UTF-8 character code. | ||
| PhpTransliteration::ordUTF8 | protected static | function | Finds the character code for a UTF-8 character: like ord() but for UTF-8. | ||
| PhpTransliteration::readGenericData | protected | function | Reads in generic transliteration data for a bank of characters. | ||
| PhpTransliteration::readLanguageOverrides | protected | function | Reads in language overrides for a language code. | 1 | |
| PhpTransliteration::removeDiacritics | public | function | Removes diacritics (accents) from certain letters. | Overrides TransliterationInterface::removeDiacritics | |
| PhpTransliteration::replace | protected | function | Replaces a single Unicode character using the transliteration database. | ||
| PhpTransliteration::transliterate | public | function | Transliterates text from Unicode to US-ASCII. | Overrides TransliterationInterface::transliterate | |
| PhpTransliteration::__construct | public | function | Constructs a transliteration object. | 1 | 
Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.