2010-08-22 16 views
15

Jaki jest najlepszy sposób na usunięcie akcentów, np.Usunąć akcenty bez używania iconv

ÈâuÑ”staje się "Eaun"

Bez wykorzystaniem iconv

+1

Dlaczego nie chcesz użyć 'iconv'. Jakiś konkretny powód? – shamittomar

+0

iconv daje różne wyniki w różnych systemach operacyjnych. –

+0

możliwy duplikat [Jak usunąć akcenty ze znaków w ciągu PHP?] (Http://stackoverflow.com/questions/1017599/how-do-i-remove- akcenty-from-characters-in-a-php-string) – outis

Odpowiedz

16

kompletny kod roboczy. wiem, że to długo, ale jest to sposób na pewno-shot używany przez Wordpress.

<?php 

function seems_utf8($str) 
{ 
    $length = strlen($str); 
    for ($i=0; $i < $length; $i++) { 
     $c = ord($str[$i]); 
     if ($c < 0x80) $n = 0; # 0bbbbbbb 
     elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb 
     elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb 
     elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb 
     elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb 
     elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b 
     else return false; # Does not match any model 
     for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 
      if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80)) 
       return false; 
     } 
    } 
    return true; 
} 

/** 
* Converts all accent characters to ASCII characters. 
* 
* If there are no accent characters, then the string given is just returned. 
* 
* @param string $string Text that might have accent characters 
* @return string Filtered string with replaced "nice" characters. 
*/ 
function remove_accents($string) { 
    if (!preg_match('/[\x80-\xff]/', $string)) 
     return $string; 

    if (seems_utf8($string)) { 
     $chars = array(
     // Decompositions for Latin-1 Supplement 
     chr(195).chr(128) => 'A', chr(195).chr(129) => 'A', 
     chr(195).chr(130) => 'A', chr(195).chr(131) => 'A', 
     chr(195).chr(132) => 'A', chr(195).chr(133) => 'A', 
     chr(195).chr(135) => 'C', chr(195).chr(136) => 'E', 
     chr(195).chr(137) => 'E', chr(195).chr(138) => 'E', 
     chr(195).chr(139) => 'E', chr(195).chr(140) => 'I', 
     chr(195).chr(141) => 'I', chr(195).chr(142) => 'I', 
     chr(195).chr(143) => 'I', chr(195).chr(145) => 'N', 
     chr(195).chr(146) => 'O', chr(195).chr(147) => 'O', 
     chr(195).chr(148) => 'O', chr(195).chr(149) => 'O', 
     chr(195).chr(150) => 'O', chr(195).chr(153) => 'U', 
     chr(195).chr(154) => 'U', chr(195).chr(155) => 'U', 
     chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y', 
     chr(195).chr(159) => 's', chr(195).chr(160) => 'a', 
     chr(195).chr(161) => 'a', chr(195).chr(162) => 'a', 
     chr(195).chr(163) => 'a', chr(195).chr(164) => 'a', 
     chr(195).chr(165) => 'a', chr(195).chr(167) => 'c', 
     chr(195).chr(168) => 'e', chr(195).chr(169) => 'e', 
     chr(195).chr(170) => 'e', chr(195).chr(171) => 'e', 
     chr(195).chr(172) => 'i', chr(195).chr(173) => 'i', 
     chr(195).chr(174) => 'i', chr(195).chr(175) => 'i', 
     chr(195).chr(177) => 'n', chr(195).chr(178) => 'o', 
     chr(195).chr(179) => 'o', chr(195).chr(180) => 'o', 
     chr(195).chr(181) => 'o', chr(195).chr(182) => 'o', 
     chr(195).chr(182) => 'o', chr(195).chr(185) => 'u', 
     chr(195).chr(186) => 'u', chr(195).chr(187) => 'u', 
     chr(195).chr(188) => 'u', chr(195).chr(189) => 'y', 
     chr(195).chr(191) => 'y', 
     // Decompositions for Latin Extended-A 
     chr(196).chr(128) => 'A', chr(196).chr(129) => 'a', 
     chr(196).chr(130) => 'A', chr(196).chr(131) => 'a', 
     chr(196).chr(132) => 'A', chr(196).chr(133) => 'a', 
     chr(196).chr(134) => 'C', chr(196).chr(135) => 'c', 
     chr(196).chr(136) => 'C', chr(196).chr(137) => 'c', 
     chr(196).chr(138) => 'C', chr(196).chr(139) => 'c', 
     chr(196).chr(140) => 'C', chr(196).chr(141) => 'c', 
     chr(196).chr(142) => 'D', chr(196).chr(143) => 'd', 
     chr(196).chr(144) => 'D', chr(196).chr(145) => 'd', 
     chr(196).chr(146) => 'E', chr(196).chr(147) => 'e', 
     chr(196).chr(148) => 'E', chr(196).chr(149) => 'e', 
     chr(196).chr(150) => 'E', chr(196).chr(151) => 'e', 
     chr(196).chr(152) => 'E', chr(196).chr(153) => 'e', 
     chr(196).chr(154) => 'E', chr(196).chr(155) => 'e', 
     chr(196).chr(156) => 'G', chr(196).chr(157) => 'g', 
     chr(196).chr(158) => 'G', chr(196).chr(159) => 'g', 
     chr(196).chr(160) => 'G', chr(196).chr(161) => 'g', 
     chr(196).chr(162) => 'G', chr(196).chr(163) => 'g', 
     chr(196).chr(164) => 'H', chr(196).chr(165) => 'h', 
     chr(196).chr(166) => 'H', chr(196).chr(167) => 'h', 
     chr(196).chr(168) => 'I', chr(196).chr(169) => 'i', 
     chr(196).chr(170) => 'I', chr(196).chr(171) => 'i', 
     chr(196).chr(172) => 'I', chr(196).chr(173) => 'i', 
     chr(196).chr(174) => 'I', chr(196).chr(175) => 'i', 
     chr(196).chr(176) => 'I', chr(196).chr(177) => 'i', 
     chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij', 
     chr(196).chr(180) => 'J', chr(196).chr(181) => 'j', 
     chr(196).chr(182) => 'K', chr(196).chr(183) => 'k', 
     chr(196).chr(184) => 'k', chr(196).chr(185) => 'L', 
     chr(196).chr(186) => 'l', chr(196).chr(187) => 'L', 
     chr(196).chr(188) => 'l', chr(196).chr(189) => 'L', 
     chr(196).chr(190) => 'l', chr(196).chr(191) => 'L', 
     chr(197).chr(128) => 'l', chr(197).chr(129) => 'L', 
     chr(197).chr(130) => 'l', chr(197).chr(131) => 'N', 
     chr(197).chr(132) => 'n', chr(197).chr(133) => 'N', 
     chr(197).chr(134) => 'n', chr(197).chr(135) => 'N', 
     chr(197).chr(136) => 'n', chr(197).chr(137) => 'N', 
     chr(197).chr(138) => 'n', chr(197).chr(139) => 'N', 
     chr(197).chr(140) => 'O', chr(197).chr(141) => 'o', 
     chr(197).chr(142) => 'O', chr(197).chr(143) => 'o', 
     chr(197).chr(144) => 'O', chr(197).chr(145) => 'o', 
     chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe', 
     chr(197).chr(148) => 'R',chr(197).chr(149) => 'r', 
     chr(197).chr(150) => 'R',chr(197).chr(151) => 'r', 
     chr(197).chr(152) => 'R',chr(197).chr(153) => 'r', 
     chr(197).chr(154) => 'S',chr(197).chr(155) => 's', 
     chr(197).chr(156) => 'S',chr(197).chr(157) => 's', 
     chr(197).chr(158) => 'S',chr(197).chr(159) => 's', 
     chr(197).chr(160) => 'S', chr(197).chr(161) => 's', 
     chr(197).chr(162) => 'T', chr(197).chr(163) => 't', 
     chr(197).chr(164) => 'T', chr(197).chr(165) => 't', 
     chr(197).chr(166) => 'T', chr(197).chr(167) => 't', 
     chr(197).chr(168) => 'U', chr(197).chr(169) => 'u', 
     chr(197).chr(170) => 'U', chr(197).chr(171) => 'u', 
     chr(197).chr(172) => 'U', chr(197).chr(173) => 'u', 
     chr(197).chr(174) => 'U', chr(197).chr(175) => 'u', 
     chr(197).chr(176) => 'U', chr(197).chr(177) => 'u', 
     chr(197).chr(178) => 'U', chr(197).chr(179) => 'u', 
     chr(197).chr(180) => 'W', chr(197).chr(181) => 'w', 
     chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y', 
     chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z', 
     chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z', 
     chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z', 
     chr(197).chr(190) => 'z', chr(197).chr(191) => 's', 
     // Euro Sign 
     chr(226).chr(130).chr(172) => 'E', 
     // GBP (Pound) Sign 
     chr(194).chr(163) => ''); 

     $string = strtr($string, $chars); 
    } else { 
     // Assume ISO-8859-1 if not UTF-8 
     $chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158) 
      .chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194) 
      .chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202) 
      .chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210) 
      .chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218) 
      .chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227) 
      .chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235) 
      .chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243) 
      .chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251) 
      .chr(252).chr(253).chr(255); 

     $chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy"; 

     $string = strtr($string, $chars['in'], $chars['out']); 
     $double_chars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254)); 
     $double_chars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th'); 
     $string = str_replace($double_chars['in'], $double_chars['out'], $string); 
    } 

    return $string; 
} 


$str = "ÈâuÑ"; 
echo remove_accents($str); // Output: EauN 
?> 
+1

To nie jest kompletne. Nie zastępuje akcentów, gdy są w formie rozłożonej. – Artefacto

+0

Co to jest forma rozłożona? –

+1

@ Mark Zaktualizowałem swoją odpowiedź. Rozłożona forma "ÈâuÑ foo bar 88." to "ÈâuÑ foo bar 88.". Zależy od czcionki, ale powinny wyglądać tak samo. Oto przykład niepowodzenia twojej funkcji: http://codepad.viper-7.com/FAvL9I – Artefacto

18

O wiele prostszy:

$text = "ÈâuÑ foo bar 88."; 
var_dump(preg_replace('/\p{M}/u', '', 
    Normalizer::normalize($text, Normalizer::FORM_D))); 

daje:

string(16) "EauN foo bar 88." 

Chodzi o to, aby rozłożyć znaki w postaci gdzie są one faktycznie dwie jednostki kodu Unicode, z indywidualnym jeden dla akcenty. Następnie usuwasz te akcenty.

\p{M}means "postać przeznaczona do łączenia z inną postacią (np. Akcentami, umlautami, załączonymi ramkami itp.)".

Należy pamiętać, że nie jest to ostateczne rozwiązanie. Może to wystarczyć dla tekstu wejściowego, ale wiele znaków, takich jak ø, nie ulega rozkładowi, więc ta metoda się nie powiedzie.

+0

Ale jego dostępność jest następująca: (PHP 5> = 5.3.0, PECL intl> = 1.0.0) – shamittomar

+0

Tak, mój pakiet nie ma klasy Normalizer ... –

+0

Biblioteka intl została dołączona do PHP 5.3. – Artefacto

-1

Albo spróbować tej funkcji (z str_replace i wyczerpującej tabeli akcentami): http://idilix.net/php-strip-accents.php

+5

To nie jest wyczerpujące: jest śmiesznie i żenująco naiwni! Po pierwsze, istnieje tylko 756 liter łacińskich z "akcentami". Dodaj kolejny tysiąc dla liter niełacińskich. Po drugie, nie używa żadnego rodzaju rozkładu, więc jest zepsute finansowo. Po trzecie, istnieje 237 łacińskich liter, które nie mają ** MIĘDZY dekompozycjami do naprawienia, lub około dwukrotnie, jeśli policzymy niełacińskie. Jest na to właściwy sposób, ale jest to takie złe, że nie powiem ci, jak. Naucz się korzystać z Unicode, a nie niszcz go. – tchrist

2

Napisałem to w php. Mam nadzieję, że to pomoże komuś. Unika mi konieczności używania iconv.

// 
// This function takes a string with international accented characters and anglicizes it. 
// For example, "Hafþór Júlíus Björnsson" becomes "Hafthor Julius Bjornsson" 
// 

function anglicize($string) { 
$accented = array("À","Á","Â","Ã","Ä","Å","Æ", "Ç","È","É","Ê","Ë","Ì","Í","Î","Ï","Ð","Ñ","Ò","Ó","Ô","Õ","Ö","Ø","Ù","Ú","Û","Ü","Þ", "ß", "à","á","â","ã","ä","å","æ", "ç","è","é","ê","ë","ì","í","î","ï","ð","ñ","ò","ó","ô","õ","ö","ø","ù","ú","û","ü","þ", "Ā","ā","Ă","ă","Ą","ą","Ć","ć","Ĉ","ĉ","Ċ","ċ","Č","č","Ď","ď","Đ","đ","Ē","ē","Ĕ","ĕ","Ė","ė","Ę","ę","Ě","ě","Ĝ","ĝ","Ğ","ğ","Ġ","ġ","Ģ","ģ","Ĥ","ĥ","Ħ","ħ","Ĩ","ĩ","Ī","ī","Ĭ","ĭ","Į","į","İ","ı","IJ", "ij", "Ĵ","ĵ","Ķ","ķ","Ĺ","ĺ","Ļ","ļ","Ľ","ľ","Ŀ","ŀ","Ł","ł","Ń","ń","Ņ","ņ","Ň","ň","ʼn","Ō","ō","Ŏ","ŏ","Ő","ő","Œ", "œ", "Ŕ","ŕ","Ŗ","ŗ","Ř","ř","Ś","ś","Ŝ","ŝ","Ş","ş","Š","š","Ţ","ţ","Ť","ť","Ŧ","ŧ","Ũ","ũ","Ū","ū","Ŭ","ŭ","Ů","ů","Ű","ű","Ų","ų","ſ","ƒ","DŽ", "Dž", "dž", "LJ", "Lj", "lj", "NJ", "Nj", "nj", "DZ", "Dz", "dz", "Ș","ș","Ț","ț","Ḁ","ḁ","Ḃ","ḃ","Ḅ","ḅ","Ḇ","ḇ","Ḉ","ḉ","Ḋ","ḋ","Ḍ","ḍ","Ḏ","ḏ","Ḑ","ḑ","Ḓ","ḓ","Ḕ","ḕ","Ḗ","ḗ","Ḙ","ḙ","Ḛ","ḛ","Ḝ","ḝ","Ḟ","ḟ","Ḡ","ḡ","Ḣ","ḣ","Ḥ","ḥ","Ḧ","ḧ","Ḩ","ḩ","Ḫ","ḫ","Ḭ","ḭ","Ḯ","ḯ","Ḱ","ḱ","Ḳ","ḳ","Ḵ","ḵ","Ḷ","ḷ","Ḹ","ḹ","Ḻ","ḻ","Ḽ","ḽ","Ḿ","ḿ","Ṁ","ṁ","Ṃ","ṃ","Ṅ","ṅ","Ṇ","ṇ","Ṉ","ṉ","Ṋ","ṋ","Ṍ","ṍ","Ṏ","ṏ","Ṑ","ṑ","Ṓ","ṓ","Ṕ","ṕ","Ṗ","ṗ","Ṙ","ṙ","Ṛ","ṛ","Ṝ","ṝ","Ṟ","ṟ","Ṡ","ṡ","Ṣ","ṣ","Ṥ","ṥ","Ṧ","ṧ","Ṩ","ṩ","Ṫ","ṫ","Ṭ","ṭ","Ṯ","ṯ","Ṱ","ṱ","Ṳ","ṳ","Ṵ","ṵ","Ṷ","ṷ","Ṹ","ṹ","Ṻ","ṻ","Ṽ","ṽ","Ṿ","ṿ","Ẁ","ẁ","Ẃ","ẃ","Ẅ","ẅ","Ẇ","ẇ","Ẉ","ẉ","Ẋ","ẋ","Ẍ","ẍ","Ẏ","ẏ","Ẑ","ẑ","Ẓ","ẓ","Ẕ","ẕ","ẖ","ẗ","ẘ","ẙ","ẚ","ẞ","Ạ","ạ","Ả","ả","Ấ","ấ","Ầ","ầ","Ẩ","ẩ","Ẫ","ẫ","Ậ","ậ","Ắ","ắ","Ằ","ằ","Ẳ","ẳ","Ẵ","ẵ","Ặ","ặ","Ẹ","ẹ","Ẻ","ẻ","Ẽ","ẽ","Ế","ế","Ề","ề","Ể","ể","Ễ","ễ","Ệ","ệ","Ỉ","ỉ","Ị","ị","Ọ","ọ","Ỏ","ỏ","Ố","ố","Ồ","ồ","Ổ","ổ","Ỗ","ỗ","Ộ","ộ","Ớ","ớ","Ờ","ờ","Ở","ở","Ỡ","ỡ","Ợ","ợ","Ụ","ụ","Ủ","ủ","Ứ","ứ","Ừ","ừ","Ử","ử","Ữ","ữ","Ự","ự","Ỳ","ỳ","Ỵ","ỵ","Ỷ","ỷ","Ỹ","ỹ"); 
$nonaccented = array("A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I","D","N","O","O","O","O","O","O","U","U","U","U","Th","ss","a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i","d","n","o","o","o","o","o","o","u","u","u","u","th","A","a","A","a","A","a","C","c","C","c","C","c","C","c","D","d","D","d","E","e","E","e","E","e","E","e","E","e","G","g","G","g","G","g","G","g","H","h","H","h","I","i","I","i","I","i","I","i","I","i","IJ","ij","J","j","K","k","L","l","L","l","L","l","L","l","L","l","N","n","N","n","N","n","n","O","o","O","o","O","o","OE","oe","R","r","R","r","R","r","S","s","S","s","S","s","S","s","T","t","T","t","T","t","U","u","U","u","U","u","U","u","U","u","U","u","s","f","DZ","Dz","dz","LJ","Lj","lj","NJ","Nj","nj","DZ","Dz","dz","S","s","T","t","A","a","B","b","B","b","B","b","C","c","D","d","D","d","D","d","D","d","D","d","E","e","E","e","E","e","E","e","E","e","F","f","G","g","H","h","H","h","H","h","H","h","H","h","I","i","I","i","K","k","K","k","K","k","L","l","L","l","L","l","L","l","M","m","M","m","M","m","N","n","N","n","N","n","N","n","O","o","O","o","O","o","O","o","P","p","P","p","R","r","R","r","R","r","R","r","S","s","S","s","S","s","S","s","S","s","T","t","T","t","T","t","T","t","U","u","U","u","U","u","U","u","U","u","V","v","V","v","W","w","W","w","W","w","W","w","W","w","X","x","X","x","Y","y","Z","z","Z","z","Z","z","h","t","w","y","a","B","A","a","A","a","A","a","A","a","A","a","A","a","A","a","A","a","A","a","A","a","A","a","A","a","E","e","E","e","E","e","E","e","E","e","E","e","E","e","E","e","I","i","I","i","O","o","O","o","O","o","O","o","O","o","O","o","O","o","O","o","O","o","O","o","O","o","O","o","U","u","U","u","U","u","U","u","U","u","U","u","U","u","Y","y","Y","y","Y","y","Y","y"); 
return str_replace($accented,$nonaccented,$string); 
} 
Powiązane problemy