Java 1.6 onwards we have the following normalize method to clear most of those strange characters
java.text.Normalizer.normalize(stringtobecleared, java.text.Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+","")
The above method removes and replaces a lot of characters like Á to A, what this method generally does is it splits Á into two parts one being the A and other the acute sign (strange character at the top of A). But some really nasty ones which cannot be split still survive this method. like this German character ß which is equivalent to ss.
So Inspired by the php code in this blog post http://www.php.net/manual/en/function.preg-replace.php#96586 i made a convert method which takes in a String and returns a clean string. I have made some extra additions to the above post. Also you can add in any other character you know of. If i miss any other language characters please comment the character along with its english equivalent.
Following is the lines of code:
public static String convert(String s) {
String output = “”;
boolean bProcessed = false;
StringBuilder buffer = new StringBuilder();
int len = (s != null) ? s.length() : 0;
for (int i = 0; i < len; i++) {
char ch = s.charAt(i);
bProcessed = false;
switch (ch) {
// extra addition Þ, þ
case ‘Þ’: {
buffer.append(“TH”);
bProcessed = true;
break;
}
case ‘þ’: {
buffer.append(“th”);
bProcessed = true;
break;
}
// extra addition Þ
case ‘À’: case ‘Á’: case ‘Â’: case ‘Ã’: case ‘Ä’: case ‘Å’: case ‘Ā’: case’Ă’: case ‘Ą’: case ‘Ǎ’: case’Ǻ’: {
buffer.append(“A”);
bProcessed = true;
break;
}
case ‘Æ’: case ‘Ǽ’: {
buffer.append(“AE”);
bProcessed = true;
break;
}
case ‘Ç’: case’Ć’: case ‘Ĉ’: case ‘Ċ’: case ‘Č’: {
buffer.append(“C”);
bProcessed = true;
break;
}
case ‘È’: case ‘É’: case ‘Ê’: case ‘Ë’: case ‘Ē’: case ‘Ĕ’: case ‘Ė’: case ‘Ę’: case ‘Ě’: {
buffer.append(“E”);
bProcessed = true;
break;
}
case ‘Ì’: case ‘Í’: case ‘Î’: case ‘Ï’: case ‘Ĩ’: case ‘Ī’: case ‘Ĭ’: case ‘Į’: case ‘İ’: case ‘Ǐ’: {
buffer.append(“I”);
bProcessed = true;
break;
}
case ‘Ð’: case ‘Ď’: case ‘Đ’: {
buffer.append(“D”);
bProcessed = true;
break;
}
case ‘Ñ’: case ‘Ń’: case ‘Ņ’: case ‘Ň’: {
buffer.append(“N”);
bProcessed = true;
break;
}
case ‘Ò’: case ‘Ó’: case ‘Ô’: case ‘Õ’: case ‘Ö’: case ‘Ø’: case ‘Ō’: case ‘Ŏ’: case ‘Ő’: case ‘Ơ’: case ‘Ǒ’: case ‘Ǿ’: {
buffer.append(“O”);
bProcessed = true;
break;
}
case ‘Ù’: case ‘Ú’: case ‘Û’: case ‘Ü’: case ‘Ũ’: case ‘Ū’: case ‘Ŭ’: case ‘Ů’: case ‘Ű’: case ‘Ų’: case ‘Ư’: case ‘Ǔ’: case ‘Ǖ’: case ‘Ǘ’: case ‘Ǚ’: case ‘Ǜ’: {
buffer.append(“U”);
bProcessed = true;
break;
}
case ‘Ý’: case ‘Ŷ’: case ‘’: {
buffer.append(“Y”);
bProcessed = true;
break;
}
case ‘ś’: case ‘ŝ’: case ‘ş’: case ‘’: case ‘ſ’: {
buffer.append(“s”);
bProcessed = true;
break;
}
case ‘ß’:{
buffer.append(“ss”);
bProcessed = true;
break;
}
case ‘à’: case ‘á’: case ‘â’: case ‘ã’: case ‘ä’: case ‘å’: case ‘ā’: case ‘ă’: case ‘ą’: case ‘ǎ’: case’ǻ’: {
buffer.append(“a”);
bProcessed = true;
break;
}
case ‘æ’: case ‘ǽ’:{
buffer.append(“ae”);
bProcessed = true;
break;
}
case ‘ç’: case ‘ć’: case ‘ĉ’: case ‘ċ’: case ‘č’: {
buffer.append(“c”);
bProcessed = true;
break;
}
case ‘è’: case ‘é’: case ‘ê’: case ‘ë’: case ‘ē’: case ‘ĕ’: case ‘ė’: case ‘ę’: case ‘ě’: {
buffer.append(“e”);
bProcessed = true;
break;
}
case ‘ì’: case ‘í’: case ‘î’: case ‘ï’: case ‘ĩ’: case ‘ī’: case ‘ĭ’: case ‘į’: case ‘ı’: case’ǐ’: {
buffer.append(“i”);
bProcessed = true;
break;
}
case ‘ñ’: case ‘ń’: case ‘ņ’: case ‘ň’: case ‘ʼn’: {
buffer.append(“n”);
bProcessed = true;
break;
}
case ‘ò’: case ‘ó’: case ‘ô’: case ‘õ’: case ‘ö’: case ‘ø’: case ‘ō’: case ‘ŏ’: case ‘ő’: case’ơ’: case’ǒ’: case’ǿ’: {
buffer.append(“o”);
bProcessed = true;
break;
}
case ‘ù’: case ‘ú’: case ‘û’: case ‘ü’: case ‘ũ’: case ‘ū’: case ‘ŭ’: case ‘ů’: case ‘ű’: case ‘ų’: case ‘ư’: case’ǔ’: case ‘ǖ’: case ‘ǘ’: case ‘ǚ’: case ‘ǜ’: {
buffer.append(“u”);
bProcessed = true;
break;
}
case ‘ý’: case ‘ÿ’: case ‘ŷ’: {
buffer.append(“y”);
bProcessed = true;
break;
}
case ‘ď’: case ‘đ’: case ‘ð’:{
buffer.append(“d”);
bProcessed = true;
break;
}
case ‘Ĝ’: case ‘Ğ’: case’Ġ’: case’Ģ’: {
buffer.append(“G”);
bProcessed = true;
break;
}
case ‘ĝ’: case ‘ğ’: case ‘ġ’: case ‘ģ’: {
buffer.append(“g”);
bProcessed = true;
break;
}
case ‘Ĥ’: case ‘Ħ’: {
buffer.append(“H”);
bProcessed = true;
break;
}
case ‘ĥ’: case ‘ħ’: {
buffer.append(“h”);
bProcessed = true;
break;
}
case ‘IJ’: {
buffer.append(“IJ”);
bProcessed = true;
break;
}
case ‘ij’: {
buffer.append(“ij”);
bProcessed = true;
break;
}
case ‘Ĵ’: {
buffer.append(“J”);
bProcessed = true;
break;
}
case ‘ĵ’: {
buffer.append(“j”);
bProcessed = true;
break;
}
case ‘Ķ’: {
buffer.append(“K”);
bProcessed = true;
break;
}
case ‘ķ’: {
buffer.append(“k”);
bProcessed = true;
break;
}
case ‘Ĺ’: case ‘Ļ’: case’Ľ’: case’Ŀ’: case ‘Ł’: {
buffer.append(“L”);
bProcessed = true;
break;
}
case ‘ĺ’: case ‘ļ’: case ‘ľ’: case ‘ŀ’: case ‘ł’: {
buffer.append(“l”);
bProcessed = true;
break;
}
case ‘’: {
buffer.append(“OE”);
bProcessed = true;
break;
}
case ‘’: {
buffer.append(“oe”);
bProcessed = true;
break;
}
case ‘Ŕ’: case ‘Ŗ’: case ‘Ř’: {
buffer.append(“R”);
bProcessed = true;
break;
}
case ‘ŕ’: case ‘ŗ’: case ‘ř’: {
buffer.append(“r”);
bProcessed = true;
break;
}
case ‘Ś’: case ‘Ŝ’: case ‘Ş’: case ‘’: {
buffer.append(“S”);
bProcessed = true;
break;
}
case ‘Ţ’: case ‘Ť’: case’Ŧ’: {
buffer.append(“T”);
bProcessed = true;
break;
}
case ‘ţ’: case ‘ť’: case ‘ŧ’: {
buffer.append(“t”);
bProcessed = true;
break;
}
case ‘Ŵ’: {
buffer.append(“W”);
bProcessed = true;
break;
}
case ‘ŵ’: {
buffer.append(“w”);
bProcessed = true;
break;
}
case ‘Ź’: case ‘Ż’: case ‘’: {
buffer.append(“Z”);
bProcessed = true;
break;
}
case ‘ź’: case ‘ż’: case ‘’: {
buffer.append(“z”);
bProcessed = true;
break;
}
case ‘’: {
buffer.append(“f”);
bProcessed = true;
break;
}
case ‘-‘: {
buffer.append(“-“);
bProcessed = true;
break;
}
}
// only look if the character is not yet processed
if (!bProcessed) {
if (Character.isLetterOrDigit(ch)) {
buffer.append(ch);
}
if (Character.isWhitespace(ch)) {
buffer.append(ch);
}
}
}
output = buffer.toString();
return output;
}