Clear strange Language characters to english equivalent.


Java 1.6 onwards we have the following normalize method to clear most of those strange characters

java.text.Normalizer.normalize(stringtobecleared, java.text.Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+","")

The above method removes and replaces a lot of characters like Á to A, what this method generally does is it splits Á into two parts one being the A and other the acute sign (strange character at the top of A). But some really nasty ones which cannot be split still survive this method. like this German character ß which is equivalent to ss.

So Inspired by the php code in this blog post http://www.php.net/manual/en/function.preg-replace.php#96586 i made a convert method which takes in a String and returns a clean string. I have made some extra additions to the above post. Also you can add in any other character you know of. If i miss any other language characters please comment the character along with its english equivalent.

Following is the lines of code:

public static String convert(String s) {

String output = “”;
boolean bProcessed = false;

StringBuilder buffer = new StringBuilder();
int len = (s != null) ? s.length() : 0;
for (int i = 0; i < len; i++) {
char ch = s.charAt(i);
bProcessed = false;
switch (ch) {
// extra addition  Þ, þ
case ‘Þ’: {
buffer.append(“TH”);
bProcessed = true;
break;
}
case ‘þ’: {
buffer.append(“th”);
bProcessed = true;
break;
}
// extra addition Þ
case ‘À’: case ‘Á’: case ‘Â’: case ‘Ã’: case ‘Ä’: case ‘Å’: case ‘Ā’: case’Ă’: case ‘Ą’: case ‘Ǎ’: case’Ǻ’: {
buffer.append(“A”);
bProcessed = true;
break;
}
case ‘Æ’: case ‘Ǽ’: {
buffer.append(“AE”);
bProcessed = true;
break;
}
case ‘Ç’: case’Ć’: case  ‘Ĉ’: case  ‘Ċ’: case  ‘Č’: {
buffer.append(“C”);
bProcessed = true;
break;
}
case ‘È’: case ‘É’: case ‘Ê’: case ‘Ë’: case ‘Ē’: case  ‘Ĕ’: case  ‘Ė’: case  ‘Ę’: case  ‘Ě’: {
buffer.append(“E”);
bProcessed = true;
break;
}
case ‘Ì’: case ‘Í’: case ‘Î’: case ‘Ï’: case ‘Ĩ’: case  ‘Ī’: case  ‘Ĭ’: case  ‘Į’: case  ‘İ’: case  ‘Ǐ’: {
buffer.append(“I”);
bProcessed = true;
break;
}
case ‘Ð’: case ‘Ď’: case ‘Đ’:   {
buffer.append(“D”);
bProcessed = true;
break;
}
case ‘Ñ’: case ‘Ń’: case  ‘Ņ’: case  ‘Ň’:   {
buffer.append(“N”);
bProcessed = true;
break;
}
case ‘Ò’: case ‘Ó’: case ‘Ô’: case ‘Õ’: case ‘Ö’: case ‘Ø’: case ‘Ō’: case  ‘Ŏ’: case  ‘Ő’: case  ‘Ơ’: case  ‘Ǒ’: case  ‘Ǿ’: {
buffer.append(“O”);
bProcessed = true;
break;
}
case ‘Ù’: case ‘Ú’: case ‘Û’: case ‘Ü’: case ‘Ũ’: case  ‘Ū’: case  ‘Ŭ’: case  ‘Ů’: case  ‘Ű’: case  ‘Ų’: case  ‘Ư’: case  ‘Ǔ’: case  ‘Ǖ’: case  ‘Ǘ’: case  ‘Ǚ’: case  ‘Ǜ’: {
buffer.append(“U”);
bProcessed = true;
break;
}
case ‘Ý’: case ‘Ŷ’: case  ‘Ÿ’:   {
buffer.append(“Y”);
bProcessed = true;
break;
}
case ‘ś’: case ‘ŝ’: case  ‘ş’: case  ‘š’: case  ‘ſ’: {
buffer.append(“s”);
bProcessed = true;
break;
}
case ‘ß’:{
buffer.append(“ss”);
bProcessed = true;
break;
}
case ‘à’: case ‘á’: case ‘â’: case ‘ã’: case ‘ä’: case ‘å’: case  ‘ā’: case  ‘ă’: case  ‘ą’: case ‘ǎ’: case’ǻ’:  {
buffer.append(“a”);
bProcessed = true;
break;
}
case ‘æ’: case  ‘ǽ’:{
buffer.append(“ae”);
bProcessed = true;
break;
}
case ‘ç’: case ‘ć’: case  ‘ĉ’: case  ‘ċ’: case  ‘č’:  {
buffer.append(“c”);
bProcessed = true;
break;
}
case ‘è’: case ‘é’: case ‘ê’: case ‘ë’: case ‘ē’: case  ‘ĕ’: case  ‘ė’: case  ‘ę’: case  ‘ě’: {
buffer.append(“e”);
bProcessed = true;
break;
}
case ‘ì’: case ‘í’: case ‘î’: case ‘ï’: case ‘ĩ’: case  ‘ī’: case  ‘ĭ’: case  ‘į’: case  ‘ı’: case’ǐ’: {
buffer.append(“i”);
bProcessed = true;
break;
}
case ‘ñ’: case ‘ń’: case  ‘ņ’: case  ‘ň’: case ‘ʼn’:  {
buffer.append(“n”);
bProcessed = true;
break;
}
case ‘ò’: case ‘ó’: case ‘ô’: case ‘õ’: case ‘ö’: case ‘ø’: case ‘ō’: case  ‘ŏ’: case  ‘ő’: case’ơ’: case’ǒ’: case’ǿ’: {
buffer.append(“o”);
bProcessed = true;
break;
}
case ‘ù’: case ‘ú’: case ‘û’: case ‘ü’: case ‘ũ’: case  ‘ū’: case  ‘ŭ’: case  ‘ů’: case  ‘ű’: case  ‘ų’: case ‘ư’: case’ǔ’: case  ‘ǖ’: case  ‘ǘ’: case  ‘ǚ’: case  ‘ǜ’:  {
buffer.append(“u”);
bProcessed = true;
break;
}
case ‘ý’: case ‘ÿ’: case ‘ŷ’: {
buffer.append(“y”);
bProcessed = true;
break;
}
case ‘ď’: case  ‘đ’:  case ‘ð’:{
buffer.append(“d”);
bProcessed = true;
break;
}

case ‘Ĝ’: case ‘Ğ’: case’Ġ’: case’Ģ’: {
buffer.append(“G”);
bProcessed = true;
break;
}
case ‘ĝ’: case  ‘ğ’: case  ‘ġ’: case  ‘ģ’:  {
buffer.append(“g”);
bProcessed = true;
break;
}
case ‘Ĥ’: case ‘Ħ’: {
buffer.append(“H”);
bProcessed = true;
break;
}
case ‘ĥ’: case  ‘ħ’:  {
buffer.append(“h”);
bProcessed = true;
break;
}

case ‘IJ’:  {
buffer.append(“IJ”);
bProcessed = true;
break;
}
case ‘ij’:  {
buffer.append(“ij”);
bProcessed = true;
break;
}
case ‘Ĵ’: {
buffer.append(“J”);
bProcessed = true;
break;
}
case ‘ĵ’:  {
buffer.append(“j”);
bProcessed = true;
break;
}
case ‘Ķ’: {
buffer.append(“K”);
bProcessed = true;
break;
}
case ‘ķ’:  {
buffer.append(“k”);
bProcessed = true;
break;
}
case ‘Ĺ’: case ‘Ļ’: case’Ľ’: case’Ŀ’: case ‘Ł’: {
buffer.append(“L”);
bProcessed = true;
break;
}
case ‘ĺ’: case  ‘ļ’: case  ‘ľ’: case  ‘ŀ’: case ‘ł’:  {
buffer.append(“l”);
bProcessed = true;
break;
}
case ‘Œ’:  {
buffer.append(“OE”);
bProcessed = true;
break;
}
case ‘œ’:  {
buffer.append(“oe”);
bProcessed = true;
break;
}
case ‘Ŕ’: case ‘Ŗ’: case ‘Ř’: {
buffer.append(“R”);
bProcessed = true;
break;
}
case ‘ŕ’: case  ‘ŗ’: case  ‘ř’:  {
buffer.append(“r”);
bProcessed = true;
break;
}
case ‘Ś’: case  ‘Ŝ’: case  ‘Ş’: case  ‘Š’:  {
buffer.append(“S”);
bProcessed = true;
break;
}
case ‘Ţ’: case ‘Ť’: case’Ŧ’:  {
buffer.append(“T”);
bProcessed = true;
break;
}
case ‘ţ’: case  ‘ť’: case ‘ŧ’:  {
buffer.append(“t”);
bProcessed = true;
break;
}

case ‘Ŵ’:  {
buffer.append(“W”);
bProcessed = true;
break;
}
case ‘ŵ’:  {
buffer.append(“w”);
bProcessed = true;
break;
}

case ‘Ź’: case ‘Ż’: case ‘Ž’: {
buffer.append(“Z”);
bProcessed = true;
break;
}
case ‘ź’: case  ‘ż’: case  ‘ž’:  {
buffer.append(“z”);
bProcessed = true;
break;
}
case ‘ƒ’:  {
buffer.append(“f”);
bProcessed = true;
break;
}

case ‘-‘: {
buffer.append(“-“);
bProcessed = true;
break;
}
}

// only look if the character is not yet processed
if (!bProcessed) {
if (Character.isLetterOrDigit(ch)) {
buffer.append(ch);
}
if (Character.isWhitespace(ch)) {
buffer.append(ch);
}
}
}

output = buffer.toString();
return output;
}

Advertisements

About Dominic

J for JAVA more about me : http://about.me/dominicdsouza
This entry was posted in Thechy Stuff and tagged . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s