IT:AD:Diacritics
void Main() { string input = "Theȳ"; input.Length.Dump(); input.ToCharArray().Length.Dump(); input = input.Normalize(NormalizationForm.FormD); input.Length.Dump(); input.ToCharArray().Length.Dump(); for (int i = 0; i < input.Length; i++) { Char c = input[i]; if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(c) != System.Globalization.UnicodeCategory.NonSpacingMark) c.Dump(); } input = "ŠĐĆŽ-šđčćž"; "ÂâĈĉÊêĜĝĤĥÎîĴĵÔôŜŝÛûŴŵXxŶŷẐẑ".Test(); "ÀàÈèÌìÒòÙùǸǹẀẁỲỳ".Test(); "ÁáÉéÍíÓóÚúÝýǼǽǾǿĆćǴǵḰḱĹĺḾḿŃńṔṕŔশẂẃŹź".Test(); "Ç窺".Test(); "ĀāĀ́ā́Ā̀ā̀Ā̂ā̂Ā͂ā͂ǞǟǠǡĒēḖḗḔḕĒ̂ē̂Ē͂ē͂ĪīĪ́ī́Ī̀ī̀Ī̂ī̂Ī͂ī͂ŌōṒṓṐṑŌ̂ō̂Ō͂ō͂ȪȫǬǭȬȭȰȱŪūŪ́ū́Ū̀ū̀Ū̂ū̂Ū͂ū͂ǕǖṺṻȲȳȲ́ȳ́Ȳ̀ȳ̀Ȳ̂ȳ̂Ȳ͂ȳ͂ÆæØø".Test(); "ĀāĒēĪīŌōŪūȲȳÆæØø".Test(); "ÿÿÿÿÿÿÿÿÿÿÿÂâĈĉÊêĜĝĤĥÎîĴĵÔôŜŝÛûŴŵXxŶŷẐẑĀāĒēĪīŌōŪūȲȳÆæØø".Test(); } // Define other methods and classes here public static class X{ static Dictionary<string,string> _dictionaryMacronsToUmlauts = new Dictionary<string,string>(); static Dictionary<string,string> _dictionaryMacronsToEscaped = new Dictionary<string,string>(); static Dictionary<string,string> _dictionaryEscapedToUmlaut = new Dictionary<string,string>(); static Dictionary<string,string> _dictionaryUmlautsToEscaped = new Dictionary<string, string>(); static X(){ _dictionaryMacronsToUmlauts["Ā"] = "Ä"; _dictionaryMacronsToUmlauts["ā"] = "ä"; _dictionaryMacronsToUmlauts["Ā́"] = "Ä"; _dictionaryMacronsToUmlauts["ā́"] = "ä"; _dictionaryMacronsToUmlauts["Ā̀"] = "Ä"; _dictionaryMacronsToUmlauts["ā̀"] = "ä"; _dictionaryMacronsToUmlauts["Ā̂"] = "Ä"; _dictionaryMacronsToUmlauts["ā̂"] = "ä"; _dictionaryMacronsToUmlauts["Ā͂"] = "Ä"; _dictionaryMacronsToUmlauts["ā͂"] = "ä"; _dictionaryMacronsToUmlauts["Ǟ"] = "Ä"; _dictionaryMacronsToUmlauts["ǟ"] = "ä"; _dictionaryMacronsToUmlauts["Ǡ"] = "Ä"; _dictionaryMacronsToUmlauts["ǡ"] = "ä"; //_dictionaryMacronsToUmlauts["Ǣ"]=""; //_dictionaryMacronsToUmlauts["ǣ"]=""; //_dictionaryMacronsToUmlauts["Ḇ"] = "B"; //_dictionaryMacronsToUmlauts["ḇ"] = "b"; //_dictionaryMacronsToUmlauts["C̄"] = "C"; //_dictionaryMacronsToUmlauts["c̄"] = "c"; //_dictionaryMacronsToUmlauts["Ḏ"] = "D"; //_dictionaryMacronsToUmlauts["ḏ"] = "d"; _dictionaryMacronsToUmlauts["Ē"] = "Ë"; _dictionaryMacronsToUmlauts["ē"] = "ë"; _dictionaryMacronsToUmlauts["Ḗ"] = "Ë"; _dictionaryMacronsToUmlauts["ḗ"] = "ë"; _dictionaryMacronsToUmlauts["Ḕ"] = "Ë"; _dictionaryMacronsToUmlauts["ḕ"] = "ë"; _dictionaryMacronsToUmlauts["Ē̂"] = "Ë"; _dictionaryMacronsToUmlauts["ē̂"] = "ë"; _dictionaryMacronsToUmlauts["Ē͂"] = "Ë"; _dictionaryMacronsToUmlauts["ē͂"] = "ë"; //_dictionaryMacronsToUmlauts["Ḡ"] = "G"; //_dictionaryMacronsToUmlauts["ḡ"] = "g"; //_dictionaryMacronsToUmlauts["H̱"] = "H"; //_dictionaryMacronsToUmlauts["ẖ"] = "h"; _dictionaryMacronsToUmlauts["Ī"] = "Ï"; _dictionaryMacronsToUmlauts["ī"] = "i"; _dictionaryMacronsToUmlauts["Ī́"] = "Ï"; _dictionaryMacronsToUmlauts["ī́"] = "i"; _dictionaryMacronsToUmlauts["Ī̀"] = "Ï"; _dictionaryMacronsToUmlauts["ī̀"] = "i"; _dictionaryMacronsToUmlauts["Ī̂"] = "Ï"; _dictionaryMacronsToUmlauts["ī̂"] = "i"; _dictionaryMacronsToUmlauts["Ī͂"] = "Ï"; _dictionaryMacronsToUmlauts["ī͂"] = "i"; //_dictionaryMacronsToUmlauts["Ḵ"] = "K"; //_dictionaryMacronsToUmlauts["ḵ"] = "k"; //_dictionaryMacronsToUmlauts["Ḻ"] = "L"; //_dictionaryMacronsToUmlauts["ḻ"] = "l"; //_dictionaryMacronsToUmlauts["Ḹ"] = "L"; //_dictionaryMacronsToUmlauts["ḹ"] = "l"; //_dictionaryMacronsToUmlauts["M̄"] = "M"; //_dictionaryMacronsToUmlauts["m̄"] = "m"; //_dictionaryMacronsToUmlauts["Ṉ"] = "N"; //_dictionaryMacronsToUmlauts["ṉ"] = "n"; //_dictionaryMacronsToUmlauts["N̄"] = "N"; //_dictionaryMacronsToUmlauts["n̄"] = "n"; _dictionaryMacronsToUmlauts["Ō"] = "Ö"; _dictionaryMacronsToUmlauts["ō"] = "ö"; _dictionaryMacronsToUmlauts["Ṓ"] = "Ö"; _dictionaryMacronsToUmlauts["ṓ"] = "ö"; _dictionaryMacronsToUmlauts["Ṑ"] = "Ö"; _dictionaryMacronsToUmlauts["ṑ"] = "ö"; _dictionaryMacronsToUmlauts["Ō̂"] = "Ö"; _dictionaryMacronsToUmlauts["ō̂"] = "ö"; _dictionaryMacronsToUmlauts["Ō͂"] = "Ö"; _dictionaryMacronsToUmlauts["ō͂"] = "ö"; _dictionaryMacronsToUmlauts["Ȫ"] = "Ö"; _dictionaryMacronsToUmlauts["ȫ"] = "ö"; _dictionaryMacronsToUmlauts["Ǭ"] = "Ö"; _dictionaryMacronsToUmlauts["ǭ"] = "ö"; _dictionaryMacronsToUmlauts["Ȭ"] = "Ö"; _dictionaryMacronsToUmlauts["ȭ"] = "ö"; _dictionaryMacronsToUmlauts["Ȱ"] = "Ö"; _dictionaryMacronsToUmlauts["ȱ"] = "ö"; //_dictionaryMacronsToUmlauts["R̄"] = "R"; //_dictionaryMacronsToUmlauts["r̄"] = "r"; //_dictionaryMacronsToUmlauts["Ṟ"] = "R"; //_dictionaryMacronsToUmlauts["ṟ"] = "r"; //_dictionaryMacronsToUmlauts["Ṝ"] = "R"; //_dictionaryMacronsToUmlauts["ṝ"] = "r"; //_dictionaryMacronsToUmlauts["Ṯ"] = "T"; //_dictionaryMacronsToUmlauts["ṯ"] = "t"; _dictionaryMacronsToUmlauts["Ū"] = "Ü"; _dictionaryMacronsToUmlauts["ū"] = "ü"; _dictionaryMacronsToUmlauts["Ū́"] = "Ü"; _dictionaryMacronsToUmlauts["ū́"] = "ü"; _dictionaryMacronsToUmlauts["Ū̀"] = "Ü"; _dictionaryMacronsToUmlauts["ū̀"] = "ü"; _dictionaryMacronsToUmlauts["Ū̂"] = "Ü"; _dictionaryMacronsToUmlauts["ū̂"] = "ü"; _dictionaryMacronsToUmlauts["Ū͂"] = "Ü"; _dictionaryMacronsToUmlauts["ū͂"] = "ü"; _dictionaryMacronsToUmlauts["Ǖ"] = "Ü"; _dictionaryMacronsToUmlauts["ǖ"] = "ü"; _dictionaryMacronsToUmlauts["Ṻ"] = "Ü"; _dictionaryMacronsToUmlauts["ṻ"] = "ü"; _dictionaryMacronsToUmlauts["Ȳ"] = "Ÿ"; _dictionaryMacronsToUmlauts["ȳ"] = "ÿ"; _dictionaryMacronsToUmlauts["Ȳ́"] = "Ÿ"; _dictionaryMacronsToUmlauts["ȳ́"] = "ÿ"; _dictionaryMacronsToUmlauts["Ȳ̀"] = "Ÿ"; _dictionaryMacronsToUmlauts["ȳ̀"] = "ÿ"; _dictionaryMacronsToUmlauts["Ȳ̂"] = "Ÿ"; _dictionaryMacronsToUmlauts["ȳ̂"] = "ÿ"; _dictionaryMacronsToUmlauts["Ȳ͂"] = "Ÿ"; _dictionaryMacronsToUmlauts["ȳ͂"] = "ÿ"; //_dictionaryMacronsToUmlauts["Ẕ"] = "Z"; //_dictionaryMacronsToUmlauts["ẕ"] = "z"; // _dictionaryMacronsToUmlauts["Æ"] = "_"; // _dictionaryMacronsToUmlauts["æ"] = "_"; // _dictionaryMacronsToUmlauts["Ø"] = "_"; // _dictionaryMacronsToUmlauts["ø"] = "_"; _dictionaryMacronsToEscaped["Ā"] = "[{A}]"; _dictionaryMacronsToEscaped["ā"] = "[{a}]"; _dictionaryMacronsToEscaped["Ē"] = "[{E}]"; _dictionaryMacronsToEscaped["ē"] = "[{e}]"; _dictionaryMacronsToEscaped["Ī"] = "[{I}]"; _dictionaryMacronsToEscaped["ī"] = "[{i}]"; _dictionaryMacronsToEscaped["Ō"] = "[{O}]"; _dictionaryMacronsToEscaped["ō"] = "[{o}]"; _dictionaryMacronsToEscaped["Ū"] = "[{U}]"; _dictionaryMacronsToEscaped["ū"] = "[{u}]"; _dictionaryMacronsToEscaped["Ȳ"] = "[{Y}]"; _dictionaryMacronsToEscaped["ȳ"] = "[{y}]"; _dictionaryUmlautsToEscaped["Ä"] = "[{A}]"; _dictionaryUmlautsToEscaped["ä"] = "[{a}]"; _dictionaryUmlautsToEscaped["Ë"] = "[{E}]"; _dictionaryUmlautsToEscaped["ë"] = "[{e}]"; _dictionaryUmlautsToEscaped["Ï"] = "[{I}]"; _dictionaryUmlautsToEscaped["i"] = "[{i}]"; _dictionaryUmlautsToEscaped["Ö"] = "[{O}]"; _dictionaryUmlautsToEscaped["ö"] = "[{o}]"; _dictionaryUmlautsToEscaped["Ü"] = "[{U}]"; _dictionaryUmlautsToEscaped["ü"] = "[{u}]"; _dictionaryUmlautsToEscaped["Ÿ"] = "[{Y}]"; _dictionaryUmlautsToEscaped["ÿ"] = "[{y}]"; _dictionaryEscapedToUmlaut["[{A}]"] = "Ä"; _dictionaryEscapedToUmlaut["[{a}]"] = "ä"; _dictionaryEscapedToUmlaut["[{E}]"] = "Ë"; _dictionaryEscapedToUmlaut["[{e}]"] = "ë"; _dictionaryEscapedToUmlaut["[{I}]"] = "Ï"; _dictionaryEscapedToUmlaut["[{i}]"] = "i"; _dictionaryEscapedToUmlaut["[{O}]"] = "Ö"; _dictionaryEscapedToUmlaut["[{o}]"] = "ö"; _dictionaryEscapedToUmlaut["[{U}]"] = "Ü"; _dictionaryEscapedToUmlaut["[{u}]"] = "ü"; _dictionaryEscapedToUmlaut["[{Y}]"] = "Ÿ"; _dictionaryEscapedToUmlaut["[{y}]"] = "ÿ"; } public static void Test(this string input){ input.Dump(); input.RemoveAccents(true).Dump(); } public static string RemoveAccents (this string input, bool saveMacrons) { if (saveMacrons){ return input.RemoveMostDiacritics(); } input = input.Replace("Ǽ","_").Replace("ǽ","_").Replace("Ǿ","_").Replace("ǿ","_"); string normalized = input.Normalize(NormalizationForm.FormKD); Encoding removal = Encoding.GetEncoding(Encoding.ASCII.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); byte[] bytes = removal.GetBytes(normalized); string result = Encoding.ASCII.GetString(bytes); return result; } public static string RemoveMostDiacritics(this string text){ StringBuilder stringBuilder = new StringBuilder(); //Iterate through each letter: foreach(char c in text) { string singleLetter = c.ToString(); string replacement; if (_dictionaryMacronsToUmlauts.TryGetValue(singleLetter,out replacement)){ stringBuilder.Append(replacement); continue; } if (_dictionaryUmlautsToEscaped.TryGetValue(singleLetter,out replacement)){ stringBuilder.Append(singleLetter); continue; } //String normalizedLetter = s.Normalize(NormalizationForm.FormKD); String normalizedString = singleLetter.Normalize(NormalizationForm.FormD); foreach(char normalizedChar in normalizedString){ if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(normalizedChar) != System.Globalization.UnicodeCategory.NonSpacingMark){ stringBuilder.Append(normalizedChar); } } } return stringBuilder.ToString(); } }