I had a client who’s users were occassionally doing data entry that was causing some downstream XML processing to fail. Only, when viewing the XML everything appeared correctly. I was able to track down the culprit: copying and pasting hidden characters into a textbox along with the rest of the text.
I created some simple extension methods to help sanitize input catching the rogue characters. You have the options to either remove characters or replace them if warranted.
public static class StringExtensions { public static string RemoveNonXmlCharacters(this string input) { return new string(input.Where(ch => XmlConvert.IsXmlChar(ch)).ToArray()); } public static string RemoveControlCharacters(this string input) { return new string(input.Where(c => !char.IsControl(c)).ToArray()); } public static string RemoveUnsafeCharacters(this string input) { return input.RemoveControlCharacters().RemoveNonXmlCharacters(); } public static string ReplaceControlCharacters(this string input, string replacementCharacter) { var sanitizedReplacementCharacters = replacementCharacter.RemoveControlCharacters(); char[] controlCharactersToReplace = input.Where(c => char.IsControl(c)).ToArray(); string[] inputSplitByControlCharacters = input.Split(controlCharactersToReplace); return String.Join(sanitizedReplacementCharacters, inputSplitByControlCharacters); } public static string ReplaceNonXmlCharacters(this string input, string replacementCharacter) { var sanitizedReplacementCharacters = replacementCharacter.RemoveNonXmlCharacters(); char[] controlCharactersToReplace = input.Where(c => !XmlConvert.IsXmlChar(c)).ToArray(); string[] inputSplitByNonXMLCharacters = input.Split(controlCharactersToReplace); return String.Join(sanitizedReplacementCharacters, inputSplitByNonXMLCharacters); } public static string ReplaceUnsafeCharacters(this string input, string replaceCharacter) { return input.ReplaceControlCharacters(replaceCharacter).ReplaceNonXmlCharacters(replaceCharacter); } }