I had a client who’s users were occassionally doing data entry that was causing some downstream XML processing to fail. Only, when viewing the XML everything appeared correctly. I was able to track down the culprit: copying and pasting hidden characters into a textbox along with the rest of the text.
I created some simple extension methods to help sanitize input catching the rogue characters. You have the options to either remove characters or replace them if warranted.
public static class StringExtensions
{
public static string RemoveNonXmlCharacters(this string input)
{
return new string(input.Where(ch => XmlConvert.IsXmlChar(ch)).ToArray());
}
public static string RemoveControlCharacters(this string input)
{
return new string(input.Where(c => !char.IsControl(c)).ToArray());
}
public static string RemoveUnsafeCharacters(this string input)
{
return input.RemoveControlCharacters().RemoveNonXmlCharacters();
}
public static string ReplaceControlCharacters(this string input, string replacementCharacter)
{
var sanitizedReplacementCharacters = replacementCharacter.RemoveControlCharacters();
char[] controlCharactersToReplace = input.Where(c => char.IsControl(c)).ToArray();
string[] inputSplitByControlCharacters = input.Split(controlCharactersToReplace);
return String.Join(sanitizedReplacementCharacters, inputSplitByControlCharacters);
}
public static string ReplaceNonXmlCharacters(this string input, string replacementCharacter)
{
var sanitizedReplacementCharacters = replacementCharacter.RemoveNonXmlCharacters();
char[] controlCharactersToReplace = input.Where(c => !XmlConvert.IsXmlChar(c)).ToArray();
string[] inputSplitByNonXMLCharacters = input.Split(controlCharactersToReplace);
return String.Join(sanitizedReplacementCharacters, inputSplitByNonXMLCharacters);
}
public static string ReplaceUnsafeCharacters(this string input, string replaceCharacter)
{
return input.ReplaceControlCharacters(replaceCharacter).ReplaceNonXmlCharacters(replaceCharacter);
}
}