FIX: stop stripping zero-width-whitespace

This char is used for formatting khmer words
This commit is contained in:
Sam 2015-03-27 13:01:31 +11:00
parent d5eed540ea
commit 58c95f64d2
2 changed files with 9 additions and 2 deletions

View File

@ -44,7 +44,7 @@ class TextCleaner
text
end
@@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200B]|\u2028|\u2029|\u202F|\u205F|\u3000|\uFEFF)", "u").freeze
@@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)", "u").freeze
def self.normalize_whitespaces(text)
text.gsub(@@whitespaces_regexp, ' ')

View File

@ -193,10 +193,17 @@ describe TextCleaner do
describe "#normalize_whitespaces" do
it "normalize whitespaces" do
whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u2028\u2029\u202F\u205F\u3000\uFEFF"
whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
expect(whitespaces.strip).not_to eq("")
expect(TextCleaner.normalize_whitespaces(whitespaces).strip).to eq("")
end
it "does not muck with zero width white space" do
# this is used for khmer, dont mess with it
expect(TextCleaner.normalize_whitespaces("hello\u200Bworld").strip).to eq("hello\u200Bworld")
expect(TextCleaner.normalize_whitespaces("hello\uFEFFworld").strip).to eq("hello\uFEFFworld")
end
end
end