FIX: stop stripping zero-width-whitespace
This char is used for formatting khmer words
This commit is contained in:
parent
d5eed540ea
commit
58c95f64d2
|
@ -44,7 +44,7 @@ class TextCleaner
|
|||
text
|
||||
end
|
||||
|
||||
@@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200B]|\u2028|\u2029|\u202F|\u205F|\u3000|\uFEFF)", "u").freeze
|
||||
@@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)", "u").freeze
|
||||
|
||||
def self.normalize_whitespaces(text)
|
||||
text.gsub(@@whitespaces_regexp, ' ')
|
||||
|
|
|
@ -193,10 +193,17 @@ describe TextCleaner do
|
|||
|
||||
describe "#normalize_whitespaces" do
|
||||
it "normalize whitespaces" do
|
||||
whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u2028\u2029\u202F\u205F\u3000\uFEFF"
|
||||
whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
|
||||
expect(whitespaces.strip).not_to eq("")
|
||||
expect(TextCleaner.normalize_whitespaces(whitespaces).strip).to eq("")
|
||||
end
|
||||
|
||||
it "does not muck with zero width white space" do
|
||||
# this is used for khmer, dont mess with it
|
||||
expect(TextCleaner.normalize_whitespaces("hello\u200Bworld").strip).to eq("hello\u200Bworld")
|
||||
expect(TextCleaner.normalize_whitespaces("hello\uFEFFworld").strip).to eq("hello\uFEFFworld")
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue