FIX: strip invalid byte sequences

This commit is contained in:
Maja Komel 2019-02-26 00:12:34 +01:00
parent ed0120171c
commit 3d9981ac5c
2 changed files with 10 additions and 0 deletions

View File

@ -27,6 +27,8 @@ class TextCleaner
end
def self.clean(text, opts = {})
# Remove invalid byte sequences
text.scrub!("")
# Replace !!!!! with a single !
text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]
# Replace ????? with a single ?

View File

@ -229,4 +229,12 @@ describe TextCleaner do
end
end
context "invalid byte sequence" do
let(:with_invalid_bytes) { "abc\u3042\x81" }
let(:without_invalid_bytes) { "abc\u3042" }
it "removes invalid bytes" do
expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes)
end
end
end