FIX: strip invalid byte sequences
This commit is contained in:
parent
ed0120171c
commit
3d9981ac5c
|
@ -27,6 +27,8 @@ class TextCleaner
|
|||
end
|
||||
|
||||
def self.clean(text, opts = {})
|
||||
# Remove invalid byte sequences
|
||||
text.scrub!("")
|
||||
# Replace !!!!! with a single !
|
||||
text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]
|
||||
# Replace ????? with a single ?
|
||||
|
|
|
@ -229,4 +229,12 @@ describe TextCleaner do
|
|||
end
|
||||
end
|
||||
|
||||
context "invalid byte sequence" do
|
||||
let(:with_invalid_bytes) { "abc\u3042\x81" }
|
||||
let(:without_invalid_bytes) { "abc\u3042" }
|
||||
|
||||
it "removes invalid bytes" do
|
||||
expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue