FIX: strip invalid byte sequences
This commit is contained in:
parent
ed0120171c
commit
3d9981ac5c
|
@ -27,6 +27,8 @@ class TextCleaner
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.clean(text, opts = {})
|
def self.clean(text, opts = {})
|
||||||
|
# Remove invalid byte sequences
|
||||||
|
text.scrub!("")
|
||||||
# Replace !!!!! with a single !
|
# Replace !!!!! with a single !
|
||||||
text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]
|
text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]
|
||||||
# Replace ????? with a single ?
|
# Replace ????? with a single ?
|
||||||
|
|
|
@ -229,4 +229,12 @@ describe TextCleaner do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "invalid byte sequence" do
|
||||||
|
let(:with_invalid_bytes) { "abc\u3042\x81" }
|
||||||
|
let(:without_invalid_bytes) { "abc\u3042" }
|
||||||
|
|
||||||
|
it "removes invalid bytes" do
|
||||||
|
expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue