FIX: OpenAI Tokenizer was failing to truncate mid emojis (#91)

* FIX: OpenAI Tokenizer was failing to truncate mid emojis

* Update spec/shared/tokenizer.rb

Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com>

---------

Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com>
This commit is contained in:
Rafael dos Santos Silva 2023-06-16 15:15:36 -03:00 committed by GitHub
parent 9e901dbfbf
commit e457c687ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 0 deletions

View File

@ -49,6 +49,9 @@ module DiscourseAi
return text if text.size < max_length return text if text.size < max_length
tokenizer.decode(tokenize(text).take(max_length)) tokenizer.decode(tokenize(text).take(max_length))
rescue Tiktoken::UnicodeError
max_length = max_length - 1
retry
end end
end end
end end

View File

@ -76,5 +76,10 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud" sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz") expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
end end
it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
end
end end
end end