FIX: OpenAI Tokenizer was failing to truncate mid emojis (#91)
* FIX: OpenAI Tokenizer was failing to truncate mid emojis * Update spec/shared/tokenizer.rb Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com> --------- Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com>
This commit is contained in:
parent
9e901dbfbf
commit
e457c687ca
|
@ -49,6 +49,9 @@ module DiscourseAi
|
||||||
return text if text.size < max_length
|
return text if text.size < max_length
|
||||||
|
|
||||||
tokenizer.decode(tokenize(text).take(max_length))
|
tokenizer.decode(tokenize(text).take(max_length))
|
||||||
|
rescue Tiktoken::UnicodeError
|
||||||
|
max_length = max_length - 1
|
||||||
|
retry
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -76,5 +76,10 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
|
||||||
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||||
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
||||||
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||||
|
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue