From e457c687cae28c9a1a8eec620b888eaf07a6d465 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Fri, 16 Jun 2023 15:15:36 -0300 Subject: [PATCH] FIX: OpenAI Tokenizer was failing to truncate mid emojis (#91) * FIX: OpenAI Tokenizer was failing to truncate mid emojis * Update spec/shared/tokenizer.rb Co-authored-by: Joffrey JAFFEUX --------- Co-authored-by: Joffrey JAFFEUX --- lib/shared/tokenizer/tokenizer.rb | 3 +++ spec/shared/tokenizer.rb | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/lib/shared/tokenizer/tokenizer.rb b/lib/shared/tokenizer/tokenizer.rb index fc66c4e7..0fdcf2c9 100644 --- a/lib/shared/tokenizer/tokenizer.rb +++ b/lib/shared/tokenizer/tokenizer.rb @@ -49,6 +49,9 @@ module DiscourseAi return text if text.size < max_length tokenizer.decode(tokenize(text).take(max_length)) + rescue Tiktoken::UnicodeError + max_length = max_length - 1 + retry end end end diff --git a/spec/shared/tokenizer.rb b/spec/shared/tokenizer.rb index 9402445e..bfdf6510 100644 --- a/spec/shared/tokenizer.rb +++ b/spec/shared/tokenizer.rb @@ -76,5 +76,10 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud" expect(described_class.truncate(sentence, 3)).to eq("foo bar baz") end + + it "truncates a sentence successfully at a multibyte unicode character" do + sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud" + expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿") + end end end