From b82074850e64704c3b97fb2ecf9a77f308fb1315 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Fri, 14 Jul 2023 11:37:21 -0300 Subject: [PATCH] DEV: Add tests to allmpnet tokenizer (#107) * DEV: Add tests to allmpnet tokenizer * lint --- spec/shared/{tokenizer.rb => tokenizer_spec.rb} | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) rename spec/shared/{tokenizer.rb => tokenizer_spec.rb} (83%) diff --git a/spec/shared/tokenizer.rb b/spec/shared/tokenizer_spec.rb similarity index 83% rename from spec/shared/tokenizer.rb rename to spec/shared/tokenizer_spec.rb index bfdf6510..47ab900f 100644 --- a/spec/shared/tokenizer.rb +++ b/spec/shared/tokenizer_spec.rb @@ -83,3 +83,20 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do end end end + +describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do + describe "#size" do + describe "returns a token count" do + it "for a sentence with punctuation and capitalization and numbers" do + expect(described_class.size("Hello, World! 123")).to eq(7) + end + end + end + + describe "#truncate" do + it "truncates a sentence" do + sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud" + expect(described_class.truncate(sentence, 3)).to eq("foo bar") + end + end +end