2023-04-19 10:55:59 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
require "rails_helper"
|
|
|
|
|
2023-05-15 14:10:42 -04:00
|
|
|
describe DiscourseAi::Tokenizer::BertTokenizer do
|
2023-04-19 10:55:59 -04:00
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a single word" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.size("hello")).to eq(3)
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "for a sentence" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.size("hello world")).to eq(4)
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "for a sentence with punctuation" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.size("hello, world!")).to eq(6)
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "for a sentence with punctuation and capitalization" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.size("Hello, World!")).to eq(6)
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#tokenizer" do
|
|
|
|
it "returns a tokenizer" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "returns the same tokenizer" do
|
2023-05-15 14:10:42 -04:00
|
|
|
expect(described_class.tokenizer).to eq(described_class.tokenizer)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe DiscourseAi::Tokenizer::AnthropicTokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(5)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe DiscourseAi::Tokenizer::OpenAiTokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(6)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
2023-06-16 14:15:36 -04:00
|
|
|
|
|
|
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
|
|
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
|
|
|
end
|
2024-03-14 16:33:30 -04:00
|
|
|
|
|
|
|
it "truncates unicode characters properly when they use more than one token per char" do
|
|
|
|
sentence = "我喜欢吃比萨"
|
|
|
|
original_size = described_class.size(sentence)
|
|
|
|
expect(described_class.size(described_class.truncate(sentence, original_size - 1))).to be <
|
|
|
|
original_size
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#can_expand_tokens?" do
|
|
|
|
it "returns true when the tokens can be expanded" do
|
|
|
|
expect(described_class.can_expand_tokens?("foo bar", "baz qux", 6)).to eq(true)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "returns false when the tokens cannot be expanded" do
|
|
|
|
expect(described_class.can_expand_tokens?("foo bar", "baz qux", 3)).to eq(false)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do
|
|
|
|
expect(described_class.can_expand_tokens?("foo bar 👨🏿", "baz qux", 6)).to eq(false)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "handles unicode characters properly when they use more than one token per char" do
|
|
|
|
expect(described_class.can_expand_tokens?("我喜欢吃比萨", "萨", 10)).to eq(false)
|
|
|
|
end
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
end
|
2023-07-14 10:37:21 -04:00
|
|
|
|
|
|
|
describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2023-07-27 12:55:32 -04:00
|
|
|
|
2023-07-27 14:50:03 -04:00
|
|
|
describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2023-10-04 12:47:51 -04:00
|
|
|
|
|
|
|
describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2024-04-10 16:24:01 -04:00
|
|
|
|
|
|
|
describe DiscourseAi::Tokenizer::BgeM3Tokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo")
|
|
|
|
end
|
|
|
|
|
|
|
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
|
|
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
|
|
|
end
|
|
|
|
|
|
|
|
it "truncates unicode characters properly when they use more than one token per char" do
|
|
|
|
sentence = "我喜欢吃比萨"
|
|
|
|
original_size = described_class.size(sentence)
|
|
|
|
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
|
|
|
|
original_size
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2024-05-13 11:45:52 -04:00
|
|
|
|
|
|
|
describe DiscourseAi::Tokenizer::Llama3Tokenizer do
|
|
|
|
describe "#size" do
|
|
|
|
describe "returns a token count" do
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#truncate" do
|
|
|
|
it "truncates a sentence" do
|
|
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
|
|
end
|
|
|
|
|
|
|
|
# Llama3 fails here
|
|
|
|
# it "truncates a sentence successfully at a multibyte unicode character" do
|
|
|
|
# sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
|
|
# expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿")
|
|
|
|
# end
|
|
|
|
|
|
|
|
it "truncates unicode characters properly when they use more than one token per char" do
|
|
|
|
sentence = "我喜欢吃比萨"
|
|
|
|
original_size = described_class.size(sentence)
|
|
|
|
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
|
|
|
|
original_size
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|