mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-02-07 12:08:13 +00:00
* FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last
231 lines
7.6 KiB
Ruby
231 lines
7.6 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "rails_helper"
|
|
|
|
describe DiscourseAi::Tokenizer::BertTokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a single word" do
|
|
expect(described_class.size("hello")).to eq(3)
|
|
end
|
|
|
|
it "for a sentence" do
|
|
expect(described_class.size("hello world")).to eq(4)
|
|
end
|
|
|
|
it "for a sentence with punctuation" do
|
|
expect(described_class.size("hello, world!")).to eq(6)
|
|
end
|
|
|
|
it "for a sentence with punctuation and capitalization" do
|
|
expect(described_class.size("Hello, World!")).to eq(6)
|
|
end
|
|
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#tokenizer" do
|
|
it "returns a tokenizer" do
|
|
expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)
|
|
end
|
|
|
|
it "returns the same tokenizer" do
|
|
expect(described_class.tokenizer).to eq(described_class.tokenizer)
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::AnthropicTokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(5)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::OpenAiTokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(6)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
|
end
|
|
|
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
|
end
|
|
|
|
it "truncates unicode characters properly when they use more than one token per char" do
|
|
sentence = "我喜欢吃比萨"
|
|
original_size = described_class.size(sentence)
|
|
expect(described_class.size(described_class.truncate(sentence, original_size - 1))).to be <
|
|
original_size
|
|
end
|
|
end
|
|
|
|
describe "#below_limit?" do
|
|
it "returns true when the tokens can be expanded" do
|
|
expect(described_class.below_limit?("foo bar baz qux", 6)).to eq(true)
|
|
end
|
|
|
|
it "returns false when the tokens cannot be expanded" do
|
|
expect(described_class.below_limit?("foo bar baz qux", 3)).to eq(false)
|
|
end
|
|
|
|
it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do
|
|
expect(described_class.below_limit?("foo bar 👨🏿 baz qux", 6)).to eq(false)
|
|
end
|
|
|
|
it "handles unicode characters properly when they use more than one token per char" do
|
|
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(6)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo")
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::BgeM3Tokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo")
|
|
end
|
|
|
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
|
|
end
|
|
|
|
it "truncates unicode characters properly when they use more than one token per char" do
|
|
sentence = "我喜欢吃比萨"
|
|
original_size = described_class.size(sentence)
|
|
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
|
|
original_size
|
|
end
|
|
end
|
|
end
|
|
|
|
describe DiscourseAi::Tokenizer::Llama3Tokenizer do
|
|
describe "#size" do
|
|
describe "returns a token count" do
|
|
it "for a sentence with punctuation and capitalization and numbers" do
|
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#truncate" do
|
|
it "truncates a sentence" do
|
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
|
end
|
|
|
|
# Llama3 fails here
|
|
# it "truncates a sentence successfully at a multibyte unicode character" do
|
|
# sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
|
# expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿")
|
|
# end
|
|
|
|
it "truncates unicode characters properly when they use more than one token per char" do
|
|
sentence = "我喜欢吃比萨"
|
|
original_size = described_class.size(sentence)
|
|
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
|
|
original_size
|
|
end
|
|
end
|
|
end
|