discourse-ai/spec/shared/tokenizer_spec.rb

# frozen_string_literal: true

require "rails_helper"

describe DiscourseAi::Tokenizer::BertTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a single word" do
        expect(described_class.size("hello")).to eq(3)
      end

      it "for a sentence" do
        expect(described_class.size("hello world")).to eq(4)
      end

      it "for a sentence with punctuation" do
        expect(described_class.size("hello, world!")).to eq(6)
      end

      it "for a sentence with punctuation and capitalization" do
        expect(described_class.size("Hello, World!")).to eq(6)
      end

      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#tokenizer" do
    it "returns a tokenizer" do
      expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)
    end

    it "returns the same tokenizer" do
      expect(described_class.tokenizer).to eq(described_class.tokenizer)
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::AnthropicTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(5)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
    end
  end
end

describe DiscourseAi::Tokenizer::OpenAiTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(6)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
    end

    it "truncates a sentence successfully at a multibyte unicode character" do
      sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
    end

    it "truncates unicode characters properly when they use more than one token per char" do
      sentence = "我喜欢吃比萨"
      original_size = described_class.size(sentence)
      expect(described_class.size(described_class.truncate(sentence, original_size - 1))).to be <
        original_size
    end
  end

  describe "#below_limit?" do
    it "returns true when the tokens can be expanded" do
      expect(described_class.below_limit?("foo bar baz qux", 6)).to eq(true)
    end

    it "returns false when the tokens cannot be expanded" do
      expect(described_class.below_limit?("foo bar baz qux", 3)).to eq(false)
    end

    it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do
      expect(described_class.below_limit?("foo bar 👨🏿 baz qux", 6)).to eq(false)
    end

    it "handles unicode characters properly when they use more than one token per char" do
      expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
    end
  end
end

describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(6)
      end
    end
  end
end

describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo")
    end
  end
end

describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::BgeM3Tokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo")
    end

    it "truncates a sentence successfully at a multibyte unicode character" do
      sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
    end

    it "truncates unicode characters properly when they use more than one token per char" do
      sentence = "我喜欢吃比萨"
      original_size = described_class.size(sentence)
      expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
        original_size
    end
  end
end

describe DiscourseAi::Tokenizer::Llama3Tokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end

    # Llama3 fails here
    # it "truncates a sentence successfully at a multibyte unicode character" do
    #   sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
    #   expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿")
    # end

    it "truncates unicode characters properly when they use more than one token per char" do
      sentence = "我喜欢吃比萨"
      original_size = described_class.size(sentence)
      expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
        original_size
    end
  end
end
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`# frozen_string_literal: true`

			`require "rails_helper"`

Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`describe DiscourseAi::Tokenizer::BertTokenizer do`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a single word" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("hello")).to eq(3)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("hello world")).to eq(4)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence with punctuation" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("hello, world!")).to eq(6)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence with punctuation and capitalization" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("Hello, World!")).to eq(6)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence with punctuation and capitalization and numbers" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("Hello, World! 123")).to eq(7)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`
			`end`
			`end`

			`describe "#tokenizer" do`
			`it "returns a tokenizer" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "returns the same tokenizer" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.tokenizer).to eq(described_class.tokenizer)`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`

			`describe DiscourseAi::Tokenizer::AnthropicTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(5)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")`
			`end`
			`end`
			`end`

			`describe DiscourseAi::Tokenizer::OpenAiTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(6)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`
FIX: OpenAI Tokenizer was failing to truncate mid emojis (#91) * FIX: OpenAI Tokenizer was failing to truncate mid emojis * Update spec/shared/tokenizer.rb Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com> --------- Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com> 2023-06-16 14:15:36 -04:00
			`it "truncates a sentence successfully at a multibyte unicode character" do`
			`sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")`
			`end`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00
			`it "truncates unicode characters properly when they use more than one token per char" do`
			`sentence = "我喜欢吃比萨"`
			`original_size = described_class.size(sentence)`
			`expect(described_class.size(described_class.truncate(sentence, original_size - 1))).to be <`
			`original_size`
			`end`
			`end`

FIX/REFACTOR: FoldContent revamp (#866) * FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last 2024-10-25 10:51:17 -04:00			`describe "#below_limit?" do`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00			`it "returns true when the tokens can be expanded" do`
FIX/REFACTOR: FoldContent revamp (#866) * FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last 2024-10-25 10:51:17 -04:00			`expect(described_class.below_limit?("foo bar baz qux", 6)).to eq(true)`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00			`end`

			`it "returns false when the tokens cannot be expanded" do`
FIX/REFACTOR: FoldContent revamp (#866) * FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last 2024-10-25 10:51:17 -04:00			`expect(described_class.below_limit?("foo bar baz qux", 3)).to eq(false)`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00			`end`

			`it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do`
FIX/REFACTOR: FoldContent revamp (#866) * FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last 2024-10-25 10:51:17 -04:00			`expect(described_class.below_limit?("foo bar 👨🏿 baz qux", 6)).to eq(false)`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00			`end`

			`it "handles unicode characters properly when they use more than one token per char" do`
FIX/REFACTOR: FoldContent revamp (#866) * FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last 2024-10-25 10:51:17 -04:00			`expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00			`end`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`
			`end`
DEV: Add tests to allmpnet tokenizer (#107) * DEV: Add tests to allmpnet tokenizer * lint 2023-07-14 10:37:21 -04:00
FEATURE: GPT4o Tokenizer (#721) 2024-07-22 14:26:14 -04:00			`describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(6)`
			`end`
			`end`
			`end`
			`end`

DEV: Add tests to allmpnet tokenizer (#107) * DEV: Add tests to allmpnet tokenizer * lint 2023-07-14 10:37:21 -04:00			`describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`
FEATURE: Llama2 for summarization (#116) 2023-07-27 12:55:32 -04:00
FEATURE: Support for locally infered embeddings in 100 languages (#115) * FEATURE: Support for locally infered embeddings in 100 languages * add table 2023-07-27 14:50:03 -04:00			`describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo")`
			`end`
			`end`
			`end`
FEATURE: Bge-large-en embeddings via Cloudflare Workers AI API (#241) * FEATURE: Bge-large-en embeddings via Cloudflare Workers AI API * forgot a file * lint 2023-10-04 12:47:51 -04:00
			`describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`
FEATURE: Add BGE-M3 embeddings support (#569) BAAI/bge-m3 is an interesting model, that is multilingual and with a context size of 8192. Even with a 16x larger context, it's only 4x slower to compute it's embeddings on the worst case scenario. Also includes a minor refactor of the rake task, including setting model and concurrency levels when running the backfill task. 2024-04-10 16:24:01 -04:00
			`describe DiscourseAi::Tokenizer::BgeM3Tokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo")`
			`end`

			`it "truncates a sentence successfully at a multibyte unicode character" do`
			`sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")`
			`end`

			`it "truncates unicode characters properly when they use more than one token per char" do`
			`sentence = "我喜欢吃比萨"`
			`original_size = described_class.size(sentence)`
			`expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <`
			`original_size`
			`end`
			`end`
			`end`
FEATURE: Llama 3 tokenizer (#615) 2024-05-13 11:45:52 -04:00
			`describe DiscourseAi::Tokenizer::Llama3Tokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`

			`# Llama3 fails here`
			`# it "truncates a sentence successfully at a multibyte unicode character" do`
			`# sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`# expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿")`
			`# end`

			`it "truncates unicode characters properly when they use more than one token per char" do`
			`sentence = "我喜欢吃比萨"`
			`original_size = described_class.size(sentence)`
			`expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <`
			`original_size`
			`end`
			`end`
			`end`