discourse-ai/spec/shared/tokenizer_spec.rb

# frozen_string_literal: true

require "rails_helper"

describe DiscourseAi::Tokenizer::BertTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a single word" do
        expect(described_class.size("hello")).to eq(3)
      end

      it "for a sentence" do
        expect(described_class.size("hello world")).to eq(4)
      end

      it "for a sentence with punctuation" do
        expect(described_class.size("hello, world!")).to eq(6)
      end

      it "for a sentence with punctuation and capitalization" do
        expect(described_class.size("Hello, World!")).to eq(6)
      end

      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#tokenizer" do
    it "returns a tokenizer" do
      expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)
    end

    it "returns the same tokenizer" do
      expect(described_class.tokenizer).to eq(described_class.tokenizer)
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::AnthropicTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(5)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
    end
  end
end

describe DiscourseAi::Tokenizer::OpenAiTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(6)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
    end

    it "truncates a sentence successfully at a multibyte unicode character" do
      sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
    end

    it "truncates unicode characters properly when they use more than one token per char" do
      sentence = "我喜欢吃比萨"
      original_size = described_class.size(sentence)
      expect(described_class.size(described_class.truncate(sentence, original_size - 1))).to be <
        original_size
    end
  end

  describe "#can_expand_tokens?" do
    it "returns true when the tokens can be expanded" do
      expect(described_class.can_expand_tokens?("foo bar", "baz qux", 6)).to eq(true)
    end

    it "returns false when the tokens cannot be expanded" do
      expect(described_class.can_expand_tokens?("foo bar", "baz qux", 3)).to eq(false)
    end

    it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do
      expect(described_class.can_expand_tokens?("foo bar 👨🏿", "baz qux", 6)).to eq(false)
    end

    it "handles unicode characters properly when they use more than one token per char" do
      expect(described_class.can_expand_tokens?("我喜欢吃比萨", "萨", 10)).to eq(false)
    end
  end
end

describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::Llama2Tokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(9)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo")
    end
  end
end

describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo bar")
    end
  end
end

describe DiscourseAi::Tokenizer::BgeM3Tokenizer do
  describe "#size" do
    describe "returns a token count" do
      it "for a sentence with punctuation and capitalization and numbers" do
        expect(described_class.size("Hello, World! 123")).to eq(7)
      end
    end
  end

  describe "#truncate" do
    it "truncates a sentence" do
      sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 3)).to eq("foo")
    end

    it "truncates a sentence successfully at a multibyte unicode character" do
      sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
      expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
    end

    it "truncates unicode characters properly when they use more than one token per char" do
      sentence = "我喜欢吃比萨"
      original_size = described_class.size(sentence)
      expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
        original_size
    end
  end
end
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`# frozen_string_literal: true`

			`require "rails_helper"`

Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`describe DiscourseAi::Tokenizer::BertTokenizer do`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a single word" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("hello")).to eq(3)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("hello world")).to eq(4)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence with punctuation" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("hello, world!")).to eq(6)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence with punctuation and capitalization" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("Hello, World!")).to eq(6)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "for a sentence with punctuation and capitalization and numbers" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.size("Hello, World! 123")).to eq(7)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`
			`end`
			`end`

			`describe "#tokenizer" do`
			`it "returns a tokenizer" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`

			`it "returns the same tokenizer" do`
Refinements to embeddings and tokenizers (#61) * Refinements to embeddings and tokenizers * lint * Truncate with tokenizers for summary * fix 2023-05-15 14:10:42 -04:00			`expect(described_class.tokenizer).to eq(described_class.tokenizer)`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`

			`describe DiscourseAi::Tokenizer::AnthropicTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(5)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")`
			`end`
			`end`
			`end`

			`describe DiscourseAi::Tokenizer::OpenAiTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(6)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`
FIX: OpenAI Tokenizer was failing to truncate mid emojis (#91) * FIX: OpenAI Tokenizer was failing to truncate mid emojis * Update spec/shared/tokenizer.rb Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com> --------- Co-authored-by: Joffrey JAFFEUX <j.jaffeux@gmail.com> 2023-06-16 14:15:36 -04:00
			`it "truncates a sentence successfully at a multibyte unicode character" do`
			`sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")`
			`end`
FIX: Handle unicode on tokenizer (#515) * FIX: Handle unicode on tokenizer Our fast track code broke when strings had characters who are longer in tokens than in UTF-8. Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower. Co-authored-by: wozulong <sidle.pax_0e@icloud.com> 2024-03-14 16:33:30 -04:00
			`it "truncates unicode characters properly when they use more than one token per char" do`
			`sentence = "我喜欢吃比萨"`
			`original_size = described_class.size(sentence)`
			`expect(described_class.size(described_class.truncate(sentence, original_size - 1))).to be <`
			`original_size`
			`end`
			`end`

			`describe "#can_expand_tokens?" do`
			`it "returns true when the tokens can be expanded" do`
			`expect(described_class.can_expand_tokens?("foo bar", "baz qux", 6)).to eq(true)`
			`end`

			`it "returns false when the tokens cannot be expanded" do`
			`expect(described_class.can_expand_tokens?("foo bar", "baz qux", 3)).to eq(false)`
			`end`

			`it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do`
			`expect(described_class.can_expand_tokens?("foo bar 👨🏿", "baz qux", 6)).to eq(false)`
			`end`

			`it "handles unicode characters properly when they use more than one token per char" do`
			`expect(described_class.can_expand_tokens?("我喜欢吃比萨", "萨", 10)).to eq(false)`
			`end`
FEATURE: Add a basic tokenizer API (#37) * FEATURE: Add a basic tokenizer API * Add tests * lint 2023-04-19 10:55:59 -04:00			`end`
			`end`
DEV: Add tests to allmpnet tokenizer (#107) * DEV: Add tests to allmpnet tokenizer * lint 2023-07-14 10:37:21 -04:00
			`describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`
FEATURE: Llama2 for summarization (#116) 2023-07-27 12:55:32 -04:00
			`describe DiscourseAi::Tokenizer::Llama2Tokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(9)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`
FEATURE: Support for locally infered embeddings in 100 languages (#115) * FEATURE: Support for locally infered embeddings in 100 languages * add table 2023-07-27 14:50:03 -04:00
			`describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo")`
			`end`
			`end`
			`end`
FEATURE: Bge-large-en embeddings via Cloudflare Workers AI API (#241) * FEATURE: Bge-large-en embeddings via Cloudflare Workers AI API * forgot a file * lint 2023-10-04 12:47:51 -04:00
			`describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo bar")`
			`end`
			`end`
			`end`
FEATURE: Add BGE-M3 embeddings support (#569) BAAI/bge-m3 is an interesting model, that is multilingual and with a context size of 8192. Even with a 16x larger context, it's only 4x slower to compute it's embeddings on the worst case scenario. Also includes a minor refactor of the rake task, including setting model and concurrency levels when running the backfill task. 2024-04-10 16:24:01 -04:00
			`describe DiscourseAi::Tokenizer::BgeM3Tokenizer do`
			`describe "#size" do`
			`describe "returns a token count" do`
			`it "for a sentence with punctuation and capitalization and numbers" do`
			`expect(described_class.size("Hello, World! 123")).to eq(7)`
			`end`
			`end`
			`end`

			`describe "#truncate" do`
			`it "truncates a sentence" do`
			`sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 3)).to eq("foo")`
			`end`

			`it "truncates a sentence successfully at a multibyte unicode character" do`
			`sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"`
			`expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")`
			`end`

			`it "truncates unicode characters properly when they use more than one token per char" do`
			`sentence = "我喜欢吃比萨"`
			`original_size = described_class.size(sentence)`
			`expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <`
			`original_size`
			`end`
			`end`
			`end`