mirror of
				https://github.com/discourse/discourse-ai.git
				synced 2025-10-31 14:38:37 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			154 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			154 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| # frozen_string_literal: true
 | |
| 
 | |
| require "rails_helper"
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::BertTokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a single word" do
 | |
|         expect(described_class.size("hello")).to eq(3)
 | |
|       end
 | |
| 
 | |
|       it "for a sentence" do
 | |
|         expect(described_class.size("hello world")).to eq(4)
 | |
|       end
 | |
| 
 | |
|       it "for a sentence with punctuation" do
 | |
|         expect(described_class.size("hello, world!")).to eq(6)
 | |
|       end
 | |
| 
 | |
|       it "for a sentence with punctuation and capitalization" do
 | |
|         expect(described_class.size("Hello, World!")).to eq(6)
 | |
|       end
 | |
| 
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(7)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#tokenizer" do
 | |
|     it "returns a tokenizer" do
 | |
|       expect(described_class.tokenizer).to be_a(Tokenizers::Tokenizer)
 | |
|     end
 | |
| 
 | |
|     it "returns the same tokenizer" do
 | |
|       expect(described_class.tokenizer).to eq(described_class.tokenizer)
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo bar")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::AnthropicTokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(5)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::OpenAiTokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(6)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
 | |
|     end
 | |
| 
 | |
|     it "truncates a sentence successfully at a multibyte unicode character" do
 | |
|       sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(7)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo bar")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::Llama2Tokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(9)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo bar")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(7)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| describe DiscourseAi::Tokenizer::BgeLargeEnTokenizer do
 | |
|   describe "#size" do
 | |
|     describe "returns a token count" do
 | |
|       it "for a sentence with punctuation and capitalization and numbers" do
 | |
|         expect(described_class.size("Hello, World! 123")).to eq(7)
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   describe "#truncate" do
 | |
|     it "truncates a sentence" do
 | |
|       sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
 | |
|       expect(described_class.truncate(sentence, 3)).to eq("foo bar")
 | |
|     end
 | |
|   end
 | |
| end
 |