mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-07-10 08:03:28 +00:00
FEATURE: Add Qwen3 tokenizer and update Gemma to version 3 (#1440)
This commit is contained in:
parent
df925f8304
commit
9dccc1eb93
@ -24,6 +24,7 @@ class EmbeddingDefinition < ActiveRecord::Base
|
||||
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
|
||||
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
||||
DiscourseAi::Tokenizer::MixtralTokenizer,
|
||||
DiscourseAi::Tokenizer::QwenTokenizer,
|
||||
].map(&:name)
|
||||
end
|
||||
|
||||
|
@ -11,6 +11,7 @@ module DiscourseAi
|
||||
DiscourseAi::Tokenizer::Llama3Tokenizer,
|
||||
DiscourseAi::Tokenizer::MixtralTokenizer,
|
||||
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
||||
DiscourseAi::Tokenizer::QwenTokenizer,
|
||||
]
|
||||
end
|
||||
|
||||
|
@ -4,7 +4,7 @@ module DiscourseAi
|
||||
module Tokenizer
|
||||
class GeminiTokenizer < BasicTokenizer
|
||||
def self.tokenizer
|
||||
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma2.json")
|
||||
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma3.json")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
11
lib/tokenizer/qwen_tokenizer.rb
Normal file
11
lib/tokenizer/qwen_tokenizer.rb
Normal file
@ -0,0 +1,11 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Tokenizer
|
||||
class QwenTokenizer < BasicTokenizer
|
||||
def self.tokenizer
|
||||
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/qwen3.json")
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
@ -257,3 +257,32 @@ describe DiscourseAi::Tokenizer::GeminiTokenizer do
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe DiscourseAi::Tokenizer::QwenTokenizer do
|
||||
describe "#size" do
|
||||
describe "returns a token count" do
|
||||
it "for a sentence with punctuation and capitalization and numbers" do
|
||||
expect(described_class.size("Hello, World! 123")).to eq(8)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "#truncate" do
|
||||
it "truncates a sentence" do
|
||||
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
|
||||
end
|
||||
|
||||
it "truncates a sentence successfully at a multibyte unicode character" do
|
||||
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||
expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿👩")
|
||||
end
|
||||
|
||||
it "truncates unicode characters properly when they use more than one token per char" do
|
||||
sentence = "我喜欢吃比萨"
|
||||
original_size = described_class.size(sentence)
|
||||
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
|
||||
original_size
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -34,6 +34,10 @@ Licensed under MIT License
|
||||
|
||||
Licensed under META LLAMA 3 COMMUNITY LICENSE
|
||||
|
||||
## Gemma 2
|
||||
## Gemma 3
|
||||
|
||||
Licensed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms)
|
||||
|
||||
## Qwen 3
|
||||
|
||||
Licensed under the Apache 2.0 License
|
||||
|
838953
tokenizers/gemma2.json
838953
tokenizers/gemma2.json
File diff suppressed because it is too large
Load Diff
2379611
tokenizers/gemma3.json
Normal file
2379611
tokenizers/gemma3.json
Normal file
File diff suppressed because it is too large
Load Diff
757480
tokenizers/qwen3.json
Normal file
757480
tokenizers/qwen3.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user