mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-04-21 14:44:55 +00:00
FEATURE: Gemini Tokenizer (#1088)
This commit is contained in:
parent
5a97752117
commit
67a1257b89
@ -20,7 +20,7 @@ class EmbeddingDefinition < ActiveRecord::Base
|
|||||||
DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer,
|
DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer,
|
||||||
DiscourseAi::Tokenizer::BgeLargeEnTokenizer,
|
DiscourseAi::Tokenizer::BgeLargeEnTokenizer,
|
||||||
DiscourseAi::Tokenizer::BgeM3Tokenizer,
|
DiscourseAi::Tokenizer::BgeM3Tokenizer,
|
||||||
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
DiscourseAi::Tokenizer::GeminiTokenizer,
|
||||||
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
|
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
|
||||||
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
||||||
].map(&:name)
|
].map(&:name)
|
||||||
@ -61,7 +61,7 @@ class EmbeddingDefinition < ActiveRecord::Base
|
|||||||
pg_function: "<=>",
|
pg_function: "<=>",
|
||||||
url:
|
url:
|
||||||
"https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent",
|
"https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent",
|
||||||
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
|
tokenizer_class: "DiscourseAi::Tokenizer::GeminiTokenizer",
|
||||||
provider: GOOGLE,
|
provider: GOOGLE,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -56,7 +56,7 @@ module DiscourseAi
|
|||||||
display_name: "Gemini 1.5 Flash",
|
display_name: "Gemini 1.5 Flash",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
|
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer,
|
||||||
provider: "google",
|
provider: "google",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -7,6 +7,7 @@ module DiscourseAi
|
|||||||
def available_llm_tokenizers
|
def available_llm_tokenizers
|
||||||
[
|
[
|
||||||
DiscourseAi::Tokenizer::AnthropicTokenizer,
|
DiscourseAi::Tokenizer::AnthropicTokenizer,
|
||||||
|
DiscourseAi::Tokenizer::GeminiTokenizer,
|
||||||
DiscourseAi::Tokenizer::Llama3Tokenizer,
|
DiscourseAi::Tokenizer::Llama3Tokenizer,
|
||||||
DiscourseAi::Tokenizer::MixtralTokenizer,
|
DiscourseAi::Tokenizer::MixtralTokenizer,
|
||||||
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
||||||
|
11
lib/tokenizer/gemini_tokenizer.rb
Normal file
11
lib/tokenizer/gemini_tokenizer.rb
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module DiscourseAi
|
||||||
|
module Tokenizer
|
||||||
|
class GeminiTokenizer < BasicTokenizer
|
||||||
|
def self.tokenizer
|
||||||
|
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma2.json")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
@ -228,3 +228,32 @@ describe DiscourseAi::Tokenizer::Llama3Tokenizer do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe DiscourseAi::Tokenizer::GeminiTokenizer do
|
||||||
|
describe "#size" do
|
||||||
|
describe "returns a token count" do
|
||||||
|
it "for a sentence with punctuation and capitalization and numbers" do
|
||||||
|
expect(described_class.size("Hello, World! 123")).to eq(9)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe "#truncate" do
|
||||||
|
it "truncates a sentence" do
|
||||||
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||||
|
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "truncates a sentence successfully at a multibyte unicode character" do
|
||||||
|
sentence = "foo bar 👨🏿👩🏿👧🏿👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||||
|
expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿👩")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "truncates unicode characters properly when they use more than one token per char" do
|
||||||
|
sentence = "我喜欢吃比萨"
|
||||||
|
original_size = described_class.size(sentence)
|
||||||
|
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
|
||||||
|
original_size
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
@ -33,3 +33,7 @@ Licensed under MIT License
|
|||||||
## Meta-Llama-3-70B-Instruct
|
## Meta-Llama-3-70B-Instruct
|
||||||
|
|
||||||
Licensed under META LLAMA 3 COMMUNITY LICENSE
|
Licensed under META LLAMA 3 COMMUNITY LICENSE
|
||||||
|
|
||||||
|
## Gemma 2
|
||||||
|
|
||||||
|
Licensed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms)
|
||||||
|
838953
tokenizers/gemma2.json
Normal file
838953
tokenizers/gemma2.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user