mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-02-16 16:34:45 +00:00
* FEATURE: allow tuning of RAG generation - change chunking to be token based vs char based (which is more accurate) - allow control over overlap / tokens per chunk and conversation snippets inserted - UI to control new settings * improve ui a bit * fix various reindex issues * reduce concurrency * try ultra low queue ... concurrency 1 is too slow.
47 lines
1.3 KiB
Ruby
47 lines
1.3 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Tokenizer
|
|
class OpenAiTokenizer < BasicTokenizer
|
|
class << self
|
|
def tokenizer
|
|
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
|
|
end
|
|
|
|
def tokenize(text)
|
|
tokenizer.encode(text)
|
|
end
|
|
|
|
def encode(text)
|
|
tokenizer.encode(text)
|
|
end
|
|
|
|
def decode(token_ids)
|
|
tokenizer.decode(token_ids)
|
|
end
|
|
|
|
def truncate(text, max_length)
|
|
# fast track common case, /2 to handle unicode chars
|
|
# than can take more than 1 token per char
|
|
return text if !SiteSetting.ai_strict_token_counting && text.size < max_length / 2
|
|
|
|
tokenizer.decode(tokenize(text).take(max_length))
|
|
rescue Tiktoken::UnicodeError
|
|
max_length = max_length - 1
|
|
retry
|
|
end
|
|
|
|
def can_expand_tokens?(text, addition, max_length)
|
|
# fast track common case, /2 to handle unicode chars
|
|
# than can take more than 1 token per char
|
|
if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
|
|
return true
|
|
end
|
|
|
|
tokenizer.encode(text).length + tokenizer.encode(addition).length < max_length
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|