mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-08-04 12:13:48 +00:00
To ease the administrative burden of enabling the embeddings model, this change introduces automatic backfill when the setting is enabled. It also moves the topic visit embedding creation to a lower priority queue in sidekiq and adds an option to skip embedding computation and persistence when we match on the digest.
63 lines
1.5 KiB
Ruby
63 lines
1.5 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Embeddings
|
|
module Strategies
|
|
class Truncation
|
|
def id
|
|
1
|
|
end
|
|
|
|
def version
|
|
1
|
|
end
|
|
|
|
def prepare_text_from(target, tokenizer, max_length)
|
|
case target
|
|
when Topic
|
|
topic_truncation(target, tokenizer, max_length)
|
|
when Post
|
|
post_truncation(target, tokenizer, max_length)
|
|
else
|
|
raise ArgumentError, "Invalid target type"
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def topic_information(topic)
|
|
info = +""
|
|
|
|
info << topic.title
|
|
info << "\n\n"
|
|
info << topic.category.name if topic&.category&.name
|
|
if SiteSetting.tagging_enabled
|
|
info << "\n\n"
|
|
info << topic.tags.pluck(:name).join(", ")
|
|
end
|
|
info << "\n\n"
|
|
end
|
|
|
|
def topic_truncation(topic, tokenizer, max_length)
|
|
text = +topic_information(topic)
|
|
|
|
topic.posts.find_each do |post|
|
|
text << post.raw
|
|
break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
|
|
text << "\n\n"
|
|
end
|
|
|
|
tokenizer.truncate(text, max_length)
|
|
end
|
|
|
|
def post_truncation(topic, tokenizer, max_length)
|
|
text = +topic_information(post.topic)
|
|
text << post.raw
|
|
|
|
tokenizer.truncate(text, max_length)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|