2023-07-13 12:41:36 -03:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
module Embeddings
|
|
|
|
module Strategies
|
|
|
|
class Truncation
|
|
|
|
def id
|
2023-09-05 11:08:23 -03:00
|
|
|
1
|
2023-07-13 12:41:36 -03:00
|
|
|
end
|
|
|
|
|
|
|
|
def version
|
|
|
|
1
|
|
|
|
end
|
|
|
|
|
2024-12-16 09:55:39 -03:00
|
|
|
def prepare_target_text(target, vdef)
|
|
|
|
max_length = vdef.max_sequence_length - 2
|
|
|
|
|
2025-01-21 14:10:31 -03:00
|
|
|
prepared_text =
|
|
|
|
case target
|
|
|
|
when Topic
|
|
|
|
topic_truncation(target, vdef.tokenizer, max_length)
|
|
|
|
when Post
|
|
|
|
post_truncation(target, vdef.tokenizer, max_length)
|
|
|
|
when RagDocumentFragment
|
|
|
|
vdef.tokenizer.truncate(target.fragment, max_length)
|
|
|
|
else
|
|
|
|
raise ArgumentError, "Invalid target type"
|
|
|
|
end
|
|
|
|
|
|
|
|
return prepared_text if vdef.embed_prompt.blank?
|
|
|
|
|
|
|
|
[vdef.embed_prompt, prepared_text].join(" ")
|
2023-07-13 12:41:36 -03:00
|
|
|
end
|
|
|
|
|
2024-12-16 09:55:39 -03:00
|
|
|
def prepare_query_text(text, vdef, asymetric: false)
|
2025-01-21 14:10:31 -03:00
|
|
|
qtext = asymetric ? "#{vdef.search_prompt} #{text}" : text
|
2024-12-16 09:55:39 -03:00
|
|
|
max_length = vdef.max_sequence_length - 2
|
|
|
|
|
2025-01-21 14:10:31 -03:00
|
|
|
vdef.tokenizer.truncate(qtext, max_length)
|
2024-12-16 09:55:39 -03:00
|
|
|
end
|
|
|
|
|
2023-09-05 11:08:23 -03:00
|
|
|
private
|
|
|
|
|
|
|
|
def topic_information(topic)
|
|
|
|
info = +""
|
2023-07-13 12:41:36 -03:00
|
|
|
|
2023-12-29 14:05:02 -03:00
|
|
|
if topic&.title.present?
|
|
|
|
info << topic.title
|
2023-09-05 11:08:23 -03:00
|
|
|
info << "\n\n"
|
2023-12-29 14:05:02 -03:00
|
|
|
end
|
|
|
|
if topic&.category&.name.present?
|
|
|
|
info << topic.category.name
|
|
|
|
info << "\n\n"
|
|
|
|
end
|
|
|
|
if SiteSetting.tagging_enabled && topic&.tags.present?
|
2023-09-05 11:08:23 -03:00
|
|
|
info << topic.tags.pluck(:name).join(", ")
|
2023-12-29 14:05:02 -03:00
|
|
|
info << "\n\n"
|
2023-07-13 12:41:36 -03:00
|
|
|
end
|
2023-12-29 14:05:02 -03:00
|
|
|
|
|
|
|
info
|
2023-09-05 11:08:23 -03:00
|
|
|
end
|
|
|
|
|
|
|
|
def topic_truncation(topic, tokenizer, max_length)
|
|
|
|
text = +topic_information(topic)
|
2023-07-13 12:41:36 -03:00
|
|
|
|
2024-05-28 11:15:42 +02:00
|
|
|
if topic&.topic_embed&.embed_content_cache.present?
|
2024-01-05 10:27:45 -03:00
|
|
|
text << Nokogiri::HTML5.fragment(topic.topic_embed.embed_content_cache).text
|
|
|
|
text << "\n\n"
|
|
|
|
end
|
|
|
|
|
2023-07-13 18:59:25 -03:00
|
|
|
topic.posts.find_each do |post|
|
2024-01-05 10:27:45 -03:00
|
|
|
text << Nokogiri::HTML5.fragment(post.cooked).text
|
2023-09-05 11:08:23 -03:00
|
|
|
break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
|
|
|
|
text << "\n\n"
|
2023-07-13 12:41:36 -03:00
|
|
|
end
|
|
|
|
|
2023-09-05 11:08:23 -03:00
|
|
|
tokenizer.truncate(text, max_length)
|
2023-07-13 12:41:36 -03:00
|
|
|
end
|
|
|
|
|
2023-12-29 12:28:45 -03:00
|
|
|
def post_truncation(post, tokenizer, max_length)
|
2023-09-05 11:08:23 -03:00
|
|
|
text = +topic_information(post.topic)
|
2024-01-05 10:27:45 -03:00
|
|
|
|
2024-05-28 11:15:42 +02:00
|
|
|
if post.is_first_post? && post.topic&.topic_embed&.embed_content_cache.present?
|
2024-01-05 10:27:45 -03:00
|
|
|
text << Nokogiri::HTML5.fragment(post.topic.topic_embed.embed_content_cache).text
|
|
|
|
else
|
|
|
|
text << Nokogiri::HTML5.fragment(post.cooked).text
|
|
|
|
end
|
2023-07-13 12:41:36 -03:00
|
|
|
|
2023-09-05 11:08:23 -03:00
|
|
|
tokenizer.truncate(text, max_length)
|
2023-07-13 12:41:36 -03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|