From 23b28096382839e5c57324852f977a593dc6b660 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Fri, 5 Jan 2024 10:27:45 -0300 Subject: [PATCH] FEATURE: Generate proper embeddings for posts/topics with embedded content (#401) --- lib/embeddings/strategies/truncation.rb | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/embeddings/strategies/truncation.rb b/lib/embeddings/strategies/truncation.rb index 6f57b12c..b2b29041 100644 --- a/lib/embeddings/strategies/truncation.rb +++ b/lib/embeddings/strategies/truncation.rb @@ -47,8 +47,13 @@ module DiscourseAi def topic_truncation(topic, tokenizer, max_length) text = +topic_information(topic) + if topic&.topic_embed&.embed_content_cache&.present? + text << Nokogiri::HTML5.fragment(topic.topic_embed.embed_content_cache).text + text << "\n\n" + end + topic.posts.find_each do |post| - text << post.raw + text << Nokogiri::HTML5.fragment(post.cooked).text break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up? text << "\n\n" end @@ -58,7 +63,12 @@ module DiscourseAi def post_truncation(post, tokenizer, max_length) text = +topic_information(post.topic) - text << Nokogiri::HTML5.fragment(post.cooked).text + + if post.is_first_post? && post.topic&.topic_embed&.embed_content_cache&.present? + text << Nokogiri::HTML5.fragment(post.topic.topic_embed.embed_content_cache).text + else + text << Nokogiri::HTML5.fragment(post.cooked).text + end tokenizer.truncate(text, max_length) end