FEATURE: Generate proper embeddings for posts/topics with embedded content (#401)

This commit is contained in:
Rafael dos Santos Silva 2024-01-05 10:27:45 -03:00 committed by GitHub
parent 6fc1c9f7a6
commit 23b2809638
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 12 additions and 2 deletions

View File

@ -47,8 +47,13 @@ module DiscourseAi
def topic_truncation(topic, tokenizer, max_length) def topic_truncation(topic, tokenizer, max_length)
text = +topic_information(topic) text = +topic_information(topic)
if topic&.topic_embed&.embed_content_cache&.present?
text << Nokogiri::HTML5.fragment(topic.topic_embed.embed_content_cache).text
text << "\n\n"
end
topic.posts.find_each do |post| topic.posts.find_each do |post|
text << post.raw text << Nokogiri::HTML5.fragment(post.cooked).text
break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up? break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
text << "\n\n" text << "\n\n"
end end
@ -58,7 +63,12 @@ module DiscourseAi
def post_truncation(post, tokenizer, max_length) def post_truncation(post, tokenizer, max_length)
text = +topic_information(post.topic) text = +topic_information(post.topic)
text << Nokogiri::HTML5.fragment(post.cooked).text
if post.is_first_post? && post.topic&.topic_embed&.embed_content_cache&.present?
text << Nokogiri::HTML5.fragment(post.topic.topic_embed.embed_content_cache).text
else
text << Nokogiri::HTML5.fragment(post.cooked).text
end
tokenizer.truncate(text, max_length) tokenizer.truncate(text, max_length)
end end