FEATURE: Generate proper embeddings for posts/topics with embedded content (#401)
This commit is contained in:
parent
6fc1c9f7a6
commit
23b2809638
|
@ -47,8 +47,13 @@ module DiscourseAi
|
||||||
def topic_truncation(topic, tokenizer, max_length)
|
def topic_truncation(topic, tokenizer, max_length)
|
||||||
text = +topic_information(topic)
|
text = +topic_information(topic)
|
||||||
|
|
||||||
|
if topic&.topic_embed&.embed_content_cache&.present?
|
||||||
|
text << Nokogiri::HTML5.fragment(topic.topic_embed.embed_content_cache).text
|
||||||
|
text << "\n\n"
|
||||||
|
end
|
||||||
|
|
||||||
topic.posts.find_each do |post|
|
topic.posts.find_each do |post|
|
||||||
text << post.raw
|
text << Nokogiri::HTML5.fragment(post.cooked).text
|
||||||
break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
|
break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
|
||||||
text << "\n\n"
|
text << "\n\n"
|
||||||
end
|
end
|
||||||
|
@ -58,7 +63,12 @@ module DiscourseAi
|
||||||
|
|
||||||
def post_truncation(post, tokenizer, max_length)
|
def post_truncation(post, tokenizer, max_length)
|
||||||
text = +topic_information(post.topic)
|
text = +topic_information(post.topic)
|
||||||
text << Nokogiri::HTML5.fragment(post.cooked).text
|
|
||||||
|
if post.is_first_post? && post.topic&.topic_embed&.embed_content_cache&.present?
|
||||||
|
text << Nokogiri::HTML5.fragment(post.topic.topic_embed.embed_content_cache).text
|
||||||
|
else
|
||||||
|
text << Nokogiri::HTML5.fragment(post.cooked).text
|
||||||
|
end
|
||||||
|
|
||||||
tokenizer.truncate(text, max_length)
|
tokenizer.truncate(text, max_length)
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue