2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								# frozen_string_literal: true
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								module DiscourseAi
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  module Embeddings
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    module Strategies
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								      class Truncation
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        def id
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								          1
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        def version
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        def prepare_text_from(target, tokenizer, max_length)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          case target
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          when Topic
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            topic_truncation(target, tokenizer, max_length)
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          when Post
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            post_truncation(target, tokenizer, max_length)
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          else
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            raise ArgumentError, "Invalid target type"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        private
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        def topic_information(topic)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          info = +""
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								          info << topic.title
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          info << "\n\n"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          info << topic.category.name
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          if SiteSetting.tagging_enabled
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            info << "\n\n"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            info << topic.tags.pluck(:name).join(", ")
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          end
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								          info << "\n\n"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        def topic_truncation(topic, tokenizer, max_length)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          text = +topic_information(topic)
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 18:59:25 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								          topic.posts.find_each do |post|
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            text << post.raw
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            text << "\n\n"
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								          tokenizer.truncate(text, max_length)
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        def post_truncation(topic, tokenizer, max_length)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          text = +topic_information(post.topic)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          text << post.raw
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								          tokenizer.truncate(text, max_length)
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								      end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  end
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								end
							 |