| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  | # frozen_string_literal: true | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | module DiscourseAi | 
					
						
							|  |  |  |   module Embeddings | 
					
						
							|  |  |  |     class SemanticSearch | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |       def self.clear_cache_for(query) | 
					
						
							|  |  |  |         digest = OpenSSL::Digest::SHA1.hexdigest(query) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-05 18:39:39 -03:00
										 |  |  |         hyde_key = | 
					
						
							|  |  |  |           "semantic-search-#{digest}-#{SiteSetting.ai_embeddings_semantic_search_hyde_model}" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Discourse.cache.delete(hyde_key) | 
					
						
							|  |  |  |         Discourse.cache.delete("#{hyde_key}-#{SiteSetting.ai_embeddings_model}") | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |       end | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 |  |  |       def initialize(guardian) | 
					
						
							| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  |         @guardian = guardian | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |       end | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       def cached_query?(query) | 
					
						
							|  |  |  |         digest = OpenSSL::Digest::SHA1.hexdigest(query) | 
					
						
							| 
									
										
										
										
											2023-09-05 18:39:39 -03:00
										 |  |  |         embedding_key = | 
					
						
							|  |  |  |           build_embedding_key( | 
					
						
							|  |  |  |             digest, | 
					
						
							|  |  |  |             SiteSetting.ai_embeddings_semantic_search_hyde_model, | 
					
						
							|  |  |  |             SiteSetting.ai_embeddings_model, | 
					
						
							|  |  |  |           ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Discourse.cache.read(embedding_key).present? | 
					
						
							| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  |       end | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       def search_for_topics(query, page = 1) | 
					
						
							| 
									
										
										
										
											2023-09-12 16:16:33 -03:00
										 |  |  |         max_results_per_page = 100
 | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |         limit = [Search.per_filter, max_results_per_page].min + 1
 | 
					
						
							|  |  |  |         offset = (page - 1) * limit | 
					
						
							| 
									
										
										
										
											2023-09-12 16:16:33 -03:00
										 |  |  |         search = Search.new(query, { guardian: guardian }) | 
					
						
							|  |  |  |         search_term = search.term | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  |         strategy = DiscourseAi::Embeddings::Strategies::Truncation.new | 
					
						
							|  |  |  |         vector_rep = | 
					
						
							|  |  |  |           DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-12 16:16:33 -03:00
										 |  |  |         digest = OpenSSL::Digest::SHA1.hexdigest(search_term) | 
					
						
							| 
									
										
										
										
											2023-09-05 18:39:39 -03:00
										 |  |  |         hyde_key = build_hyde_key(digest, SiteSetting.ai_embeddings_semantic_search_hyde_model) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         embedding_key = | 
					
						
							|  |  |  |           build_embedding_key( | 
					
						
							|  |  |  |             digest, | 
					
						
							|  |  |  |             SiteSetting.ai_embeddings_semantic_search_hyde_model, | 
					
						
							|  |  |  |             SiteSetting.ai_embeddings_model, | 
					
						
							|  |  |  |           ) | 
					
						
							| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |         hypothetical_post = | 
					
						
							|  |  |  |           Discourse | 
					
						
							|  |  |  |             .cache | 
					
						
							| 
									
										
										
										
											2023-09-05 18:39:39 -03:00
										 |  |  |             .fetch(hyde_key, expires_in: 1.week) do | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |               hyde_generator = DiscourseAi::Embeddings::HydeGenerators::Base.current_hyde_model.new | 
					
						
							| 
									
										
										
										
											2023-09-12 16:16:33 -03:00
										 |  |  |               hyde_generator.hypothetical_post_from(search_term) | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |             end | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         hypothetical_post_embedding = | 
					
						
							|  |  |  |           Discourse | 
					
						
							|  |  |  |             .cache | 
					
						
							| 
									
										
										
										
											2023-09-05 18:39:39 -03:00
										 |  |  |             .fetch(embedding_key, expires_in: 1.week) { vector_rep.vector_from(hypothetical_post) } | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  |         candidate_topic_ids = | 
					
						
							|  |  |  |           vector_rep.asymmetric_topics_similarity_search( | 
					
						
							|  |  |  |             hypothetical_post_embedding, | 
					
						
							|  |  |  |             limit: limit, | 
					
						
							|  |  |  |             offset: offset, | 
					
						
							|  |  |  |           ) | 
					
						
							| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-06 10:00:20 -03:00
										 |  |  |         semantic_results = | 
					
						
							|  |  |  |           ::Post | 
					
						
							|  |  |  |             .where(post_type: ::Topic.visible_post_types(guardian.user)) | 
					
						
							|  |  |  |             .public_posts | 
					
						
							|  |  |  |             .where("topics.visible") | 
					
						
							|  |  |  |             .where(topic_id: candidate_topic_ids, post_number: 1) | 
					
						
							|  |  |  |             .order("array_position(ARRAY#{candidate_topic_ids}, topic_id)") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-12 16:16:33 -03:00
										 |  |  |         query_filter_results = search.apply_filters(semantic_results) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         guardian.filter_allowed_categories(query_filter_results) | 
					
						
							| 
									
										
										
										
											2023-07-13 12:41:36 -03:00
										 |  |  |       end | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  |       private | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-05 11:08:23 -03:00
										 |  |  |       attr_reader :guardian | 
					
						
							| 
									
										
										
										
											2023-09-05 18:39:39 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  |       def build_hyde_key(digest, hyde_model) | 
					
						
							|  |  |  |         "semantic-search-#{digest}-#{hyde_model}" | 
					
						
							|  |  |  |       end | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       def build_embedding_key(digest, hyde_model, embedding_model) | 
					
						
							|  |  |  |         "#{build_hyde_key(digest, hyde_model)}-#{embedding_model}" | 
					
						
							|  |  |  |       end | 
					
						
							| 
									
										
										
										
											2023-03-31 15:29:56 -03:00
										 |  |  |     end | 
					
						
							|  |  |  |   end | 
					
						
							|  |  |  | end |