discourse-ai/spec/lib/modules/embeddings/semantic_search_spec.rb
Rafael dos Santos Silva 2c0f535bab
FEATURE: HyDE-powered semantic search. (#136)
* FEATURE: HyDE-powered semantic search.

It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way.

We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search.

This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying.

Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead.

* Missing translation and rate limiting

---------

Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com>
2023-09-05 11:08:23 -03:00

106 lines
3.0 KiB
Ruby

# frozen_string_literal: true
require_relative "../../../support/embeddings_generation_stubs"
require_relative "../../../support/openai_completions_inference_stubs"
RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
fab!(:post) { Fabricate(:post) }
fab!(:user) { Fabricate(:user) }
let(:query) { "test_query" }
let(:subject) { described_class.new(Guardian.new(user)) }
describe "#search_for_topics" do
let(:hypothetical_post) { "This is an hypothetical post generated from the keyword test_query" }
before do
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
prompt = DiscourseAi::Embeddings::HydeGenerators::OpenAi.new.prompt(query)
OpenAiCompletionsInferenceStubs.stub_response(prompt, hypothetical_post)
hyde_embedding = [0.049382, 0.9999]
EmbeddingsGenerationStubs.discourse_service(
SiteSetting.ai_embeddings_model,
hypothetical_post,
hyde_embedding,
)
end
after { described_class.clear_cache_for(query) }
def stub_candidate_ids(candidate_ids)
DiscourseAi::Embeddings::VectorRepresentations::AllMpnetBaseV2
.any_instance
.expects(:asymmetric_topics_similarity_search)
.returns(candidate_ids)
end
it "returns the first post of a topic included in the asymmetric search results" do
stub_candidate_ids([post.topic_id])
posts = subject.search_for_topics(query)
expect(posts).to contain_exactly(post)
end
describe "applies different scopes to the candidates" do
context "when the topic is not visible" do
it "returns an empty list" do
post.topic.update!(visible: false)
stub_candidate_ids([post.topic_id])
posts = subject.search_for_topics(query)
expect(posts).to be_empty
end
end
context "when the post is not public" do
it "returns an empty list" do
pm_post = Fabricate(:private_message_post)
stub_candidate_ids([pm_post.topic_id])
posts = subject.search_for_topics(query)
expect(posts).to be_empty
end
end
context "when the post type is not visible" do
it "returns an empty list" do
post.update!(post_type: Post.types[:whisper])
stub_candidate_ids([post.topic_id])
posts = subject.search_for_topics(query)
expect(posts).to be_empty
end
end
context "when the post is not the first post in the topic" do
it "returns an empty list" do
reply = Fabricate(:reply)
reply.topic.first_post.trash!
stub_candidate_ids([reply.topic_id])
posts = subject.search_for_topics(query)
expect(posts).to be_empty
end
end
context "when the post is not a candidate" do
it "doesn't include it in the results" do
post_2 = Fabricate(:post)
stub_candidate_ids([post.topic_id])
posts = subject.search_for_topics(query)
expect(posts).not_to include(post_2)
end
end
end
end
end