discourse-ai/spec/lib/modules/embeddings/strategies/truncation_spec.rb

# frozen_string_literal: true

RSpec.describe DiscourseAi::Embeddings::Strategies::Truncation do
  subject(:truncation) { described_class.new }

  describe "#prepare_text_from" do
    context "when using vector from OpenAI" do
      before { SiteSetting.max_post_length = 100_000 }

      fab!(:topic) { Fabricate(:topic) }
      fab!(:post) do
        Fabricate(:post, topic: topic, raw: "Baby, bird, bird, bird\nBird is the word\n" * 500)
      end
      fab!(:post) do
        Fabricate(
          :post,
          topic: topic,
          raw: "Don't you know about the bird?\nEverybody knows that the bird is a word\n" * 400,
        )
      end
      fab!(:post) { Fabricate(:post, topic: topic, raw: "Surfin' bird\n" * 800) }

      let(:model) do
        DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002.new(truncation)
      end

      it "truncates a topic" do
        prepared_text =
          truncation.prepare_text_from(topic, model.tokenizer, model.max_sequence_length)

        expect(model.tokenizer.size(prepared_text)).to be <= model.max_sequence_length
      end
    end
  end
end
FIX: Fix embeddings truncation strategy (#139) 2023-08-16 14:09:41 -04:00			`# frozen_string_literal: true`

			`RSpec.describe DiscourseAi::Embeddings::Strategies::Truncation do`
FEATURE: HyDE-powered semantic search. (#136) * FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-09-05 10:08:23 -04:00			`subject(:truncation) { described_class.new }`

			`describe "#prepare_text_from" do`
			`context "when using vector from OpenAI" do`
FIX: Fix embeddings truncation strategy (#139) 2023-08-16 14:09:41 -04:00			`before { SiteSetting.max_post_length = 100_000 }`

			`fab!(:topic) { Fabricate(:topic) }`
			`fab!(:post) do`
			`Fabricate(:post, topic: topic, raw: "Baby, bird, bird, bird\nBird is the word\n" * 500)`
			`end`
			`fab!(:post) do`
			`Fabricate(`
			`:post,`
			`topic: topic,`
			`raw: "Don't you know about the bird?\nEverybody knows that the bird is a word\n" * 400,`
			`)`
			`end`
			`fab!(:post) { Fabricate(:post, topic: topic, raw: "Surfin' bird\n" * 800) }`

FEATURE: HyDE-powered semantic search. (#136) * FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-09-05 10:08:23 -04:00			`let(:model) do`
			`DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002.new(truncation)`
			`end`
FIX: Fix embeddings truncation strategy (#139) 2023-08-16 14:09:41 -04:00
			`it "truncates a topic" do`
FEATURE: HyDE-powered semantic search. (#136) * FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-09-05 10:08:23 -04:00			`prepared_text =`
			`truncation.prepare_text_from(topic, model.tokenizer, model.max_sequence_length)`
FIX: Fix embeddings truncation strategy (#139) 2023-08-16 14:09:41 -04:00
FEATURE: HyDE-powered semantic search. (#136) * FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-09-05 10:08:23 -04:00			`expect(model.tokenizer.size(prepared_text)).to be <= model.max_sequence_length`
FIX: Fix embeddings truncation strategy (#139) 2023-08-16 14:09:41 -04:00			`end`
			`end`
			`end`
			`end`