discourse-ai/spec/lib/modules/ai_bot/tools/web_browser_spec.rb

# frozen_string_literal: true

RSpec.describe DiscourseAi::AiBot::Tools::WebBrowser do
  let(:bot_user) { DiscourseAi::AiBot::EntryPoint.find_user_from_model("gpt-3.5-turbo") }
  let(:llm) { DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo") }

  before do
    SiteSetting.ai_openai_api_key = "asd"
    SiteSetting.ai_bot_enabled = true
  end

  describe "#invoke" do
    it "can retrieve the content of a webpage and returns the processed text" do
      url = "https://arxiv.org/html/2403.17011v1"
      processed_text = "This is a simplified version of the webpage content."

      # Mocking the web request to return a specific HTML structure
      stub_request(:get, url).to_return(
        status: 200,
        body:
          "<html><head><title>Test</title></head><body><p>This is a simplified version of the webpage content.</p></body></html>",
      )

      tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)
      result = tool.invoke

      expect(result).to have_key(:text)
      expect(result[:text]).to eq(processed_text)
      expect(result[:url]).to eq(url)
    end

    it "returns an error if the webpage cannot be retrieved" do
      url = "https://arxiv.org/html/2403.17011v1"

      # Simulating a failed request
      stub_request(:get, url).to_return(status: [500, "Internal Server Error"])

      tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)
      result = tool.invoke

      expect(result).to have_key(:error)
      expect(result[:error]).to include("Failed to retrieve the web page")
    end
  end

  describe "#invoke with various HTML structures" do
    let(:url) { "http://example.com" }

    it "extracts main content from a simple HTML structure" do
      simple_html = "<html><body><p>Simple content.</p></body></html>"
      stub_request(:get, url).to_return(status: 200, body: simple_html)

      tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)
      result = tool.invoke

      expect(result[:text]).to eq("Simple content.")
    end

    it "correctly ignores script and style tags" do
      complex_html =
        "<html><head><script>console.log('Ignore me')</script></head><body><style>body { background-color: #000; }</style><p>Only relevant content here.</p></body></html>"
      stub_request(:get, url).to_return(status: 200, body: complex_html)

      tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)
      result = tool.invoke

      expect(result[:text]).to eq("Only relevant content here.")
    end

    it "extracts content from nested structures" do
      nested_html =
        "<html><body><div><section><p>Nested paragraph 1.</p></section><section><p>Nested paragraph 2.</p></section></div></body></html>"
      stub_request(:get, url).to_return(status: 200, body: nested_html)

      tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)
      result = tool.invoke

      expect(result[:text]).to eq("Nested paragraph 1. Nested paragraph 2.")
    end
  end

  describe "#invoke with redirects" do
    let(:initial_url) { "http://initial-example.com" }
    let(:final_url) { "http://final-example.com" }
    let(:redirect_html) { "<html><body><p>Redirected content.</p></body></html>" }

    it "follows redirects and retrieves content from the final destination" do
      stub_request(:get, initial_url).to_return(status: 302, headers: { "Location" => final_url })
      stub_request(:get, final_url).to_return(status: 200, body: redirect_html)

      tool = described_class.new({ url: initial_url }, bot_user: bot_user, llm: llm)
      result = tool.invoke

      expect(result[:url]).to eq(final_url)
      expect(result[:text]).to eq("Redirected content.")
    end
  end
end
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00			`# frozen_string_literal: true`

			`RSpec.describe DiscourseAi::AiBot::Tools::WebBrowser do`
DEV: Rewire AI bot internals to use LlmModel (#638) * DRAFT: Create AI Bot users dynamically and support custom LlmModels * Get user associated to llm_model * Track enabled bots with attribute * Don't store bot username. Minor touches to migrate default values in settings * Handle scenario where vLLM uses a SRV record * Made 3.5-turbo-16k the default version so we can remove hack 2024-06-18 13:32:14 -04:00			`let(:bot_user) { DiscourseAi::AiBot::EntryPoint.find_user_from_model("gpt-3.5-turbo") }`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00			`let(:llm) { DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo") }`

			`before do`
			`SiteSetting.ai_openai_api_key = "asd"`
			`SiteSetting.ai_bot_enabled = true`
			`end`

			`describe "#invoke" do`
			`it "can retrieve the content of a webpage and returns the processed text" do`
			`url = "https://arxiv.org/html/2403.17011v1"`
			`processed_text = "This is a simplified version of the webpage content."`

			`# Mocking the web request to return a specific HTML structure`
			`stub_request(:get, url).to_return(`
			`status: 200,`
			`body:`
			`"<html><head><title>Test</title></head><body><p>This is a simplified version of the webpage content.</p></body></html>",`
			`)`

REFACTOR: Simplify tool invocation by removing bot_user and llm parameters (#603) * Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec 2024-05-07 07:55:46 -04:00			`tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)`
			`result = tool.invoke`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00
			`expect(result).to have_key(:text)`
			`expect(result[:text]).to eq(processed_text)`
			`expect(result[:url]).to eq(url)`
			`end`

			`it "returns an error if the webpage cannot be retrieved" do`
			`url = "https://arxiv.org/html/2403.17011v1"`

			`# Simulating a failed request`
			`stub_request(:get, url).to_return(status: [500, "Internal Server Error"])`

REFACTOR: Simplify tool invocation by removing bot_user and llm parameters (#603) * Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec 2024-05-07 07:55:46 -04:00			`tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)`
			`result = tool.invoke`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00
			`expect(result).to have_key(:error)`
			`expect(result[:error]).to include("Failed to retrieve the web page")`
			`end`
			`end`

			`describe "#invoke with various HTML structures" do`
			`let(:url) { "http://example.com" }`

			`it "extracts main content from a simple HTML structure" do`
			`simple_html = "<html><body><p>Simple content.</p></body></html>"`
			`stub_request(:get, url).to_return(status: 200, body: simple_html)`

REFACTOR: Simplify tool invocation by removing bot_user and llm parameters (#603) * Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec 2024-05-07 07:55:46 -04:00			`tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)`
			`result = tool.invoke`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00
			`expect(result[:text]).to eq("Simple content.")`
			`end`

			`it "correctly ignores script and style tags" do`
			`complex_html =`
			`"<html><head><script>console.log('Ignore me')</script></head><body><style>body { background-color: #000; }</style><p>Only relevant content here.</p></body></html>"`
			`stub_request(:get, url).to_return(status: 200, body: complex_html)`

REFACTOR: Simplify tool invocation by removing bot_user and llm parameters (#603) * Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec 2024-05-07 07:55:46 -04:00			`tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)`
			`result = tool.invoke`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00
			`expect(result[:text]).to eq("Only relevant content here.")`
			`end`

			`it "extracts content from nested structures" do`
			`nested_html =`
			`"<html><body><div><section><p>Nested paragraph 1.</p></section><section><p>Nested paragraph 2.</p></section></div></body></html>"`
			`stub_request(:get, url).to_return(status: 200, body: nested_html)`

REFACTOR: Simplify tool invocation by removing bot_user and llm parameters (#603) * Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec 2024-05-07 07:55:46 -04:00			`tool = described_class.new({ url: url }, bot_user: bot_user, llm: llm)`
			`result = tool.invoke`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00
			`expect(result[:text]).to eq("Nested paragraph 1. Nested paragraph 2.")`
			`end`
			`end`

			`describe "#invoke with redirects" do`
			`let(:initial_url) { "http://initial-example.com" }`
			`let(:final_url) { "http://final-example.com" }`
			`let(:redirect_html) { "<html><body><p>Redirected content.</p></body></html>" }`

			`it "follows redirects and retrieves content from the final destination" do`
			`stub_request(:get, initial_url).to_return(status: 302, headers: { "Location" => final_url })`
			`stub_request(:get, final_url).to_return(status: 200, body: redirect_html)`

REFACTOR: Simplify tool invocation by removing bot_user and llm parameters (#603) * Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec 2024-05-07 07:55:46 -04:00			`tool = described_class.new({ url: initial_url }, bot_user: bot_user, llm: llm)`
			`result = tool.invoke`
FEATURE: web browsing tool (#548) This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects. 2024-03-28 01:01:58 -04:00
			`expect(result[:url]).to eq(final_url)`
			`expect(result[:text]).to eq("Redirected content.")`
			`end`
			`end`
			`end`