mirror of
				https://github.com/discourse/discourse-ai.git
				synced 2025-10-25 03:28:40 +00:00 
			
		
		
		
	* Well, it was quite a journey but now tools have "context" which can be critical for the stuff they generate This entire change was so Dall E and Artist generate images in the correct context * FIX: improve error handling around image generation - also corrects image markdown and clarifies code * fix spec
		
			
				
	
	
		
			107 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			107 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| # frozen_string_literal: true
 | |
| 
 | |
| module DiscourseAi
 | |
|   module AiBot
 | |
|     module Tools
 | |
|       class WebBrowser < Tool
 | |
|         def self.signature
 | |
|           {
 | |
|             name: name,
 | |
|             description:
 | |
|               "Visits a web page, retrieves the HTML content, extracts the main content, converts it to plain text, and returns the result.",
 | |
|             parameters: [
 | |
|               {
 | |
|                 name: "url",
 | |
|                 description: "The URL of the web page to visit.",
 | |
|                 required: true,
 | |
|                 type: "string",
 | |
|               },
 | |
|             ],
 | |
|           }
 | |
|         end
 | |
| 
 | |
|         def self.name
 | |
|           "web_browser"
 | |
|         end
 | |
| 
 | |
|         def url
 | |
|           return @url if defined?(@url)
 | |
|           @url = parameters[:url]
 | |
|           @url = "https://#{@url}" if !@url.start_with?("http")
 | |
| 
 | |
|           @url
 | |
|         end
 | |
| 
 | |
|         def invoke
 | |
|           send_http_request(url, follow_redirects: true) do |response|
 | |
|             if response.code == "200"
 | |
|               html = read_response_body(response)
 | |
|               text = extract_main_content(html)
 | |
|               text = truncate(text, max_length: 50_000, percent_length: 0.3, llm: llm)
 | |
|               return { url: response.uri.to_s, text: text.strip }
 | |
|             else
 | |
|               return { url: url, error: "Failed to retrieve the web page: #{response.code}" }
 | |
|             end
 | |
|           end
 | |
| 
 | |
|           { url: url, error: "Failed to retrieve the web page" }
 | |
|         end
 | |
| 
 | |
|         def description_args
 | |
|           { url: url }
 | |
|         end
 | |
| 
 | |
|         private
 | |
| 
 | |
|         def extract_main_content(html)
 | |
|           doc = Nokogiri.HTML(html)
 | |
|           doc.search("script, style, comment").remove
 | |
| 
 | |
|           main_content = find_main_content(doc)
 | |
|           main_content ||= doc.at("body")
 | |
| 
 | |
|           buffer = +""
 | |
|           nodes_to_text(main_content, buffer)
 | |
| 
 | |
|           buffer.gsub(/\s+/, " ")
 | |
|         end
 | |
| 
 | |
|         def nodes_to_text(nodes, buffer)
 | |
|           if nodes.text?
 | |
|             buffer << nodes.text
 | |
|             buffer << " "
 | |
|             return
 | |
|           end
 | |
| 
 | |
|           nodes.children.each do |node|
 | |
|             case node.name
 | |
|             when "text"
 | |
|               buffer << node.text
 | |
|               buffer << " "
 | |
|             when "br"
 | |
|               buffer << "\n"
 | |
|             when "a"
 | |
|               nodes_to_text(node, buffer)
 | |
|               buffer << " [#{node["href"]}] "
 | |
|             else
 | |
|               nodes_to_text(node, buffer)
 | |
|             end
 | |
|           end
 | |
|         end
 | |
| 
 | |
|         def find_main_content(doc)
 | |
|           [
 | |
|             doc.at("article"),
 | |
|             doc.at("main"),
 | |
|             doc.at("[role='main']"),
 | |
|             doc.at("#main"),
 | |
|             doc.at(".main"),
 | |
|             doc.at("#content"),
 | |
|             doc.at(".content"),
 | |
|           ].find(&:present?)
 | |
|         end
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| end
 |