FIX: crawler requests not tracked for non UTF-8 user agents

Non UTF-8 user_agent requests were bypassing logging due to PG always
wanting UTF-8 strings.

This adds some conversion to ensure we are always dealing with UTF-8
This commit is contained in:
Sam Saffron 2019-12-09 17:43:51 +11:00
parent 9b30922109
commit e440ec2519
2 changed files with 26 additions and 1 deletions

View File

@ -117,7 +117,12 @@ class Middleware::RequestTracker
}
if h[:is_crawler]
h[:user_agent] = env['HTTP_USER_AGENT']
user_agent = env['HTTP_USER_AGENT']
if user_agent.encoding != Encoding::UTF_8
user_agent = user_agent.encode("utf-8")
user_agent.scrub!
end
h[:user_agent] = user_agent
end
if cache = headers["X-Discourse-Cached"]

View File

@ -15,6 +15,26 @@ describe Middleware::RequestTracker do
}.merge(opts)
end
context "full request" do
before do
@orig = WebCrawlerRequest.autoflush
WebCrawlerRequest.autoflush = 1
end
after do
WebCrawlerRequest.autoflush = @orig
end
it "can handle rogue user agents" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("Windows-1252")
middleware = Middleware::RequestTracker.new(->(env) { ["200", { "Content-Type" => "text/html" }, [""]] })
middleware.call(env("HTTP_USER_AGENT" => agent))
expect(WebCrawlerRequest.where(user_agent: agent.encode('utf-8')).count).to eq(1)
end
end
context "log_request" do
before do
freeze_time Time.now