diff --git a/src/site/apt/index.apt b/src/site/apt/index.apt index 3d6d147f9..139c4775f 100644 --- a/src/site/apt/index.apt +++ b/src/site/apt/index.apt @@ -62,7 +62,7 @@ Features * Tunneled HTTPS connections through HTTP proxies, via the CONNECT method. - * Basic, Digest authentication schemes. Please note NTLM is currently not supported. + * Basic, Digest authentication schemes. Please note NTLM is supported only partially. * Plug-in mechanism for custom authentication schemes. diff --git a/src/site/apt/ntlm.apt b/src/site/apt/ntlm.apt new file mode 100644 index 000000000..daaf61d94 --- /dev/null +++ b/src/site/apt/ntlm.apt @@ -0,0 +1,183 @@ +~~ $HeadURL$ +~~ $Revision$ +~~ $Date$ +~~ +~~ ==================================================================== +~~ Licensed to the Apache Software Foundation (ASF) under one +~~ or more contributor license agreements. See the NOTICE file +~~ distributed with this work for additional information +~~ regarding copyright ownership. The ASF licenses this file +~~ to you under the Apache License, Version 2.0 (the +~~ "License"); you may not use this file except in compliance +~~ with the License. You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, +~~ software distributed under the License is distributed on an +~~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +~~ KIND, either express or implied. See the License for the +~~ specific language governing permissions and limitations +~~ under the License. +~~ ==================================================================== +~~ +~~ This software consists of voluntary contributions made by many +~~ individuals on behalf of the Apache Software Foundation. For more +~~ information on the Apache Software Foundation, please see +~~ . + + ---------- + NTLM support in HttpClient + ---------- + ---------- + ---------- + +NTLM support in HttpClient + + Currently HttpClient 4.0 does not provide support for the NTLM authentication scheme + out of the box and probably never will. The reasons for that are legal rather than + technical. + +* Background + + NTLM is a proprietary authentication scheme developed by Microsoft and optimized for + Windows operating system. + + Until year 2008 there was no official, publicly available, complete documentation of + the protocol. {{{http://davenport.sourceforge.net/ntlm.html}Unofficial}} 3rd party + protocol descriptions existed as a result of reverse-engineering efforts. It was not + really known whether the protocol based on the reverse-engineering were complete or + even correct. + + Microsoft published {{{http://download.microsoft.com/download/a/e/6/ae6e4142-aa58-45c6-8dcf-a657e5900cd3/%5BMS-NLMP%5D.pdf}MS-NLMP}} + and {{{http://download.microsoft.com/download/a/e/6/ae6e4142-aa58-45c6-8dcf-a657e5900cd3/%5BMS-NTHT%5D.pdf}MS-NTHT}} + specifications in February 2008 as a part of its + {{{http://www.microsoft.com/interop/principles/default.mspx}Interoperability + Principles initiative}}. Unfortunately, it is still not entirely clear whether NTLM + encryption algorithms are covered by any patents held by Microsoft, which would make + commercial users of open-source NTLM implementations liable for the use of Microsoft + intellectual property. + +* Enabling NTLM support in HttpClient 4.x + + The good news is HttpClient is fully NTLM capable right out of the box. + HttpClient ships with the NTLM authentication scheme, which, if configured + to use an external NTLM engine, can handle NTLM challenges and authenticate + against NTLM servers. + +---------------------------------------- +public interface NTLMEngine { + + String generateType1Msg( + String domain, + String workstation) throws NTLMEngineException; + + String generateType3Msg( + String username, + String password, + String domain, + String workstation, + String challenge) throws NTLMEngineException; + +} +---------------------------------------- + +* Using Samba JCIFS as an NTLM engine + + Follow these instructions to build an NTLMEngine implementation using JCIFS library + + <>. + + * Download the latest release of the JCIFS library from the + {{{http://jcifs.samba.org/}Samba}} web site + + * Implement NTLMEngine interface + +---------------------------------------- +import jcifs.ntlmssp.Type1Message; +import jcifs.ntlmssp.Type2Message; +import jcifs.ntlmssp.Type3Message; +import jcifs.util.Base64; + +import org.apache.http.impl.auth.NTLMEngine; +import org.apache.http.impl.auth.NTLMEngineException; + +public class JCIFSEngine implements NTLMEngine { + + public String generateType1Msg( + String domain, + String workstation) throws NTLMEngineException { + + Type1Message t1m = new Type1Message( + Type1Message.getDefaultFlags(), + domain, + workstation); + return Base64.encode(t1m.toByteArray()); + } + + public String generateType3Msg( + String username, + String password, + String domain, + String workstation, + String challenge) throws NTLMEngineException { + Type2Message t2m; + try { + t2m = new Type2Message(Base64.decode(challenge)); + } catch (IOException ex) { + throw new NTLMEngineException("Invalid Type2 message", ex); + } + Type3Message t3m = new Type3Message( + t2m, + password, + domain, + username, + workstation); + return Base64.encode(t3m.toByteArray()); + } + +} +---------------------------------------- + + * Implement AuthSchemeFactory interface + +---------------------------------------- +import org.apache.http.auth.AuthScheme; +import org.apache.http.auth.AuthSchemeFactory; +import org.apache.http.impl.auth.NTLMScheme; +import org.apache.http.params.HttpParams; + +public class NTLMSchemeFactory implements AuthSchemeFactory { + + public AuthScheme newInstance(final HttpParams params) { + return new NTLMScheme(new JCIFSEngine()); + } + +} +---------------------------------------- + + * Register NTLMSchemeFactory with the HttpClient instance you want to NTLM + enable. + +---------------------------------------- +httpclient.getAuthSchemes().register("ntlm", new NTLMSchemeFactory()); +---------------------------------------- + + * Set NTCredentials for the web server you are going to access. + +---------------------------------------- +httpclient.getCredentialsProvider().setCredentials( + new AuthScope("myserver", -1), + new NTCredentials("username", "password", "MYSERVER", "MYDOMAIN")); +----------------------------------------------------------- + + * You are done. + + +* Why this code is not distributed with HttpClient + + JCIFS is licensed under the Lesser General Public License (LGPL). This license + is not compatible with the Apache Licenses under which all Apache Software is + released. Lawyers of the Apache Software Foundation are currently investigating + under which conditions Apache software is allowed to make use of LGPL software. diff --git a/src/site/apt/primer.apt b/src/site/apt/primer.apt new file mode 100644 index 000000000..73503d5da --- /dev/null +++ b/src/site/apt/primer.apt @@ -0,0 +1,670 @@ +~~ $HeadURL$ +~~ $Revision$ +~~ $Date$ +~~ +~~ ==================================================================== +~~ Licensed to the Apache Software Foundation (ASF) under one +~~ or more contributor license agreements. See the NOTICE file +~~ distributed with this work for additional information +~~ regarding copyright ownership. The ASF licenses this file +~~ to you under the Apache License, Version 2.0 (the +~~ "License"); you may not use this file except in compliance +~~ with the License. You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, +~~ software distributed under the License is distributed on an +~~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +~~ KIND, either express or implied. See the License for the +~~ specific language governing permissions and limitations +~~ under the License. +~~ ==================================================================== +~~ +~~ This software consists of voluntary contributions made by many +~~ individuals on behalf of the Apache Software Foundation. For more +~~ information on the Apache Software Foundation, please see +~~ . + + ---------- + Client HTTP Programming Primer + ---------- + ---------- + ---------- + +Client HTTP Programming Primer + +* About + + This document is intended for people who suddenly have to or want to implement + an application that automates something usually done with a browser, + but are missing the background to understand what they actually need to do. + It provides guidance on the steps required to implement a program that + interacts with a web site which is designed to be used with a browser. + It does not save you from eventually learning the background of what + you are doing, but it should help you to get started quickly and learn + the details later. + + This document has evolved from discussions on the HttpClient mailing lists. + Although it refers to HttpClient, the concepts described here apply equally + to HttpComponents or SUN's {{{http://java.sun.com/j2se/1.4.2/docs/api/java/net/HttpURLConnection.html}HttpURLConnection}} + or any other HTTP communication library for any programming language. So you + might find it useful even if you're not using Java and HttpClient. + + The existence of this document does not imply that the HttpClient community + feels responsible for teaching you how to program a client HTTP application. + It is merely a way for us to reduce the noise on the mailing list without + just leaving the newbies out in the cold. + +* Scenario + + Let's assume that you have some kind of repetitive, web-based task that + you want to automate. Something like: + + * goto page http://xxx.yyy.zzz/login.html + + * enter username and password in a web form and hit the "login" button + + * navigate to a specific page + + * check the number/headline/whatever shown on that page + + [] + + At this time, we don't have a specific example which could be developed + into a sample application. So this document is all bla-bla, and you will + have to work out the details - all the details - yourself. Such is life. + +* Caveat + + This scenario describes a hobbyist usage of HTTP, in other words: + <>. Web sites are designed for user interaction, not + as an application programming interface (API). The interface of a + web site is the user interface displayed by a browser. The HTTP + communication between the browser and the server is an internal API, + subject to change without notice. + + A web site can be redesigned at any point in time. The server then + sends different documents and a browser will display the new content. + The user easily adjusts to click the appropriate links, and the browser + communicates via HTTP as specified by the new documents from the server. + Your application that only mimicks a browser will simply break. + + Nevertheless, implementing this scenario will help you to get + familiar with HTTP communication. It is also "good enough" for + hobbyists applications, for example if you want to download the + latest installment of your favorite daily webcomic to install + it as the screen background. There is no big damage if such an + application breaks. + + If you want to implement a solid application, you should use only + published APIs. For example, to check for new mail on your webmail + account, you should ask the webmail provider for POP or IMAP access. + These are standardized protocols supported my most EMail client applications. + If you want to have a newsticker, look for RSS feeds from the provider and + applications that display them. + + As another example, if you want to perform a web search, there are + search companies that provide an API for using their search engines. + Unlike the examples before, such APIs are proprietary. You will still + have to implement an application, but then you are using a published API + that the provider will not change without notice. + + +* Not a Browser + + HttpClient is not a browser. Here's the difference. + + <> + +[images/browser.png] Browser + + The figure shows some of the components you will find in a browser. + To the left, there is the user interface. The browser needs a rendering + engine to display pages, and to interpret user input such as mouse clicks + somewhere on the displayed page. There is a layout engine which computes + how an HTML page should be displayed, including cascading style sheets + and images. A JavaScript interpreter runs JavaScript code embedded in + or referenced from HTML pages. Events from the user interface are passed + to the JavaScript interpreter for processing. + On the top, there are interfaces for plugins that can handle Applets, + embedded media objects like PDF files, Quicktime movies and Flash animations, + or ActiveX controls that can do anything. + + In the center of the figure you can find internal components. Browsers + have a cache of recently accessed documents and image files. They need + to remember cookies and passwords entered by the user. Such information + can be kept in memory or stored persistently in the file system at the + bottom of the figure, to be available again when the browser is restarted. + Certificates for secure communication are almost always stored persistently. + To the right of the figure is the network. Browsers support many protocols + on different levels of abstraction. There are application protocols + such as FTP and HTTP to retrieve documents from servers, and transport + layer protocols such as TLS/SSL and Socks to establish connections for + the application protocols. + + One characteristic of browsers that is not shown in the figure is tolerance + for bad input. There needs to be tolerance for invalid user input to make + the browser user friendly. There also needs to be tolerance for malformed + documents retrieved from servers, and for flaws in server behavior when + executing protocols, to make as many websites as possible accessible to + the user. + + <> + +[images/httpclient.png] HTTP Client + + The figure shows some of the components you will find in a browser, + and highlights the scope of HttpClient. The primary responsibility + of HttpClient is the HTTP protocol, executed directly or through an + HTTP proxy. It provides interfaces and default implementations for + cookie and password management, but not for persisting such data. + User interfacing, HTML parsing, plugins or non-HTTP application level + protocols are not in the scope of HttpClient. It does provide interfaces + to plug in transport layer protocols, but it does not implement such + protocols. + + All the rest of a browser's functionality you require needs to be + provided by your application. HttpClient executes HTTP requests, but it + will not and can not assemble them. Since HttpClient does not interface + with the user, nor interpret content such as HTML files, there is + little or no tolerance for bad data passed to the API. There is some + tolerance for flaws in server behavior, but there are limits to the + deviations HttpClient can handle. + +* Terminology + + This section introduces some important terms you have to know to + understand the rest of this document. + + <<>> + + consists of a header section and an optional entity. There are two kinds + of messages, requests and responses. They differ in the format of the + first line, but both can have header fields and an optional entity. + + <<>> + + is sent from a client to a server. The first line includes the URI for + which the request is sent, and a method that the server should execute + for the client. + + <<>> + + is sent from a server to a client in response to a request. The first + line includes a status code that tells about success or failure of + the request. HTTP defines a set of status codes, like 200 for success + and 404 for not found. Other protocols based on HTTP can define + additional status codes. + + <<>> + + is an operation requested from the server. HTTP defines a set of + operations, the most frequent being GET and POST. Other protocols + based on HTTP can define additional methods. + + <<
>> + + are name-value pairs, where both name and value are text. The name of + a header field is not case sensitive. Multiple values can be assigned + to the same name. RFC 2616 defines a wide range + of header fields for handling various aspects of the HTTP protocol. + Other specifications, like RFC 2617 and RFC 2965, define additional + headers. Some of the defined headers are for general use, others are + meant for exclusive use with either requests or responses, still others + are meant for use only with an entity. + + <<>> + + is data sent with an HTTP message. For example, a response can contain + the page or image you are downloading as an entity, or a request can + include the parameters that you entered into a web form. + The entity of an HTTP message can have an arbitrary data format, which + is usually specified as a MIME type in a header field. + + <<>> + + is a series of requests from a single source to a server. The server + can keep session data, and needs to recognize the session to which + each incoming request belongs. For example, if you execute a web search, + the server will only return one page of search results. But it keeps + track of the other results and makes them available when you click on + the link to the "next" page. The server needs to know from the request + that it is you and your session for which more results are requested, + and not me and my session. That's because I searched for something else. + + <<>> + + are the preferred way for servers to track sessions. The server supplies + a piece of data, called a cookie, in response to a request. The server + expects the client to send that piece of data in a header field with each + following request of the same session. + The cookie is different for each session, so the server can identify to + which session a request belongs by looking at the cookie. If the cookie + is missing from a request, the server will not respond as expected. + +* Step by Step + +** GET the Login Page + + Create and execute a GET request for the login page. + Just use the link you would type into the browser as the URL. + This is what a browser does when you enter a URL in the address bar + or when you click on a link that points to another web page. + + Inspect the response from the server: + + * do you get the page you expected? + + [] + + It should be sent as the entity of the response to your request. + The entity is also referred to as the response body. + + * do you get a session cookie? + + [] + + Cookies are sent in a header field named Set-Cookie or Set-Cookie2. + It is possible that you don't get a session cookie until you log in. + If there is no session cookie in the response, you'll have to do perform + step 2 later, after you reach the point where the cookie is set. + + If you do not get the page you expect, check the URL you are requesting. + If it is correct, the server may use a browser detection. You will have + to set the header field User-Agent to a value used by a popular browser + to pretend that the request is coming from that browser. + + If you can't get the login page, get the home page instead now. + Get the login page in the next step, when you establish the session. + +** Establish the Session + + Create and execute another GET request for a page. + You can simply request the login page again, or some other page + of which you know the URL. Do NOT try to get a page which would + be returned in response to submitting a web form. Use something + you can reach simply by clicking on a link in the browser. Something + where you can see the URL in the browser status line while the + mouse pointer is hovering over the link. + + This step is important when developing the application. Once you know + that your application does establish the session correctly, you may + be able to remove it. Only if you couldn't get the login page directly + and had to get the home page first, you know you have to leave it in. + + Inspect the request being sent to the server. + + * is the session cookie sent with the request? + + [] + + You can see what is sent to the server by enabling the wire log + for HttpClient. You only need to see the request headers, not the body. + The session cookie should be sent in a header field called Cookie. + There may be several of those, and other cookies might be sent as well. + + Inspect the response from the server: + + * do you get another session cookie? + + [] + + You should not get another session cookie. If you get the same session + cookie as before, the server behaves a little strange but that should + not be a problem. If you get a new session cookie, then the server did + not recognize the session for the request. Usually, this happens if the + request did not contain the session cookie. But servers might use other + means to track sessions, or to detect session hijacking. + + If the session cookie is not sent in the request, one of two things + has gone wrong. Either the cookie was not detected in the previous + response, or the cookie was not selected for being sent with the new + request. + + HttpClient automatically parses cookies sent in responses and puts them + into a cookie store. HttpClient uses a configurable cookie policy + to decide whether a cookie being sent from a server is correct. + The default policy complies strictly with RFC 2109, but many servers + do not. Play around with the cookie policies until the cookie is + accepted and put into the cookie store. + + If the cookie is accepted from the previous response but still not + sent with the new request, make sure that HttpClient uses the same + cookie store object. Unless you explicitly manage cookie store + objects (not recommended for newbies!), this will be the case if you + use the same HttpClient object to execute both requests. + + If the cookie is still not sent with the request, make sure that the + URL you are requesting is in the scope for the cookie. Cookies are + only sent to the domain and path specified in the cookie scope. + A cookie for host "jakarta.apache.org" will not be sent to host + "tomcat.apache.org". A cookie for domain ".apache.org" will be sent + to both. A cookie for host "apache.org", without the leading dot, + will not be sent to "jakarta.apache.org". The latter case can be + resolved by using a different cookie spec that adds the leading dot. + In the other cases, use a URL that in the cookie scope to establish + the session. + + If the session cookie is sent with the request, but a new session cookie + is set in the response anyway, check whether there are cookies other + than the session cookie in the request. Some servers are incapable of + detecting multiple cookies sent in individual header fields. HttpClient + can be advised to put all cookies into a single header field. + + If that doesn't help, you are in trouble. The server may use additional + means to track the session, for example the header field named Referer. + Set that field to the URL of the previous request. + ({{{http://mail-archives.apache.org/mod_mbox/jakarta-httpclient-user/200602.mbox/%3c19b.44e04b45.31166eaa@aol.com%3e}see this mail}}) + + If that doesn't help either, you will have to compare the request from + your application to a corresponding one generated by a browser. The + instructions in step 5 for POST requests apply for GET requests as well. + It's even simpler with GET, since you don't have an entity. + +** Analyze the Form + + Now it is time to analyze the form defined in the HTML markup of the page. + A form in HTML is a set of name-value-pairs called parameters, where some + of the values can be entered in the browser. By analyzing the HTML markup, + you can learn which parameters you have to define and how to send them + to the server. + + Look for the
tag in the page source. There may be several forms in + the page, but they can not be nested. Locate the form you want to submit. + Locate the matching
tag. Everything in between the two may be + relevant. Let's start with the attributes of the
tag: + + <<>> + + specifies the method used for submitting the form. If it is GET or + not specified at all, then you need to create a GET request. The parameters + will be added as a query string to the URL. If the method is POST, you + need to create a POST request. The parameters will be put in the entity + of the request, also referred to as the request body. + How to do that is discussed in step 5. + + <<>> + + specifies the URL to which the request has to be sent. Do not try to + get this URL from the address bar of your browser! A browser will + automatically follow redirects and only displays the final URL, which + can be different from the URL in this attribute. + It is possible that the URL includes a query string that specifies + some parameters. If so, keep that in mind. + + <<>> + + specifies the MIME type for the entity of the request generated by the + form. The two common cases are url-encoded (default) and multipart-mime. + Note that these terms are just informally used here, the exact values + that need to be written in an HTML document are specified elsewhere. + This attribute is only used for the POST method. If the method is GET, + the parameters will always be url-encoded, but not in an entity. + + <<>> + + specifies the character set that the browser should allow for user input. + It will not be discussed here, but you will have to consider this value + if you experience charset related problems. + + Except for optional query parameters in the action attribute, the parameters + of a form are specified by HTML tags between and . + The following is a list of tags that can be used to define parameters. + Except where stated otherwise, they have a name attribute which specifies + the name of the parameter. The value of the parameter usually depends on + user input. + +---------------------------------------- + + +---------------------------------------- + + specify single-line input fields. Using the return key in one of these + fields will submit the form, so the value really is a single line of + input from the user. + +---------------------------------------- + + +---------------------------------------- + + specify a parameter that can not be changed by the user. + The value of the parameter is given by the value attribute. + +---------------------------------------- + + +---------------------------------------- + + specify a parameter that can be included or omitted. There usually is + more than one tag with the same name. For radio buttons, only one can + be selected and the value of the parameter is the value of the selected + radio button. For checkboxes, more than one can be selected. There will + be one name-value-pair for each selected checkbox, with the same name + for all of them. + +---------------------------------------- + +