HttpClient
2020/12/07
共 1.1k 字
约 6 分钟
归档: 学习
HTTP通信实现库
框架的好处是方便,webmagic让我们快速地就写出个“有模有样”的爬虫程序,但也带来框架共有的自由度限制。对于一些难搞的网站,依旧需要使用okhttp、httpclient等发起http请求。所谓难搞,就是单单用webmagic发起的请求没法返回我们想要的页面,如重定向页面要同一个请求发起两次,如需要Cookies。
maven依赖
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.12</version>
</dependency>
无参get请求
public static RequestConfig requestConfig = RequestConfig.custom()
// 设置连接超时时间(单位毫秒)
.setConnectTimeout(5000)
// 设置请求超时时间(单位毫秒)
.setConnectionRequestTimeout(5000)
// socket读写超时时间(单位毫秒)
.setSocketTimeout(5000)
// 设置是否允许重定向(默认为true)
.setRedirectsEnabled(true).build();
public static String doGetHtml(String url) {
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
CloseableHttpResponse response = null;
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(requestConfig);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
try {
response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if (entity != null) {
return EntityUtils.toString(entity,"gb2312");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
带参get请求
感觉有点多此一举?这不是直接写个url就好了,如果参数会变用replace处理不就行了?
URIBuilder builder = new URIBuilder("https://bid.yimei180.com/");
for (int page = 1;page < 5;page++){
builder.setParameter("jsessionid","6B3EB5A0981C01DD0B4BF36537D7C302")
.setParameter("pageNo","2")
.setParameter("pageSize","10");
String html = doGet(builder.build().toString());
}
post的表单模式
用List<NameValuePair>
传参
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("currentPage","2"));
private static String doPost(String url, List<NameValuePair> params) throws UnsupportedEncodingException {
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(requestConfig);
httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"UTF-8");
httpPost.setEntity(formEntity);
CloseableHttpResponse response = null;
try{
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
if (entity!=null){
return EntityUtils.toString(entity,"UTF-8");
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return "error";
}
post的json模式
StringEntity jsonEntity = new StringEntity(json, "UTF-8");
httpPost.setEntity(jsonEntity);
post的xml模式
同上,使用StringEntity
使用同一个httpclient发起两次请求
用于解决window.location重定向
public String getContent(String url) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
CloseableHttpResponse response2 = null;
String result = "";
try {
httpClient = getHttpClient(false, true);
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
httpGet.setHeader("Connection", "close");
RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000).setSocketTimeout(15 * 1000).build();
httpGet.setConfig(config);
response = httpClient.execute(httpGet);
response.setHeader("Connection", "close");
if (response.getStatusLine().getStatusCode() == 200) {
result = EntityUtils.toString(response.getEntity(), "UTF-8");
if (result.contains("跳转中")){
String newUrl = result.substring(result.indexOf("/channel"),result.lastIndexOf("\""));
newUrl = "http://www.ynys.gov.cn" + newUrl;
HttpGet httpGet2 = new HttpGet(newUrl);
httpGet2.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpGet2.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
httpGet2.setHeader("Connection", "close");
httpGet2.setConfig(config);
response2 = httpClient.execute(httpGet2);
response2.setHeader("Connection", "close");
if (response.getStatusLine().getStatusCode() == 200) {
result = EntityUtils.toString(response2.getEntity(), "UTF-8");
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
return result;
}
获取并设置cookie
public String getCookie(String url) {
String cookie = "";
BasicCookieStore cookieStore = new BasicCookieStore();
CloseableHttpClient client = HttpClientBuilder.create().build();
try {
HttpGet get = new HttpGet(url);
client = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(10 * 1000).setSocketTimeout(30 * 1000).build();
get.setConfig(requestConfig);
client.execute(get);
for (int i = 0; i < cookieStore.getCookies().size(); i++) {
String name = cookieStore.getCookies().get(i).getName();
String value = cookieStore.getCookies().get(i).getValue();
cookie += name + "=" + value + ";";
//System.out.println("cookie==" + name + "=" + value + ";");
}
if (cookie.endsWith(";")) {
cookie = cookie.substring(0, cookie.length() - 1);
}
} catch (Exception e) {
e.printStackTrace();
}
return cookie;
}
public String getContent(String path) {
String respStr = "";
String cookie = getCookie();
CloseableHttpClient client = null;
CloseableHttpResponse res = null;
try {
client = getHttpClient(true, false);
HttpGet httpGet = new HttpGet(path);
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
httpGet.addHeader("Content-type", "text/html");
httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
if (!"".equals(cookie)) {
httpGet.addHeader("Cookie", cookie);
}
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(30 * 1000).setConnectionRequestTimeout(30 * 1000).setSocketTimeout(30 * 1000)
.setRedirectsEnabled(false)
.build();
httpGet.setConfig(requestConfig);
res = client.execute(httpGet);
respStr = EntityUtils.toString(res.getEntity(), "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (res != null) {
res.close();
}
if (client != null) {
client.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return respStr;
}
留言