HttpClient

2020/12/07
共 1.1k 字
约 6 分钟
归档: 学习

HTTP通信实现库


框架的好处是方便,webmagic让我们快速地就写出个“有模有样”的爬虫程序,但也带来框架共有的自由度限制。对于一些难搞的网站,依旧需要使用okhttp、httpclient等发起http请求。所谓难搞,就是单单用webmagic发起的请求没法返回我们想要的页面,如重定向页面要同一个请求发起两次,如需要Cookies。

maven依赖

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.12</version>
</dependency>

无参get请求

public static RequestConfig requestConfig = RequestConfig.custom()
            // 设置连接超时时间(单位毫秒)
            .setConnectTimeout(5000)
            // 设置请求超时时间(单位毫秒)
            .setConnectionRequestTimeout(5000)
            // socket读写超时时间(单位毫秒)
            .setSocketTimeout(5000)
            // 设置是否允许重定向(默认为true)
            .setRedirectsEnabled(true).build();

public static String doGetHtml(String url) {
    CloseableHttpClient httpClient = HttpClientBuilder.create().build();
    CloseableHttpResponse response = null;
    HttpGet httpGet = new HttpGet(url);
    httpGet.setConfig(requestConfig);
    httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
    try {
        response = httpClient.execute(httpGet);
        HttpEntity entity = response.getEntity();
        if (entity != null) {
            return EntityUtils.toString(entity,"gb2312");
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        try {
            if (response != null) {
                response.close();
            }
            if (httpClient != null) {
                httpClient.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return "";
}

带参get请求

感觉有点多此一举?这不是直接写个url就好了,如果参数会变用replace处理不就行了?

URIBuilder builder = new URIBuilder("https://bid.yimei180.com/");
for (int page = 1;page < 5;page++){
    builder.setParameter("jsessionid","6B3EB5A0981C01DD0B4BF36537D7C302")
            .setParameter("pageNo","2")
            .setParameter("pageSize","10");
    String html = doGet(builder.build().toString());
}

post的表单模式

List<NameValuePair>传参


List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("currentPage","2"));

private static String doPost(String url, List<NameValuePair> params) throws UnsupportedEncodingException {
    CloseableHttpClient httpClient = HttpClientBuilder.create().build();
    HttpPost httpPost = new HttpPost(url);
    httpPost.setConfig(requestConfig);
    httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
    UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"UTF-8");
    httpPost.setEntity(formEntity);
    CloseableHttpResponse response = null;
    try{
        response = httpClient.execute(httpPost);
        if(response.getStatusLine().getStatusCode()==200){
            HttpEntity entity = response.getEntity();
            if (entity!=null){
                return EntityUtils.toString(entity,"UTF-8");
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        try {
            if (response != null) {
                response.close();
            }
            if (httpClient != null) {
                httpClient.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return "error";
}

post的json模式

StringEntity jsonEntity = new StringEntity(json, "UTF-8");
httpPost.setEntity(jsonEntity);

post的xml模式

同上,使用StringEntity

使用同一个httpclient发起两次请求

用于解决window.location重定向

public String getContent(String url) {
    CloseableHttpClient httpClient = null;
    CloseableHttpResponse response = null;
    CloseableHttpResponse response2 = null;
    String result = "";
    try {
        httpClient = getHttpClient(false, true);
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
        httpGet.setHeader("Connection", "close");
        RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000).setSocketTimeout(15 * 1000).build();
        httpGet.setConfig(config);
        response = httpClient.execute(httpGet);
        response.setHeader("Connection", "close");
        if (response.getStatusLine().getStatusCode() == 200) {
            result = EntityUtils.toString(response.getEntity(), "UTF-8");
            if (result.contains("跳转中")){
                String newUrl = result.substring(result.indexOf("/channel"),result.lastIndexOf("\""));
                newUrl = "http://www.ynys.gov.cn" + newUrl;
                HttpGet httpGet2 = new HttpGet(newUrl);
                httpGet2.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
                httpGet2.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
                httpGet2.setHeader("Connection", "close");
                httpGet2.setConfig(config);
                response2 = httpClient.execute(httpGet2);
                response2.setHeader("Connection", "close");
                if (response.getStatusLine().getStatusCode() == 200) {
                    result = EntityUtils.toString(response2.getEntity(), "UTF-8");
                }
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (response != null) {
                response.close();
            }
            if (httpClient != null) {
                httpClient.close();
            }
        } catch (Exception e) {
             e.printStackTrace();
        }
    }
    return result;
}

获取并设置cookie

public String getCookie(String url) {
    String cookie = "";
    BasicCookieStore cookieStore = new BasicCookieStore();
    CloseableHttpClient client = HttpClientBuilder.create().build();
    try {
        HttpGet get = new HttpGet(url);
        client = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(10 * 1000).setSocketTimeout(30 * 1000).build();
        get.setConfig(requestConfig);
        client.execute(get);
        for (int i = 0; i < cookieStore.getCookies().size(); i++) {
            String name = cookieStore.getCookies().get(i).getName();
            String value = cookieStore.getCookies().get(i).getValue();
            cookie += name + "=" + value + ";";
            //System.out.println("cookie==" + name + "=" + value + ";");
        }
        if (cookie.endsWith(";")) {
            cookie = cookie.substring(0, cookie.length() - 1);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    return cookie;
}

public String getContent(String path) {
    String respStr = "";
    String cookie = getCookie();
    CloseableHttpClient client = null;
    CloseableHttpResponse res = null;
    try {
        client = getHttpClient(true, false);
        HttpGet httpGet = new HttpGet(path);
        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
        httpGet.addHeader("Content-type", "text/html");
        httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        if (!"".equals(cookie)) {
            httpGet.addHeader("Cookie", cookie);
        }
        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(30 * 1000).setConnectionRequestTimeout(30 * 1000).setSocketTimeout(30 * 1000)
                .setRedirectsEnabled(false)
                .build();
        httpGet.setConfig(requestConfig);
        res = client.execute(httpGet);
        respStr = EntityUtils.toString(res.getEntity(), "utf-8");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (res != null) {
                res.close();
            }
            if (client != null) {
                client.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return respStr;
}

留言

本站已运行
© 2024 Jack  由 Hexo 驱动
复制成功