HttpClient 连接池

    1. import org.apache.http.impl.client.CloseableHttpClient;
    2. import org.apache.http.impl.client.HttpClients;
    3. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    4. public class HttpClientCreater {
    5. private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    6. public static CloseableHttpClient create() {
    7. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    8. return httpClient;
    9. }
    10. }

    爬取京东页面

    1. import java.io.File;
    2. import java.io.FileOutputStream;
    3. import java.io.IOException;
    4. import org.apache.http.client.ClientProtocolException;
    5. import org.apache.http.client.methods.CloseableHttpResponse;
    6. import org.apache.http.client.methods.HttpGet;
    7. import org.apache.http.impl.client.CloseableHttpClient;
    8. import org.apache.http.util.EntityUtils;
    9. import org.jsoup.Jsoup;
    10. import org.jsoup.nodes.Document;
    11. import org.jsoup.nodes.Element;
    12. import org.jsoup.select.Elements;
    13. import com.sftp.spider.util.HttpClientCreater;
    14. import cn.hutool.core.lang.UUID;
    15. public class TestJd {
    16. private static String baseUrl = "https://list.jd.com/list.html?cat=9987%2C653%2C655&s=117&click=0&page=";
    17. public static void main(String[] args) {
    18. try {
    19. CloseableHttpClient httpClient = HttpClientCreater.create();
    20. for (int i = 0; i < 10; i++) {
    21. HttpGet get = new HttpGet(baseUrl + (i * 2 + 1));
    22. get.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36");
    23. CloseableHttpResponse httpResponse = httpClient.execute(get);
    24. String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8");
    25. parseHtml(html);
    26. httpResponse.close();
    27. }
    28. } catch (IOException e) {
    29. e.printStackTrace();
    30. }
    31. }
    32. private static void parseHtml(String html) throws ClientProtocolException, IOException {
    33. Document document = Jsoup.parse(html);
    34. Elements nodes = document.select("li.gl-item");
    35. for (Element node : nodes) {
    36. String sku = node.attr("data-sku");
    37. String spu = node.attr("data-spu");
    38. String title = node.select("div.p-name").text();
    39. String price = node.select("div.p-price i").text();
    40. String picUrl = node.select("div.p-img img").attr("data-lazy-img");
    41. String imgPath = downloadImg(picUrl);
    42. String itemUrl = node.select("div.p-img > a").attr("href");
    43. System.out.println("sku:" + sku + ",spu:" + spu + ",title:" + title + ",price:" + price + ",picUrl:" + picUrl + ",itemUrl:" + itemUrl);
    44. }
    45. }
    46. private static String downloadImg(String url) throws ClientProtocolException, IOException {
    47. File file = new File("D:/tmp/jd/mobile");
    48. if (!file.exists()) {
    49. file.mkdirs();
    50. }
    51. String extName = url.substring(url.lastIndexOf("."));
    52. String fileName = UUID.fastUUID().toString() + extName;
    53. CloseableHttpClient httpClient = HttpClientCreater.create();
    54. HttpGet get = new HttpGet("https:" + url);
    55. get.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36");
    56. CloseableHttpResponse response = httpClient.execute(get);
    57. FileOutputStream outpuStream = new FileOutputStream("D:/tmp/jd/mobile/" + fileName);
    58. response.getEntity().writeTo(outpuStream);
    59. outpuStream.close();
    60. response.close();
    61. return fileName;
    62. }
    63. }