HttpClient 连接池
import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;public class HttpClientCreater {private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();public static CloseableHttpClient create() {CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();return httpClient;}}
爬取京东页面
import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.sftp.spider.util.HttpClientCreater;import cn.hutool.core.lang.UUID;public class TestJd {private static String baseUrl = "https://list.jd.com/list.html?cat=9987%2C653%2C655&s=117&click=0&page=";public static void main(String[] args) {try {CloseableHttpClient httpClient = HttpClientCreater.create();for (int i = 0; i < 10; i++) {HttpGet get = new HttpGet(baseUrl + (i * 2 + 1));get.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36");CloseableHttpResponse httpResponse = httpClient.execute(get);String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8");parseHtml(html);httpResponse.close();}} catch (IOException e) {e.printStackTrace();}}private static void parseHtml(String html) throws ClientProtocolException, IOException {Document document = Jsoup.parse(html);Elements nodes = document.select("li.gl-item");for (Element node : nodes) {String sku = node.attr("data-sku");String spu = node.attr("data-spu");String title = node.select("div.p-name").text();String price = node.select("div.p-price i").text();String picUrl = node.select("div.p-img img").attr("data-lazy-img");String imgPath = downloadImg(picUrl);String itemUrl = node.select("div.p-img > a").attr("href");System.out.println("sku:" + sku + ",spu:" + spu + ",title:" + title + ",price:" + price + ",picUrl:" + picUrl + ",itemUrl:" + itemUrl);}}private static String downloadImg(String url) throws ClientProtocolException, IOException {File file = new File("D:/tmp/jd/mobile");if (!file.exists()) {file.mkdirs();}String extName = url.substring(url.lastIndexOf("."));String fileName = UUID.fastUUID().toString() + extName;CloseableHttpClient httpClient = HttpClientCreater.create();HttpGet get = new HttpGet("https:" + url);get.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36");CloseableHttpResponse response = httpClient.execute(get);FileOutputStream outpuStream = new FileOutputStream("D:/tmp/jd/mobile/" + fileName);response.getEntity().writeTo(outpuStream);outpuStream.close();response.close();return fileName;}}
