一、网络爬虫

爬虫的基本原理很简单,就是利用程序访问互联网,然后将数据保存到本地中。我们都知道,互联网提供的服务大多数是以网站的形式提供的。我们需要的数据一般都是从网站中获取的,如电商网站商品信息、商品的评论、微博的信息等。爬虫和我们手动将看到的数据复制粘贴下来是类似的,只是获取大量的数据靠人工显然不太可能。因此,需要我们使用工具来帮助获取知识。使用程序编写爬虫就是使用程序编写一些网络访问的规则,将我们的目标数据保存下来。接下来,让我们开始从头搭建一个爬虫的案例。

二、HttpClient的入门程序

maven依赖:

  1. <dependency>
  2. <groupId>org.apache.httpcomponents</groupId>
  3. <artifactId>httpclient</artifactId>
  4. <version>4.5.2</version>
  5. </dependency>
  6. <dependency>
  7. <groupId>org.slf4j</groupId>
  8. <artifactId>slf4j-log4j12</artifactId>
  9. <version>1.7.25</version>
  10. <scope>test</scope>
  11. </dependency>

HelloWolrd代码

  1. /**
  2. * @author River
  3. * @date 2020/6/23 6:21
  4. * @description
  5. */
  6. public class HelloWorld {
  7. public static void main(String[] args) throws Exception{
  8. //1.打开浏览器,创建对象
  9. CloseableHttpClient httpClient = HttpClients.createDefault();
  10. //2.网址
  11. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
  12. //3.发起请求
  13. CloseableHttpResponse response = httpClient.execute(httpGet);
  14. //4.解析响应,获取数据
  15. if(response.getStatusLine().getStatusCode()==200){
  16. HttpEntity entity = response.getEntity();
  17. String content = EntityUtils.toString(entity, "utf8");
  18. System.out.println(content);
  19. }
  20. }
  21. }

返回的数据首页的数据:
image.png

三、HttpClient Get请求

  1. public static void main(String[] args) throws Exception{
  2. //1.创建对象
  3. CloseableHttpClient httpClient = HttpClients.createDefault();
  4. //2.访问地址
  5. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
  6. //3.发起请
  7. CloseableHttpResponse response = httpClient.execute(httpGet);
  8. //4.解析响应
  9. if(response.getStatusLine().getStatusCode()==200){
  10. HttpEntity entity = response.getEntity();
  11. String content = EntityUtils.toString(entity, "utf8");
  12. System.out.println(content);
  13. }
  14. //5.关闭response
  15. response.close();
  16. httpClient.close();
  17. }

带参数的Get请求:使用的是uri进行参数的构造

  1. public static void main(String[] args) throws Exception{
  2. //1.创建对象
  3. CloseableHttpClient httpClient = HttpClients.createDefault();
  4. //设置请求地址
  5. URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
  6. uriBuilder.setParameter("keys","java");
  7. //2.访问地址
  8. HttpGet httpGet = new HttpGet(uriBuilder.build());
  9. //3.发起请求
  10. CloseableHttpResponse response = httpClient.execute(httpGet);
  11. //4.解析响应
  12. if(response.getStatusLine().getStatusCode()==200){
  13. HttpEntity entity = response.getEntity();
  14. String content = EntityUtils.toString(entity, "utf8");
  15. System.out.println(content);
  16. }
  17. //5.关闭response
  18. response.close();
  19. httpClient.close();
  20. }

四、Post请求

  1. public static void main(String[] args) throws Exception{
  2. //1.创建对象
  3. CloseableHttpClient httpClient = HttpClients.createDefault();
  4. //2.访问地址
  5. HttpPost httpPost = new HttpPost("http://www.itcast.cn");
  6. //3.发起请求
  7. CloseableHttpResponse response = httpClient.execute(httpPost);
  8. //4.解析响应
  9. if(response.getStatusLine().getStatusCode()==200){
  10. HttpEntity entity = response.getEntity();
  11. String content = EntityUtils.toString(entity, "utf8");
  12. System.out.println(content);
  13. }
  14. //5.关闭response
  15. response.close();
  16. httpClient.close();
  17. }

带参数的POST:

  1. //1.创建对象
  2. CloseableHttpClient httpClient = HttpClients.createDefault();
  3. //2.访问地址
  4. HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");
  5. // 利用集合封装表单请求参数
  6. List<NameValuePair> params = new ArrayList<NameValuePair>();
  7. params.add(new BasicNameValuePair("keys","java"));
  8. UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
  9. httpPost.setEntity(formEntity);
  10. //3.发起请求
  11. CloseableHttpResponse response = httpClient.execute(httpPost);
  12. //4.解析响应
  13. if(response.getStatusLine().getStatusCode()==200){
  14. HttpEntity entity = response.getEntity();
  15. String content = EntityUtils.toString(entity, "utf8");
  16. System.out.println(content);
  17. }
  18. //5.关闭response
  19. response.close();
  20. httpClient.close();
  21. }

HttpClient连接池

  1. public static void main(String[] args) throws Exception {
  2. //1.创建连接池管理器
  3. PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
  4. //设置连接数
  5. cm.setMaxTotal(100);
  6. //设置每个主机的最大连接数
  7. cm.setDefaultMaxPerRoute(10);
  8. //2.管理器发起请求
  9. doGet(cm);
  10. }
  11. private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception{
  12. //1.从连接池中获取HttpClient
  13. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
  14. //2.发请求
  15. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
  16. CloseableHttpResponse response = httpClient.execute(httpGet);
  17. if (response.getStatusLine().getStatusCode()==200){
  18. String content = EntityUtils.toString(response.getEntity(), "utf8");
  19. System.out.println(content.length());
  20. }
  21. //3.关闭
  22. response.close();
  23. //httpClient.close(); 连接池管理 不必关闭
  24. }

HttpClient请求参数

  1. public static void main(String[] args) throws Exception {
  2. //1.创建对象
  3. CloseableHttpClient httpClient = HttpClients.createDefault();
  4. //2.访问地址
  5. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
  6. // 配置请求信息
  7. RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) //创建连接的做大连接时间 一天
  8. .setConnectionRequestTimeout(500) //获取连接的最长时间
  9. .setSocketTimeout(10*1000) //设置数据传输的最长时间
  10. .build();
  11. httpGet.setConfig(config);
  12. //3.发起请求
  13. CloseableHttpResponse response = httpClient.execute(httpGet);
  14. //4.解析响应
  15. if(response.getStatusLine().getStatusCode()==200){
  16. HttpEntity entity = response.getEntity();
  17. String content = EntityUtils.toString(entity, "utf8");
  18. System.out.println(content);
  19. }
  20. //5.关闭response
  21. response.close();
  22. httpClient.close();
  23. }

封装

  1. import com.sun.istack.Pool;
  2. import org.apache.http.client.config.RequestConfig;
  3. import org.apache.http.client.methods.CloseableHttpResponse;
  4. import org.apache.http.client.methods.HttpGet;
  5. import org.apache.http.impl.client.CloseableHttpClient;
  6. import org.apache.http.impl.client.HttpClients;
  7. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
  8. import org.apache.http.util.EntityUtils;
  9. import org.springframework.stereotype.Component;
  10. import java.io.*;
  11. import java.util.UUID;
  12. /**
  13. * @date 2020/2/8 12:43
  14. */
  15. @Component //实例才能使用工具类
  16. public class HttpUtils {
  17. private PoolingHttpClientConnectionManager cm;
  18. public HttpUtils() {
  19. cm = new PoolingHttpClientConnectionManager();
  20. //设置最大连接数
  21. cm.setMaxTotal(100);
  22. cm.setDefaultMaxPerRoute(10);
  23. }
  24. /**
  25. * 根据请求地址下载数据
  26. * @param url
  27. * @return
  28. * @throws Exception
  29. */
  30. public String doGetHtml(String url) throws Exception {
  31. //1.获取对象
  32. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
  33. //2.地址
  34. HttpGet httpGet = new HttpGet(url);
  35. //设置请求信息
  36. httpGet.setConfig(getConfig());
  37. //3.请求数据
  38. CloseableHttpResponse response = httpClient.execute(httpGet);
  39. //4.解析
  40. if (response.getStatusLine().getStatusCode()==200){
  41. if (response.getEntity() != null){
  42. String content = EntityUtils.toString(response.getEntity(), "utf8");
  43. return content;
  44. }
  45. }
  46. if (response != null){
  47. response.close();
  48. }
  49. return ""; //没有数据时候返回空
  50. }
  51. /**
  52. * 下载图片
  53. * @param url
  54. * @return
  55. */
  56. public String doGetImage(String url) throws Exception {
  57. //1.获取对象
  58. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
  59. //2.地址
  60. HttpGet httpGet = new HttpGet(url);
  61. //设置请求信息
  62. httpGet.setConfig(getConfig());
  63. //3.请求数据
  64. CloseableHttpResponse response = httpClient.execute(httpGet);
  65. //4.解析
  66. if (response.getStatusLine().getStatusCode()==200){
  67. if (response.getEntity() != null){
  68. //获取图片的后缀
  69. String extName = url.substring(url.lastIndexOf("."));
  70. //重命名图片
  71. String picName = UUID.randomUUID().toString()+extName;
  72. //下载图片
  73. OutputStream outputStream = new FileOutputStream(new File("D:\\APP\\IDEA\\workplace\\crawler\\images\\"+picName));
  74. response.getEntity().writeTo(outputStream);
  75. //返回图片名称
  76. return picName;
  77. }
  78. }
  79. if (response != null){
  80. response.close();
  81. }
  82. return ""; //没有数据时候返回空
  83. }
  84. private RequestConfig getConfig() {
  85. RequestConfig config = RequestConfig.custom()
  86. .setConnectTimeout(1000) //创建连接的最长时间
  87. .setConnectionRequestTimeout(500) //获取连接的最长时间
  88. .setSocketTimeout(10000) //数据传输的最长时间
  89. .build();
  90. return config;
  91. }
  92. }