网络爬虫就是用程序帮助我们访问网络上的资源,我们一直以来都是使用HTTP协议访问互联网的网页,网络爬虫需要编写程序,在这里使用同样的HTTP协议访问网页。
这里我们使用Java 的 HTTP协议客户端HttpClient这个技术,来实现抓取网页数据。


2.1 Get请求

  1. package cn.itbuild.crawler.test;
  2. import org.apache.http.client.methods.CloseableHttpResponse;
  3. import org.apache.http.client.methods.HttpGet;
  4. import org.apache.http.impl.client.CloseableHttpClient;
  5. import org.apache.http.impl.client.HttpClients;
  6. import org.apache.http.util.EntityUtils;
  7. import java.io.IOException;
  8. /**
  9. * @Date 2020/12/21 19:16
  10. * @Version 10.21
  11. * @Author DuanChaojie
  12. */
  13. public class HttpGetTest {
  14. public static void main(String[] args) {
  15. // 1.创建HttpClient对象
  16. CloseableHttpClient httpClient = HttpClients.createDefault();
  17. // 2.创建HttpGet对象, 设置url访问地址
  18. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
  19. // 3.使用 HttpClient 发起请求, 获取 response
  20. CloseableHttpResponse response = null;
  21. try {
  22. response = httpClient.execute(httpGet);
  23. // 4.解析响应
  24. if (response.getStatusLine().getStatusCode() == 200){
  25. String content = EntityUtils.toString(response.getEntity(), "utf-8");
  26. System.out.println(content.length());
  27. System.out.println(content);
  28. }
  29. } catch (IOException e) {
  30. e.printStackTrace();
  31. }finally {
  32. // 5.关闭资源
  33. if (response != null) {
  34. try {
  35. response.close();
  36. } catch (IOException e) {
  37. e.printStackTrace();
  38. }
  39. }
  40. if (httpClient != null) {
  41. try {
  42. httpClient.close();
  43. } catch (IOException e) {
  44. e.printStackTrace();
  45. }
  46. }
  47. }
  48. }
  49. }


2.2 带参数的Get请求

在传智中搜索学习视频,地址为http://yun.itheima.com/search?keys=Java

  1. package cn.itbuild.crawler.test;
  2. import org.apache.http.client.methods.CloseableHttpResponse;
  3. import org.apache.http.client.methods.HttpGet;
  4. import org.apache.http.client.utils.URIBuilder;
  5. import org.apache.http.impl.client.CloseableHttpClient;
  6. import org.apache.http.impl.client.HttpClients;
  7. import org.apache.http.util.EntityUtils;
  8. /**
  9. * @Date 2020/12/21 19:16
  10. * @Version 10.21
  11. * @Author DuanChaojie
  12. */
  13. public class HttpGetParamTest {
  14. public static void main(String[] args) throws Exception {
  15. // 1.创建HttpClient对象
  16. CloseableHttpClient httpClient = HttpClients.createDefault();
  17. // 设置请求地址是: http://yun.itheima.com/search?keys=Java
  18. // 创建uriBuilder
  19. URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
  20. // 多个参数,使用连式编程
  21. uriBuilder.setParameter("keys","Java");
  22. // 2.创建HttpGet对象, 设置url访问地址☆
  23. HttpGet httpGet = new HttpGet(uriBuilder.build());
  24. System.out.println("发送请求的信息:");
  25. // 3.使用 HttpClient 发起请求, 获取 response
  26. CloseableHttpResponse response = httpClient.execute(httpGet);
  27. // 4.解析响应
  28. if (response.getStatusLine().getStatusCode() == 200){
  29. String content = EntityUtils.toString(response.getEntity(), "utf-8");
  30. System.out.println(content.length());
  31. System.out.println(content);
  32. }
  33. }
  34. }


2.3 Post请求

使用POST 请求访问传智官网,请求url地址:http://www.itcast.cn/

  1. package cn.itbuild.crawler.test;
  2. import org.apache.http.client.methods.CloseableHttpResponse;
  3. import org.apache.http.client.methods.HttpPost;
  4. import org.apache.http.impl.client.CloseableHttpClient;
  5. import org.apache.http.impl.client.HttpClients;
  6. import org.apache.http.util.EntityUtils;
  7. import java.io.IOException;
  8. /**
  9. * @Date 2020/12/21 19:16
  10. * @Version 10.21
  11. * @Author DuanChaojie
  12. */
  13. public class HttpPostTest {
  14. public static void main(String[] args) {
  15. // 1.创建HttpClient对象
  16. CloseableHttpClient httpClient = HttpClients.createDefault();
  17. // 2.创建HttpPost对象, 设置url访问地址
  18. HttpPost httpPost = new HttpPost("http://www.itcast.cn");
  19. // 3.使用 HttpClient 发起请求, 获取 response
  20. CloseableHttpResponse response = null;
  21. try {
  22. response = httpClient.execute(httpPost);
  23. // 4.解析响应
  24. if (response.getStatusLine().getStatusCode() == 200){
  25. String content = EntityUtils.toString(response.getEntity(), "utf-8");
  26. System.out.println(content.length());
  27. System.out.println(content);
  28. }
  29. } catch (IOException e) {
  30. e.printStackTrace();
  31. }finally {
  32. // 5.关闭资源
  33. if (response != null) {
  34. try {
  35. response.close();
  36. } catch (IOException e) {
  37. e.printStackTrace();
  38. }
  39. }
  40. if (httpClient != null) {
  41. try {
  42. httpClient.close();
  43. } catch (IOException e) {
  44. e.printStackTrace();
  45. }
  46. }
  47. }
  48. }
  49. }


2.4 带参数的Post请求

在传智中搜索学习视频,使用POST请求,url地址为:http://yun.itheima.com/search
url地址没有参数,参数 keys=java放到表单中进行提交。

  1. package cn.itbuild.crawler.test;
  2. import org.apache.http.NameValuePair;
  3. import org.apache.http.client.entity.UrlEncodedFormEntity;
  4. import org.apache.http.client.methods.CloseableHttpResponse;
  5. import org.apache.http.client.methods.HttpPost;
  6. import org.apache.http.impl.client.CloseableHttpClient;
  7. import org.apache.http.impl.client.HttpClients;
  8. import org.apache.http.message.BasicNameValuePair;
  9. import org.apache.http.util.EntityUtils;
  10. import java.util.ArrayList;
  11. import java.util.List;
  12. /**
  13. * @Date 2020/12/21 19:16
  14. * @Version 10.21
  15. * @Author DuanChaojie
  16. */
  17. public class HttpPostParamTest {
  18. public static void main(String[] args) throws Exception {
  19. // 1.创建HttpClient对象
  20. CloseableHttpClient httpClient = HttpClients.createDefault();
  21. // 2.创建HttpPost对象, 设置url访问地址
  22. HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
  23. /** 声明List集合, 封装表单中的参数
  24. * public interface NameValuePair {
  25. * String getName();
  26. *
  27. * String getValue();
  28. * }
  29. */
  30. List<NameValuePair> params = new ArrayList<NameValuePair>();
  31. // 注意BasicNameValuePair是NameValuePair唯一的实现类
  32. params.add(new BasicNameValuePair("keys","Java"));
  33. // 创建表单的Entity对象, 第一个参数就是封装好的表单数据, 第二个参数就是编码
  34. UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf-8");
  35. // 设置表单的Entity对象到 Post 请求中
  36. httpPost.setEntity(formEntity);
  37. // 3.使用 HttpClient 发起请求, 获取 response
  38. CloseableHttpResponse response = httpClient.execute(httpPost);
  39. // 4.解析响应
  40. if (response.getStatusLine().getStatusCode() == 200){
  41. String content = EntityUtils.toString(response.getEntity(), "utf-8");
  42. System.out.println(content.length());
  43. }
  44. }
  45. }


2.5 连接池

如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。

  1. package cn.itbuild.crawler.test;
  2. import org.apache.http.client.methods.CloseableHttpResponse;
  3. import org.apache.http.client.methods.HttpGet;
  4. import org.apache.http.impl.client.CloseableHttpClient;
  5. import org.apache.http.impl.client.HttpClients;
  6. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
  7. import org.apache.http.util.EntityUtils;
  8. import java.io.IOException;
  9. /**
  10. * @Date 2020/12/21 20:08
  11. * @Version 10.21
  12. * @Author DuanChaojie
  13. */
  14. public class HttpClientPoolTest {
  15. public static void main(String[] args) {
  16. // 创建连接池管理器
  17. PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
  18. // 设置连接数
  19. cm.setMaxTotal(100);
  20. // 设置每个主机的最大连接数
  21. cm.setDefaultMaxPerRoute(10);
  22. // 使用连接池管理器发起请求
  23. doGet(cm);
  24. doGet(cm);
  25. }
  26. /**
  27. * @param cm
  28. */
  29. private static void doGet(PoolingHttpClientConnectionManager cm) {
  30. // 不是每次创建新的HttpClient, 而是从连接池中获取 HttpClient 对象
  31. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
  32. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
  33. CloseableHttpResponse response = null;
  34. try {
  35. // 使用 HttpClient 发起请求, 获取 response
  36. response = httpClient.execute(httpGet);
  37. // 解析响应
  38. if (response.getStatusLine().getStatusCode() == 200) {
  39. String content = EntityUtils.toString(response.getEntity(), "utf8");
  40. System.out.println(content.length());
  41. }
  42. } catch(IOException e) {
  43. e.printStackTrace();
  44. } finally {
  45. // 关闭 response
  46. if (response != null) {
  47. try {
  48. response.close();
  49. } catch (IOException e) {
  50. e.printStackTrace();
  51. }
  52. // 不能关闭 HttpClient, 由连接池管理 HttpClient
  53. // httpClient. close();
  54. }
  55. }
  56. }
  57. }


2.6 请求参数

有时候因为网络,或者目标服务器的原因,请求需要更长的时间才能完成,我们需要自定义相关时间。

package cn.itbuild.crawler.test;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * @Date 2020/12/21 20:12
 * @Version 10.21
 * @Author DuanChaojie
 */
public class HttpConfigTest {
    public static void main(String[] args) throws Exception{
        // 1. 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 2. 创建HttpGet对象, 设置url访问地址
        HttpGet httpGet = new HttpGet("http://yun.itheima.com/search?keys=Java");

        // 3. 配置请求信息
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 创建连接的最长时间, 单位是毫秒
                .setConnectionRequestTimeout(500) // 设置获取连接的最长时间, 单位是毫秒
                .setSocketTimeout(10 * 1000) // 设置数据传输的最长时间, 单位是毫秒
                .build();

        // 4. 给请求设置请求信息
        httpGet.setConfig(config);

        // 5. 使用 HttpClient 发起请求, 获取 response
        CloseableHttpResponse response = httpClient.execute(httpGet);

        // 6. 解析响应
        if (response.getStatusLine().getStatusCode() == 200) {
            String content = EntityUtils.toString(response.getEntity(), "utf8");
            System.out.println(content.length());
        }
    }
}