网络爬虫就是用程序帮助我们访问网络上的资源,我们一直以来都是使用HTTP协议访问互联网的网页,网络爬虫需要编写程序,在这里使用同样的HTTP协议访问网页。
这里我们使用Java 的 HTTP协议客户端HttpClient这个技术,来实现抓取网页数据。
2.1 Get请求
package cn.itbuild.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Date 2020/12/21 19:16
* @Version 10.21
* @Author DuanChaojie
*/
public class HttpGetTest {
public static void main(String[] args) {
// 1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 2.创建HttpGet对象, 设置url访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
// 3.使用 HttpClient 发起请求, 获取 response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
// 4.解析响应
if (response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(content.length());
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 5.关闭资源
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null) {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2.2 带参数的Get请求
在传智中搜索学习视频,地址为http://yun.itheima.com/search?keys=Java
package cn.itbuild.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
/**
* @Date 2020/12/21 19:16
* @Version 10.21
* @Author DuanChaojie
*/
public class HttpGetParamTest {
public static void main(String[] args) throws Exception {
// 1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 设置请求地址是: http://yun.itheima.com/search?keys=Java
// 创建uriBuilder
URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
// 多个参数,使用连式编程
uriBuilder.setParameter("keys","Java");
// 2.创建HttpGet对象, 设置url访问地址☆
HttpGet httpGet = new HttpGet(uriBuilder.build());
System.out.println("发送请求的信息:");
// 3.使用 HttpClient 发起请求, 获取 response
CloseableHttpResponse response = httpClient.execute(httpGet);
// 4.解析响应
if (response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(content.length());
System.out.println(content);
}
}
}
2.3 Post请求
使用POST 请求访问传智官网,请求url地址:http://www.itcast.cn/
package cn.itbuild.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Date 2020/12/21 19:16
* @Version 10.21
* @Author DuanChaojie
*/
public class HttpPostTest {
public static void main(String[] args) {
// 1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 2.创建HttpPost对象, 设置url访问地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn");
// 3.使用 HttpClient 发起请求, 获取 response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpPost);
// 4.解析响应
if (response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(content.length());
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 5.关闭资源
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null) {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2.4 带参数的Post请求
在传智中搜索学习视频,使用POST请求,url地址为:http://yun.itheima.com/search
url地址没有参数,参数 keys=java放到表单中进行提交。
package cn.itbuild.crawler.test;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.util.ArrayList;
import java.util.List;
/**
* @Date 2020/12/21 19:16
* @Version 10.21
* @Author DuanChaojie
*/
public class HttpPostParamTest {
public static void main(String[] args) throws Exception {
// 1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 2.创建HttpPost对象, 设置url访问地址
HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
/** 声明List集合, 封装表单中的参数
* public interface NameValuePair {
* String getName();
*
* String getValue();
* }
*/
List<NameValuePair> params = new ArrayList<NameValuePair>();
// 注意BasicNameValuePair是NameValuePair唯一的实现类
params.add(new BasicNameValuePair("keys","Java"));
// 创建表单的Entity对象, 第一个参数就是封装好的表单数据, 第二个参数就是编码
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf-8");
// 设置表单的Entity对象到 Post 请求中
httpPost.setEntity(formEntity);
// 3.使用 HttpClient 发起请求, 获取 response
CloseableHttpResponse response = httpClient.execute(httpPost);
// 4.解析响应
if (response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(content.length());
}
}
}
2.5 连接池
如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。
package cn.itbuild.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Date 2020/12/21 20:08
* @Version 10.21
* @Author DuanChaojie
*/
public class HttpClientPoolTest {
public static void main(String[] args) {
// 创建连接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// 设置连接数
cm.setMaxTotal(100);
// 设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
// 使用连接池管理器发起请求
doGet(cm);
doGet(cm);
}
/**
* @param cm
*/
private static void doGet(PoolingHttpClientConnectionManager cm) {
// 不是每次创建新的HttpClient, 而是从连接池中获取 HttpClient 对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response = null;
try {
// 使用 HttpClient 发起请求, 获取 response
response = httpClient.execute(httpGet);
// 解析响应
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch(IOException e) {
e.printStackTrace();
} finally {
// 关闭 response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
// 不能关闭 HttpClient, 由连接池管理 HttpClient
// httpClient. close();
}
}
}
}
2.6 请求参数
有时候因为网络,或者目标服务器的原因,请求需要更长的时间才能完成,我们需要自定义相关时间。
package cn.itbuild.crawler.test;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
/**
* @Date 2020/12/21 20:12
* @Version 10.21
* @Author DuanChaojie
*/
public class HttpConfigTest {
public static void main(String[] args) throws Exception{
// 1. 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 2. 创建HttpGet对象, 设置url访问地址
HttpGet httpGet = new HttpGet("http://yun.itheima.com/search?keys=Java");
// 3. 配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 创建连接的最长时间, 单位是毫秒
.setConnectionRequestTimeout(500) // 设置获取连接的最长时间, 单位是毫秒
.setSocketTimeout(10 * 1000) // 设置数据传输的最长时间, 单位是毫秒
.build();
// 4. 给请求设置请求信息
httpGet.setConfig(config);
// 5. 使用 HttpClient 发起请求, 获取 response
CloseableHttpResponse response = httpClient.execute(httpGet);
// 6. 解析响应
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
}
}