一、网络爬虫
爬虫的基本原理很简单,就是利用程序访问互联网,然后将数据保存到本地中。我们都知道,互联网提供的服务大多数是以网站的形式提供的。我们需要的数据一般都是从网站中获取的,如电商网站商品信息、商品的评论、微博的信息等。爬虫和我们手动将看到的数据复制粘贴下来是类似的,只是获取大量的数据靠人工显然不太可能。因此,需要我们使用工具来帮助获取知识。使用程序编写爬虫就是使用程序编写一些网络访问的规则,将我们的目标数据保存下来。接下来,让我们开始从头搭建一个爬虫的案例。
二、HttpClient的入门程序
maven依赖:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<scope>test</scope>
</dependency>
HelloWolrd代码
/**
* @author River
* @date 2020/6/23 6:21
* @description
*/
public class HelloWorld {
public static void main(String[] args) throws Exception{
//1.打开浏览器,创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.网址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应,获取数据
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
}
}
返回的数据首页的数据:
三、HttpClient Get请求
public static void main(String[] args) throws Exception{
//1.创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//3.发起请
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
//5.关闭response
response.close();
httpClient.close();
}
带参数的Get请求:使用的是uri进行参数的构造
public static void main(String[] args) throws Exception{
//1.创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求地址
URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
uriBuilder.setParameter("keys","java");
//2.访问地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
//5.关闭response
response.close();
httpClient.close();
}
四、Post请求
public static void main(String[] args) throws Exception{
//1.创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.访问地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn");
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpPost);
//4.解析响应
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
//5.关闭response
response.close();
httpClient.close();
}
带参数的POST:
//1.创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.访问地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");
// 利用集合封装表单请求参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("keys","java"));
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
httpPost.setEntity(formEntity);
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpPost);
//4.解析响应
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
//5.关闭response
response.close();
httpClient.close();
}
HttpClient连接池
public static void main(String[] args) throws Exception {
//1.创建连接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
//2.管理器发起请求
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception{
//1.从连接池中获取HttpClient
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.发请求
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
//3.关闭
response.close();
//httpClient.close(); 连接池管理 不必关闭
}
HttpClient请求参数
public static void main(String[] args) throws Exception {
//1.创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
// 配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) //创建连接的做大连接时间 一天
.setConnectionRequestTimeout(500) //获取连接的最长时间
.setSocketTimeout(10*1000) //设置数据传输的最长时间
.build();
httpGet.setConfig(config);
//3.发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应
if(response.getStatusLine().getStatusCode()==200){
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
//5.关闭response
response.close();
httpClient.close();
}
封装
import com.sun.istack.Pool;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.*;
import java.util.UUID;
/**
* @date 2020/2/8 12:43
*/
@Component //实例才能使用工具类
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
cm.setDefaultMaxPerRoute(10);
}
/**
* 根据请求地址下载数据
* @param url
* @return
* @throws Exception
*/
public String doGetHtml(String url) throws Exception {
//1.获取对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.地址
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(getConfig());
//3.请求数据
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析
if (response.getStatusLine().getStatusCode()==200){
if (response.getEntity() != null){
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
if (response != null){
response.close();
}
return ""; //没有数据时候返回空
}
/**
* 下载图片
* @param url
* @return
*/
public String doGetImage(String url) throws Exception {
//1.获取对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.地址
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(getConfig());
//3.请求数据
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析
if (response.getStatusLine().getStatusCode()==200){
if (response.getEntity() != null){
//获取图片的后缀
String extName = url.substring(url.lastIndexOf("."));
//重命名图片
String picName = UUID.randomUUID().toString()+extName;
//下载图片
OutputStream outputStream = new FileOutputStream(new File("D:\\APP\\IDEA\\workplace\\crawler\\images\\"+picName));
response.getEntity().writeTo(outputStream);
//返回图片名称
return picName;
}
}
if (response != null){
response.close();
}
return ""; //没有数据时候返回空
}
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000) //创建连接的最长时间
.setConnectionRequestTimeout(500) //获取连接的最长时间
.setSocketTimeout(10000) //数据传输的最长时间
.build();
return config;
}
}