java 爬虫
引入
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
Get测试
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
public class GetController {
public static void main(String[] args) throws URISyntaxException {
CloseableHttpClient httpClient = HttpClients.createDefault();
String url = "https://www.baidu.com";
//设置参数
URIBuilder uriBuilder = new URIBuilder("url");
uriBuilder.setParameter("wd", "田云");
//新建get请求
HttpGet httpGet = new HttpGet(uriBuilder.build());
//设置请求头
//httpGet.setHeader("User-Agent", );
//设置请求参数
//RequestConfig config = RequestConfig.custom().setConnectTimeout(3000) //创建链接的最长时间
// .setConnectionRequestTimeout(500) //从连接池中获取到链接的最长时间
// .setSocketTimeout(10 * 1000) //数据传输的最长时间
// .setProxy(new HttpHost("ip",8080,"http"))
// .build();
//httpGet.setConfig(config);
CloseableHttpResponse response = null;
//发送请求
try {
response= httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
String html = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(html);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
Post测试
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
public class PostController {
public static void main(String[] args) throws URISyntaxException, UnsupportedEncodingException {
CloseableHttpClient httpClient = HttpClients.createDefault();
String url = "https://www.baidu.com";
//设置参数
URIBuilder uriBuilder = new URIBuilder("url");
uriBuilder.setParameter("wd", "田云");
//新建Post请求
HttpPost httpPost = new HttpPost(uriBuilder.build());
//设置请求头
httpPost.setHeader("User-Agent", "");
//设置form表单数据
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("key1", "value1"));
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf-8");
httpPost.setEntity(formEntity);
CloseableHttpResponse response = null;
//发送请求
try {
response= httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) {
String html = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(html);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
Jsoup解析
引用
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
使用:
# 解析
Document doc = Jsoup.parse(html);
# 筛选
Elements pngs = doc.select("img[src$=.png]");
# 获取元数据
select.get(0).attr("id");
select.get(0).text();
链接池
public class PoolController {
public static void main(String[] args) {
PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();
//最大链接数
manager.setMaxTotal(200);
//每个主机的最大连接数
manager.setDefaultMaxPerRoute(20);
userMamger(manager);
userMamger(manager);
}
public static void userMamger(PoolingHttpClientConnectionManager manager) {
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(manager).build();
System.out.println(httpClient);
}
}
封装
代码
MyHttp.java
import com.google.gson.Gson;
import lombok.Data;
import lombok.experimental.Accessors;
import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 我的utils封装
* 读取设置:url、cookies、params
* 读取:response
* 设置 proxy、header、超时时间
*
* @author Mark
*/
@Data
@Accessors(chain = true)
public class HttpUtil {
/** 代理 */
private HttpHost proxy;
/** 超时时间 5秒 */
private Integer connectTimeout = 5000;
/** header */
private HashMap<String, String> headerMap;
/** urlStr */
private String urlStr;
private BasicCookieStore cookieStore = new BasicCookieStore();
private RequestConfig config;
private HttpResponse httpResponse;
/**
* params参数
*/
private List<NameValuePair> nameValuePairs = new ArrayList<>();
public HttpUtil(String urlStr) {
// 5秒
this.urlStr = urlStr;
}
/**
* 给url设置参数
*
* @param paramsMap
* @throws URISyntaxException
*/
public HttpUtil setParams(HashMap<String, String> paramsMap){
for (Map.Entry<String, String> entry : paramsMap.entrySet()) {
nameValuePairs.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
return this;
}
/**
* 获取get请求结果
*
* @return
* @throws IOException
*/
public String get() throws Exception {
return getExecuteResult(getHttpGet());
}
/**
* 获取post请求结果
*
* @param dataMap
* @return
* @throws IOException
*/
public String post(HashMap<String, String> dataMap) throws Exception {
HttpPost httpPost = getHttpPost();
//解决中文乱码
httpPost.setHeader("Content-Type", "text/html; charset=UTF-8");
// 添加data
if (dataMap != null) {
List<NameValuePair> nameValuePairs = new ArrayList<>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nameValuePairs.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(nameValuePairs, "utf-8");
httpPost.setEntity(urlEncodedFormEntity);
}
return getExecuteResult(httpPost);
}
/**
* 发送json请求
* @param object
* @return
* @throws Exception
*/
public String postJson(Object object) throws Exception {
HttpPost httpPost = getHttpPost();
//解决中文乱码
httpPost.setHeader("Content-Type", "application/json;charset=UTF-8");
if (object != null) {
StringEntity entity = new StringEntity(new Gson().toJson(object), "utf-8");
httpPost.setEntity(entity);
}
return getExecuteResult(httpPost);
}
/**
* 处理get和post请求,返回html
*
* @param httpUriRequest
* @return
* @throws IOException
*/
private String getExecuteResult(HttpUriRequest httpUriRequest) throws IOException {
String returnStr;
//获取一个链接
CloseableHttpClient httpClient = HttpClients.custom()
//.setConnectionManager(connectionManager)
.setDefaultCookieStore(cookieStore).build();
//设置header
httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36");
if (headerMap != null) {
for (Map.Entry<String, String> entry : headerMap.entrySet()) {
httpUriRequest.setHeader(entry.getKey(), entry.getValue());
}
}
httpResponse = httpClient.execute(httpUriRequest);
//获取返回结果中的实体
HttpEntity entity = httpResponse.getEntity();
//判断是否失败
if (httpResponse.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
returnStr = null;
} else {
//查看页面内容结果
String rawHTMLContent = EntityUtils.toString(entity, "utf-8");
returnStr = decodeUnicode(rawHTMLContent);
}
//关闭HttpEntity流
EntityUtils.consume(entity);
return returnStr;
}
/**
* 设置cookies
*
* @param cookieMap
*/
public void setCookies(HashMap<String, String> cookieMap) {
for (Map.Entry<String, String> entry : cookieMap.entrySet()) {
BasicClientCookie clientCookie = new BasicClientCookie(entry.getKey(), entry.getValue());
cookieStore.addCookie(clientCookie);
}
}
/**
* 获取cookies
*
* @return
*/
public List<Cookie> getCookies() {
return cookieStore.getCookies();
}
private HttpGet getHttpGet() throws Exception {
URIBuilder uriBuilder = new URIBuilder(urlStr);
if (nameValuePairs.size() > 0) {
uriBuilder.addParameters(nameValuePairs);
}
URI uri = uriBuilder.build();
HttpGet httpGet = new HttpGet(uri);
httpGet.setConfig(getConfig());
return httpGet;
}
private HttpPost getHttpPost() throws Exception {
URIBuilder uriBuilder = new URIBuilder(urlStr);
if (nameValuePairs.size() > 0) {
uriBuilder.addParameters(nameValuePairs);
}
URI uri = uriBuilder.build();
HttpPost httpPost = new HttpPost(uri);
httpPost.setConfig(getConfig());
return httpPost;
}
public boolean getDownLoad(String filePath) throws Exception {
return downLoadFile(getHttpGet(), filePath);
}
public boolean PostDownLoad(String filePath) throws Exception {
return downLoadFile(getHttpPost(), filePath);
}
/**
* 下载文件到指定路径
*
* @param httpUriRequest
* @param filePath
* @return
*/
private boolean downLoadFile(HttpUriRequest httpUriRequest, String filePath) throws IOException {
CloseableHttpClient httpClient = HttpClients.custom()
//.setConnectionManager(connectionManager)
.setDefaultCookieStore(cookieStore).build();
HttpResponse httpResponse = httpClient.execute(httpUriRequest);
//判断是否失败
if (httpResponse.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
return false;
}
//获取返回结果中的实体
HttpEntity entity = httpResponse.getEntity();
//存储
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
entity.writeTo(bos);
bos.close();
//关闭HttpEntity流
EntityUtils.consume(entity);
return true;
}
/**
* 设置代理服务器
*
* @param ip
* @param port
* @param httpType
*/
public HttpUtil setProxy(String ip, int port, String httpType) {
this.proxy = new HttpHost(ip, port, httpType);
return this;
}
private RequestConfig getConfig() {
RequestConfig.Builder custom = RequestConfig.custom();
if (proxy != null) {
custom.setProxy(proxy);
}
/**
* 设置超时时间
*/
custom.setConnectTimeout(connectTimeout)
.setSocketTimeout(connectTimeout)
.setConnectionRequestTimeout(connectTimeout);
return custom.build();
}
/**
* 解析string里面的字符串,因为只有部分需要解析,所有需要判断
*
* @param s 字符串
* @return
*/
public String decodeUnicode(String s) {
char aChar;
int len = s.length();
StringBuffer outBuffer = new StringBuffer(len);
for (int x = 0; x < len; ) {
aChar = s.charAt(x++);
if (aChar == '\\') {
aChar = s.charAt(x++);
if (aChar == 'u') {
// Read the xxxx
int value = 0;
for (int i = 0; i < 4; i++) {
aChar = s.charAt(x++);
switch (aChar) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value = (value << 4) + aChar - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
value = (value << 4) + 10 + aChar - 'a';
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
value = (value << 4) + 10 + aChar - 'A';
break;
default:
throw new IllegalArgumentException(
"Malformed \\uxxxx encoding.");
}
}
outBuffer.append((char) value);
} else {
switch (aChar) {
case 't':
aChar = '\t';
break;
case 'r':
aChar = '\r';
break;
case 'n':
aChar = '\n';
break;
case 'f':
aChar = '\f';
break;
default:
break;
}
outBuffer.append(aChar);
}
} else {
outBuffer.append(aChar);
}
}
return outBuffer.toString();
}
public static void main(String[] args) throws Exception {
String s = new HttpUtil().setUrlStr("https://www.baidu.com").get();
System.out.println(s);
}
}
使用示例
/**
* url
*/
String url1 = "http://www.baidu.com";
/**
* 代理
*/
String proxyIp = "219.141.153.41";
Integer proxyPort = 80;
String proxyType = "http";
/**
* header
*/
HashMap<String, String> header = new HashMap<String, String>() {{
put("Accept", "application/json");
}};
/**
* url参数
*/
HashMap<String, String> urlParam = new HashMap<String, String>() {{
put("name", "小明");
}};
/**
* 发送请求
*/
String html = new MyHttp()
.setUrlStr(url)
.setProxy(proxyIp, proxyPort, proxyType)
.setHeaderMap(header)
.setParams(urlParam)
.get();
这短短的一生我们最终都会失去,不放大胆一点,爱一个人、攀一座山、追一个梦!