java 爬虫

引入

  1. <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
  2. <dependency>
  3. <groupId>org.apache.httpcomponents</groupId>
  4. <artifactId>httpclient</artifactId>
  5. <version>4.5.3</version>
  6. </dependency>

Get测试

  1. import org.apache.http.client.methods.CloseableHttpResponse;
  2. import org.apache.http.client.methods.HttpGet;
  3. import org.apache.http.client.utils.URIBuilder;
  4. import org.apache.http.impl.client.CloseableHttpClient;
  5. import org.apache.http.impl.client.HttpClients;
  6. import org.apache.http.util.EntityUtils;
  7. import java.io.IOException;
  8. import java.net.URISyntaxException;
  9. public class GetController {
  10. public static void main(String[] args) throws URISyntaxException {
  11. CloseableHttpClient httpClient = HttpClients.createDefault();
  12. String url = "https://www.baidu.com";
  13. //设置参数
  14. URIBuilder uriBuilder = new URIBuilder("url");
  15. uriBuilder.setParameter("wd", "田云");
  16. //新建get请求
  17. HttpGet httpGet = new HttpGet(uriBuilder.build());
  18. //设置请求头
  19. //httpGet.setHeader("User-Agent", );
  20. //设置请求参数
  21. //RequestConfig config = RequestConfig.custom().setConnectTimeout(3000) //创建链接的最长时间
  22. // .setConnectionRequestTimeout(500) //从连接池中获取到链接的最长时间
  23. // .setSocketTimeout(10 * 1000) //数据传输的最长时间
  24. // .setProxy(new HttpHost("ip",8080,"http"))
  25. // .build();
  26. //httpGet.setConfig(config);
  27. CloseableHttpResponse response = null;
  28. //发送请求
  29. try {
  30. response= httpClient.execute(httpGet);
  31. if (response.getStatusLine().getStatusCode() == 200) {
  32. String html = EntityUtils.toString(response.getEntity(), "utf-8");
  33. System.out.println(html);
  34. }
  35. } catch (Exception e) {
  36. e.printStackTrace();
  37. }finally {
  38. try {
  39. response.close();
  40. httpClient.close();
  41. } catch (IOException e) {
  42. e.printStackTrace();
  43. }
  44. }
  45. }
  46. }

Post测试

  1. import org.apache.http.NameValuePair;
  2. import org.apache.http.client.entity.UrlEncodedFormEntity;
  3. import org.apache.http.client.methods.CloseableHttpResponse;
  4. import org.apache.http.client.methods.HttpGet;
  5. import org.apache.http.client.methods.HttpPost;
  6. import org.apache.http.client.utils.URIBuilder;
  7. import org.apache.http.impl.client.CloseableHttpClient;
  8. import org.apache.http.impl.client.HttpClients;
  9. import org.apache.http.message.BasicNameValuePair;
  10. import org.apache.http.util.EntityUtils;
  11. import java.io.IOException;
  12. import java.io.UnsupportedEncodingException;
  13. import java.net.URISyntaxException;
  14. import java.util.ArrayList;
  15. import java.util.List;
  16. public class PostController {
  17. public static void main(String[] args) throws URISyntaxException, UnsupportedEncodingException {
  18. CloseableHttpClient httpClient = HttpClients.createDefault();
  19. String url = "https://www.baidu.com";
  20. //设置参数
  21. URIBuilder uriBuilder = new URIBuilder("url");
  22. uriBuilder.setParameter("wd", "田云");
  23. //新建Post请求
  24. HttpPost httpPost = new HttpPost(uriBuilder.build());
  25. //设置请求头
  26. httpPost.setHeader("User-Agent", "");
  27. //设置form表单数据
  28. List<NameValuePair> params = new ArrayList<NameValuePair>();
  29. params.add(new BasicNameValuePair("key1", "value1"));
  30. UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf-8");
  31. httpPost.setEntity(formEntity);
  32. CloseableHttpResponse response = null;
  33. //发送请求
  34. try {
  35. response= httpClient.execute(httpPost);
  36. if (response.getStatusLine().getStatusCode() == 200) {
  37. String html = EntityUtils.toString(response.getEntity(), "utf-8");
  38. System.out.println(html);
  39. }
  40. } catch (Exception e) {
  41. e.printStackTrace();
  42. }finally {
  43. try {
  44. response.close();
  45. httpClient.close();
  46. } catch (IOException e) {
  47. e.printStackTrace();
  48. }
  49. }
  50. }
  51. }

Jsoup解析

引用

  1. <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
  2. <dependency>
  3. <groupId>org.jsoup</groupId>
  4. <artifactId>jsoup</artifactId>
  5. <version>1.11.3</version>
  6. </dependency>

参考指南

使用:

  1. # 解析
  2. Document doc = Jsoup.parse(html);
  3. # 筛选
  4. Elements pngs = doc.select("img[src$=.png]");
  5. # 获取元数据
  6. select.get(0).attr("id");
  7. select.get(0).text();

链接池

  1. public class PoolController {
  2. public static void main(String[] args) {
  3. PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();
  4. //最大链接数
  5. manager.setMaxTotal(200);
  6. //每个主机的最大连接数
  7. manager.setDefaultMaxPerRoute(20);
  8. userMamger(manager);
  9. userMamger(manager);
  10. }
  11. public static void userMamger(PoolingHttpClientConnectionManager manager) {
  12. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(manager).build();
  13. System.out.println(httpClient);
  14. }
  15. }

封装

代码

MyHttp.java

  1. import com.google.gson.Gson;
  2. import lombok.Data;
  3. import lombok.experimental.Accessors;
  4. import org.apache.http.*;
  5. import org.apache.http.client.config.RequestConfig;
  6. import org.apache.http.client.entity.UrlEncodedFormEntity;
  7. import org.apache.http.client.methods.HttpGet;
  8. import org.apache.http.client.methods.HttpPost;
  9. import org.apache.http.client.methods.HttpUriRequest;
  10. import org.apache.http.client.utils.URIBuilder;
  11. import org.apache.http.cookie.Cookie;
  12. import org.apache.http.entity.StringEntity;
  13. import org.apache.http.impl.client.BasicCookieStore;
  14. import org.apache.http.impl.client.CloseableHttpClient;
  15. import org.apache.http.impl.client.HttpClients;
  16. import org.apache.http.impl.cookie.BasicClientCookie;
  17. import org.apache.http.message.BasicNameValuePair;
  18. import org.apache.http.util.EntityUtils;
  19. import java.io.BufferedOutputStream;
  20. import java.io.FileOutputStream;
  21. import java.io.IOException;
  22. import java.net.URI;
  23. import java.net.URISyntaxException;
  24. import java.util.ArrayList;
  25. import java.util.HashMap;
  26. import java.util.List;
  27. import java.util.Map;
  28. /**
  29. * 我的utils封装
  30. * 读取设置:url、cookies、params
  31. * 读取:response
  32. * 设置 proxy、header、超时时间
  33. *
  34. * @author Mark
  35. */
  36. @Data
  37. @Accessors(chain = true)
  38. public class HttpUtil {
  39. /** 代理 */
  40. private HttpHost proxy;
  41. /** 超时时间 5秒 */
  42. private Integer connectTimeout = 5000;
  43. /** header */
  44. private HashMap<String, String> headerMap;
  45. /** urlStr */
  46. private String urlStr;
  47. private BasicCookieStore cookieStore = new BasicCookieStore();
  48. private RequestConfig config;
  49. private HttpResponse httpResponse;
  50. /**
  51. * params参数
  52. */
  53. private List<NameValuePair> nameValuePairs = new ArrayList<>();
  54. public HttpUtil(String urlStr) {
  55. // 5秒
  56. this.urlStr = urlStr;
  57. }
  58. /**
  59. * 给url设置参数
  60. *
  61. * @param paramsMap
  62. * @throws URISyntaxException
  63. */
  64. public HttpUtil setParams(HashMap<String, String> paramsMap){
  65. for (Map.Entry<String, String> entry : paramsMap.entrySet()) {
  66. nameValuePairs.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
  67. }
  68. return this;
  69. }
  70. /**
  71. * 获取get请求结果
  72. *
  73. * @return
  74. * @throws IOException
  75. */
  76. public String get() throws Exception {
  77. return getExecuteResult(getHttpGet());
  78. }
  79. /**
  80. * 获取post请求结果
  81. *
  82. * @param dataMap
  83. * @return
  84. * @throws IOException
  85. */
  86. public String post(HashMap<String, String> dataMap) throws Exception {
  87. HttpPost httpPost = getHttpPost();
  88. //解决中文乱码
  89. httpPost.setHeader("Content-Type", "text/html; charset=UTF-8");
  90. // 添加data
  91. if (dataMap != null) {
  92. List<NameValuePair> nameValuePairs = new ArrayList<>();
  93. for (Map.Entry<String, String> entry : dataMap.entrySet()) {
  94. nameValuePairs.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
  95. }
  96. UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(nameValuePairs, "utf-8");
  97. httpPost.setEntity(urlEncodedFormEntity);
  98. }
  99. return getExecuteResult(httpPost);
  100. }
  101. /**
  102. * 发送json请求
  103. * @param object
  104. * @return
  105. * @throws Exception
  106. */
  107. public String postJson(Object object) throws Exception {
  108. HttpPost httpPost = getHttpPost();
  109. //解决中文乱码
  110. httpPost.setHeader("Content-Type", "application/json;charset=UTF-8");
  111. if (object != null) {
  112. StringEntity entity = new StringEntity(new Gson().toJson(object), "utf-8");
  113. httpPost.setEntity(entity);
  114. }
  115. return getExecuteResult(httpPost);
  116. }
  117. /**
  118. * 处理get和post请求,返回html
  119. *
  120. * @param httpUriRequest
  121. * @return
  122. * @throws IOException
  123. */
  124. private String getExecuteResult(HttpUriRequest httpUriRequest) throws IOException {
  125. String returnStr;
  126. //获取一个链接
  127. CloseableHttpClient httpClient = HttpClients.custom()
  128. //.setConnectionManager(connectionManager)
  129. .setDefaultCookieStore(cookieStore).build();
  130. //设置header
  131. httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36");
  132. if (headerMap != null) {
  133. for (Map.Entry<String, String> entry : headerMap.entrySet()) {
  134. httpUriRequest.setHeader(entry.getKey(), entry.getValue());
  135. }
  136. }
  137. httpResponse = httpClient.execute(httpUriRequest);
  138. //获取返回结果中的实体
  139. HttpEntity entity = httpResponse.getEntity();
  140. //判断是否失败
  141. if (httpResponse.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
  142. returnStr = null;
  143. } else {
  144. //查看页面内容结果
  145. String rawHTMLContent = EntityUtils.toString(entity, "utf-8");
  146. returnStr = decodeUnicode(rawHTMLContent);
  147. }
  148. //关闭HttpEntity流
  149. EntityUtils.consume(entity);
  150. return returnStr;
  151. }
  152. /**
  153. * 设置cookies
  154. *
  155. * @param cookieMap
  156. */
  157. public void setCookies(HashMap<String, String> cookieMap) {
  158. for (Map.Entry<String, String> entry : cookieMap.entrySet()) {
  159. BasicClientCookie clientCookie = new BasicClientCookie(entry.getKey(), entry.getValue());
  160. cookieStore.addCookie(clientCookie);
  161. }
  162. }
  163. /**
  164. * 获取cookies
  165. *
  166. * @return
  167. */
  168. public List<Cookie> getCookies() {
  169. return cookieStore.getCookies();
  170. }
  171. private HttpGet getHttpGet() throws Exception {
  172. URIBuilder uriBuilder = new URIBuilder(urlStr);
  173. if (nameValuePairs.size() > 0) {
  174. uriBuilder.addParameters(nameValuePairs);
  175. }
  176. URI uri = uriBuilder.build();
  177. HttpGet httpGet = new HttpGet(uri);
  178. httpGet.setConfig(getConfig());
  179. return httpGet;
  180. }
  181. private HttpPost getHttpPost() throws Exception {
  182. URIBuilder uriBuilder = new URIBuilder(urlStr);
  183. if (nameValuePairs.size() > 0) {
  184. uriBuilder.addParameters(nameValuePairs);
  185. }
  186. URI uri = uriBuilder.build();
  187. HttpPost httpPost = new HttpPost(uri);
  188. httpPost.setConfig(getConfig());
  189. return httpPost;
  190. }
  191. public boolean getDownLoad(String filePath) throws Exception {
  192. return downLoadFile(getHttpGet(), filePath);
  193. }
  194. public boolean PostDownLoad(String filePath) throws Exception {
  195. return downLoadFile(getHttpPost(), filePath);
  196. }
  197. /**
  198. * 下载文件到指定路径
  199. *
  200. * @param httpUriRequest
  201. * @param filePath
  202. * @return
  203. */
  204. private boolean downLoadFile(HttpUriRequest httpUriRequest, String filePath) throws IOException {
  205. CloseableHttpClient httpClient = HttpClients.custom()
  206. //.setConnectionManager(connectionManager)
  207. .setDefaultCookieStore(cookieStore).build();
  208. HttpResponse httpResponse = httpClient.execute(httpUriRequest);
  209. //判断是否失败
  210. if (httpResponse.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
  211. return false;
  212. }
  213. //获取返回结果中的实体
  214. HttpEntity entity = httpResponse.getEntity();
  215. //存储
  216. BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
  217. entity.writeTo(bos);
  218. bos.close();
  219. //关闭HttpEntity流
  220. EntityUtils.consume(entity);
  221. return true;
  222. }
  223. /**
  224. * 设置代理服务器
  225. *
  226. * @param ip
  227. * @param port
  228. * @param httpType
  229. */
  230. public HttpUtil setProxy(String ip, int port, String httpType) {
  231. this.proxy = new HttpHost(ip, port, httpType);
  232. return this;
  233. }
  234. private RequestConfig getConfig() {
  235. RequestConfig.Builder custom = RequestConfig.custom();
  236. if (proxy != null) {
  237. custom.setProxy(proxy);
  238. }
  239. /**
  240. * 设置超时时间
  241. */
  242. custom.setConnectTimeout(connectTimeout)
  243. .setSocketTimeout(connectTimeout)
  244. .setConnectionRequestTimeout(connectTimeout);
  245. return custom.build();
  246. }
  247. /**
  248. * 解析string里面的字符串,因为只有部分需要解析,所有需要判断
  249. *
  250. * @param s 字符串
  251. * @return
  252. */
  253. public String decodeUnicode(String s) {
  254. char aChar;
  255. int len = s.length();
  256. StringBuffer outBuffer = new StringBuffer(len);
  257. for (int x = 0; x < len; ) {
  258. aChar = s.charAt(x++);
  259. if (aChar == '\\') {
  260. aChar = s.charAt(x++);
  261. if (aChar == 'u') {
  262. // Read the xxxx
  263. int value = 0;
  264. for (int i = 0; i < 4; i++) {
  265. aChar = s.charAt(x++);
  266. switch (aChar) {
  267. case '0':
  268. case '1':
  269. case '2':
  270. case '3':
  271. case '4':
  272. case '5':
  273. case '6':
  274. case '7':
  275. case '8':
  276. case '9':
  277. value = (value << 4) + aChar - '0';
  278. break;
  279. case 'a':
  280. case 'b':
  281. case 'c':
  282. case 'd':
  283. case 'e':
  284. case 'f':
  285. value = (value << 4) + 10 + aChar - 'a';
  286. break;
  287. case 'A':
  288. case 'B':
  289. case 'C':
  290. case 'D':
  291. case 'E':
  292. case 'F':
  293. value = (value << 4) + 10 + aChar - 'A';
  294. break;
  295. default:
  296. throw new IllegalArgumentException(
  297. "Malformed \\uxxxx encoding.");
  298. }
  299. }
  300. outBuffer.append((char) value);
  301. } else {
  302. switch (aChar) {
  303. case 't':
  304. aChar = '\t';
  305. break;
  306. case 'r':
  307. aChar = '\r';
  308. break;
  309. case 'n':
  310. aChar = '\n';
  311. break;
  312. case 'f':
  313. aChar = '\f';
  314. break;
  315. default:
  316. break;
  317. }
  318. outBuffer.append(aChar);
  319. }
  320. } else {
  321. outBuffer.append(aChar);
  322. }
  323. }
  324. return outBuffer.toString();
  325. }
  326. public static void main(String[] args) throws Exception {
  327. String s = new HttpUtil().setUrlStr("https://www.baidu.com").get();
  328. System.out.println(s);
  329. }
  330. }

使用示例

  1. /**
  2. * url
  3. */
  4. String url1 = "http://www.baidu.com";
  5. /**
  6. * 代理
  7. */
  8. String proxyIp = "219.141.153.41";
  9. Integer proxyPort = 80;
  10. String proxyType = "http";
  11. /**
  12. * header
  13. */
  14. HashMap<String, String> header = new HashMap<String, String>() {{
  15. put("Accept", "application/json");
  16. }};
  17. /**
  18. * url参数
  19. */
  20. HashMap<String, String> urlParam = new HashMap<String, String>() {{
  21. put("name", "小明");
  22. }};
  23. /**
  24. * 发送请求
  25. */
  26. String html = new MyHttp()
  27. .setUrlStr(url)
  28. .setProxy(proxyIp, proxyPort, proxyType)
  29. .setHeaderMap(header)
  30. .setParams(urlParam)
  31. .get();

这短短的一生我们最终都会失去,不放大胆一点,爱一个人、攀一座山、追一个梦!