Java SpringBoot WebMagic
WebMagic是一个开源的java爬虫框架。WebMagic框架的具体如何使用请参考官方文档:http://webmagic.io/docs/
这里对spring boot+WebMagic+MyBatis做了整合,使用WebMagic爬取数据,然后通过MyBatis持久化爬取的数据到mysql数据库。
基于 SpringBoot   WebMagic 实现一个的爬虫框架 - 图1

1、添加maven依赖

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <project xmlns="http://maven.apache.org/POM/4.0.0"
  3. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5. <modelVersion>4.0.0</modelVersion>
  6. <groupId>hyzx</groupId>
  7. <artifactId>qbasic-crawler</artifactId>
  8. <version>1.0.0</version>
  9. <parent>
  10. <groupId>org.springframework.boot</groupId>
  11. <artifactId>spring-boot-starter-parent</artifactId>
  12. <version>1.5.21.RELEASE</version>
  13. <relativePath/> <!-- lookup parent from repository -->
  14. </parent>
  15. <properties>
  16. <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  17. <maven.test.skip>true</maven.test.skip>
  18. <java.version>1.8</java.version>
  19. <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
  20. <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
  21. <mysql.connector.version>5.1.47</mysql.connector.version>
  22. <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
  23. <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
  24. <fastjson.version>1.2.58</fastjson.version>
  25. <commons.lang3.version>3.9</commons.lang3.version>
  26. <joda.time.version>2.10.2</joda.time.version>
  27. <webmagic.core.version>0.7.3</webmagic.core.version>
  28. </properties>
  29. <dependencies>
  30. <dependency>
  31. <groupId>org.springframework.boot</groupId>
  32. <artifactId>spring-boot-devtools</artifactId>
  33. <scope>runtime</scope>
  34. <optional>true</optional>
  35. </dependency>
  36. <dependency>
  37. <groupId>org.springframework.boot</groupId>
  38. <artifactId>spring-boot-starter-test</artifactId>
  39. <scope>test</scope>
  40. </dependency>
  41. <dependency>
  42. <groupId>org.springframework.boot</groupId>
  43. <artifactId>spring-boot-configuration-processor</artifactId>
  44. <optional>true</optional>
  45. </dependency>
  46. <dependency>
  47. <groupId>mysql</groupId>
  48. <artifactId>mysql-connector-java</artifactId>
  49. <version>${mysql.connector.version}</version>
  50. </dependency>
  51. <dependency>
  52. <groupId>com.alibaba</groupId>
  53. <artifactId>druid-spring-boot-starter</artifactId>
  54. <version>${druid.spring.boot.starter.version}</version>
  55. </dependency>
  56. <dependency>
  57. <groupId>org.mybatis.spring.boot</groupId>
  58. <artifactId>mybatis-spring-boot-starter</artifactId>
  59. <version>${mybatis.spring.boot.starter.version}</version>
  60. </dependency>
  61. <dependency>
  62. <groupId>com.alibaba</groupId>
  63. <artifactId>fastjson</artifactId>
  64. <version>${fastjson.version}</version>
  65. </dependency>
  66. <dependency>
  67. <groupId>org.apache.commons</groupId>
  68. <artifactId>commons-lang3</artifactId>
  69. <version>${commons.lang3.version}</version>
  70. </dependency>
  71. <dependency>
  72. <groupId>joda-time</groupId>
  73. <artifactId>joda-time</artifactId>
  74. <version>${joda.time.version}</version>
  75. </dependency>
  76. <dependency>
  77. <groupId>us.codecraft</groupId>
  78. <artifactId>webmagic-core</artifactId>
  79. <version>${webmagic.core.version}</version>
  80. <exclusions>
  81. <exclusion>
  82. <groupId>org.slf4j</groupId>
  83. <artifactId>slf4j-log4j12</artifactId>
  84. </exclusion>
  85. </exclusions>
  86. </dependency>
  87. </dependencies>
  88. <build>
  89. <plugins>
  90. <plugin>
  91. <groupId>org.apache.maven.plugins</groupId>
  92. <artifactId>maven-compiler-plugin</artifactId>
  93. <version>${maven.compiler.plugin.version}</version>
  94. <configuration>
  95. <source>${java.version}</source>
  96. <target>${java.version}</target>
  97. <encoding>${project.build.sourceEncoding}</encoding>
  98. </configuration>
  99. </plugin>
  100. <plugin>
  101. <groupId>org.apache.maven.plugins</groupId>
  102. <artifactId>maven-resources-plugin</artifactId>
  103. <version>${maven.resources.plugin.version}</version>
  104. <configuration>
  105. <encoding>${project.build.sourceEncoding}</encoding>
  106. </configuration>
  107. </plugin>
  108. <plugin>
  109. <groupId>org.springframework.boot</groupId>
  110. <artifactId>spring-boot-maven-plugin</artifactId>
  111. <configuration>
  112. <fork>true</fork>
  113. <addResources>true</addResources>
  114. </configuration>
  115. <executions>
  116. <execution>
  117. <goals>
  118. <goal>repackage</goal>
  119. </goals>
  120. </execution>
  121. </executions>
  122. </plugin>
  123. </plugins>
  124. </build>
  125. <repositories>
  126. <repository>
  127. <id>public</id>
  128. <name>aliyun nexus</name>
  129. <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
  130. <releases>
  131. <enabled>true</enabled>
  132. </releases>
  133. </repository>
  134. </repositories>
  135. <pluginRepositories>
  136. <pluginRepository>
  137. <id>public</id>
  138. <name>aliyun nexus</name>
  139. <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
  140. <releases>
  141. <enabled>true</enabled>
  142. </releases>
  143. <snapshots>
  144. <enabled>false</enabled>
  145. </snapshots>
  146. </pluginRepository>
  147. </pluginRepositories>
  148. </project>

2、项目配置文件 application.properties

配置mysql数据源,druid数据库连接池以及MyBatis的mapper文件的位置。

  1. # mysql数据源配置
  2. spring.datasource.name=mysql
  3. spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
  4. spring.datasource.driver-class-name=com.mysql.jdbc.Driver
  5. spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
  6. spring.datasource.username=root
  7. spring.datasource.password=root
  8. # druid数据库连接池配置
  9. spring.datasource.druid.initial-size=5
  10. spring.datasource.druid.min-idle=5
  11. spring.datasource.druid.max-active=10
  12. spring.datasource.druid.max-wait=60000
  13. spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
  14. spring.datasource.druid.test-on-borrow=false
  15. spring.datasource.druid.test-on-return=false
  16. spring.datasource.druid.test-while-idle=true
  17. spring.datasource.druid.time-between-eviction-runs-millis=60000
  18. spring.datasource.druid.min-evictable-idle-time-millis=300000
  19. spring.datasource.druid.max-evictable-idle-time-millis=600000
  20. # mybatis配置
  21. mybatis.mapperLocations=classpath:mapper/**/*.xml

3、数据库表结构

  1. CREATE TABLE `cms_content` (
  2. `contentId` varchar(40) NOT NULL COMMENT '内容ID',
  3. `title` varchar(150) NOT NULL COMMENT '标题',
  4. `content` longtext COMMENT '文章内容',
  5. `releaseDate` datetime NOT NULL COMMENT '发布日期',
  6. PRIMARY KEY (`contentId`)
  7. ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';

4、实体类

  1. import java.util.Date;
  2. public class CmsContentPO {
  3. private String contentId;
  4. private String title;
  5. private String content;
  6. private Date releaseDate;
  7. public String getContentId() {
  8. return contentId;
  9. }
  10. public void setContentId(String contentId) {
  11. this.contentId = contentId;
  12. }
  13. public String getTitle() {
  14. return title;
  15. }
  16. public void setTitle(String title) {
  17. this.title = title;
  18. }
  19. public String getContent() {
  20. return content;
  21. }
  22. public void setContent(String content) {
  23. this.content = content;
  24. }
  25. public Date getReleaseDate() {
  26. return releaseDate;
  27. }
  28. public void setReleaseDate(Date releaseDate) {
  29. this.releaseDate = releaseDate;
  30. }
  31. }

5、mapper接口

  1. public interface CrawlerMapper {
  2. int addCmsContent(CmsContentPO record);
  3. }

6、CrawlerMapper.xml文件

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
  3. <mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">
  4. <insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO">
  5. insert into cms_content (contentId,
  6. title,
  7. releaseDate,
  8. content)
  9. values (#{contentId,jdbcType=VARCHAR},
  10. #{title,jdbcType=VARCHAR},
  11. #{releaseDate,jdbcType=TIMESTAMP},
  12. #{content,jdbcType=LONGVARCHAR})
  13. </insert>
  14. </mapper>

7、知乎页面内容处理类ZhihuPageProcessor

主要用于解析爬取到的知乎html页面。

  1. @Component
  2. public class ZhihuPageProcessor implements PageProcessor {
  3. private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
  4. @Override
  5. public void process(Page page) {
  6. page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
  7. page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
  8. page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
  9. if (page.getResultItems().get("title") == null) {
  10. // 如果是列表页,跳过此页,pipeline不进行后续处理
  11. page.setSkip(true);
  12. }
  13. }
  14. @Override
  15. public Site getSite() {
  16. return site;
  17. }
  18. }

8、知乎数据处理类ZhihuPipeline

主要用于将知乎html页面解析出的数据存储到mysql数据库。

  1. @Component
  2. public class ZhihuPipeline implements Pipeline {
  3. private static final Logger LOGGER = LoggerFactory.getLogger(ZhihuPipeline.class);
  4. @Autowired
  5. private CrawlerMapper crawlerMapper;
  6. public void process(ResultItems resultItems, Task task) {
  7. String title = resultItems.get("title");
  8. String answer = resultItems.get("answer");
  9. CmsContentPO contentPO = new CmsContentPO();
  10. contentPO.setContentId(UUID.randomUUID().toString());
  11. contentPO.setTitle(title);
  12. contentPO.setReleaseDate(new Date());
  13. contentPO.setContent(answer);
  14. try {
  15. boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
  16. LOGGER.info("保存知乎文章成功:{}", title);
  17. } catch (Exception ex) {
  18. LOGGER.error("保存知乎文章失败", ex);
  19. }
  20. }
  21. }

9、知乎爬虫任务类ZhihuTask

每十分钟启动一次爬虫。

  1. @Component
  2. public class ZhihuTask {
  3. private static final Logger LOGGER = LoggerFactory.getLogger(ZhihuPipeline.class);
  4. @Autowired
  5. private ZhihuPipeline zhihuPipeline;
  6. @Autowired
  7. private ZhihuPageProcessor zhihuPageProcessor;
  8. private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
  9. public void crawl() {
  10. // 定时任务,每10分钟爬取一次
  11. timer.scheduleWithFixedDelay(() -> {
  12. Thread.currentThread().setName("zhihuCrawlerThread");
  13. try {
  14. Spider.create(zhihuPageProcessor)
  15. // 从https://www.zhihu.com/explore开始抓
  16. .addUrl("https://www.zhihu.com/explore")
  17. // 抓取到的数据存数据库
  18. .addPipeline(zhihuPipeline)
  19. // 开启2个线程抓取
  20. .thread(2)
  21. // 异步启动爬虫
  22. .start();
  23. } catch (Exception ex) {
  24. LOGGER.error("定时抓取知乎数据线程执行异常", ex);
  25. }
  26. }, 0, 10, TimeUnit.MINUTES);
  27. }
  28. }

10、Spring boot程序启动类

  1. @SpringBootApplication
  2. @MapperScan(basePackages = "com.hyzx.qbasic.dao")
  3. public class Application implements CommandLineRunner {
  4. @Autowired
  5. private ZhihuTask zhihuTask;
  6. public static void main(String[] args) throws IOException {
  7. SpringApplication.run(Application.class, args);
  8. }
  9. @Override
  10. public void run(String... strings) throws Exception {
  11. // 爬取知乎数据
  12. zhihuTask.crawl();
  13. }
  14. }