需求分析

产品经理召集开会,文章审核功能已经交付了,文章也能正常发布审核。对于上次提出的自管理敏感词也很满意,这次会议核心的内容如下:

  • 文章中包含的图片要识别文字,过滤掉图片文字的敏感词

8.2)图片文字识别

什么是OCR?

OCR (Optical Character Recognition,光学字符识别)是指电子设备(例如扫描仪或数码相机)检查纸上打印的字符,通过检测暗、亮的模式确定其形状,然后用字符识别方法将形状翻译成计算机文字的过程

方案 说明
百度OCR 收费
Tesseract-OCR Google维护的开源OCR引擎,支持Java,Python等语言调用
Tess4J 封装了Tesseract-OCR ,支持Java调用

8.3)Tess4j案例

①:创建项目导入tess4j对应的依赖

  1. <dependency>
  2. <groupId>net.sourceforge.tess4j</groupId>
  3. <artifactId>tess4j</artifactId>
  4. <version>4.1.1</version>
  5. </dependency>

②:导入中文字体库, 把资料中的tessdata文件夹拷贝到自己的工作空间下

图片识别文字审核敏感词 - 图1

③:编写测试类进行测试

  1. package com.heima.tess4j;
  2. import net.sourceforge.tess4j.ITesseract;
  3. import net.sourceforge.tess4j.Tesseract;
  4. import java.io.File;
  5. public class Application {
  6. public static void main(String[] args) {
  7. try {
  8. //获取本地图片
  9. File file = new File("D:\\26.png");
  10. //创建Tesseract对象
  11. ITesseract tesseract = new Tesseract();
  12. //设置字体库路径
  13. tesseract.setDatapath("D:\\workspace\\tessdata");
  14. //中文识别
  15. tesseract.setLanguage("chi_sim");
  16. //执行ocr识别
  17. String result = tesseract.doOCR(file);
  18. //替换回车和tal键 使结果为一行
  19. result = result.replaceAll("\\r|\\n","-").replaceAll(" ","");
  20. System.out.println("识别的结果为:"+result);
  21. } catch (Exception e) {
  22. e.printStackTrace();
  23. }
  24. }
  25. }

8.4)管理敏感词和图片文字识别集成到文章审核

①:在heima-leadnews-common中创建工具类,简单封装一下tess4j

需要先导入pom

  1. <dependency>
  2. <groupId>net.sourceforge.tess4j</groupId>
  3. <artifactId>tess4j</artifactId>
  4. <version>4.1.1</version>
  5. </dependency>

工具类

  1. package com.heima.common.tess4j;
  2. import lombok.Getter;
  3. import lombok.Setter;
  4. import net.sourceforge.tess4j.ITesseract;
  5. import net.sourceforge.tess4j.Tesseract;
  6. import net.sourceforge.tess4j.TesseractException;
  7. import org.springframework.boot.context.properties.ConfigurationProperties;
  8. import org.springframework.stereotype.Component;
  9. import java.awt.image.BufferedImage;
  10. @Getter
  11. @Setter
  12. @Component
  13. @ConfigurationProperties(prefix = "tess4j")
  14. public class Tess4jClient {
  15. private String dataPath;
  16. private String language;
  17. public String doOCR(BufferedImage image) throws TesseractException {
  18. //创建Tesseract对象
  19. ITesseract tesseract = new Tesseract();
  20. //设置字体库路径
  21. tesseract.setDatapath(dataPath);
  22. //中文识别
  23. tesseract.setLanguage(language);
  24. //执行ocr识别
  25. String result = tesseract.doOCR(image);
  26. //替换回车和tal键 使结果为一行
  27. result = result.replaceAll("\\r|\\n", "-").replaceAll(" ", "");
  28. return result;
  29. }
  30. }

在spring.factories配置中添加该类,完整如下:

  1. org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
  2. com.heima.common.exception.ExceptionCatch,\
  3. com.heima.common.swagger.SwaggerConfiguration,\
  4. com.heima.common.swagger.Swagger2Configuration,\
  5. com.heima.common.aliyun.GreenTextScan,\
  6. com.heima.common.aliyun.GreenImageScan,\
  7. com.heima.common.tess4j.Tess4jClient

②:在heima-leadnews-wemedia中的配置中添加两个属性

  1. tess4j:
  2. data-path: D:\workspace\tessdata
  3. language: chi_sim

③:在WmNewsAutoScanServiceImpl中的handleImageScan方法上添加如下代码

  1. try {
  2. for (String image : images) {
  3. byte[] bytes = fileStorageService.downLoadFile(image);
  4. //图片识别文字审核---begin-----
  5. //从byte[]转换为butteredImage
  6. ByteArrayInputStream in = new ByteArrayInputStream(bytes);
  7. BufferedImage imageFile = ImageIO.read(in);
  8. //识别图片的文字
  9. String result = tess4jClient.doOCR(imageFile);
  10. //审核是否包含自管理的敏感词
  11. boolean isSensitive = handleSensitiveScan(result, wmNews);
  12. if(!isSensitive){
  13. return isSensitive;
  14. }
  15. //图片识别文字审核---end-----
  16. imageList.add(bytes);
  17. }
  18. }catch (Exception e){
  19. e.printStackTrace();
  20. }

最后附上文章审核的完整代码如下:

  1. package com.heima.wemedia.service.impl;
  2. import com.alibaba.fastjson.JSONArray;
  3. import com.baomidou.mybatisplus.core.toolkit.Wrappers;
  4. import com.heima.apis.article.IArticleClient;
  5. import com.heima.common.aliyun.GreenImageScan;
  6. import com.heima.common.aliyun.GreenTextScan;
  7. import com.heima.common.tess4j.Tess4jClient;
  8. import com.heima.file.service.FileStorageService;
  9. import com.heima.model.article.dtos.ArticleDto;
  10. import com.heima.model.common.dtos.ResponseResult;
  11. import com.heima.model.wemedia.pojos.WmChannel;
  12. import com.heima.model.wemedia.pojos.WmNews;
  13. import com.heima.model.wemedia.pojos.WmSensitive;
  14. import com.heima.model.wemedia.pojos.WmUser;
  15. import com.heima.utils.common.SensitiveWordUtil;
  16. import com.heima.wemedia.mapper.WmChannelMapper;
  17. import com.heima.wemedia.mapper.WmNewsMapper;
  18. import com.heima.wemedia.mapper.WmSensitiveMapper;
  19. import com.heima.wemedia.mapper.WmUserMapper;
  20. import com.heima.wemedia.service.WmNewsAutoScanService;
  21. import lombok.extern.slf4j.Slf4j;
  22. import org.apache.commons.lang3.StringUtils;
  23. import org.springframework.beans.BeanUtils;
  24. import org.springframework.beans.factory.annotation.Autowired;
  25. import org.springframework.scheduling.annotation.Async;
  26. import org.springframework.stereotype.Service;
  27. import org.springframework.transaction.annotation.Transactional;
  28. import javax.imageio.ImageIO;
  29. import java.awt.image.BufferedImage;
  30. import java.io.ByteArrayInputStream;
  31. import java.util.*;
  32. import java.util.stream.Collectors;
  33. @Service
  34. @Slf4j
  35. @Transactional
  36. public class WmNewsAutoScanServiceImpl implements WmNewsAutoScanService {
  37. @Autowired
  38. private WmNewsMapper wmNewsMapper;
  39. /**
  40. * 自媒体文章审核
  41. *
  42. * @param id 自媒体文章id
  43. */
  44. @Override
  45. @Async //标明当前方法是一个异步方法
  46. public void autoScanWmNews(Integer id) {
  47. // int a = 1/0;
  48. //1.查询自媒体文章
  49. WmNews wmNews = wmNewsMapper.selectById(id);
  50. if (wmNews == null) {
  51. throw new RuntimeException("WmNewsAutoScanServiceImpl-文章不存在");
  52. }
  53. if (wmNews.getStatus().equals(WmNews.Status.SUBMIT.getCode())) {
  54. //从内容中提取纯文本内容和图片
  55. Map<String, Object> textAndImages = handleTextAndImages(wmNews);
  56. //自管理的敏感词过滤
  57. boolean isSensitive = handleSensitiveScan((String) textAndImages.get("content"), wmNews);
  58. if(!isSensitive) return;
  59. //2.审核文本内容 阿里云接口
  60. boolean isTextScan = handleTextScan((String) textAndImages.get("content"), wmNews);
  61. if (!isTextScan) return;
  62. //3.审核图片 阿里云接口
  63. boolean isImageScan = handleImageScan((List<String>) textAndImages.get("images"), wmNews);
  64. if (!isImageScan) return;
  65. //4.审核成功,保存app端的相关的文章数据
  66. ResponseResult responseResult = saveAppArticle(wmNews);
  67. if (!responseResult.getCode().equals(200)) {
  68. throw new RuntimeException("WmNewsAutoScanServiceImpl-文章审核,保存app端相关文章数据失败");
  69. }
  70. //回填article_id
  71. wmNews.setArticleId((Long) responseResult.getData());
  72. updateWmNews(wmNews, (short) 9, "审核成功");
  73. }
  74. }
  75. @Autowired
  76. private WmSensitiveMapper wmSensitiveMapper;
  77. /**
  78. * 自管理的敏感词审核
  79. * @param content
  80. * @param wmNews
  81. * @return
  82. */
  83. private boolean handleSensitiveScan(String content, WmNews wmNews) {
  84. boolean flag = true;
  85. //获取所有的敏感词
  86. List<WmSensitive> wmSensitives = wmSensitiveMapper.selectList(Wrappers.<WmSensitive>lambdaQuery().select(WmSensitive::getSensitives));
  87. List<String> sensitiveList = wmSensitives.stream().map(WmSensitive::getSensitives).collect(Collectors.toList());
  88. //初始化敏感词库
  89. SensitiveWordUtil.initMap(sensitiveList);
  90. //查看文章中是否包含敏感词
  91. Map<String, Integer> map = SensitiveWordUtil.matchWords(content);
  92. if(map.size() >0){
  93. updateWmNews(wmNews,(short) 2,"当前文章中存在违规内容"+map);
  94. flag = false;
  95. }
  96. return flag;
  97. }
  98. @Autowired
  99. private IArticleClient articleClient;
  100. @Autowired
  101. private WmChannelMapper wmChannelMapper;
  102. @Autowired
  103. private WmUserMapper wmUserMapper;
  104. /**
  105. * 保存app端相关的文章数据
  106. *
  107. * @param wmNews
  108. */
  109. private ResponseResult saveAppArticle(WmNews wmNews) {
  110. ArticleDto dto = new ArticleDto();
  111. //属性的拷贝
  112. BeanUtils.copyProperties(wmNews, dto);
  113. //文章的布局
  114. dto.setLayout(wmNews.getType());
  115. //频道
  116. WmChannel wmChannel = wmChannelMapper.selectById(wmNews.getChannelId());
  117. if (wmChannel != null) {
  118. dto.setChannelName(wmChannel.getName());
  119. }
  120. //作者
  121. dto.setAuthorId(wmNews.getUserId().longValue());
  122. WmUser wmUser = wmUserMapper.selectById(wmNews.getUserId());
  123. if (wmUser != null) {
  124. dto.setAuthorName(wmUser.getName());
  125. }
  126. //设置文章id
  127. if (wmNews.getArticleId() != null) {
  128. dto.setId(wmNews.getArticleId());
  129. }
  130. dto.setCreatedTime(new Date());
  131. ResponseResult responseResult = articleClient.saveArticle(dto);
  132. return responseResult;
  133. }
  134. @Autowired
  135. private FileStorageService fileStorageService;
  136. @Autowired
  137. private GreenImageScan greenImageScan;
  138. @Autowired
  139. private Tess4jClient tess4jClient;
  140. /**
  141. * 审核图片
  142. *
  143. * @param images
  144. * @param wmNews
  145. * @return
  146. */
  147. private boolean handleImageScan(List<String> images, WmNews wmNews) {
  148. boolean flag = true;
  149. if (images == null || images.size() == 0) {
  150. return flag;
  151. }
  152. //下载图片 minIO
  153. //图片去重
  154. images = images.stream().distinct().collect(Collectors.toList());
  155. List<byte[]> imageList = new ArrayList<>();
  156. try {
  157. for (String image : images) {
  158. byte[] bytes = fileStorageService.downLoadFile(image);
  159. //图片识别文字审核---begin-----
  160. //从byte[]转换为butteredImage
  161. ByteArrayInputStream in = new ByteArrayInputStream(bytes);
  162. BufferedImage imageFile = ImageIO.read(in);
  163. //识别图片的文字
  164. String result = tess4jClient.doOCR(imageFile);
  165. //审核是否包含自管理的敏感词
  166. boolean isSensitive = handleSensitiveScan(result, wmNews);
  167. if(!isSensitive){
  168. return isSensitive;
  169. }
  170. //图片识别文字审核---end-----
  171. imageList.add(bytes);
  172. }
  173. }catch (Exception e){
  174. e.printStackTrace();
  175. }
  176. //审核图片
  177. try {
  178. Map map = greenImageScan.imageScan(imageList);
  179. if (map != null) {
  180. //审核失败
  181. if (map.get("suggestion").equals("block")) {
  182. flag = false;
  183. updateWmNews(wmNews, (short) 2, "当前文章中存在违规内容");
  184. }
  185. //不确定信息 需要人工审核
  186. if (map.get("suggestion").equals("review")) {
  187. flag = false;
  188. updateWmNews(wmNews, (short) 3, "当前文章中存在不确定内容");
  189. }
  190. }
  191. } catch (Exception e) {
  192. flag = false;
  193. e.printStackTrace();
  194. }
  195. return flag;
  196. }
  197. @Autowired
  198. private GreenTextScan greenTextScan;
  199. /**
  200. * 审核纯文本内容
  201. *
  202. * @param content
  203. * @param wmNews
  204. * @return
  205. */
  206. private boolean handleTextScan(String content, WmNews wmNews) {
  207. boolean flag = true;
  208. if ((wmNews.getTitle() + "-" + content).length() == 0) {
  209. return flag;
  210. }
  211. try {
  212. Map map = greenTextScan.greeTextScan((wmNews.getTitle() + "-" + content));
  213. if (map != null) {
  214. //审核失败
  215. if (map.get("suggestion").equals("block")) {
  216. flag = false;
  217. updateWmNews(wmNews, (short) 2, "当前文章中存在违规内容");
  218. }
  219. //不确定信息 需要人工审核
  220. if (map.get("suggestion").equals("review")) {
  221. flag = false;
  222. updateWmNews(wmNews, (short) 3, "当前文章中存在不确定内容");
  223. }
  224. }
  225. } catch (Exception e) {
  226. flag = false;
  227. e.printStackTrace();
  228. }
  229. return flag;
  230. }
  231. /**
  232. * 修改文章内容
  233. *
  234. * @param wmNews
  235. * @param status
  236. * @param reason
  237. */
  238. private void updateWmNews(WmNews wmNews, short status, String reason) {
  239. wmNews.setStatus(status);
  240. wmNews.setReason(reason);
  241. wmNewsMapper.updateById(wmNews);
  242. }
  243. /**
  244. * 1。从自媒体文章的内容中提取文本和图片
  245. * 2.提取文章的封面图片
  246. *
  247. * @param wmNews
  248. * @return
  249. */
  250. private Map<String, Object> handleTextAndImages(WmNews wmNews) {
  251. //存储纯文本内容
  252. StringBuilder stringBuilder = new StringBuilder();
  253. List<String> images = new ArrayList<>();
  254. //1。从自媒体文章的内容中提取文本和图片
  255. if (StringUtils.isNotBlank(wmNews.getContent())) {
  256. List<Map> maps = JSONArray.parseArray(wmNews.getContent(), Map.class);
  257. for (Map map : maps) {
  258. if (map.get("type").equals("text")) {
  259. stringBuilder.append(map.get("value"));
  260. }
  261. if (map.get("type").equals("image")) {
  262. images.add((String) map.get("value"));
  263. }
  264. }
  265. }
  266. //2.提取文章的封面图片
  267. if (StringUtils.isNotBlank(wmNews.getImages())) {
  268. String[] split = wmNews.getImages().split(",");
  269. images.addAll(Arrays.asList(split));
  270. }
  271. Map<String, Object> resultMap = new HashMap<>();
  272. resultMap.put("content", stringBuilder.toString());
  273. resultMap.put("images", images);
  274. return resultMap;
  275. }
  276. }

9)文章详情-静态文件生成

9.1)思路分析

文章端创建app相关文章时,生成文章详情静态页上传到MinIO中

图片识别文字审核敏感词 - 图2

9.2)实现步骤

1.新建ArticleFreemarkerService创建静态文件并上传到minIO中

  1. package com.heima.article.service;
  2. import com.heima.model.article.pojos.ApArticle;
  3. public interface ArticleFreemarkerService {
  4. /**
  5. * 生成静态文件上传到minIO中
  6. * @param apArticle
  7. * @param content
  8. */
  9. public void buildArticleToMinIO(ApArticle apArticle,String content);
  10. }

实现

  1. package com.heima.article.service.impl;
  2. import com.alibaba.fastjson.JSON;
  3. import com.alibaba.fastjson.JSONArray;
  4. import com.baomidou.mybatisplus.core.toolkit.Wrappers;
  5. import com.heima.article.mapper.ApArticleContentMapper;
  6. import com.heima.article.service.ApArticleService;
  7. import com.heima.article.service.ArticleFreemarkerService;
  8. import com.heima.file.service.FileStorageService;
  9. import com.heima.model.article.pojos.ApArticle;
  10. import freemarker.template.Configuration;
  11. import freemarker.template.Template;
  12. import lombok.extern.slf4j.Slf4j;
  13. import org.apache.commons.lang3.StringUtils;
  14. import org.springframework.beans.BeanUtils;
  15. import org.springframework.beans.factory.annotation.Autowired;
  16. import org.springframework.scheduling.annotation.Async;
  17. import org.springframework.stereotype.Service;
  18. import org.springframework.transaction.annotation.Transactional;
  19. import java.io.ByteArrayInputStream;
  20. import java.io.InputStream;
  21. import java.io.StringWriter;
  22. import java.util.HashMap;
  23. import java.util.Map;
  24. @Service
  25. @Slf4j
  26. @Transactional
  27. public class ArticleFreemarkerServiceImpl implements ArticleFreemarkerService {
  28. @Autowired
  29. private ApArticleContentMapper apArticleContentMapper;
  30. @Autowired
  31. private Configuration configuration;
  32. @Autowired
  33. private FileStorageService fileStorageService;
  34. @Autowired
  35. private ApArticleService apArticleService;
  36. /**
  37. * 生成静态文件上传到minIO中
  38. * @param apArticle
  39. * @param content
  40. */
  41. @Async
  42. @Override
  43. public void buildArticleToMinIO(ApArticle apArticle, String content) {
  44. //已知文章的id
  45. //4.1 获取文章内容
  46. if(StringUtils.isNotBlank(content)){
  47. //4.2 文章内容通过freemarker生成html文件
  48. Template template = null;
  49. StringWriter out = new StringWriter();
  50. try {
  51. template = configuration.getTemplate("article.ftl");
  52. //数据模型
  53. Map<String,Object> contentDataModel = new HashMap<>();
  54. contentDataModel.put("content", JSONArray.parseArray(content));
  55. //合成
  56. template.process(contentDataModel,out);
  57. } catch (Exception e) {
  58. e.printStackTrace();
  59. }
  60. //4.3 把html文件上传到minio中
  61. InputStream in = new ByteArrayInputStream(out.toString().getBytes());
  62. String path = fileStorageService.uploadHtmlFile("", apArticle.getId() + ".html", in);
  63. //4.4 修改ap_article表,保存static_url字段
  64. apArticleService.update(Wrappers.<ApArticle>lambdaUpdate().eq(ApArticle::getId,apArticle.getId())
  65. .set(ApArticle::getStaticUrl,path));
  66. }
  67. }
  68. }

2.在ApArticleService的saveArticle实现方法中添加调用生成文件的方法

  1. /**
  2. * 保存app端相关文章
  3. * @param dto
  4. * @return
  5. */
  6. @Override
  7. public ResponseResult saveArticle(ArticleDto dto) {
  8. // try {
  9. // Thread.sleep(3000);
  10. // } catch (InterruptedException e) {
  11. // e.printStackTrace();
  12. // }
  13. //1.检查参数
  14. if(dto == null){
  15. return ResponseResult.errorResult(AppHttpCodeEnum.PARAM_INVALID);
  16. }
  17. ApArticle apArticle = new ApArticle();
  18. BeanUtils.copyProperties(dto,apArticle);
  19. //2.判断是否存在id
  20. if(dto.getId() == null){
  21. //2.1 不存在id 保存 文章 文章配置 文章内容
  22. //保存文章
  23. save(apArticle);
  24. //保存配置
  25. ApArticleConfig apArticleConfig = new ApArticleConfig(apArticle.getId());
  26. apArticleConfigMapper.insert(apArticleConfig);
  27. //保存 文章内容
  28. ApArticleContent apArticleContent = new ApArticleContent();
  29. apArticleContent.setArticleId(apArticle.getId());
  30. apArticleContent.setContent(dto.getContent());
  31. apArticleContentMapper.insert(apArticleContent);
  32. }else {
  33. //2.2 存在id 修改 文章 文章内容
  34. //修改 文章
  35. updateById(apArticle);
  36. //修改文章内容
  37. ApArticleContent apArticleContent = apArticleContentMapper.selectOne(Wrappers.<ApArticleContent>lambdaQuery().eq(ApArticleContent::getArticleId, dto.getId()));
  38. apArticleContent.setContent(dto.getContent());
  39. apArticleContentMapper.updateById(apArticleContent);
  40. }
  41. //异步调用 生成静态文件上传到minio中
  42. articleFreemarkerService.buildArticleToMinIO(apArticle,dto.getContent());
  43. //3.结果返回 文章的id
  44. return ResponseResult.okResult(apArticle.getId());
  45. }

3.文章微服务开启异步调用

图片识别文字审核敏感词 - 图3