python数据爬取

安装scrapy

在网上找到教程完成

分析网页

image.png

image.png
image.png

爬虫逻辑

项目结构

image.png

主要逻辑

找到name和code

  1. import scrapy
  2. from shengshixian.items import ShengshixianItem
  3. class SsxSpider(scrapy.Spider):
  4. name = 'ssx'
  5. allowed_domains = ['www.mca.gov.cn']
  6. start_urls = ['http://www.mca.gov.cn/article/sj/xzqh/2020/20201201.html']
  7. def parse(self, response):
  8. trarray = response.xpath('//tr[@height="19"]')
  9. for tr in trarray:
  10. code = tr.xpath('./td[2]/text()').get()
  11. name = tr.xpath('./td[3]/text()').get()
  12. item = ShengshixianItem(name=name, code=code)
  13. yield item

保存到json

  1. from scrapy.exporters import JsonItemExporter
  2. class ShengshixianPipeline:
  3. def __init__(self):
  4. self.fp = open('region.json', 'wb')
  5. self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf8')
  6. self.exporter.start_exporting()
  7. def open_spider(self, spider):
  8. print('begin')
  9. def process_item(self, item, spider):
  10. self.exporter.export_item(item)
  11. return item
  12. def close_spider(self, spider):
  13. self.exporter.finish_exporting()
  14. print('end')
  15. self.fp.close()

java数据处理并入库

发现问题

1.三沙市的两个区没有code

image.png
解决办法:手动加在json里面
image.png

2.新疆的石河子等十个市没有父级,还有神农架,海南也有这种直辖市

image.png
解决办法:手动加个父节点,百度一下,发现这些算是直属的市,但是编号比较奇怪,像区和县一样,利用代码判断这些特殊的省级直辖市
image.png

3.四个直辖市结构不一样

解决办法:代码逻辑去判断

java代码

image.png
service和mapper就用mybatis-plus通用的那种
使用FileUtil读取json,然后用fastjson处理json,使用mybatis-plus入库mysql

  1. @SneakyThrows
  2. @Test
  3. public void regionJson() {
  4. long l = System.currentTimeMillis();
  5. String jsonFilePath = "C:\\Users\\leo\\Desktop\\region.json";
  6. File file = new File(jsonFilePath );
  7. String input = FileUtils.readFileToString(file,"UTF-8");
  8. List<ProCityCounty> array = JSONArray.parseArray(input, ProCityCounty.class);
  9. Map<Integer, String> map = array.stream().collect(Collectors.toMap(ProCityCounty::getCode, ProCityCounty::getName));
  10. for (ProCityCounty proCityCounty : array) {
  11. proCityCounty.setId(Long.valueOf(proCityCounty.getCode()));
  12. //省逻辑
  13. if (proCityCounty.getCode() % 10000 == 0) {
  14. proCityCounty.setGrade(1);
  15. proCityCounty.setFullname(proCityCounty.getName());
  16. }
  17. //市逻辑
  18. else if (proCityCounty.getCode() % 100 == 0) {
  19. proCityCounty.setGrade(2);
  20. Integer pCode = proCityCounty.getCode() / 10000;
  21. String pname = map.get(pCode*10000);
  22. proCityCounty.setPid(pCode*10000);
  23. proCityCounty.setFullname(pname+"-"+proCityCounty.getName());
  24. }else {
  25. Integer pCode=proCityCounty.getCode()/10000;
  26. //国的四大直辖市下属区县的逻辑
  27. if (NumberUtil.equals(pCode,11)||NumberUtil.equals(pCode,12)||NumberUtil.equals(pCode,50)||NumberUtil.equals(pCode,31)) {
  28. proCityCounty.setGrade(2);
  29. proCityCounty.setPid(pCode*10000);
  30. String pname = map.get(pCode*10000);
  31. String fullName=pname+"-"+proCityCounty.getName();
  32. proCityCounty.setFullname(fullName);
  33. }else {
  34. Integer cCode = proCityCounty.getCode() / 100;
  35. proCityCounty.setGrade(3);
  36. String pname = map.get(pCode*10000);
  37. //省的直辖市逻辑
  38. if (cCode % 100 == 90) {
  39. proCityCounty.setPid(cCode*100);
  40. proCityCounty.setPid(pCode*10000);
  41. String fullName=pname+"-"+proCityCounty.getName();
  42. proCityCounty.setFullname(fullName);
  43. }else {
  44. String cName = map.get(cCode*100);
  45. proCityCounty.setPid(cCode*100);
  46. String fullName=pname+"-"+cName+"-"+proCityCounty.getName();
  47. proCityCounty.setFullname(fullName);
  48. }
  49. }
  50. }
  51. }
  52. proCityCountyService.saveBatch(array,1000);
  53. System.out.println((System.currentTimeMillis()-l)/1000+"秒");
  54. }

最终数据库

image.png
image.png

最终前端

前端用ztree
image.png