Day01

1.初识mongoDo数据库

将小说的每一行都保存在mongodb中

  1. import pymongo
  2. #将小说保存到mongodb数据库中
  3. client = pymongo.MongoClient('localhost',27017)
  4. walden = client['walden']
  5. sheet_tab = walden['sheet_tab']
  6. path = '/Users/Hou/Desktop/walden.txt'
  7. with open(path2,'r') as f:
  8. lines = f.readlines()
  9. for index,line in enumerate(lines):
  10. data = {
  11. 'index':index,
  12. 'line' :line,
  13. 'words':len(line.split())
  14. }
  15. sheet_tab.insert_one(data)
  16. # $lt/$lte/$gt/$gte/$ne,依次等价于</<=/>/>=/!=。(l表示less g表示greater e表示equal n表示not )
  17. for item in sheet_tab.find({'words':{'$lt':5}}):
  18. print(item)

image.png

Day02

1.爬取58同城的多个页面的数据(不一定成功的)

分析流程

  1. 观察页面特征(不同的页面是否具有一致性)

不同页面不同规则的问题,58同城的分页问题

  1. 设计工作流程(确保工作能搞笑运行)

设定有两个爬虫spider1(先爬取所有的列表页,保存到url_list表中)和spider2(从表url_list中获取数据爬取详情页,将数据保存到item_info表中
Week02 - 图2

1.1 爬取58的所有分类页

  1. from bs4 import BeautifulSoup
  2. import requests
  3. headers = {
  4. 'user-agent': ''
  5. }
  6. start_url = 'https://gz.58.com/sale.shtml'
  7. url_host = 'https://gz.58.com'
  8. def get_channel_url(url):
  9. wb_data = requests.get(start_url, headers = headers)
  10. #print(wb_data.text)
  11. soup = BeautifulSoup(wb_data.text, 'lxml')
  12. links = soup.select('ul.ym-submnu > li > b > a')
  13. print(links)
  14. for link in links:
  15. page_url = url_host + link.get('href')
  16. print(page_url)
  17. #ymenu-side > ul > li:nth-child(1) > ul > li:nth-child(1) > b > a
  18. # ul.ym-submnu > li > b > a
  19. get_channel_url(start_url)

image.png

1.2 根据url解析页面(获取商家链接)

  1. from bs4 import BeautifulSoup
  2. import requests
  3. import time
  4. import pymongo
  5. client = pymongo.MongoClient('localhost', 27017)
  6. tmp_58city = client['tmp_test']
  7. url_list = tmp_58city['url_list']
  8. item_info = tmp_58city['item_info']
  9. # spider 1
  10. def get_links_from(channel, pages, who_sells=0):
  11. # https://bj.58.com/bijiben/0/pn2/
  12. # 0表示个人,pn2表示第二页
  13. list_view = '{}{}/pn{}'.format(channel, str(who_sells), str(pages))
  14. #print(list_view)
  15. wb_data = requests.get(list_view, headers=headers)
  16. time.sleep(2)
  17. soup = BeautifulSoup(wb_data.text, 'lxml')
  18. # print(wb_data.text)
  19. if soup.find('td', 't'):
  20. for link in soup.select('td.t a.t'):
  21. item_link = link.get('href').split('?')[0]
  22. url_list.insert_one({'url': item_link})
  23. print(item_link)
  24. else:
  25. pass
  26. def get_item_info(url):
  27. wb_data = requests.get(url, headers = headers)
  28. soup = BeautifulSoup(wb_data_text, 'lxml')
  29. title = soup.title.text
  30. price = soup.select('span.price.c_f50')[0].text
  31. date = soup.select('.time')[0].text
  32. area = list(soup.select('.c_25d a')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None
  33. item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area})
  34. get_links_from('https://gz.58.com/bijiben/',2)

image.png

1.3 根据商家链接爬取详细信息

  1. path = './test/test.html'
  2. html_page = open(path, 'r', encoding='utf-8')
  3. soup = BeautifulSoup(html_page, 'lxml')
  4. # title = soup.select('div.detail-title > h1').text
  5. title = soup.title.text
  6. price = soup.select('span.infocard__container__item__main__text--price')[0].text.strip()
  7. date = soup.select('div.detail-title > div.detail-title__info > div:nth-child(1)')[0].text
  8. # area = list(soup.select('div.infocard__container__item__main a')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None
  9. area = list(soup.select('div.infocard__container__item__main a'))[0].text
  10. # item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area})
  11. ans = {'title': title, 'price': price, 'date': date, 'area': area}
  12. print(ans)

image.png

1.4 多进程爬虫得数据获取

创建main.py文件调用上述的几个函数
1.导入所需要的库
2.用函数填入页码
3.创建进程池

  1. from multiprocessing import Pool
  2. from ahcnnel_extract import channel_list
  3. from page_parsing import get_links_form
  4. def get_all_links_from(channel):
  5. for num in range(1, 101):
  6. get_links_from(channel, num)
  7. if __name__ == '__main__':
  8. pool = Pool()
  9. pool.map(get_all_links_from, channel_list.split())