爬虫学习
目前学了的知识点:
requests #请求BeautifulSoup #获取、筛选静态网页代码数据pymysql #mysql的库json #格式time #time.sleep(1) 等待urllib #urllib.request 也是请求selenium #可以获取动态网页的源码、模拟操作网页re #正则表达式
大概简单流程:
import requestsurl = 'http://www.baidu.com'r = requests.get(url)print(r.encoding) #编码格式print(r.apparent_encoding) #另一个编码格式#部分情况下,若出现中文乱码,则r.encoding = r.apparent_encodingprint(r.text) #源代码
连接mysql数据库:
import pymysqltry: db = pymysql.Connect( host = 'localhost', port = 3306, user = 'root', password = '密码', db = 'table_name', charset = 'utf8' ) cur = db.cursor() #游标 insert = "inset into weather values('%s','%s','%s','%s')" data = ('a','b','c','d') cur.execute(insert % data) #这个样子有点特别 db.commit()except: print('出错...')finally: cur.close() db.close()
提取数据:
from bs4 import BeautifulSoupdef getone(string): soup = BeaufitulSoup(string,'lxml') #先把源代码解析为bs对象 index = soup.find_all(attrs={'class','board-index'}) #这种形式,查找出class为board-index的标签 #类似的还有,通过条件查找标签或标签的元素 # 直接根据标签 soup.title soup.p soup.a.name soup.a.parent # 通过attrs soup.a.attrs['href'] #获取a标签的href地址 # 通过find,,查找单个 soup.find(id = 'newtitle') # 通过find_all,,查找多个 soup.find_all(class = 'abc') soup.find_all(attrs={'class','abc'}) # 通过select soup.select('a > .main_img') soup.select('#article_content > div > ul:nth-child(24)') # 可以直接在网页中复制过来 #获取节点信息 atag = soup.find(id = 'new').string
提取动态网站的数据
- 使用selenium需要安装这个库,还有相应浏览器的驱动
from selenium import webdriverfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.by import Bydriver = webdriver.Chrome() #加载驱动driver.get('http://www.baidu.com') #获取源代码elem = driver.find_element_by_id('new') #通过条件查找元素#查找条件有:# 单个查找find_element_by_idfind_element_by_namefind_element_by_xpathfind_element_by_link_textfind_element_by_partial_link_textfind_element_by_tag_namefind_element_by_class_namefind_element_by_css_selector# 多个元素,-+s,,,则得到的是集合find_elements_by_namefind_elements_by_xpathfind_elements_by_link_textfind_elements_by_partial_link_textfind_elements_by_tag_namefind_elements_by_class_namefind_elements_by_css_selector# 把byxxx放在括号里find_element(By.ID,'new')find_elements(By.CLASS_NAME,'abc')# 获取属性、内容elem = driver.find_element_by_id('new') elem.get_attribute('href')# 等待5秒获得东西lily = WebDriverWait(driver, 5, 0.5).until( EC.presence_all_element_located((By.CSS_SELECTOR, ".fcg")))
模拟动作:
browser = webdriver.Chrome()try: browser.get('http://www.baidu.com') ainput = browser.find_element_by_id('kw') #找到id为kw的输入框 ainput.send_keys('python') #在输入框中输入 ainput.send_keys(Kays.ENTER) #按下回车 wait = WebDriverWait(browser,10) #等待10秒 wait.until(EC.presence_of_element_located((By.ID,'content_left'))) print(browser.current_url) #当前网页的链接 print(browser.page_source) #当前网页源代码except: print('出错了')finally: browser.close()