python - 百度采集思路 - 《代码》

首先打开搜索界面，然后打开检查

res=browser.find_elements_by_class_name(‘result’)抓取页面的内容

x=browser.find_element_by_id(‘page’)

然后抓取page里面的a标签，links=x.find_elements_by_tag_name(‘a’)
然后for循环，赋值给a for a in links:
if page_num in a.get_attribute(‘href’):

百度采集

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
import requests
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, compress',
    'Accept-Language': 'en-us;q=0.5,en;q=0.3',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}
def baidu_url():
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.find_element_by_id('kw').send_keys('inurl:asp?id=')
    browser.find_element_by_id('kw').send_keys(Keys.ENTER)
    pn=0
    for p in range(10,1000,10):
        page_num='&pn='+str(p)
        sleep(5)
        res=browser.find_elements_by_class_name('result')
        zhuqu(res)
        x=browser.find_element_by_id('page')
        links=x.find_elements_by_tag_name('a')
        for a in links:
            if page_num in a.get_attribute('href'):
                a.click()
                break
        print('当前抓取第%s 页内容' % int(p / 10+1))
def zhuqu(res):
    for re in res:
        f=open('d:/baidu_url.txt','a')
        t=re.find_element_by_tag_name('a').text
        zu=re.find_element_by_tag_name('a').get_attribute('href')
        baiurl=requests.get(zu,headers=headers,allow_redirects=False)
        true_url=baiurl.headers['Location']
        print(t)
        print(true_url)
        f.write(true_url+'\n')
baidu_url()

模拟登录

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
br=webdriver.Chrome()
br.get('https://localhost:3443/#/login')
input=br.find_element_by_id('txtEmailAddress').send_keys('2920015236@qq.com')
input=br.find_element_by_id('txtPassword').send_keys('x5201314..')
dj=br.find_element_by_id('btnLogin').click()