博客园 - 《python 爬虫》

import requests
from bs4 import BeautifulSoup
dingtalk_url = “https://oapi.dingtalk.com/robot/send?access_token=856406880ce7735633eebc761d00eabb11fa0ca47cba8de800c74512bc11fdf2"headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko)’}

url = ‘https://www.cnblogs.com/'resp = requests.get(url, headers=headers)
htmlcontent = resp.text # 得到网页内容soup = BeautifulSoup(html_content, ‘lxml’) # 开始解析# # 找到 id 为 list-container 的 div#_
listcontainer = soup.find(‘div’, id=’postlist’)## # 找到所有 class 为 title 的 a# for link in list_container.find_all(‘a’, class=’titlelnk’):__# print (link.get(‘href’), link.text)_def send_message(Dingtalk_url, title, message):
data = {
‘msgtype’: ‘markdown’,
‘markdown’: {
‘title’: ‘%s\n\n’ % title,
‘text’: “\n### %s\n\n %s” % (title, message),

},
‘at’: {
‘atMobiles’: [],
‘isAtAll’: False }
}
response = requests.post(
Dingtalk_url,
json=data,
verify=True)
print(response.content)
return response

msg = “\n\n”# 找到 id 为 list-container _的 div_list_container = soup.find(‘div’, id=’post_list’)

# 找到所有 class 为 title 的 a_div = list_container.find_all(‘a’, class=’titlelnk’)
for link in div[:5]:
print(link.get(‘href’), link.text)

msg += “> ● %s\n\n\n” % (link.string, “https://www.jianshu.com“ + link.get(‘href’))
send_message(dingtalk_url, “博客园”, msg)