# http://www.17989.com/xiaohua/5.htm# 标题和正文from lxml import etreeimport requestsfrom fake_useragent import UserAgentimport csv# 获取网页源代码def download(url): headers = { 'User-Agent':UserAgent().random } html = requests.get(url=url,headers=headers) html.encoding='utf-=8' return html.text# 获取内容def get_data(html): xml = etree.HTML(html) li_list = xml.xpath('//div[@class="module articlelist"]/ul/li') j = 1 for li in li_list: item = {} title = li.xpath('.//div[@class="hd"]/text()')[0] content = li.xpath('.//pre/text()') contentData = ' '.join([i.strip() for i in content]) item['title'] = title item['content'] = contentData data_list.append(item) j += 1 return data_listdef saveData(): headers = ['title','content'] with open('xiaohua.csv', 'w', encoding='utf-8-sig', newline='') as file: dwriter = csv.DictWriter(file,headers) dwriter.writeheader() dwriter.writerows(data_list)# 获取翻页def get_next_page(html): xml = etree.HTML(html) href = xml.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0] return "http://www.17989.com" + hrefif __name__ == '__main__': url = 'http://www.17989.com/xiaohua/1.htm' i = 1 data_list = [] while i < 100: html = download(url) get_data(html) url = get_next_page(html) i += 1 if not url: break saveData()