https://juejin.cn/post/6988679718664929294 beautifulsoup: 不能使用xpath lxml: 可以使用xpath
方法一:lxml
from lxml import etreehtml = etree.parse('./test.html', etree.HTMLParser()) # 从文件读取html = etree.HTML(html_text) # 从网页获取eles = html.xpath('//*')div_content = etree.tostring(eles[0], pretty_print=True, method='html').decode('utf-8') # 转为字符串content_html_str = etree.tostring(eles[0], encoding='utf-8',pretty_print=True, method='html').decode('utf-8') # 转为字符串
方法二:BeautifulSoup
from bs4 import BeautifulSoupsoup = BeautifulSoup('<p>Hello</p>', 'lxml')print(soup.p.string)soup = BeautifulSoup(markup, "lxml") # markup 是 HTML 的 str# 标准缩进格式输出print(soup.prettify())soup = BeautifulSoup(html, 'lxml')print(soup.title) # <title>The Dormouse's story</title>print(type(soup.title)) # <class 'bs4.element.Tag'>print(soup.title.string) # The Dormouse's storyprint(soup.head) # <head><title>The Dormouse's story</title></head>print(soup.p)
find(name , attrs , recursive , text , **kwargs)soup = BeautifulSoup(html, 'lxml')print(soup.find(name='ul')print(soup.find(attrs={'class': 'element'}))print(soup.find(text=re.compile('.*?o.*?', re.S))) # 结果会返回匹配正则表达式的第一个节点的文本(结果不是节点)
用法与find()类似
