https://juejin.cn/post/6988679718664929294 beautifulsoup: 不能使用xpath lxml: 可以使用xpath
方法一:lxml
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser()) # 从文件读取
html = etree.HTML(html_text) # 从网页获取
eles = html.xpath('//*')
div_content = etree.tostring(eles[0], pretty_print=True, method='html').decode('utf-8') # 转为字符串
content_html_str = etree.tostring(eles[0], encoding='utf-8',pretty_print=True, method='html').decode('utf-8') # 转为字符串
方法二:BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup('<p>Hello</p>', 'lxml')
print(soup.p.string)
soup = BeautifulSoup(markup, "lxml") # markup 是 HTML 的 str
# 标准缩进格式输出
print(soup.prettify())
soup = BeautifulSoup(html, 'lxml')
print(soup.title) # <title>The Dormouse's story</title>
print(type(soup.title)) # <class 'bs4.element.Tag'>
print(soup.title.string) # The Dormouse's story
print(soup.head) # <head><title>The Dormouse's story</title></head>
print(soup.p)
find(name , attrs , recursive , text , **kwargs)
soup = BeautifulSoup(html, 'lxml')
print(soup.find(name='ul')
print(soup.find(attrs={'class': 'element'}))
print(soup.find(text=re.compile('.*?o.*?', re.S))) # 结果会返回匹配正则表达式的第一个节点的文本(结果不是节点)
用法与find()类似