https://juejin.cn/post/6988679718664929294 beautifulsoup: 不能使用xpath lxml: 可以使用xpath

方法一:lxml

  1. from lxml import etree
  2. html = etree.parse('./test.html', etree.HTMLParser()) # 从文件读取
  3. html = etree.HTML(html_text) # 从网页获取
  4. eles = html.xpath('//*')
  5. div_content = etree.tostring(eles[0], pretty_print=True, method='html').decode('utf-8') # 转为字符串
  6. content_html_str = etree.tostring(eles[0], encoding='utf-8',pretty_print=True, method='html').decode('utf-8') # 转为字符串

方法二:BeautifulSoup

  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup('<p>Hello</p>', 'lxml')
  3. print(soup.p.string)
  4. soup = BeautifulSoup(markup, "lxml") # markup 是 HTML 的 str
  5. # 标准缩进格式输出
  6. print(soup.prettify())
  7. soup = BeautifulSoup(html, 'lxml')
  8. print(soup.title) # <title>The Dormouse's story</title>
  9. print(type(soup.title)) # <class 'bs4.element.Tag'>
  10. print(soup.title.string) # The Dormouse's story
  11. print(soup.head) # <head><title>The Dormouse's story</title></head>
  12. print(soup.p)
  1. find(name , attrs , recursive , text , **kwargs)
  2. soup = BeautifulSoup(html, 'lxml')
  3. print(soup.find(name='ul')
  4. print(soup.find(attrs={'class': 'element'}))
  5. print(soup.find(text=re.compile('.*?o.*?', re.S))) # 结果会返回匹配正则表达式的第一个节点的文本(结果不是节点)
  1. 用法与find()类似