lxml的使用

使用

lxml

  1. #!/usr/bin/env python3
  2. #-*- coding utf-8 -*-
  3. # https://lxml.de/tutorial.html lxml 文档
  4. from lxml import etree
  5. from copy import deepcopy
  6. root = etree.Element("root")
  7. print(root.tag) # root
  8. root.append(etree.Element("child1"))
  9. child2 = etree.SubElement(root, "child2")
  10. child3 = etree.SubElement(root, "child3")
  11. # b'<root>\n <child1/>\n <child2/>\n <child3/>\n</root>\n'
  12. print(etree.tostring(root, pretty_print=True))
  13. child = root[0]
  14. print(child.tag) # child1
  15. print(len(root)) # 3
  16. print(root.index(root[1])) # 1
  17. # 插入
  18. root.insert(0, etree.Element("child0"))
  19. for child in root:
  20. print(child.tag)
  21. print(etree.iselement(root)) # 判断是否是一个element
  22. print(root is root[0].getparent()) # 判断 root 是 root[0]的父级element
  23. print(root[0] is root[1].getprevious()) # 上一个节点
  24. print(root[1] is root[0].getnext()) # 下一个节点
  25. temp = deepcopy(root[1]) # 复制节点
  26. print(temp.tag)
  27. root = etree.Element("root", id="root") # 设置属性
  28. print(etree.tostring(root))

简易的例子

  1. #!/usr/bin/env python3
  2. #-*- coding utf-8 -*-
  3. from lxml import etree
  4. text = '''
  5. <div>
  6. <ul>
  7. <li class="item1">1</li>
  8. <li class="item2">2</li>
  9. <li class="item3">3</li>
  10. </ul>
  11. </div>
  12. '''
  13. html = etree.HTML(text) # string 转换为 lxml.etree._Element
  14. result = etree.tostring(html, encoding='utf-8')
  15. print(type(html))
  16. print(type(result))
  17. print(result.decode('utf-8'))
  18. print("======================")
  19. htmlEmt = etree.parse('./test/text.xml') # 读文件转换为lxml.etree._Element
  20. print(type(htmlEmt))
  21. result = etree.tostring(htmlEmt, pretty_print=True)
  22. print(result)
  23. # 获取所有的li元素
  24. result = htmlEmt.xpath('//li')
  25. for r in result:
  26. print(etree.tostring(r)) # 输出element string
  27. print(r.text) # 输出text
  28. print("======================")
  29. # 通过id获取元素
  30. # // 表示从任意位置
  31. # li 表示元素名
  32. # [@id='root'] 表示id为root的
  33. result = htmlEmt.xpath("//li[@id='root']")
  34. print(type(result))
  35. print(len(result))
  36. for r in result:
  37. print(etree.tostring(r)) # 输出element string
  38. print(r.text) # 输出text