lxml的使用
使用
lxml
#!/usr/bin/env python3
#-*- coding utf-8 -*-
# https://lxml.de/tutorial.html lxml 文档
from lxml import etree
from copy import deepcopy
root = etree.Element("root")
print(root.tag) # root
root.append(etree.Element("child1"))
child2 = etree.SubElement(root, "child2")
child3 = etree.SubElement(root, "child3")
# b'<root>\n <child1/>\n <child2/>\n <child3/>\n</root>\n'
print(etree.tostring(root, pretty_print=True))
child = root[0]
print(child.tag) # child1
print(len(root)) # 3
print(root.index(root[1])) # 1
# 插入
root.insert(0, etree.Element("child0"))
for child in root:
print(child.tag)
print(etree.iselement(root)) # 判断是否是一个element
print(root is root[0].getparent()) # 判断 root 是 root[0]的父级element
print(root[0] is root[1].getprevious()) # 上一个节点
print(root[1] is root[0].getnext()) # 下一个节点
temp = deepcopy(root[1]) # 复制节点
print(temp.tag)
root = etree.Element("root", id="root") # 设置属性
print(etree.tostring(root))
简易的例子
#!/usr/bin/env python3
#-*- coding utf-8 -*-
from lxml import etree
text = '''
<div>
<ul>
<li class="item1">1</li>
<li class="item2">2</li>
<li class="item3">3</li>
</ul>
</div>
'''
html = etree.HTML(text) # string 转换为 lxml.etree._Element
result = etree.tostring(html, encoding='utf-8')
print(type(html))
print(type(result))
print(result.decode('utf-8'))
print("======================")
htmlEmt = etree.parse('./test/text.xml') # 读文件转换为lxml.etree._Element
print(type(htmlEmt))
result = etree.tostring(htmlEmt, pretty_print=True)
print(result)
# 获取所有的li元素
result = htmlEmt.xpath('//li')
for r in result:
print(etree.tostring(r)) # 输出element string
print(r.text) # 输出text
print("======================")
# 通过id获取元素
# // 表示从任意位置
# li 表示元素名
# [@id='root'] 表示id为root的
result = htmlEmt.xpath("//li[@id='root']")
print(type(result))
print(len(result))
for r in result:
print(etree.tostring(r)) # 输出element string
print(r.text) # 输出text