input.xml

  1. <?xml version="1.0"?>
  2. <actors xmlns:fictional="http://characters.example.com"
  3. xmlns="http://people.example.com">
  4. <actor>
  5. <name>John Cleese</name>
  6. <fictional:character>Lancelot</fictional:character>
  7. <fictional:character>Archie Leach</fictional:character>
  8. </actor>
  9. <actor>
  10. <name>Eric Idle</name>
  11. <fictional:character>Sir Robin</fictional:character>
  12. <fictional:character>Gunther</fictional:character>
  13. <fictional:character>Commander Clement</fictional:character>
  14. </actor>
  15. </actors>

1 xml.dom.minidom(推荐*)

  1. from xml.dom import minidom
  2. tree = minidom.parse('input.xml')
  3. actors = tree.getElementsByTagName('actor')
  4. for actor in actors:
  5. name = actor.getElementsByTagName('name')[0].firstChild.nodeValue
  6. print name
  7. for character in actor.getElementsByTagName('fictional:character'):
  8. print ' |-->', character.firstChild.nodeValue

image.png

2 bs4(不推荐)

需要全部读入,适用于文件不太大

  1. import bs4
  2. with open('input.xml') as f:
  3. try:
  4. soup = bs4.BeautifulSoup(f.read(), 'lxml') # use lxml parser, faster then default
  5. except:
  6. soup = bs4.BeautifulSoup(f.read(), 'html.parser') # default parser
  7. # 不能用soup.select('fictional:character')
  8. for actor in soup.find_all('actor'):
  9. print actor.find_all('name')[0].text
  10. for character in actor.find_all('fictional:character'):
  11. print ' |-->', character.text

3 xml.etree.ElementTree/cElementTree(推荐**)

  1. try:
  2. import xml.etree.cElementTree as ET # cElementTree is faster
  3. except ImportError:
  4. import xml.etree.ElementTree as ET
  5. infile = 'ns.xml'
  6. tree = ET.parse(infile)
  7. root = tree.getroot()
  8. # 该文件中含有两个命名空间:
  9. # 特定的fictional xmlns:fictional="http://characters.example.com"
  10. # 默认的 xmlns="http://people.example.com"
  11. # 寻找元素时,不带fictional的元素即为默认的
  12. # 下面自己设置了命名空间的映射,其中foo, bar可随意起名
  13. ns = {
  14. 'foo': 'http://people.example.com',
  15. 'bar': 'http://characters.example.com'
  16. }
  17. for actor in root.findall('foo:actor', ns):
  18. print actor.find('foo:name', ns).text
  19. for character in actor.findall('bar:character', ns):
  20. print ' |-->', character.text

4 lxml.etree(推荐*)

  1. import lxml.etree as ET
  2. tree = ET.parse('input.xml')
  3. root = tree.getroot()
  4. ns = root.nsmap # 自动获取namespace映射
  5. # print ns
  6. # {'fictional': 'http://characters.example.com', None: 'http://people.example.com'}
  7. for actor in root.findall('actor', ns):
  8. print actor.find('name', ns).text
  9. for character in actor.findall('fictional:character', ns):
  10. print ' |-->', character.text

5 xml.parsers.expat(不确定)

据说速度较快,不过用起来比较麻烦,还未测试

PS: 测试比较

image.png
GIF.gif