"""
<?xml version="1.0" encoding="UTF-8"?>
<Pensons>
<Penson id="1" cc="zc">
<name>name</name>
<sex>male</sex>
<age>30</age>
</Penson>
<Penson id="2" cc="cz">
<name>name</name>
<sex>male</sex>
<age>30</age>
</Penson>
</Pensons>
"""
from xml.etree.ElementTree import parse
import re
xml_path = r'D:\Demo\Demo.xml'
# get xml object
doc = parse(xml_path)
# get root element
root = doc.getroot()
layer = 0
def getElementInfo(obj,layer,parentpath):
# obj.tag: get tag name
bjname = str(obj.tag)
# obj.items(): get tag attribute objects
bjattr = str(obj.keys())
#str(obj.attrib)
#str(dict(x for x in obj.items()))
# obj.text: get tag value
bjvalue = ("None" if obj.text!=None and re.match(r'\s+',obj.text) else str(obj.text))
# layer: tag layer level in xml doc tree
bjlayer = str(layer)
return (parentpath,objname,objattr,objvalue,objlayer)
def iterobjxml(obj,layer,parentpath,xmldata):
parentpath += "->" + str(obj.tag)
xmldata.append(getElementInfo(obj,layer,parentpath))
# obj.getchildren(): get children layer element
if obj.getchildren() != None:
layer += 1
for b in obj.getchildren():
iterobjxml(b,layer,parentpath,xmldata)
xmllist = []
iterobjxml(root,0,'',xmllist)
for rlist in xmllist:
print(rlist)
http://blog.csdn.net/menglei8625/article/details/7494509
# target: to parse xml
# ref: http://blog.csdn.net/menglei8625/article/details/7494509
from xml.etree.ElementTree import parse
import re
# demo.xml
"""
<?xml version="1.0"?>
<stop>
<id a='a' b='b' c='c'>14791</id>
<nm>Clark & Balmoral</nm>
<sri>
<rt>22</rt>
<d>North Bound</d>
<dd>North Bound</dd>
</sri>
<cr>22</cr>
<pre>
<pt>5 MIN</pt>
<fd>Howard</fd>
<v>1378</v>
<rn>22</rn>
</pre>
</stop>
"""
# xml file path
xml_path = r'.\demo.xml'
print('{:-^20}'.format('start'))
# 1.打开xml文档
doc = parse(xml_path)
# 2.获取root节点
root = doc.getroot()
print("root:"+str(root))
# 3.获取元素name
print("root.tag:"+str(root.tag))
# 4.获取元素attribute
print("root.attrib:"+str(root.attrib))
# 5.获取元素value
print("root.text:"+ascii(root.text))
# 6.获取元素tail(不常用)
print("root.tail:"+ascii(root.tail))
print('{:-^20}'.format(''))
# 此方法用来获取元素name, attribute, value, 组成tuple并返回
def getElementInfomation(obj):
# obj.tag: get tag name
objname = str(obj.tag)
# obj.attrib: get tag attribute objects
# 没有属性显示空字典:{}
objattr = str(obj.attrib)
# obj.text: get tag value
# 如果有值且非‘空格换行制表’之类的占位符则取该值,否则取"None"
objvalue = ("None" if obj.text!=None and re.match(r'\s+',obj.text) else str(obj.text))
# 返回tuple
return (objname,objattr,objvalue)
# 使用getElementInfomation方法效果
print(getElementInfomation(root))
print('{:-^20}'.format(''))
# 7.如果root下还有子元素
for child in root:
print(getElementInfomation(child))
print('{:-^20}'.format(''))
# 由某个节点开始进行遍历(迭代方法)
def go_through(obj):
print(getElementInfomation(obj))
for ele in obj:
# for ele in obj.getchildren():
go_through(ele)
# 使用go_through方法效果
go_through(root)
print('{:-^20}'.format(''))
# 加入层级数值
def go_through_layer(obj,layer:int):
# 将tuple转换成list并赋值给temp变量
temp = list(getElementInfomation(obj))
# 在list里追加层级数值
temp.insert(0,layer)
print(temp)
layer += 1
for ele in obj:
# for ele in obj.getchildren():
go_through_layer(ele,layer)
layer_level = 0
go_through_layer(root,layer_level)
print('{:-^20}'.format(''))
# 在紧邻的子元素层级中查找到第一个匹配的元素
print('{:*^20}'.format(''))
print(root.find('.')) # 根元素
print(root.find('pre')) # 其紧邻的子元素层级包括'id/nm/sri/cr/pre'
go_through_layer(root.find('pre'),1)
print(root.find('./sri/rt')) # 根据xpath可以找到相应元素
# 在紧邻的子元素层级中查找到所有匹配的元素
print('{:*^20}'.format(''))
print(root.findall('.')) # # 根元素
print(root.findall('sri')) # 其紧邻的子元素层级包括'id/nm/sri/cr/pre'
for i in root.findall('sri'):
go_through_layer(i,1)
print(root.findall('./sri/rt')) # 根据xpath可以找到相应元素
# 在紧邻的子元素层级中查找到匹配的元素,并返回元素name
print('{:*^20}'.format(''))
print(ascii(root.findtext('id')))
print(ascii(root.findtext('sri')))
# 以下几种是迭代器,可用于遍历元素,之后进行后续操作
print('{:+<20}'.format('Left')) #左缩进
# 以当前元素作为根节点做迭代
for y in root.iter():
print(y)
print('{:+^20}'.format('center')) #居中
# 以指定元素作为根节点做迭代
for y in root.iterfind('sri'):
print(y)
print('{:+>20}'.format('right')) #右缩进
# 以当前元素作为根节点做迭代并返回元素value
for y in root.itertext():
print(ascii(y))