xml parse

上一篇 / 下一篇  2016-10-26 21:06:09 / 个人分类:python xml

# target: to parse xml
# ref: http://blog.csdn.net/menglei8625/article/details/7494509

from xml.etree.ElementTree import parse

import re

# demo.xml
"""
<?xml version="1.0"?>
<stop>
<id a='a' b='b' c='c'>14791</id>
<nm>Clark &amp; Balmoral</nm>
<sri>
<rt>22</rt>
<d>North Bound</d>
<dd>North Bound</dd>
</sri>
<cr>22</cr>
<pre>
<pt>5 MIN</pt>
<fd>Howard</fd>
<v>1378</v>
<rn>22</rn>
</pre>
</stop>
"""


# xml file path
xml_path = r'.\demo1.xml'


print('{:-^20}'.format('start'))

# 1.打开xml文档
doc = parse(xml_path)

# 2.获取root节点
root = doc.getroot()

print("root:"+str(root))
# 3.获取元素name
print("root.tag:"+str(root.tag))
# 4.获取元素attribute
print("root.attrib:"+str(root.attrib))
# 5.获取元素value
print("root.text:"+ascii(root.text))
# 6.获取元素tail(不常用)
print("root.tail:"+ascii(root.tail))

print('{:-^20}'.format(''))

# 此方法用来获取元素name, attribute, value, 组成tuple并返回
def getElementInfomation(obj):
# obj.tag: get tag name
objname = str(obj.tag)
# obj.attrib: get tag attribute objects
# 没有属性显示空字典:{}
objattr = sorted(list((x,obj.attrib[x]) for x in obj.attrib))
# obj.text: get tag value
# 如果有值且非‘空格换行制表’之类的占位符则取该值,否则取"None"
objvalue = ("None" if obj.text!=None and re.match(r'\s+',obj.text) else str(obj.text))
# 返回tuple
return (objname,objattr,objvalue)
# 使用getElementInfomation方法效果
print(getElementInfomation(root))
'''
print('{:-^20}'.format(''))

# 7.如果root下还有子元素
for child in root:
print(getElementInfomation(child))

print('{:-^20}'.format(''))

# 由某个节点开始进行遍历(迭代方法)
def go_through(obj):
print(getElementInfomation(obj))
for ele in obj:
# for ele in obj.getchildren():
go_through(ele)
# 使用go_through方法效果
go_through(root)

print('{:-^20}'.format(''))

# 加入层级数值
def go_through_layer(obj,layer:int):
# 将tuple转换成list并赋值给temp变量
temp = list(getElementInfomation(obj))
# 在list里追加层级数值
temp.insert(0,layer)
print(list(temp))
layer += 1
for ele in obj:
# for ele in obj.getchildren():
go_through_layer(ele,layer)
layer_level = 0
go_through_layer(root,layer_level)

print('{:-^20}'.format(''))

# 在紧邻的子元素层级中查找到第一个匹配的元素
print('{:*^20}'.format(''))
print(root.find('.')) # 根元素
print(root.find('pre')) # 其紧邻的子元素层级**'id/nm/sri/cr/pre'
go_through_layer(root.find('pre'),1)
print(root.find('./sri/rt')) # 根据xpath可以找到相应元素
# 在紧邻的子元素层级中查找到所有匹配的元素
print('{:*^20}'.format(''))
print(root.findall('.')) # # 根元素
print(root.findall('sri')) # 其紧邻的子元素层级**'id/nm/sri/cr/pre'
for i in root.findall('sri'):
go_through_layer(i,1)
print(root.findall('./sri/rt')) # 根据xpath可以找到相应元素

# 在紧邻的子元素层级中查找到匹配的元素,并返回元素name
print('{:*^20}'.format(''))
print(ascii(root.findtext('id')))
print(ascii(root.findtext('sri')))

# 以下几种是迭代器,可用于遍历元素,之后进行后续操作
print('{:+<20}'.format('Left')) #左缩进
# 以当前元素作为根节点做迭代
for y in root.iter():
print(y)

print('{:+^20}'.format('center')) #居中
# 以指定元素作为根节点做迭代
for y in root.iterfind('sri'):
print(y)

print('{:+>20}'.format('right')) #右缩进
# 以当前元素作为根节点做迭代并返回元素value
for y in root.itertext():
print(ascii(y))
'''

def go_through_layer_sort(obj,layer:int):
# 将tuple转换成list并赋值给temp变量
temp = list(getElementInfomation(obj))
# 在list里追加层级数值
temp.insert(0,layer)
print(tuple(temp))
layer += 1
# for ele in obj:
for ele in sorted(list(elem for elem in obj),key=lambda x:x.tag):
go_through_layer_sort(ele,layer)
layer_level = 0
go_through_layer_sort(root,layer_level)

TAG:

 

评分:0

我来说两句

Open Toolbar