262 lines
12 KiB
Python
262 lines
12 KiB
Python
#!/usr/bin/python310
|
|
|
|
"""Simple XML element sorter.
|
|
|
|
This module can be used by importing `sort_xml` or by running standalone from the command-line.
|
|
|
|
"""
|
|
|
|
# Copyright (c) 2022, Chris Koch <kopachris@gmail.com>
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met:
|
|
#
|
|
# (1) Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
#
|
|
# (2) Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in
|
|
# the documentation and/or other materials provided with the
|
|
# distribution.
|
|
#
|
|
# (3)The name of the author may not be used to
|
|
# endorse or promote products derived from this software without
|
|
# specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
|
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
|
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
__version__ = (0, 1, 0)
|
|
__version_str__ = '.'.join([str(v) for v in __version__])
|
|
|
|
__description__ = """
|
|
A simple XML element sorter. Will sort the children of selected elements
|
|
using a given attribute's value or subelement's text as the sort key.
|
|
Example usage:
|
|
$ python sortxml.py ARForm_orig.rdl "./DataSets/DataSet[@Name='ARForm']/Fields" Name -o ARForm.rdl
|
|
"""
|
|
|
|
import argparse as ap
|
|
import xml.etree.ElementTree as ET
|
|
from pathlib import Path
|
|
from io import TextIOWrapper
|
|
from codecs import BOM_UTF8
|
|
from decimal import Decimal
|
|
from dateutil.parser import parse as parse_dt
|
|
|
|
|
|
class NSElement(ET.Element):
|
|
"""Subclass of ElementTree.Element which keeps track of its TreeBuilder and namespaces if available."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
self._ns_map = dict()
|
|
self._builder = None
|
|
if 'builder' in kwargs:
|
|
builder = kwargs.pop('builder')
|
|
self._builder = builder
|
|
if hasattr(builder, 'ns_map'):
|
|
self._ns_map = builder.ns_map
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def find(self, path, namespaces=None):
|
|
if namespaces is None:
|
|
namespaces = self._ns_map
|
|
return super().find(path, namespaces)
|
|
|
|
def findall(self, path, namespaces=None):
|
|
if namespaces is None:
|
|
namespaces = self._ns_map
|
|
return super().findall(path, namespaces)
|
|
|
|
def findtext(self, path, default=None, namespaces=None):
|
|
if namespaces is None:
|
|
namespaces = self._ns_map
|
|
return super().findtext(path, default, namespaces)
|
|
|
|
def iterfind(self, path, namespaces=None):
|
|
if namespaces is None:
|
|
namespaces = self._ns_map
|
|
return super().iterfind(path, namespaces)
|
|
|
|
|
|
class NSTreeBuilder(ET.TreeBuilder):
|
|
"""Subclass of ElementTree.TreeBuilder which adds namespaces in the document to the namespace registry."""
|
|
|
|
def __init__(self, **kwargs):
|
|
self.ns_map = dict()
|
|
if 'element_factory' in kwargs:
|
|
del kwargs['element_factory']
|
|
super().__init__(element_factory=NSElement, **kwargs)
|
|
|
|
def start_ns(self, prefix, uri):
|
|
self.ns_map[prefix] = uri
|
|
ET.register_namespace(prefix, uri)
|
|
|
|
def start(self, tag, attrs):
|
|
if self._factory is NSElement:
|
|
self._flush()
|
|
self._last = e = self._factory(tag, attrs, builder=self)
|
|
if self._elem:
|
|
self._elem[-1].append(e)
|
|
elif self._root is None:
|
|
self._root = e
|
|
self._elem.append(e)
|
|
self._tail = 0
|
|
return e
|
|
else:
|
|
return super().start(tag, attrs)
|
|
|
|
def _handle_single(self, factory, insert, *args):
|
|
if factory is NSElement:
|
|
e = factory(*args, builder=self)
|
|
if insert:
|
|
self._flush()
|
|
self._last = e
|
|
if self._elem:
|
|
self._elem[-1].append(e)
|
|
self._tail = 1
|
|
return e
|
|
else:
|
|
return super()._handle_single(factory, insert, *args)
|
|
|
|
|
|
def sort_xml(xml_doc, node_path, sort_attr, use_text=False, sort_as_datetime=False, sort_as_decimal=False,
|
|
descending=False):
|
|
"""Sort the children of a selection of elements in an XML document. Returns an ElementTree representing the
|
|
resulting whole document. ElementTree can easily be converted to string or written to a file like so:
|
|
|
|
>>> foo_str = ET.tostring(sort_xml(xml_doc, node_path, sort_attr).getroot())
|
|
>>> sort_xml(xml_doc, node_path, sort_attr).write('foo.xml')
|
|
|
|
Required arguments:
|
|
-------------------
|
|
* `xml_doc` -- a text IO stream (such as an open file object), Path object pointing to an XML
|
|
file, string representing the file path, or string containing the file contents of a valid XML file. Can't take
|
|
an ElementTree instance because we need to use our own parser to keep track of namespaces.
|
|
* `node_path` -- a string containing the path to the node you want to sort the children of in the XPath language
|
|
of the etree module
|
|
* `sort_attr` -- the attribute of the child elements to use as the sort key
|
|
|
|
Optional arguments:
|
|
-------------------
|
|
* `use_text` -- use `sort_attr` as the name of a subelement of the path's children whose text will be the
|
|
sort key (default: False)
|
|
* `sort_as_datetime` -- try to parse the values of the sort key as a datetime using the `dateutil` module and sort
|
|
chronologically (default: False, mutually exclusive with `sort_as_decimal`)
|
|
* `sort_as_decimal` -- try to parse the values of the sort key as a decimal and sort numerically (useful to keep
|
|
'10' from showing up right after '1') (default: False, mutually exclusive with `sort_as_datetime`)
|
|
* `descending` -- sort in descending order instead of ascending (default: False)
|
|
|
|
"""
|
|
# check parameters
|
|
|
|
# xml_doc
|
|
if isinstance(xml_doc, TextIOWrapper) and xml_doc.readable():
|
|
# xml_doc is a readable text stream, let's read it
|
|
# but first make sure to remove any byte order marker
|
|
|
|
if xml_doc.encoding != 'utf-8-sig':
|
|
xml_doc.reconfigure(encoding='utf-8-sig')
|
|
|
|
xml_str = xml_doc.read()
|
|
elif isinstance(xml_doc, Path) and xml_doc.is_file():
|
|
# xml_doc is a Path object to a file
|
|
xml_str = xml_doc.read_text('utf-8-sig') # utf-8-sig to remove byte order marker
|
|
elif isinstance(xml_doc, str) and Path(xml_doc).is_file():
|
|
# xml_doc is a filename
|
|
xml_str = Path(xml_doc).read_text('utf-8-sig')
|
|
elif isinstance(xml_doc, str) and len(xml_doc) > 0:
|
|
# xml_doc hopefully contains valid XML
|
|
if xml_doc.startswith(BOM_UTF8.decode('utf-8')):
|
|
xml_str = xml_doc[3:]
|
|
else:
|
|
xml_str = xml_doc
|
|
else:
|
|
raise TypeError("sort_xml() requires first parameter must be a string, readable IO stream, or path for a "
|
|
f"valid xml file! xml_doc: {repr(xml_doc)}")
|
|
|
|
# sort_attr
|
|
if not (isinstance(sort_attr, str) and len(sort_attr) > 0):
|
|
raise TypeError("sort_xml() requires sort attribute must be a non-empty string!\n\t"
|
|
f"sort_attr: {repr(sort_attr)}")
|
|
else:
|
|
sort_attr = sort_attr.strip()
|
|
if not (sort_attr.replace('_', '').isalnum() and (sort_attr[0].isalpha() or sort_attr[0] == '_')):
|
|
raise ValueError("Sort attribute passed to sort_xml() is an invalid name!\n\t"
|
|
f"sort_attr: {repr(sort_attr)}")
|
|
|
|
# make our element tree using our custom treebuilder and get all the parents we have to sort children of
|
|
|
|
dom = ET.fromstring(xml_str, ET.XMLParser(target=NSTreeBuilder()))
|
|
matching_parents = dom.findall(node_path)
|
|
|
|
# check what kind of sorting we're doing and do it
|
|
# TODO might be faster if we do the check once and then run the appropriate for loop?
|
|
for par in matching_parents:
|
|
if use_text:
|
|
if sort_as_datetime:
|
|
par[:] = sorted(par, key=lambda x: parse_dt(x.findtext(sort_attr)), reverse=descending)
|
|
elif sort_as_decimal:
|
|
par[:] = sorted(par, key=lambda x: Decimal(x.findtext(sort_attr)), reverse=descending)
|
|
else:
|
|
par[:] = sorted(par, key=lambda x: x.findtext(sort_attr), reverse=descending)
|
|
elif sort_as_datetime:
|
|
par[:] = sorted(par, key=lambda x: parse_dt(x.get(sort_attr)), reverse=descending)
|
|
elif sort_as_decimal:
|
|
par[:] = sorted(par, key=lambda x: Decimal(x.get(sort_attr)), reverse=descending)
|
|
else:
|
|
par[:] = sorted(par, key=lambda x: x.get(sort_attr), reverse=descending)
|
|
|
|
return ET.ElementTree(dom)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
argp = ap.ArgumentParser(description=__description__, formatter_class=ap.RawDescriptionHelpFormatter)
|
|
argp.add_argument('-v', '--version', action='version', version=f"%(prog)s -- version {__version_str__}")
|
|
argp.add_argument('input_file', type=Path, help="File path to the source xml file.")
|
|
argp.add_argument('sort_xpath',
|
|
help="XPath-style selector for elements to sort the children of. This has the same limitations "
|
|
"as Python's ElementTree module.")
|
|
argp.add_argument('sort_attr', help="The name of the attribute to use as the sort key.")
|
|
argp.add_argument('-r', '--reverse', '--descending', action='store_true', dest='descending',
|
|
help="Sort the child elements in reverse (descending) order.")
|
|
argp.add_argument('-t', '--text', '--use-text', action='store_true', dest='use_text',
|
|
help="Treat the sort attribute name as the name of a subelement whose text is the sort key.")
|
|
sort_style = argp.add_mutually_exclusive_group()
|
|
sort_style.add_argument('--datetime', '--as-datetime', action='store_true', dest='as_datetime',
|
|
help="Try to parse the sort key as a date/time value. Mutually exclusive with --decimal.")
|
|
sort_style.add_argument('--decimal', '--as-decimal', action='store_true', dest='as_decimal',
|
|
help="Try to parse the sort key as a decimal number. Mutually exclusive with --datetime.")
|
|
argp.add_argument('-o', '--output', type=Path, dest='output_file',
|
|
help="File path to the destination file. (Default is to append '_sorted' to the filename.)")
|
|
|
|
argv = argp.parse_args()
|
|
|
|
xml_doc = argv.input_file
|
|
sort_path = argv.sort_xpath
|
|
sort_attr = argv.sort_attr
|
|
sort_desc = argv.descending
|
|
use_text = argv.use_text
|
|
as_dt = argv.as_datetime
|
|
as_dec = argv.as_decimal
|
|
|
|
sorted_xml = sort_xml(xml_doc, sort_path, sort_attr, use_text, as_dt, as_dec, sort_desc)
|
|
|
|
if not hasattr(argv, 'output_file'):
|
|
new_filename = xml_doc.stem + '_sorted'
|
|
out_file = xml_doc.with_stem(new_filename)
|
|
else:
|
|
out_file = argv.output_file
|
|
|
|
out_file.write_text(ET.tostring(sorted_xml.getroot(), encoding='unicode'), encoding='utf-8')
|
|
|
|
print(f"Output sorted file as `{out_file}`")
|