1
0
Fork 0
sortxml/sortxml.py
Daniel Baumann 638d6148c0
Adding upstream version 0.1.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-30 18:55:23 +02:00

262 lines
12 KiB
Python

#!/usr/bin/python310
"""Simple XML element sorter.
This module can be used by importing `sort_xml` or by running standalone from the command-line.
"""
# Copyright (c) 2022, Chris Koch <kopachris@gmail.com>
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# (1) Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# (2) Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# (3)The name of the author may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
__version__ = (0, 1, 0)
__version_str__ = '.'.join([str(v) for v in __version__])
__description__ = """
A simple XML element sorter. Will sort the children of selected elements
using a given attribute's value or subelement's text as the sort key.
Example usage:
$ python sortxml.py ARForm_orig.rdl "./DataSets/DataSet[@Name='ARForm']/Fields" Name -o ARForm.rdl
"""
import argparse as ap
import xml.etree.ElementTree as ET
from pathlib import Path
from io import TextIOWrapper
from codecs import BOM_UTF8
from decimal import Decimal
from dateutil.parser import parse as parse_dt
class NSElement(ET.Element):
"""Subclass of ElementTree.Element which keeps track of its TreeBuilder and namespaces if available."""
def __init__(self, *args, **kwargs):
self._ns_map = dict()
self._builder = None
if 'builder' in kwargs:
builder = kwargs.pop('builder')
self._builder = builder
if hasattr(builder, 'ns_map'):
self._ns_map = builder.ns_map
super().__init__(*args, **kwargs)
def find(self, path, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().find(path, namespaces)
def findall(self, path, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().findall(path, namespaces)
def findtext(self, path, default=None, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().findtext(path, default, namespaces)
def iterfind(self, path, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().iterfind(path, namespaces)
class NSTreeBuilder(ET.TreeBuilder):
"""Subclass of ElementTree.TreeBuilder which adds namespaces in the document to the namespace registry."""
def __init__(self, **kwargs):
self.ns_map = dict()
if 'element_factory' in kwargs:
del kwargs['element_factory']
super().__init__(element_factory=NSElement, **kwargs)
def start_ns(self, prefix, uri):
self.ns_map[prefix] = uri
ET.register_namespace(prefix, uri)
def start(self, tag, attrs):
if self._factory is NSElement:
self._flush()
self._last = e = self._factory(tag, attrs, builder=self)
if self._elem:
self._elem[-1].append(e)
elif self._root is None:
self._root = e
self._elem.append(e)
self._tail = 0
return e
else:
return super().start(tag, attrs)
def _handle_single(self, factory, insert, *args):
if factory is NSElement:
e = factory(*args, builder=self)
if insert:
self._flush()
self._last = e
if self._elem:
self._elem[-1].append(e)
self._tail = 1
return e
else:
return super()._handle_single(factory, insert, *args)
def sort_xml(xml_doc, node_path, sort_attr, use_text=False, sort_as_datetime=False, sort_as_decimal=False,
descending=False):
"""Sort the children of a selection of elements in an XML document. Returns an ElementTree representing the
resulting whole document. ElementTree can easily be converted to string or written to a file like so:
>>> foo_str = ET.tostring(sort_xml(xml_doc, node_path, sort_attr).getroot())
>>> sort_xml(xml_doc, node_path, sort_attr).write('foo.xml')
Required arguments:
-------------------
* `xml_doc` -- a text IO stream (such as an open file object), Path object pointing to an XML
file, string representing the file path, or string containing the file contents of a valid XML file. Can't take
an ElementTree instance because we need to use our own parser to keep track of namespaces.
* `node_path` -- a string containing the path to the node you want to sort the children of in the XPath language
of the etree module
* `sort_attr` -- the attribute of the child elements to use as the sort key
Optional arguments:
-------------------
* `use_text` -- use `sort_attr` as the name of a subelement of the path's children whose text will be the
sort key (default: False)
* `sort_as_datetime` -- try to parse the values of the sort key as a datetime using the `dateutil` module and sort
chronologically (default: False, mutually exclusive with `sort_as_decimal`)
* `sort_as_decimal` -- try to parse the values of the sort key as a decimal and sort numerically (useful to keep
'10' from showing up right after '1') (default: False, mutually exclusive with `sort_as_datetime`)
* `descending` -- sort in descending order instead of ascending (default: False)
"""
# check parameters
# xml_doc
if isinstance(xml_doc, TextIOWrapper) and xml_doc.readable():
# xml_doc is a readable text stream, let's read it
# but first make sure to remove any byte order marker
if xml_doc.encoding != 'utf-8-sig':
xml_doc.reconfigure(encoding='utf-8-sig')
xml_str = xml_doc.read()
elif isinstance(xml_doc, Path) and xml_doc.is_file():
# xml_doc is a Path object to a file
xml_str = xml_doc.read_text('utf-8-sig') # utf-8-sig to remove byte order marker
elif isinstance(xml_doc, str) and Path(xml_doc).is_file():
# xml_doc is a filename
xml_str = Path(xml_doc).read_text('utf-8-sig')
elif isinstance(xml_doc, str) and len(xml_doc) > 0:
# xml_doc hopefully contains valid XML
if xml_doc.startswith(BOM_UTF8.decode('utf-8')):
xml_str = xml_doc[3:]
else:
xml_str = xml_doc
else:
raise TypeError("sort_xml() requires first parameter must be a string, readable IO stream, or path for a "
f"valid xml file! xml_doc: {repr(xml_doc)}")
# sort_attr
if not (isinstance(sort_attr, str) and len(sort_attr) > 0):
raise TypeError("sort_xml() requires sort attribute must be a non-empty string!\n\t"
f"sort_attr: {repr(sort_attr)}")
else:
sort_attr = sort_attr.strip()
if not (sort_attr.replace('_', '').isalnum() and (sort_attr[0].isalpha() or sort_attr[0] == '_')):
raise ValueError("Sort attribute passed to sort_xml() is an invalid name!\n\t"
f"sort_attr: {repr(sort_attr)}")
# make our element tree using our custom treebuilder and get all the parents we have to sort children of
dom = ET.fromstring(xml_str, ET.XMLParser(target=NSTreeBuilder()))
matching_parents = dom.findall(node_path)
# check what kind of sorting we're doing and do it
# TODO might be faster if we do the check once and then run the appropriate for loop?
for par in matching_parents:
if use_text:
if sort_as_datetime:
par[:] = sorted(par, key=lambda x: parse_dt(x.findtext(sort_attr)), reverse=descending)
elif sort_as_decimal:
par[:] = sorted(par, key=lambda x: Decimal(x.findtext(sort_attr)), reverse=descending)
else:
par[:] = sorted(par, key=lambda x: x.findtext(sort_attr), reverse=descending)
elif sort_as_datetime:
par[:] = sorted(par, key=lambda x: parse_dt(x.get(sort_attr)), reverse=descending)
elif sort_as_decimal:
par[:] = sorted(par, key=lambda x: Decimal(x.get(sort_attr)), reverse=descending)
else:
par[:] = sorted(par, key=lambda x: x.get(sort_attr), reverse=descending)
return ET.ElementTree(dom)
if __name__ == '__main__':
argp = ap.ArgumentParser(description=__description__, formatter_class=ap.RawDescriptionHelpFormatter)
argp.add_argument('-v', '--version', action='version', version=f"%(prog)s -- version {__version_str__}")
argp.add_argument('input_file', type=Path, help="File path to the source xml file.")
argp.add_argument('sort_xpath',
help="XPath-style selector for elements to sort the children of. This has the same limitations "
"as Python's ElementTree module.")
argp.add_argument('sort_attr', help="The name of the attribute to use as the sort key.")
argp.add_argument('-r', '--reverse', '--descending', action='store_true', dest='descending',
help="Sort the child elements in reverse (descending) order.")
argp.add_argument('-t', '--text', '--use-text', action='store_true', dest='use_text',
help="Treat the sort attribute name as the name of a subelement whose text is the sort key.")
sort_style = argp.add_mutually_exclusive_group()
sort_style.add_argument('--datetime', '--as-datetime', action='store_true', dest='as_datetime',
help="Try to parse the sort key as a date/time value. Mutually exclusive with --decimal.")
sort_style.add_argument('--decimal', '--as-decimal', action='store_true', dest='as_decimal',
help="Try to parse the sort key as a decimal number. Mutually exclusive with --datetime.")
argp.add_argument('-o', '--output', type=Path, dest='output_file',
help="File path to the destination file. (Default is to append '_sorted' to the filename.)")
argv = argp.parse_args()
xml_doc = argv.input_file
sort_path = argv.sort_xpath
sort_attr = argv.sort_attr
sort_desc = argv.descending
use_text = argv.use_text
as_dt = argv.as_datetime
as_dec = argv.as_decimal
sorted_xml = sort_xml(xml_doc, sort_path, sort_attr, use_text, as_dt, as_dec, sort_desc)
if not hasattr(argv, 'output_file'):
new_filename = xml_doc.stem + '_sorted'
out_file = xml_doc.with_stem(new_filename)
else:
out_file = argv.output_file
out_file.write_text(ET.tostring(sorted_xml.getroot(), encoding='unicode'), encoding='utf-8')
print(f"Output sorted file as `{out_file}`")