#!/usr/bin/python310 """Simple XML element sorter. This module can be used by importing `sort_xml` or by running standalone from the command-line. """ # Copyright (c) 2022, Chris Koch # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # (1) Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # (2) Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # (3)The name of the author may not be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. __version__ = (0, 1, 0) __version_str__ = '.'.join([str(v) for v in __version__]) __description__ = """ A simple XML element sorter. Will sort the children of selected elements using a given attribute's value or subelement's text as the sort key. Example usage: $ python sortxml.py ARForm_orig.rdl "./DataSets/DataSet[@Name='ARForm']/Fields" Name -o ARForm.rdl """ import argparse as ap import xml.etree.ElementTree as ET from pathlib import Path from io import TextIOWrapper from codecs import BOM_UTF8 from decimal import Decimal from dateutil.parser import parse as parse_dt class NSElement(ET.Element): """Subclass of ElementTree.Element which keeps track of its TreeBuilder and namespaces if available.""" def __init__(self, *args, **kwargs): self._ns_map = dict() self._builder = None if 'builder' in kwargs: builder = kwargs.pop('builder') self._builder = builder if hasattr(builder, 'ns_map'): self._ns_map = builder.ns_map super().__init__(*args, **kwargs) def find(self, path, namespaces=None): if namespaces is None: namespaces = self._ns_map return super().find(path, namespaces) def findall(self, path, namespaces=None): if namespaces is None: namespaces = self._ns_map return super().findall(path, namespaces) def findtext(self, path, default=None, namespaces=None): if namespaces is None: namespaces = self._ns_map return super().findtext(path, default, namespaces) def iterfind(self, path, namespaces=None): if namespaces is None: namespaces = self._ns_map return super().iterfind(path, namespaces) class NSTreeBuilder(ET.TreeBuilder): """Subclass of ElementTree.TreeBuilder which adds namespaces in the document to the namespace registry.""" def __init__(self, **kwargs): self.ns_map = dict() if 'element_factory' in kwargs: del kwargs['element_factory'] super().__init__(element_factory=NSElement, **kwargs) def start_ns(self, prefix, uri): self.ns_map[prefix] = uri ET.register_namespace(prefix, uri) def start(self, tag, attrs): if self._factory is NSElement: self._flush() self._last = e = self._factory(tag, attrs, builder=self) if self._elem: self._elem[-1].append(e) elif self._root is None: self._root = e self._elem.append(e) self._tail = 0 return e else: return super().start(tag, attrs) def _handle_single(self, factory, insert, *args): if factory is NSElement: e = factory(*args, builder=self) if insert: self._flush() self._last = e if self._elem: self._elem[-1].append(e) self._tail = 1 return e else: return super()._handle_single(factory, insert, *args) def sort_xml(xml_doc, node_path, sort_attr, use_text=False, sort_as_datetime=False, sort_as_decimal=False, descending=False): """Sort the children of a selection of elements in an XML document. Returns an ElementTree representing the resulting whole document. ElementTree can easily be converted to string or written to a file like so: >>> foo_str = ET.tostring(sort_xml(xml_doc, node_path, sort_attr).getroot()) >>> sort_xml(xml_doc, node_path, sort_attr).write('foo.xml') Required arguments: ------------------- * `xml_doc` -- a text IO stream (such as an open file object), Path object pointing to an XML file, string representing the file path, or string containing the file contents of a valid XML file. Can't take an ElementTree instance because we need to use our own parser to keep track of namespaces. * `node_path` -- a string containing the path to the node you want to sort the children of in the XPath language of the etree module * `sort_attr` -- the attribute of the child elements to use as the sort key Optional arguments: ------------------- * `use_text` -- use `sort_attr` as the name of a subelement of the path's children whose text will be the sort key (default: False) * `sort_as_datetime` -- try to parse the values of the sort key as a datetime using the `dateutil` module and sort chronologically (default: False, mutually exclusive with `sort_as_decimal`) * `sort_as_decimal` -- try to parse the values of the sort key as a decimal and sort numerically (useful to keep '10' from showing up right after '1') (default: False, mutually exclusive with `sort_as_datetime`) * `descending` -- sort in descending order instead of ascending (default: False) """ # check parameters # xml_doc if isinstance(xml_doc, TextIOWrapper) and xml_doc.readable(): # xml_doc is a readable text stream, let's read it # but first make sure to remove any byte order marker if xml_doc.encoding != 'utf-8-sig': xml_doc.reconfigure(encoding='utf-8-sig') xml_str = xml_doc.read() elif isinstance(xml_doc, Path) and xml_doc.is_file(): # xml_doc is a Path object to a file xml_str = xml_doc.read_text('utf-8-sig') # utf-8-sig to remove byte order marker elif isinstance(xml_doc, str) and Path(xml_doc).is_file(): # xml_doc is a filename xml_str = Path(xml_doc).read_text('utf-8-sig') elif isinstance(xml_doc, str) and len(xml_doc) > 0: # xml_doc hopefully contains valid XML if xml_doc.startswith(BOM_UTF8.decode('utf-8')): xml_str = xml_doc[3:] else: xml_str = xml_doc else: raise TypeError("sort_xml() requires first parameter must be a string, readable IO stream, or path for a " f"valid xml file! xml_doc: {repr(xml_doc)}") # sort_attr if not (isinstance(sort_attr, str) and len(sort_attr) > 0): raise TypeError("sort_xml() requires sort attribute must be a non-empty string!\n\t" f"sort_attr: {repr(sort_attr)}") else: sort_attr = sort_attr.strip() if not (sort_attr.replace('_', '').isalnum() and (sort_attr[0].isalpha() or sort_attr[0] == '_')): raise ValueError("Sort attribute passed to sort_xml() is an invalid name!\n\t" f"sort_attr: {repr(sort_attr)}") # make our element tree using our custom treebuilder and get all the parents we have to sort children of dom = ET.fromstring(xml_str, ET.XMLParser(target=NSTreeBuilder())) matching_parents = dom.findall(node_path) # check what kind of sorting we're doing and do it # TODO might be faster if we do the check once and then run the appropriate for loop? for par in matching_parents: if use_text: if sort_as_datetime: par[:] = sorted(par, key=lambda x: parse_dt(x.findtext(sort_attr)), reverse=descending) elif sort_as_decimal: par[:] = sorted(par, key=lambda x: Decimal(x.findtext(sort_attr)), reverse=descending) else: par[:] = sorted(par, key=lambda x: x.findtext(sort_attr), reverse=descending) elif sort_as_datetime: par[:] = sorted(par, key=lambda x: parse_dt(x.get(sort_attr)), reverse=descending) elif sort_as_decimal: par[:] = sorted(par, key=lambda x: Decimal(x.get(sort_attr)), reverse=descending) else: par[:] = sorted(par, key=lambda x: x.get(sort_attr), reverse=descending) return ET.ElementTree(dom) if __name__ == '__main__': argp = ap.ArgumentParser(description=__description__, formatter_class=ap.RawDescriptionHelpFormatter) argp.add_argument('-v', '--version', action='version', version=f"%(prog)s -- version {__version_str__}") argp.add_argument('input_file', type=Path, help="File path to the source xml file.") argp.add_argument('sort_xpath', help="XPath-style selector for elements to sort the children of. This has the same limitations " "as Python's ElementTree module.") argp.add_argument('sort_attr', help="The name of the attribute to use as the sort key.") argp.add_argument('-r', '--reverse', '--descending', action='store_true', dest='descending', help="Sort the child elements in reverse (descending) order.") argp.add_argument('-t', '--text', '--use-text', action='store_true', dest='use_text', help="Treat the sort attribute name as the name of a subelement whose text is the sort key.") sort_style = argp.add_mutually_exclusive_group() sort_style.add_argument('--datetime', '--as-datetime', action='store_true', dest='as_datetime', help="Try to parse the sort key as a date/time value. Mutually exclusive with --decimal.") sort_style.add_argument('--decimal', '--as-decimal', action='store_true', dest='as_decimal', help="Try to parse the sort key as a decimal number. Mutually exclusive with --datetime.") argp.add_argument('-o', '--output', type=Path, dest='output_file', help="File path to the destination file. (Default is to append '_sorted' to the filename.)") argv = argp.parse_args() xml_doc = argv.input_file sort_path = argv.sort_xpath sort_attr = argv.sort_attr sort_desc = argv.descending use_text = argv.use_text as_dt = argv.as_datetime as_dec = argv.as_decimal sorted_xml = sort_xml(xml_doc, sort_path, sort_attr, use_text, as_dt, as_dec, sort_desc) if not hasattr(argv, 'output_file'): new_filename = xml_doc.stem + '_sorted' out_file = xml_doc.with_stem(new_filename) else: out_file = argv.output_file out_file.write_text(ET.tostring(sorted_xml.getroot(), encoding='unicode'), encoding='utf-8') print(f"Output sorted file as `{out_file}`")