Adding upstream version 0.1.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
550be5da31
commit
638d6148c0
12 changed files with 539 additions and 0 deletions
262
sortxml.py
Normal file
262
sortxml.py
Normal file
|
@ -0,0 +1,262 @@
|
|||
#!/usr/bin/python310
|
||||
|
||||
"""Simple XML element sorter.
|
||||
|
||||
This module can be used by importing `sort_xml` or by running standalone from the command-line.
|
||||
|
||||
"""
|
||||
|
||||
# Copyright (c) 2022, Chris Koch <kopachris@gmail.com>
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# (1) Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# (2) Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
#
|
||||
# (3)The name of the author may not be used to
|
||||
# endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
||||
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
__version__ = (0, 1, 0)
|
||||
__version_str__ = '.'.join([str(v) for v in __version__])
|
||||
|
||||
__description__ = """
|
||||
A simple XML element sorter. Will sort the children of selected elements
|
||||
using a given attribute's value or subelement's text as the sort key.
|
||||
Example usage:
|
||||
$ python sortxml.py ARForm_orig.rdl "./DataSets/DataSet[@Name='ARForm']/Fields" Name -o ARForm.rdl
|
||||
"""
|
||||
|
||||
import argparse as ap
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from io import TextIOWrapper
|
||||
from codecs import BOM_UTF8
|
||||
from decimal import Decimal
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
||||
|
||||
class NSElement(ET.Element):
|
||||
"""Subclass of ElementTree.Element which keeps track of its TreeBuilder and namespaces if available."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._ns_map = dict()
|
||||
self._builder = None
|
||||
if 'builder' in kwargs:
|
||||
builder = kwargs.pop('builder')
|
||||
self._builder = builder
|
||||
if hasattr(builder, 'ns_map'):
|
||||
self._ns_map = builder.ns_map
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def find(self, path, namespaces=None):
|
||||
if namespaces is None:
|
||||
namespaces = self._ns_map
|
||||
return super().find(path, namespaces)
|
||||
|
||||
def findall(self, path, namespaces=None):
|
||||
if namespaces is None:
|
||||
namespaces = self._ns_map
|
||||
return super().findall(path, namespaces)
|
||||
|
||||
def findtext(self, path, default=None, namespaces=None):
|
||||
if namespaces is None:
|
||||
namespaces = self._ns_map
|
||||
return super().findtext(path, default, namespaces)
|
||||
|
||||
def iterfind(self, path, namespaces=None):
|
||||
if namespaces is None:
|
||||
namespaces = self._ns_map
|
||||
return super().iterfind(path, namespaces)
|
||||
|
||||
|
||||
class NSTreeBuilder(ET.TreeBuilder):
|
||||
"""Subclass of ElementTree.TreeBuilder which adds namespaces in the document to the namespace registry."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.ns_map = dict()
|
||||
if 'element_factory' in kwargs:
|
||||
del kwargs['element_factory']
|
||||
super().__init__(element_factory=NSElement, **kwargs)
|
||||
|
||||
def start_ns(self, prefix, uri):
|
||||
self.ns_map[prefix] = uri
|
||||
ET.register_namespace(prefix, uri)
|
||||
|
||||
def start(self, tag, attrs):
|
||||
if self._factory is NSElement:
|
||||
self._flush()
|
||||
self._last = e = self._factory(tag, attrs, builder=self)
|
||||
if self._elem:
|
||||
self._elem[-1].append(e)
|
||||
elif self._root is None:
|
||||
self._root = e
|
||||
self._elem.append(e)
|
||||
self._tail = 0
|
||||
return e
|
||||
else:
|
||||
return super().start(tag, attrs)
|
||||
|
||||
def _handle_single(self, factory, insert, *args):
|
||||
if factory is NSElement:
|
||||
e = factory(*args, builder=self)
|
||||
if insert:
|
||||
self._flush()
|
||||
self._last = e
|
||||
if self._elem:
|
||||
self._elem[-1].append(e)
|
||||
self._tail = 1
|
||||
return e
|
||||
else:
|
||||
return super()._handle_single(factory, insert, *args)
|
||||
|
||||
|
||||
def sort_xml(xml_doc, node_path, sort_attr, use_text=False, sort_as_datetime=False, sort_as_decimal=False,
|
||||
descending=False):
|
||||
"""Sort the children of a selection of elements in an XML document. Returns an ElementTree representing the
|
||||
resulting whole document. ElementTree can easily be converted to string or written to a file like so:
|
||||
|
||||
>>> foo_str = ET.tostring(sort_xml(xml_doc, node_path, sort_attr).getroot())
|
||||
>>> sort_xml(xml_doc, node_path, sort_attr).write('foo.xml')
|
||||
|
||||
Required arguments:
|
||||
-------------------
|
||||
* `xml_doc` -- a text IO stream (such as an open file object), Path object pointing to an XML
|
||||
file, string representing the file path, or string containing the file contents of a valid XML file. Can't take
|
||||
an ElementTree instance because we need to use our own parser to keep track of namespaces.
|
||||
* `node_path` -- a string containing the path to the node you want to sort the children of in the XPath language
|
||||
of the etree module
|
||||
* `sort_attr` -- the attribute of the child elements to use as the sort key
|
||||
|
||||
Optional arguments:
|
||||
-------------------
|
||||
* `use_text` -- use `sort_attr` as the name of a subelement of the path's children whose text will be the
|
||||
sort key (default: False)
|
||||
* `sort_as_datetime` -- try to parse the values of the sort key as a datetime using the `dateutil` module and sort
|
||||
chronologically (default: False, mutually exclusive with `sort_as_decimal`)
|
||||
* `sort_as_decimal` -- try to parse the values of the sort key as a decimal and sort numerically (useful to keep
|
||||
'10' from showing up right after '1') (default: False, mutually exclusive with `sort_as_datetime`)
|
||||
* `descending` -- sort in descending order instead of ascending (default: False)
|
||||
|
||||
"""
|
||||
# check parameters
|
||||
|
||||
# xml_doc
|
||||
if isinstance(xml_doc, TextIOWrapper) and xml_doc.readable():
|
||||
# xml_doc is a readable text stream, let's read it
|
||||
# but first make sure to remove any byte order marker
|
||||
|
||||
if xml_doc.encoding != 'utf-8-sig':
|
||||
xml_doc.reconfigure(encoding='utf-8-sig')
|
||||
|
||||
xml_str = xml_doc.read()
|
||||
elif isinstance(xml_doc, Path) and xml_doc.is_file():
|
||||
# xml_doc is a Path object to a file
|
||||
xml_str = xml_doc.read_text('utf-8-sig') # utf-8-sig to remove byte order marker
|
||||
elif isinstance(xml_doc, str) and Path(xml_doc).is_file():
|
||||
# xml_doc is a filename
|
||||
xml_str = Path(xml_doc).read_text('utf-8-sig')
|
||||
elif isinstance(xml_doc, str) and len(xml_doc) > 0:
|
||||
# xml_doc hopefully contains valid XML
|
||||
if xml_doc.startswith(BOM_UTF8.decode('utf-8')):
|
||||
xml_str = xml_doc[3:]
|
||||
else:
|
||||
xml_str = xml_doc
|
||||
else:
|
||||
raise TypeError("sort_xml() requires first parameter must be a string, readable IO stream, or path for a "
|
||||
f"valid xml file! xml_doc: {repr(xml_doc)}")
|
||||
|
||||
# sort_attr
|
||||
if not (isinstance(sort_attr, str) and len(sort_attr) > 0):
|
||||
raise TypeError("sort_xml() requires sort attribute must be a non-empty string!\n\t"
|
||||
f"sort_attr: {repr(sort_attr)}")
|
||||
else:
|
||||
sort_attr = sort_attr.strip()
|
||||
if not (sort_attr.replace('_', '').isalnum() and (sort_attr[0].isalpha() or sort_attr[0] == '_')):
|
||||
raise ValueError("Sort attribute passed to sort_xml() is an invalid name!\n\t"
|
||||
f"sort_attr: {repr(sort_attr)}")
|
||||
|
||||
# make our element tree using our custom treebuilder and get all the parents we have to sort children of
|
||||
|
||||
dom = ET.fromstring(xml_str, ET.XMLParser(target=NSTreeBuilder()))
|
||||
matching_parents = dom.findall(node_path)
|
||||
|
||||
# check what kind of sorting we're doing and do it
|
||||
# TODO might be faster if we do the check once and then run the appropriate for loop?
|
||||
for par in matching_parents:
|
||||
if use_text:
|
||||
if sort_as_datetime:
|
||||
par[:] = sorted(par, key=lambda x: parse_dt(x.findtext(sort_attr)), reverse=descending)
|
||||
elif sort_as_decimal:
|
||||
par[:] = sorted(par, key=lambda x: Decimal(x.findtext(sort_attr)), reverse=descending)
|
||||
else:
|
||||
par[:] = sorted(par, key=lambda x: x.findtext(sort_attr), reverse=descending)
|
||||
elif sort_as_datetime:
|
||||
par[:] = sorted(par, key=lambda x: parse_dt(x.get(sort_attr)), reverse=descending)
|
||||
elif sort_as_decimal:
|
||||
par[:] = sorted(par, key=lambda x: Decimal(x.get(sort_attr)), reverse=descending)
|
||||
else:
|
||||
par[:] = sorted(par, key=lambda x: x.get(sort_attr), reverse=descending)
|
||||
|
||||
return ET.ElementTree(dom)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
argp = ap.ArgumentParser(description=__description__, formatter_class=ap.RawDescriptionHelpFormatter)
|
||||
argp.add_argument('-v', '--version', action='version', version=f"%(prog)s -- version {__version_str__}")
|
||||
argp.add_argument('input_file', type=Path, help="File path to the source xml file.")
|
||||
argp.add_argument('sort_xpath',
|
||||
help="XPath-style selector for elements to sort the children of. This has the same limitations "
|
||||
"as Python's ElementTree module.")
|
||||
argp.add_argument('sort_attr', help="The name of the attribute to use as the sort key.")
|
||||
argp.add_argument('-r', '--reverse', '--descending', action='store_true', dest='descending',
|
||||
help="Sort the child elements in reverse (descending) order.")
|
||||
argp.add_argument('-t', '--text', '--use-text', action='store_true', dest='use_text',
|
||||
help="Treat the sort attribute name as the name of a subelement whose text is the sort key.")
|
||||
sort_style = argp.add_mutually_exclusive_group()
|
||||
sort_style.add_argument('--datetime', '--as-datetime', action='store_true', dest='as_datetime',
|
||||
help="Try to parse the sort key as a date/time value. Mutually exclusive with --decimal.")
|
||||
sort_style.add_argument('--decimal', '--as-decimal', action='store_true', dest='as_decimal',
|
||||
help="Try to parse the sort key as a decimal number. Mutually exclusive with --datetime.")
|
||||
argp.add_argument('-o', '--output', type=Path, dest='output_file',
|
||||
help="File path to the destination file. (Default is to append '_sorted' to the filename.)")
|
||||
|
||||
argv = argp.parse_args()
|
||||
|
||||
xml_doc = argv.input_file
|
||||
sort_path = argv.sort_xpath
|
||||
sort_attr = argv.sort_attr
|
||||
sort_desc = argv.descending
|
||||
use_text = argv.use_text
|
||||
as_dt = argv.as_datetime
|
||||
as_dec = argv.as_decimal
|
||||
|
||||
sorted_xml = sort_xml(xml_doc, sort_path, sort_attr, use_text, as_dt, as_dec, sort_desc)
|
||||
|
||||
if not hasattr(argv, 'output_file'):
|
||||
new_filename = xml_doc.stem + '_sorted'
|
||||
out_file = xml_doc.with_stem(new_filename)
|
||||
else:
|
||||
out_file = argv.output_file
|
||||
|
||||
out_file.write_text(ET.tostring(sorted_xml.getroot(), encoding='unicode'), encoding='utf-8')
|
||||
|
||||
print(f"Output sorted file as `{out_file}`")
|
Loading…
Add table
Add a link
Reference in a new issue