#!/usr/bin/env python import lxml.etree as et import sys, os.path, optparse, itertools SHORT_DESCRIPTION = "An XPath file finder for XML files." __doc__ = SHORT_DESCRIPTION + ''' Evaluates an XPath expression against a series of files and prints the matching subtrees to stdout. Examples:: $ cat test.xml # find all leaf elements: $ SCRIPT '//*[not(*)]' test.xml # find all elements with attribute values containing "abc" ignoring case: $ SCRIPT '//*[@*[contains(py:lower(.), "abc")]]' test.xml # find all numeric attribute values: $ SCRIPT '//@*[re:match(., "^[0-9]+$")]' test.xml 1234 * find all elements with numeric attribute values: $ SCRIPT '//*[@*[re:match(., "^[0-9]+$")]]' test.xml * find all elements with numeric attribute values in more than one file: $ SCRIPT '//*[@*[re:match(., "^[0-9]+$")]]' test.xml test.xml test.xml >> test.xml >> test.xml >> test.xml * find XML files that have non-empty root nodes: $ SCRIPT -q '*' test.xml test.xml test.xml >> test.xml >> test.xml >> test.xml * find out if an XML file has at most depth three: $ SCRIPT 'not(/*/*/*)' test.xml True '''.replace('SCRIPT', os.path.basename(sys.argv[0])) REGEXP_NS = "http://exslt.org/regular-expressions" PYTHON_BUILTINS_NS = "PYTHON-BUILTINS" parser = et.XMLParser(remove_blank_text=True) def print_results(results): if isinstance(results, basestring) or isinstance(results, bool): print results return for result in results: if isinstance(result, basestring) or isinstance(result, bool): print result else: print et.tostring( result, xml_declaration=False, pretty_print=True) def find_in_file(f, xpath, print_name=True, xinclude=False): if hasattr(f, 'name'): filename = f.name else: filename = f try: try: tree = et.parse(f, parser) except IOError, e: print >> sys.stderr, "ERR: parsing %r failed: %s: %s" % ( filename, e.__class__.__name__, e) return False try: if xinclude: tree.xinclude() except IOError, e: print >> sys.stderr, "ERR: XInclude for %r failed: %s: %s" % ( filename, e.__class__.__name__, e) return False if not callable(xpath): xpath = et.XPath(xpath) results = xpath(tree) if results == []: return False if print_name: print ">> %s" % f if options.verbose: print_results(results) return True except Exception, e: print >> sys.stderr, "ERR: %r: %s: %s" % ( filename, e.__class__.__name__, e) return False def register_builtins(): ns = et.FunctionNamespace(PYTHON_BUILTINS_NS) for (name, builtin) in vars(__builtins__).iteritems(): if callable(builtin): if not name.startswith('_') and name == name.lower(): ns[name] = builtin str_xpath = et.XPath("string()") def lower(_, s): if isinstance(s, list): if not s: return '' s = s[0] if not isinstance(s, basestring): if isinstance(s, bool): s = str(s) else: s = str_xpath(s) return s.lower() def upper(_, s): if isinstance(s, list): if not s: return '' s = s[0] if not isinstance(s, basestring): if isinstance(s, bool): s = str(s) else: s = str_xpath(s) return s.upper() ns["lower"] = lower ns["upper"] = upper def parse_options(): from optparse import OptionParser usage = "usage: %prog [options] XPATH [FILE ...]" parser = OptionParser( usage = usage, version = "%prog using lxml.etree " + et.__version__, description = SHORT_DESCRIPTION) parser.add_option("-H", "--long-help", action="store_true", dest="long_help", default=False, help="a longer help text including usage examples") parser.add_option("-i", "--xinclude", action="store_true", dest="xinclude", default=False, help="run XInclude on the file before XPath") parser.add_option("--no-python", action="store_false", dest="python", default=True, help="disable Python builtins (prefix 'py')") parser.add_option("--no-regexp", action="store_false", dest="regexp", default=True, help="disable regular expressions (prefix 're')") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") options, args = parser.parse_args() if options.long_help: parser.print_help() print __doc__[__doc__.find('\n\n')+1:] sys.exit(0) if len(args) < 1: parser.error("first argument must be an XPath expression") return options, args if __name__ == "__main__": options, args = parse_options() namespaces = {} if options.regexp: namespaces["re"] = REGEXP_NS if options.python: register_builtins() namespaces["py"] = PYTHON_BUILTINS_NS xpath = et.XPath(args[0], namespaces) found = False if len(args) == 1: found = find_in_file( sys.stdin, xpath, print_name, options.xinclude) else: print_name = len(args) > 2 for filename in itertools.islice(args, 1, None): found |= find_in_file( filename, xpath, print_name, options.xinclude) if found: sys.exit(0) else: sys.exit(1)