import sys
import optparse
import os
import gc
import time
import subprocess

sample_data = os.path.join(os.path.dirname(__file__), 'sample-data', 'python.org')

type_map = {}

class LxmlType(object):

    name = 'lxml'

    def __init__(self):
        from lxml.html import parse, tostring
        self.parse = parse
        self.tostring = tostring

    def parse_file(self, filename):
        return self.parse(filename).getroot()

    def serialize(self, doc):
        return self.tostring(doc)

type_map['lxml'] = LxmlType

class BeautifulSoupType(object):

    name = 'BeautifulSoup'

    def __init__(self):
        from BeautifulSoup import BeautifulSoup
        self.parser = BeautifulSoup

    def parse_file(self, filename):
        f = open(filename, 'rb')
        c = f.read()
        f.close()
        return self.parser(c)

    def serialize(self, doc):
        return str(doc)

type_map['bs'] = BeautifulSoupType

class BeautifulSoupLxmlType(object):

    name = 'BeautifulSoup lxml'

    def __init__(self):
        from lxml.html.ElementSoup import parse
        from lxml.html import tostring
        self.parse = parse
        self.tostring = tostring

    def parse_file(self, filename):
        return self.parse(filename)

    def serialize(self, doc):
        return self.tostring(doc)

type_map['lxml_bs'] = BeautifulSoupLxmlType

class HTML5SimpleType(object):

    name = 'html5lib simpletree'

    def __init__(self):
        from html5lib import HTMLParser
        self.parser = HTMLParser()

    def parse_file(self, filename):
        f = open(filename, 'rb')
        try:
            return self.parser.parse(f)
        finally:
            f.close()

    def serialize(self, doc):
        ## FIXME: better serializer
        return doc.toxml()

type_map['html5_simple'] = HTML5SimpleType

class HTML5CETType(HTML5SimpleType):

    name = 'html5lib cElementTree'

    def __init__(self):
        from html5lib import HTMLParser
        from html5lib import treebuilders
        try:
            from xml.etree import cElementTree
        except ImportError:
            import cElementTree
        self.parser = HTMLParser(tree=treebuilders.getTreeBuilder('etree', cElementTree))
        # FIXME: html?
        self.tostring = cElementTree.tostring

    def serialize(self, doc):
        return self.tostring(doc)

type_map['html5_cet'] = HTML5CETType

class HTML5ETType(HTML5SimpleType):
    
    name = 'html5lib ElementTree'

    def __init__(self):
        from html5lib import HTMLParser
        from html5lib import treebuilders
        try:
            from xml.etree import ElementTree
        except ImportError:
            try:
                import ElementTree
            except ImportError:
                from elementtree import ElementTree
        self.parser = HTMLParser(tree=treebuilders.getTreeBuilder('etree', ElementTree))
        # FIXME: html?
        self.tostring = ElementTree.tostring

    def serialize(self, doc):
        return self.tostring(doc)

type_map['html5_et'] = HTML5ETType

class HTML5LxmlType(HTML5SimpleType):

    name = 'html5lib lxml'

    def __init__(self):
        from html5lib import HTMLParser
        from html5lib import treebuilders
        from lxml.html import tostring
        self.parser = HTMLParser(tree=treebuilders.getTreeBuilder('lxml'))
        self.tostring = tostring

    def serialize(self, doc):
        return self.tostring(doc)

type_map['html5_lxml'] = HTML5LxmlType

class HTML5MinidomType(HTML5SimpleType):

    name = 'html5lib minidom'

    def __init__(self):
        from html5lib import HTMLParser
        from html5lib import treebuilders
        from lxml.html import tostring
        self.parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
        self.tostring = tostring

    def serialize(self, doc):
        return doc.toxml()

type_map['html5_minidom'] = HTML5MinidomType


class HTMLParserType(object):

    name = 'HTMLParser'

    def __init__(self):
        from HTMLParser import HTMLParser
        self.HTMLParser = HTMLParser

    def parse_file(self, filename):
        f = open(filename, 'rb')
        parser = self.HTMLParser()
        doc = f.read()
        try:
            parser.feed(doc)
        except Exception, e:
            #print '\nSkipping file %s: %s' % (filename, e)
            pass
        f.close()
        return doc

    def serialize(self, doc):
        # Nothing to serialize
        return doc

type_map['htmlparser'] = HTMLParserType

def get_ps():
    proc = subprocess.Popen(['ps', 'uww', '-p', str(os.getpid())], stdout=subprocess.PIPE)
    output, stderr = proc.communicate()
    print output
    parts = output.splitlines()[1].split()
    vsz = int(parts[4])
    rss = int(parts[5])
    return vsz, rss

def all_filenames(dir):
    paths = []
    for dirpath, dirnames, filenames in os.walk(dir):
        paths.extend([os.path.join(dirpath, fn) for fn in filenames])
    return paths

def test_type(type, disable_gc, keep_docs, serialize, sample_data):
    print 'Testing %s' % type.name
    filenames = all_filenames(sample_data)
    size = 0
    for fn in filenames:
        size += os.stat(fn).st_size
    print 'Files: %s  Size: %sKb' % (len(filenames), size/1000)
    all_docs = []
    if not disable_gc:
        gc.disable()
    segment = len(filenames) / 20
    if serialize:
        for i, filename in enumerate(filenames):
            doc = type.parse_file(filename)
            all_docs.append(doc)
            if not i % segment:
                sys.stdout.write('\r%5i/%5i   %i%%' % (
                    i, len(filenames), 100.0*i/len(filenames)))
                sys.stdout.flush()
        sys.stdout.write('\nFinished parsing.')
        sys.stdout.flush()
    start = time.time()
    if keep_docs:
        start_vsz, start_rss = get_ps()
    if serialize:
        for doc in all_docs:
            assert type.serialize(doc)
    else:
        for i, filename in enumerate(filenames):
            try:
                doc = type.parse_file(filename)
            except:
                print
                print 'Error in file %s' % filename
                raise
            if keep_docs:
                all_docs.append(doc)
            if not i % segment:
                sys.stdout.write('\r%5i/%5i   %i%%' % (
                    i, len(filenames), 100.0*i/len(filenames)))
                sys.stdout.flush()
    end = time.time()
    gc.collect()
    print
    print 'done.'
    if keep_docs:
        end_vsz, end_rss = get_ps()
        print 'Increased VSZ: %s  Increased RSS: %s' % (
            end_vsz-start_vsz, end_rss-start_rss)
    print 'Total time: %03.4f sec' % (end - start)
    print
    return end - start

def main(args=None):
    if args is None:
        args = sys.argv[1:]
    parser = optparse.OptionParser()
    parser.add_option(
        '--type', '-t',
        action='append',
        dest='types',
        default=None,
        metavar='TYPE',
        help='Test this type of parser (from: %s)' % ', '.join(sorted(type_map.keys())))
    parser.add_option(
        '--no-gc',
        action='store_true',
        dest='disable_gc',
        help='Disable gc during run')
    parser.add_option(
        '--keep-docs',
        action='store_true',
        dest='keep_docs',
        help='Keep the documents after they are parsed (instead of letting them be collected)')
    parser.add_option(
        '--serialize',
        action='store_true',
        dest='serialize',
        help='Serialize after parsing')
    parser.add_option(
        '--sample-data',
        dest='sample_data',
        default=sample_data,
        metavar='DIR',
        help='Directory containing sample HTML files (default: %s)' % sample_data)
    options, args = parser.parse_args(args)
    if not options.types:
        print 'No --type given'
        parser.print_help()
        sys.exit(2)
    for type in options.types:
        if type not in type_map:
            print 'No type %s' % type
            sys.exit(2)
    results = {}
    for type_name in options.types:
        type = type_map[type_name]()
        results[type_name] = test_type(
            type, disable_gc=options.disable_gc,
            keep_docs=options.keep_docs,
            serialize=options.serialize,
            sample_data=options.sample_data)
    if len(results) > 1:
        print
        print 'Summary:'
        reference_type = options.types[0]
        reference = results[reference_type]
        for type_name in options.types:
            print '%s%s: % 04.4f sec (%4i%% of %s)' % (
                type_name, (15-len(type_name)) * ' ',
                results[type_name],
                100.0*results[type_name] / reference,
                reference_type)
            
if __name__ == '__main__':
    main()
    
