[lxml-dev] Ask for help about lxml usage

Stefan Behnel stefan_ml at behnel.de
Wed May 13 09:18:50 CEST 2009


qhlonline wrote:
> My code of multi-thread parsing test is as follows, May be there are some
> problems, and I am very glad to get your suggestion. I have created eight
> threads in this program and they collaborate to parse a single HTML file
> 1000 times, the result I want is the time used.


> from lxml import etree
> import time
>
> import threading
> class TargetParser(object):
>  # The Target Parser Def:
>     def __init__(self):
>         self.ImgTag   = 0
>         self.StyleTag = 0
>         self.ScriptTag= 0
>     def start(self, tag, attrib):
>         if tag == 'img':
>             self.ImgTag = self.ImgTag+1
>         elif tag == 'style':
>             self.StyleTag = self.StyleTag+1
>         elif tag == 'script':
>             self.ScriptTag = self.ScriptTag+1
>         else:
>             pass
>     def end(self, tag):
>         pass
>     def close(self):
>         return self

I'd strip this all together and go with separate parse/search phases.


> class MultiThread:
>     def __init__(self):
>         self.circle = 0
>         self.timeres = 0.0
>
>     def CircleParse(self, webpath, circles=1000):
>         self.webpath=webpath
>         self.circles=circles
>         self.lock=thread.allocate_lock()
>         self.lock2=thread.allocate_lock()
>         starttime = time.time()
>         thread.start_new_thread(self.NewParse,())
>         thread.start_new_thread(self.NewParse,())
>
>         thread.start_new_thread(self.NewParse,())
>         thread.start_new_thread(self.NewParse,())
>         thread.start_new_thread(self.NewParse,())
>         thread.start_new_thread(self.NewParse,())
>         thread.start_new_thread(self.NewParse,())
>         thread.start_new_thread(self.NewParse,())

I'd use a for-loop here.

>         self.lock2.acquire()
>         self.lock2.acquire()

This bothers me. Not only that it appears twice, but also that you need an
explicit lock in your program.


>         timeres = time.time()-starttime  # total time
>         print 'In MultiThread Parser, the time Consume is : ',timeres, '
> Seconds!\n'
>
>     def NewParse(self):
>
>         MyParser = TargetParser()
>         Parser=etree.HTMLParser(target=MyParser)  #The parser used in this
> thread

A general note on naming in Python programs: you should skim through PEP
8. Your code is rather hard to read as it uses AllCamelCase names for
class names, method names and some local variables, i.e. things that are
semantically very different. It's common to use it for class names, but
everything else should use lower-case separated by underscores.


>         while (self.circle < self.circles):
>             if(self.circle >= self.circles):
>                 break
>
>             res = etree.parse(self.webpath,Parser)
>
>             if(self.lock.acquire()):
>                 self.circle = self.circle + 1
>                 if(self.circle >= self.circles and self.lock2.locked()):
>
>                     self.lock2.release()
>                     break
>                 self.lock.release()

Ok, my take would be this (although it's completely(!) untested):

    from Queue import Queue
    from threading import Thread
    from lxml import etree

    tag_counters = [ (tagname, etree.XPath('count(//%s)' % tagname))
                     for tagname in ('img', 'style', 'script') ]

    def start_threads(func, thread_count, *args):
        for _ in range(thread_count):
            thread = Thread(target=func, args=args)
            thread.setDaemon()
            thread.start()

    def handle_urls(url_queue, result_queue):
        parser = etree.HTMLParser()
        while True: # I'm a deamon, so I don't care
            try:
                url = url_queue.get()
                doc = etree.parse(url, parser)
                result = [ (tagname, count(doc))
                           for tagname, count in tag_counters ]
                result_queue.put(result)
                doc = None # free space while we wait
            except Exception, e:
                # catch-all to make sure we report all 'normal' exceptions
                result_queue.put(e)
                e = None # free space while we wait

    # run benchmark

    in_queue = Queue()
    out_queue = Queue()
    start_threads(handle_urls, 10, in_queue, out_queue)

    from time import time
    t = time()
    for _ in range(100):
        in_queue.put("file://tmp/somefile.html")
    for _ in range(100):
        print out_queue.get()
    print time() - t


... minus some glitches, but I bet you can fix them and post a better
version.

Have fun,

Stefan



More information about the lxml-dev mailing list