From lxml-checkins at codespeak.net Sun Oct 3 20:35:16 2010 From: lxml-checkins at codespeak.net (Online Pfizer Inc.) Date: Sun, 3 Oct 2010 20:35:16 +0200 (CEST) Subject: [Lxml-checkins] lxml-checkins@codespeak.net "BEST PRICE" 25% OFF! Message-ID: <20101004173620.2213.qmail@25-230-134-95.pool.ukrtel.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20101003/a062fe72/attachment.htm From scoder at codespeak.net Fri Oct 8 10:06:51 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 8 Oct 2010 10:06:51 +0200 (CEST) Subject: [Lxml-checkins] r77708 - lxml/trunk Message-ID: <20101008080651.C739C282BE8@codespeak.net> Author: scoder Date: Fri Oct 8 10:06:49 2010 New Revision: 77708 Modified: lxml/trunk/setup.py Log: doc update Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Fri Oct 8 10:06:49 2010 @@ -85,9 +85,12 @@ `_ or see our bug tracker at https://launchpad.net/lxml -In case you want to use the current in-development version of lxml, you can -get it from the subversion repository at http://codespeak.net/svn/lxml/trunk . -Running ``easy_install lxml==dev`` will install it from +In case you want to use the current in-development version of lxml, +you can get it from the subversion repository at +http://codespeak.net/svn/lxml/trunk . Note that this requires Cython +to build the sources, see the build instructions on the project home +page. To the same end, running Running ``easy_install lxml==dev`` +will install lxml from http://codespeak.net/svn/lxml/trunk#egg=lxml-dev """ + branch_link) % { "branch_version" : versioninfo.branch_version() }) + From scoder at codespeak.net Sun Oct 17 11:46:55 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 17 Oct 2010 11:46:55 +0200 (CEST) Subject: [Lxml-checkins] r78028 - in lxml/trunk: . src/lxml/html Message-ID: <20101017094655.2FFBD282B9D@codespeak.net> Author: scoder Date: Sun Oct 17 11:46:51 2010 New Revision: 78028 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/clean.py lxml/trunk/src/lxml/html/defs.py Log: fix 'marque' HTML tag name into 'marquee' Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Oct 17 11:46:51 2010 @@ -2,6 +2,15 @@ lxml changelog ============== +Under development +================== + +Bugs fixed +---------- + +* ``marque`` tag in HTML cleanup code is correctly named ``marquee``. + + 2.3beta1 (2010-09-06) ===================== Modified: lxml/trunk/src/lxml/html/clean.py ============================================================================== --- lxml/trunk/src/lxml/html/clean.py (original) +++ lxml/trunk/src/lxml/html/clean.py Sun Oct 17 11:46:51 2010 @@ -136,7 +136,7 @@ Removes any form tags ``annoying_tags``: - Tags that aren't *wrong*, but are annoying. ```` and ```` + Tags that aren't *wrong*, but are annoying. ```` and ```` ``remove_tags``: A list of tags to remove. @@ -342,7 +342,7 @@ remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: - remove_tags.update(('blink', 'marque')) + remove_tags.update(('blink', 'marquee')) _remove = [] _kill = [] Modified: lxml/trunk/src/lxml/html/defs.py ============================================================================== --- lxml/trunk/src/lxml/html/defs.py (original) +++ lxml/trunk/src/lxml/html/defs.py Sun Oct 17 11:46:51 2010 @@ -119,7 +119,7 @@ ]) # These tags aren't standard -nonstandard_tags = frozenset(['blink', 'marque']) +nonstandard_tags = frozenset(['blink', 'marquee']) tags = (top_level_tags | head_tags | general_block_tags | list_tags | table_tags | form_tags | special_inline_tags | phrase_tags From lxml-checkins at codespeak.net Tue Oct 19 16:27:08 2010 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Tue, 19 Oct 2010 16:27:08 +0200 (CEST) Subject: lxml-checkins@codespeak.net V|AGRA ® Official Site ID7212124 Message-ID: <20101019142708.00F31282BAD@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20101019/ee298cb0/attachment.htm From scoder at codespeak.net Wed Oct 20 20:03:15 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 20 Oct 2010 20:03:15 +0200 (CEST) Subject: [Lxml-checkins] r78146 - lxml/trunk/src/lxml Message-ID: <20101020180315.50C31282BE8@codespeak.net> Author: scoder Date: Wed Oct 20 20:03:12 2010 New Revision: 78146 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: code cleanup Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Oct 20 20:03:12 2010 @@ -1304,13 +1304,13 @@ c = s[0] return 0 -cdef int check_string_utf8(pystring): +cdef int check_string_utf8(bytes pystring): u"""Check if a string looks like valid UTF-8 XML content. Returns 0 for ASCII, 1 for UTF-8 and -1 in the case of errors, such as NULL bytes or ASCII control characters. """ cdef char* s = _cstr(pystring) - cdef char* c_end = s + python.PyBytes_GET_SIZE(pystring) + cdef char* c_end = s + len(pystring) cdef bint is_non_ascii = 0 while s < c_end: if s[0] & 0x80: @@ -1345,21 +1345,27 @@ return s[:slen] cdef bytes _utf8(object s): + """Test if a string is valid user input and encode it to UTF-8. + Reject all bytes/unicode input that contains non-XML characters. + Reject all bytes input that contains non-ASCII characters. + """ cdef int invalid + cdef bytes utf8_string if python.PyBytes_CheckExact(s): - invalid = check_string_utf8(s) - elif python.PyUnicode_CheckExact(s) or python.PyUnicode_Check(s): - s = python.PyUnicode_AsUTF8String(s) - invalid = check_string_utf8(s) == -1 + utf8_string = s + invalid = check_string_utf8(utf8_string) + elif python.PyUnicode_Check(s): + utf8_string = python.PyUnicode_AsUTF8String(s) + invalid = check_string_utf8(utf8_string) == -1 # non-XML? elif python.PyBytes_Check(s): - s = bytes(s) - invalid = check_string_utf8(s) + utf8_string = bytes(s) + invalid = check_string_utf8(utf8_string) else: raise TypeError, u"Argument must be string or unicode." if invalid: raise ValueError, \ u"All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters" - return s + return utf8_string cdef bint _isFilePath(char* c_path): u"simple heuristic to see if a path is a filename" @@ -1428,20 +1434,20 @@ if filename is None: return None elif python.PyBytes_Check(filename): - if not check_string_utf8(filename): + if not check_string_utf8(filename): # plain ASCII! return filename - c_filename = _cstr(filename) + c_filename = _cstr(filename) try: # try to decode with default encoding filename = python.PyUnicode_Decode( - c_filename, python.PyBytes_GET_SIZE(filename), + c_filename, len(filename), _C_FILENAME_ENCODING, NULL) except UnicodeDecodeError, decode_exc: try: # try if it's UTF-8 filename = python.PyUnicode_DecodeUTF8( - c_filename, python.PyBytes_GET_SIZE(filename), NULL) + c_filename, len(filename), NULL) except UnicodeDecodeError: raise decode_exc # otherwise re-raise original exception if python.PyUnicode_Check(filename): From scoder at codespeak.net Thu Oct 21 19:47:30 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Oct 2010 19:47:30 +0200 (CEST) Subject: [Lxml-checkins] r78184 - in lxml/branch/lxml-2.2: . src/lxml Message-ID: <20101021174730.D8E7E282BDC@codespeak.net> Author: scoder Date: Thu Oct 21 19:47:27 2010 New Revision: 78184 Modified: lxml/branch/lxml-2.2/CHANGES.txt lxml/branch/lxml-2.2/src/lxml/parser.pxi Log: work-around for libxml2 bug that can render the HTML parser non-functional Modified: lxml/branch/lxml-2.2/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.2/CHANGES.txt (original) +++ lxml/branch/lxml-2.2/CHANGES.txt Thu Oct 21 19:47:27 2010 @@ -2,6 +2,16 @@ lxml changelog ============== +2.2.9 (...) +================== + +Bugs fixed +---------- + +* Work-around for libxml2 bug that can leave the HTML parser in a + non-functional state after parsing a severly broken document. + + 2.2.8 (2010-09-02) ================== Modified: lxml/branch/lxml-2.2/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/parser.pxi Thu Oct 21 19:47:27 2010 @@ -496,6 +496,7 @@ if self._c_ctxt is not NULL: if self._c_ctxt.html: htmlparser.htmlCtxtReset(self._c_ctxt) + self._c_ctxt.disableSAX = 0 # work around bug in libxml2 elif self._c_ctxt.spaceTab is not NULL or \ _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2 xmlparser.xmlClearParserCtxt(self._c_ctxt) From scoder at codespeak.net Thu Oct 21 19:48:53 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Oct 2010 19:48:53 +0200 (CEST) Subject: [Lxml-checkins] r78185 - lxml/trunk/src/lxml Message-ID: <20101021174853.0787A282BDC@codespeak.net> Author: scoder Date: Thu Oct 21 19:48:51 2010 New Revision: 78185 Modified: lxml/trunk/src/lxml/parser.pxi Log: work around bug in libxml2 (ticket 661890) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu Oct 21 19:48:51 2010 @@ -504,6 +504,7 @@ if self._c_ctxt is not NULL: if self._c_ctxt.html: htmlparser.htmlCtxtReset(self._c_ctxt) + self._c_ctxt.disableSAX = 0 # work around bug in libxml2 elif self._c_ctxt.spaceTab is not NULL or \ _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2 xmlparser.xmlClearParserCtxt(self._c_ctxt) From scoder at codespeak.net Thu Oct 21 19:48:56 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Oct 2010 19:48:56 +0200 (CEST) Subject: [Lxml-checkins] r78186 - lxml/trunk Message-ID: <20101021174856.206BD282BDC@codespeak.net> Author: scoder Date: Thu Oct 21 19:48:54 2010 New Revision: 78186 Modified: lxml/trunk/CHANGES.txt Log: changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Oct 21 19:48:54 2010 @@ -8,6 +8,9 @@ Bugs fixed ---------- +* Work-around for libxml2 bug that can leave the HTML parser in a + non-functional state after parsing a severly broken document. + * ``marque`` tag in HTML cleanup code is correctly named ``marquee``. From scoder at codespeak.net Thu Oct 28 19:51:17 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 28 Oct 2010 19:51:17 +0200 (CEST) Subject: [Lxml-checkins] r78419 - lxml/trunk/doc Message-ID: <20101028175117.37CC4282BE8@codespeak.net> Author: scoder Date: Thu Oct 28 19:51:14 2010 New Revision: 78419 Modified: lxml/trunk/doc/resolvers.txt Log: note on XML catalogs Modified: lxml/trunk/doc/resolvers.txt ============================================================================== --- lxml/trunk/doc/resolvers.txt (original) +++ lxml/trunk/doc/resolvers.txt Thu Oct 28 19:51:14 2010 @@ -3,14 +3,16 @@ .. contents:: .. - 1 URI Resolvers - 2 Document loading in context - 3 I/O access control in XSLT + 1 XML Catalogs + 2 URI Resolvers + 3 Document loading in context + 4 I/O access control in XSLT -Lxml has support for custom document loaders in both the parsers and XSL -transformations. These so-called resolvers are subclasses of the -etree.Resolver class. +The normal way to load external entities (such as DTDs) is by using +XML catalogs. Lxml also has support for user provided document +loaders in both the parsers and XSL transformations. These so-called +resolvers are subclasses of the etree.Resolver class. .. >>> try: from StringIO import StringIO @@ -20,6 +22,29 @@ ... if isinstance(s, str): s = s.encode("UTF-8") ... return BytesIO(s) + +XML Catalogs +------------ + +When loading an external entity for a document, e.g. a DTD, the parser +is normally configured to prevent network access (see the +``no_network`` parser option). Instead, it will try to load the +entity from their local file system path or, in the most common case +that the entity uses a network URL as reference, from a local XML +catalog. + +`XML catalogs`_ are the preferred and agreed-on mechanism to load +external entities from XML processors. Most tools will use them, so +it is worth configuring them properly on a system. Many Linux +installations use them by default, but on other systems they may need +to get enabled manually. The `libxml2 site`_ has some documentation +on `how to set up XML catalogs`_ + +.. _`XML catalogs`: http://www.oasis-open.org/committees/entity/spec.html +.. _`libxml2 site`: http://xmlsoft.org/ +.. _`how to set up XML catalogs`: http://xmlsoft.org/catalog.html + + URI Resolvers ------------- From scoder at codespeak.net Fri Oct 29 09:12:15 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Oct 2010 09:12:15 +0200 (CEST) Subject: [Lxml-checkins] r78434 - in lxml/trunk: . doc Message-ID: <20101029071215.2FC73282BE8@codespeak.net> Author: scoder Date: Fri Oct 29 09:12:13 2010 New Revision: 78434 Modified: lxml/trunk/INSTALL.txt lxml/trunk/doc/build.txt Log: mention that STATICBUILD env variable works like STATIC_DEPS Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Fri Oct 29 09:12:13 2010 @@ -109,13 +109,17 @@ in either of the two. To get a static build, either pass the ``--static-deps`` option to the -setup.py script, or run ``easy_install`` with the ``STATIC_DEPS`` -environment variable set to true, i.e. +setup.py script, or run ``easy_install`` with the ``STATIC_DEPS`` or +``STATICBUILD`` environment variable set to true, i.e. :: STATIC_DEPS=true easy_install lxml +The ``STATICBUILD`` environment variable is handled equivalently to +the ``STATIC_DEPS`` variable, but is used by some other extension +packages, too. + MS Windows ---------- Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Fri Oct 29 09:12:13 2010 @@ -225,6 +225,10 @@ STATIC_DEPS=true sudo easy_install lxml +The ``STATICBUILD`` environment variable is handled equivalently to +the ``STATIC_DEPS`` variable, but is used by some other extension +packages, too. + Static linking on Windows ------------------------- From scoder at codespeak.net Sat Oct 30 10:51:23 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Oct 2010 10:51:23 +0200 (CEST) Subject: [Lxml-checkins] r78502 - lxml/trunk Message-ID: <20101030085123.AB921282B9D@codespeak.net> Author: scoder Date: Sat Oct 30 10:51:21 2010 New Revision: 78502 Modified: lxml/trunk/setupinfo.py Log: if the generated .c files are there, there's no need to worry users about not using Cython Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Sat Oct 30 10:51:21 2010 @@ -62,9 +62,12 @@ from Cython.Compiler import Options Options.generate_cleanup_code = 3 else: - print ("NOTE: Trying to build without Cython, pre-generated " - "'%slxml.etree.c' needs to be available." % PACKAGE_PATH) source_extension = ".c" + if not os.path.exists(PACKAGE_PATH + 'lxml.etree.c'): + print ("WARNING: Trying to build without Cython, but pre-generated " + "'%slxml.etree.c' does not seem to be available." % PACKAGE_PATH) + else: + print ("Building without Cython.") if OPTION_WITHOUT_OBJECTIFY: modules = [ entry for entry in EXT_MODULES From scoder at codespeak.net Sun Oct 31 08:02:06 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 31 Oct 2010 08:02:06 +0100 (CET) Subject: [Lxml-checkins] r78577 - lxml/trunk/src/lxml Message-ID: <20101031070206.C1647282B90@codespeak.net> Author: scoder Date: Sun Oct 31 08:02:04 2010 New Revision: 78577 Modified: lxml/trunk/src/lxml/nsclasses.pxi lxml/trunk/src/lxml/parsertarget.pxi lxml/trunk/src/lxml/xmlid.pxi lxml/trunk/src/lxml/xpath.pxi Log: safety fixes Modified: lxml/trunk/src/lxml/nsclasses.pxi ============================================================================== --- lxml/trunk/src/lxml/nsclasses.pxi (original) +++ lxml/trunk/src/lxml/nsclasses.pxi Sun Oct 31 08:02:04 2010 @@ -17,7 +17,7 @@ cdef object _ns_uri_utf cdef dict _entries cdef char* _c_ns_uri_utf - def __init__(self, ns_uri): + def __cinit__(self, ns_uri): self._ns_uri = ns_uri if ns_uri is None: self._ns_uri_utf = None @@ -98,9 +98,11 @@ Element class lookup scheme that searches the Element class in the Namespace registry. """ - cdef object _namespace_registries - def __init__(self, ElementClassLookup fallback=None): + cdef dict _namespace_registries + def __cinit__(self): self._namespace_registries = {} + + def __init__(self, ElementClassLookup fallback=None): FallbackElementClassLookup.__init__(self, fallback) self._lookup_function = _find_nselement_class Modified: lxml/trunk/src/lxml/parsertarget.pxi ============================================================================== --- lxml/trunk/src/lxml/parsertarget.pxi (original) +++ lxml/trunk/src/lxml/parsertarget.pxi Sun Oct 31 08:02:04 2010 @@ -19,7 +19,7 @@ cdef object _target_comment cdef bint _start_takes_nsmap - def __init__(self, target): + def __cinit__(self, target): cdef int event_filter event_filter = 0 self._start_takes_nsmap = 0 Modified: lxml/trunk/src/lxml/xmlid.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlid.pxi (original) +++ lxml/trunk/src/lxml/xmlid.pxi Sun Oct 31 08:02:04 2010 @@ -66,7 +66,7 @@ cdef _Document _doc cdef object _keys cdef object _items - def __init__(self, etree): + def __cinit__(self, etree): cdef _Document doc doc = _documentOrRaise(etree) if doc._c_doc.ids is NULL: Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Sun Oct 31 08:02:04 2010 @@ -144,7 +144,7 @@ if _XPATH_VERSION_WARNING_REQUIRED: _XPATH_VERSION_WARNING_REQUIRED = 0 import warnings - warnings.warn(u"This version of libxml2 has a known XPath bug. " + \ + warnings.warn(u"This version of libxml2 has a known XPath bug. " u"Use it at your own risk.") self._context = _XPathContext(namespaces, extensions, enable_regexp, None, @@ -283,11 +283,13 @@ def register_namespace(self, prefix, uri): u"""Register a namespace with the XPath context. """ + assert self._xpathCtxt is not NULL, "XPath context not initialised" self._context.addNamespace(prefix, uri) def register_namespaces(self, namespaces): u"""Register a prefix -> uri dict. """ + assert self._xpathCtxt is not NULL, "XPath context not initialised" for prefix, uri in namespaces.items(): self._context.addNamespace(prefix, uri)