#!/usr/bin/env python from __future__ import generators import sys from unittest import TestCase try: enumerate except NameError: def enumerate(it): i = 0 while 1: v = it.next() yield i, v i += 1 def peek_token(p): tok = p.get_token() p.unget_token(tok) return tok class UnescapeTests(TestCase): def test_unescape_charref(self): from pullparser import unescape_charref, get_entitydefs mdash_utf8 = u"\u2014".encode("utf-8") for ref, codepoint, utf8, latin1 in [ ("38", 38, u"&".encode("utf-8"), "&"), ("x2014", 0x2014, mdash_utf8, "—"), ("8212", 8212, mdash_utf8, "—"), ]: self.assertEqual(unescape_charref(ref, None), unichr(codepoint)) self.assertEqual(unescape_charref(ref, 'latin-1'), latin1) self.assertEqual(unescape_charref(ref, 'utf-8'), utf8) def test_get_entitydefs(self): from pullparser import get_entitydefs ed = get_entitydefs() for name, char in [ ("&", u"&"), ("<", u"<"), (">", u">"), ("—", u"\u2014"), ("♠", u"\u2660"), ]: self.assertEqual(ed[name], char) def test_unescape(self): import htmlentitydefs import pullparser data = "& < — — —" mdash_utf8 = u"\u2014".encode("utf-8") ue = pullparser.unescape(data, pullparser.get_entitydefs(), "utf-8") self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue) for text, expect in [ ("&a&", "&a&"), ("a&", "a&"), ]: got = pullparser.unescape(text, pullparser.get_entitydefs(), "latin-1") self.assertEqual(got, expect) class PullParserTests(TestCase): from pullparser import PullParser, TolerantPullParser PARSERS = [(PullParser, False), (TolerantPullParser, True)] def data_and_file(self): from StringIO import StringIO data = """ Title

This is a data blah & a & that was an entityref and this a is a charref. . """ #" f = StringIO(data) return data, f def test_encoding(self): import pullparser #for pc, tolerant in [(pullparser.PullParser, False)]:#PullParserTests.PARSERS: for pc, tolerant in PullParserTests.PARSERS: self._test_encoding(pc, tolerant) def _test_encoding(self, parser_class, tolerant): from StringIO import StringIO datas = ["ф", "ф"] def get_text(data, encoding): p = _get_parser(data, encoding) p.get_tag("a") return p.get_text() def get_attr(data, encoding, et_name, attr_name): p = _get_parser(data, encoding) while True: tag = p.get_tag(et_name) attrs = tag.attrs if attrs is not None: break return dict(attrs)[attr_name] def _get_parser(data, encoding): f = StringIO(data) p = parser_class(f, encoding=encoding) #print 'p._entitydefs>>%s<<' % p._entitydefs['—'] return p for data in datas: self.assertEqual(get_text(data, "KOI8-R"), "\xc6") self.assertEqual(get_text(data, "UTF-8"), "\xd1\x84") self.assertEqual(get_text("", "UTF-8"), u"\u2014".encode('utf8')) self.assertEqual( get_attr('blah', "UTF-8", "a", "name"), u"\u2014".encode('utf8')) self.assertEqual(get_text("", "ascii"), "—") # response = urllib.addinfourl(f, {"content-type": "text/html; charset=XXX"}, req.get_full_url()) def test_get_token(self): for pc, tolerant in PullParserTests.PARSERS: self._test_get_token(pc, tolerant) def _test_get_token(self, parser_class, tolerant): data, f = self.data_and_file() p = parser_class(f) from pullparser import NoMoreTokensError self.assertEqual( p.get_token(), ("decl", '''DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"''', None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("starttag", "html", [])) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("starttag", "head", [])) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("starttag", "title", [("an", "attr")])) self.assertEqual(p.get_token(), ("data", "Title", None)) self.assertEqual(p.get_token(), ("endtag", "title", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("endtag", "head", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("starttag", "body", [])) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("starttag", "p", [])) self.assertEqual(p.get_token(), ("data", "This is a data ", None)) self.assertEqual(p.get_token(), ("starttag", "img", [("alt", "blah & a")])) self.assertEqual(p.get_token(), ("data", " ", None)) self.assertEqual(p.get_token(), ("entityref", "amp", None)) self.assertEqual(p.get_token(), ("data", " that was an entityref and this ", None)) self.assertEqual(p.get_token(), ("charref", "097", None)) self.assertEqual(p.get_token(), ("data", " is\na charref. ", None)) self.assertEqual(p.get_token(), ("starttag", "blah", [("foo", "bing"), ("blam", "wallop")])) self.assertEqual(p.get_token(), ("data", ".\n", None)) self.assertEqual(p.get_token(), ( "comment", " comment blah blah\n" "still a comment , blah and a space at the end \n", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("decl", "rheum", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("pi", "rhaponicum", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ( (tolerant and "starttag" or "startendtag"), "randomtag", [("spam", "eggs")])) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("endtag", "body", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertEqual(p.get_token(), ("endtag", "html", None)) self.assertEqual(p.get_token(), ("data", "\n", None)) self.assertRaises(NoMoreTokensError, p.get_token) # print "token", p.get_token() # sys.exit() def test_unget_token(self): for pc, tolerant in PullParserTests.PARSERS: self._test_unget_token(pc, tolerant) def _test_unget_token(self, parser_class, tolerant): from pullparser import NoMoreTokensError data, f = self.data_and_file() p = parser_class(f) p.get_token() tok = p.get_token() self.assertEqual(tok, ("data", "\n", None)) p.unget_token(tok) self.assertEqual(p.get_token(), ("data", "\n", None)) tok = p.get_token() self.assertEqual(tok, ("starttag", "html", [])) p.unget_token(tok) self.assertEqual(tok, ("starttag", "html", [])) def test_get_tag(self): for pc, tolerant in PullParserTests.PARSERS: self._test_get_tag(pc, tolerant) def _test_get_tag(self, parser_class, tolerant): from pullparser import NoMoreTokensError data, f = self.data_and_file() p = parser_class(f) self.assertEqual(p.get_tag(), ("starttag", "html", [])) self.assertEqual(p.get_tag("blah", "body", "title"), ("starttag", "title", [("an", "attr")])) self.assertEqual(p.get_tag(), ("endtag", "title", None)) self.assertEqual(p.get_tag("randomtag"), ((tolerant and "starttag" or "startendtag"), "randomtag", [("spam", "eggs")])) self.assertEqual(p.get_tag(), ("endtag", "body", None)) self.assertEqual(p.get_tag(), ("endtag", "html", None)) self.assertRaises(NoMoreTokensError, p.get_tag) # print "tag", p.get_tag() # sys.exit() def test_get_text(self): for pc, tolerant in PullParserTests.PARSERS: self._test_get_text(pc, tolerant) def _test_get_text(self, parser_class, tolerant): from pullparser import NoMoreTokensError data, f = self.data_and_file() p = parser_class(f) self.assertEqual(p.get_text(), "\n") self.assertEqual(peek_token(p).data, "html") self.assertEqual(p.get_text(), "") self.assertEqual(peek_token(p).data, "html"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "Title"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "This is a data blah & a[IMG]"); p.get_token() self.assertEqual(p.get_text(), " & that was an entityref " "and this a is\na charref. "); p.get_token() self.assertEqual(p.get_text(), ".\n\n\n\n"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() self.assertEqual(p.get_text(), "\n"); p.get_token() # no more tokens, so we just get empty string self.assertEqual(p.get_text(), "") self.assertEqual(p.get_text(), "") self.assertRaises(NoMoreTokensError, p.get_token) #print "text", `p.get_text()` #sys.exit() def test_get_text_2(self): for pc, tolerant in PullParserTests.PARSERS: self._test_get_text_2(pc, tolerant) def _test_get_text_2(self, parser_class, tolerant): # more complicated stuff from pullparser import NoMoreTokensError # endat data, f = self.data_and_file() p = parser_class(f) self.assertEqual(p.get_text(endat=("endtag", "html")), u"\n\n\nTitle\n\n\nThis is a data blah & a[IMG]" " & that was an entityref and this a is\na charref. ." "\n\n\n\n\n\n") f.close() data, f = self.data_and_file() p = parser_class(f) self.assertEqual(p.get_text(endat=("endtag", "title")), "\n\n\nTitle") self.assertEqual(p.get_text(endat=("starttag", "img")), "\n\n\nThis is a data blah & a[IMG]") f.close() # textify arg data, f = self.data_and_file() p = parser_class(f, textify={"title": "an", "img": lambda x: "YYY"}) self.assertEqual(p.get_text(endat=("endtag", "title")), "\n\n\nattr[TITLE]Title") self.assertEqual(p.get_text(endat=("starttag", "img")), "\n\n\nThis is a data YYY") f.close() # get_compressed_text data, f = self.data_and_file() p = parser_class(f) self.assertEqual(p.get_compressed_text(endat=("endtag", "html")), u"Title This is a data blah & a[IMG]" " & that was an entityref and this a is a charref. .") f.close() def test_tags(self): for pc, tolerant in PullParserTests.PARSERS: self._test_tags(pc, tolerant) def _test_tags(self, parser_class, tolerant): from pullparser import NoMoreTokensError # no args data, f = self.data_and_file() p = parser_class(f) expected_tag_names = [ "html", "head", "title", "title", "head", "body", "p", "img", "blah", "randomtag", "body", "html" ] for i, token in enumerate(p.tags()): self.assertEquals(token.data, expected_tag_names[i]) f.close() # tag name args data, f = self.data_and_file() p = parser_class(f) expected_tokens = [ ("starttag", "head", []), ("endtag", "head", None), ("starttag", "p", []), ] for i, token in enumerate(p.tags("head", "p")): self.assertEquals(token, expected_tokens[i]) f.close() def test_tokens(self): for pc, tolerant in PullParserTests.PARSERS: self._test_tokens(pc, tolerant) def _test_tokens(self, parser_class, tolerant): from pullparser import NoMoreTokensError # no args data, f = self.data_and_file() p = parser_class(f) expected_token_types = [ "decl", "data", "starttag", "data", "starttag", "data", "starttag", "data", "endtag", "data", "endtag", "data", "starttag", "data", "starttag", "data", "starttag", "data", "entityref", "data", "charref", "data", "starttag", "data", "comment", "data", "decl", "data", "pi", "data", (tolerant and "starttag" or "startendtag"), "data", "endtag", "data", "endtag", "data" ] for i, token in enumerate(p.tokens()): self.assertEquals(token.type, expected_token_types[i]) f.close() # token type args data, f = self.data_and_file() p = parser_class(f) expected_tokens = [ ("entityref", "amp", None), ("charref", "097", None), ] for i, token in enumerate(p.tokens("charref", "entityref")): self.assertEquals(token, expected_tokens[i]) f.close() def test_token_eq(self): from pullparser import Token for (a, b) in [ (Token('endtag', 'html', None), ('endtag', 'html', None)), (Token('endtag', 'html', {'woof': 'bark'}), ('endtag', 'html', {'woof': 'bark'})), ]: self.assertEquals(a, a) self.assertEquals(a, b) self.assertEquals(b, a) if __name__ == "__main__": import unittest unittest.main()