from deliverance.util.charset import META_CHARSET_TAG from nose.tools import assert_true, assert_false, assert_equals docs = { """""": "UTF-8", """""": "ASCII", """""": "ISO-8859-1", """""": "UTF-8", """""": "UTF-8", """""": "UTF-8", """""": "UTF-8", """""": "UTF-8", # it's not completely strict; these are OK too: """""", # shouldn't have nested quotes """""", # can have only one trailing quote! """ """, # has to be in the meta tag """ charset=UTF-8" >""", # really .. has to be in the meta tag ] def test_regex(): for doc in docs: should_match(doc, docs[doc]) for doc in bad_docs: shouldnt_match(doc) def shouldnt_match(doc): match = META_CHARSET_TAG.search(doc) assert_false(match) def should_match(doc, charset): match = META_CHARSET_TAG.search(doc) assert_true(match) assert_equals(match.group('charset'), charset)