"""HTML table parsing.
ClientTable is a Python module for generic HTML table parsing. It is
most useful when used in conjunction with other parsers (htmllib or
HTMLParser, regular expressions, etc.), to divide up the parsing work
between your own code and ClientTable.
RFC 1866: HTML 2.0
RFC 1942: HTML Tables
HTML 4.01 Specification, W3C Recommendation 24 December 1999
Requires Python 2.2.
Copyright 2002-2003 John J. Lee
This code is free software; you can redistribute it and/or modify it
under the terms of the MIT License (see the file COPYING included with
the distribution).
"""
# XXXX
# get_cell method on HTMLForm! (row_nr, col_nr), and maybe other ways of
# specifying, too.
# TDs and THs can contain tables too!
# Need to have a class representing data outside eg. the
in
#
# Need to be able to match headers, rather than requiring exact string match.
# Maybe indexing should do this? Should indexing ignore HTML tags?
# Current plan: match by substring; treat string as re if re_search boolean arg
# is true; tags are ignored or not as per strip_tags arg of ParseFile.
# ignore_tags and exact_match args can be added later.
# Accept string or string-*like* args?
# Character entities.
# Implement single_span.
# Implement nr_toplevel_to_parse.
# XXX
# TD elements often indicate a scope -- this could be used to find cells,
# so there should probably be a method for this.
# Need an interface for getting at HTML attributes in tables
# What to do with illegally overlapping elements (rows and rows, cols and
# cols, rows and cols)?
# More relaxed parsing?
# Incremental parsing of tables, while HTML is still downloading?
# Deal with character sets and unicode properly.
# Notes
# -----
# How to cope with non-unity span of rows and columns?
# Cell class has rowspan and colspan attrs., and a data attr, with the
# text in it.
# Cells with non-unity span get returned twice (or more) by iterators
# and by indexing. Clients need to know when they're getting the same
# cell, so allow them to assume that identical cell objects will be
# returned.
# Directionality doesn't matter.
# Perl's HTML::TableExtractor has a very elaborate system for parsing only
# those tables that match particular constraints. I don't think I care
# enough about this to copy it. HTML::TableExtractor can (for example):
# Find table by headers, or depth and index, or both.
# Depth only returns all tables at that depth. Index only returns
# all tables at that index. Headers only returns all tables anywhere
# with those headers. Otherwise, all tables match.
# Chaining: {"headers": ["Senator", "Sugar?"],
# "chain": {"headers": ["blah"]}}
try: True
except NameError:
True = 1
False = 0
import HTMLParser, re, copy, string
from htmlentitydefs import entitydefs
from types import StringType, UnicodeType, \
IntType, LongType, FloatType
VERSION = "0.0.2a-pre1"
CHUNK = 1024 # size of chunks fed to parser, in bytes
WHITESPACE_RE = re.compile(r"\s+")
def collapse_whitespace(text): return WHITESPACE_RE.sub(" ", text)
NumericTypes = (IntType, LongType, FloatType)
def issequence(object):
"""Test whether object is a sequence."""
try:
object[0]
except (TypeError, KeyError):
return 0
except IndexError:
pass
return 1
def isstringlike(object):
if (isinstance(object, StringType) or isinstance(object, UnicodeType)):
return True
return False
DEBUG = False
def debug(*args):
if DEBUG:
for obj in args:
print obj,
print
def ParseFile(file,
nr_to_parse=-1,
nr_toplevel_to_parse=-1,
single_span=False,
strip_tags=False,
collapse_whitespace=False,
recode_entities=None,
ignore_errors=False):
"""Parse HTML tables and return a list of HTMLTable objects.
file: file object
nr_to_parse: stop after parsing this many tables; negative means parse all
of them
single_span: return cells that span multiple columns or rows only
once, rather than once for every row / column they span
strip_tags: remove HTML tags from cell contents
collapse_whitespace: collapse consecutive whitespace characters (anything
matching r"\s")to a single space
recode_entities: recode HTML entities according to this dict
Note that passing in the return value of urllib2.urlopen here as the file
argument is fine.
"""
tp = TableParser(nr_to_parse,
nr_toplevel_to_parse,
single_span,
strip_tags,
collapse_whitespace,
recode_entities,
ignore_errors)
while 1:
data = file.read(CHUNK)
try:
tp.feed(data)
except ParseFinished:
break
if len(data) != CHUNK: break
for table in tp.tables:
table.fixup()
return tp.tables
class HTMLTable:
"""Represents (surprise!) an HTML table.
HTMLTable instances are iterators over TableRows. This includes any
header rows.
Public attributes: headers_row, headers_col.
headers_row is a TableRow instance used to index columns in the table.
headers_col is a TableColumn instance used to index columns in the table.
"""
is_table = None
def __init__(self, single_span=False):
self._single_span = single_span
self._data = []
self._sub_tables = []
def push_row(self, row):
self._data.append(row)
def push_table(self, table):
self._data.append(table)
def set_max_cols(self, max_cols):
self._max_cols = max_cols
def fixup(self):
for obj in self._data:
obj.fixup() # row or table
# Duplicate cells according to cell rowspans, and pad rows to correct
# number of columns. Duplication of cells for colspans was done by
# row.fixup()
cells_done = {}
nr_rows = len(self._data)
rows = [None]*nr_rows
for i in range(nr_rows):
row = self._data[i]
if hasattr(row, "is_table"):
# stick tables that are outside of any row in their own row
table = row
row = TableRow()
row.push_table(table)
self._data[i] = row
else:
# duplicate cells according to cell rowspans
for j in range(len(row)):
cell = row[j]
if hasattr(cell, "is_table"): continue
if cells_done.has_key(cell): continue
cells_done[cell] = None
if cell.rowspan != 1:
for k in range(cell.rowspan-1):
row2 = self._data[i+k+1]
row2.insert(j, cell)
# pad table to uniform number of columns
to_pad = self._max_cols - len(row)
while to_pad > 0:
row.push_cell(None)
to_pad -= 1
self.headers_row = None
# first row of headers are keys for indexing cols
for row in self._data:
if row.is_header:
self.headers_row = row
break
for row in self._data:
row.headers_row = self.headers_row
# first col of headers are keys for indexing rows
nr_cols = len(self._data[0])
data = self._data
for i in range(nr_cols):
if self._col_is_header(i):
self.headers_col = self._col_from_index(i)
for table in self._sub_tables:
table.fixup()
def _col_is_header(self, colnr):
for row in self._data:
try:
el = row[colnr].element_type
except AttributeError:
# This cell is padding to make up missing TD/TH elements on
# this row.
return False
else:
if el != "th":
return False
return True
def _col_from_index(self, colnr):
tc = TableColumn(self._col_is_header(colnr))
for row in self._data:
obj = row[colnr]
if hasattr(obj, "is_table"):
tc.push_table(obj)
else:
tc.push_cell(obj)
tc.fixup()
return tc
def __getitem__(self, key):
if isstringlike(key):
return self.get_col_by_name(key)
return self._data[key]
def get_col_by_name(self, colname):
"""Get column by finding its name in the headers_row attribute."""
if self.headers_row is None:
raise KeyError, "no header row has been set"
i = self.headers_row.index(colname)
return self._col_from_index(i)
def get_row_by_name(self, rowname):
"""Get row by finding its name in the headers_col attribute."""
if self.headers_col is None:
raise KeyError, "no header column has been set"
i = self.headers_col.index(rowname)
return self._data[i]
def get_col_by_nr(self, colnr):
"""Get column by integer index."""
tc = TableColumn()
for row in self._data:
obj = row[colnr]
if hasattr(obj, "is_table"):
tc.push_table(obj)
else:
tc.push_cell(obj)
tc.fixup()
return tc
def get_row_by_nr(self, rownr):
"""Get row by integer index."""
return self._data[rownr]
def col_iter(self):
"""Return iterator over columns of table."""
return iterator(self.get_col_by_nr)
#raise NotImplementedError
def __iter__(self): return iterator(self.get_row_by_nr)
def __str__(self, expanded=False, abbr=False, indent=0):
indent += 4
rep = []
for row in self._data:
rep.append(" "*indent+row.__str__(expanded, abbr, indent))
return "%s[\n%s]" % (self.__class__.__name__, string.join(rep, "\n"))
def __len__(self): return len(self._data)
class iterator:
def __init__(self, index_fn):
"""
index_fn: function behaving like __getitem__ for simple sequence
object (ie. taking integer argument and returning a corresponding
object, and raising IndexError if argument is out-of-bounds; valid
indices must be consecutive)
"""
self.__i = 0
self.__index_fn = index_fn
def __iter__(self): return self
def next(self):
try:
r = self.__index_fn(self.__i)
except IndexError:
raise StopIteration
self.__i += 1
return r
class TableSeq:
"""Abstract base class for Cell, TableRow and TableColumn."""
def fixup(self): pass
def push_table(self, table):
assert isinstance(table, HTMLTable), table.__class__.__name__
self._data.append(table)
def __str__(self, expanded=False, abbr=False, indent=0):
rep = []
for obj in self._data:
if obj is None:
rep.append("None")
elif expanded:
rep.append(obj.__str__(expanded, abbr, indent))
else:
rep.append(repr(obj))
return "%s[%s]" % (self.__class__.__name__, ", ".join(rep))
def __contains__(self, item): return item in self._data
def __len__(self): return len(self._data)
def __getitem__(self, i): return self._data[i]
def insert(self, i, item):
assert isinstance(item, Cell), item.__class__.__name__
self._data.insert(i, item)
def count(self, item): return self._data.count(item)
def index(self, item): return self._data.index(item)
class Cell(TableSeq):
"""A single cell of an HTML table.
Note that a single cell may span many rows or columns (or both). Cells may
even contain tables!
Public readable attributes:
data: cell contents
Public attributes:
rowspan: horizontal span of cell (nr. of rows occupied by cell)
rowspan: horizontal span of cell (nr. of columns occupied by cell)
element_type. "td" or "th"
"""
def __init__(self, collapse_whitespace=False):
"""
element_type: HTML element type; should be "td" or "th"
"""
## if not isinstance(rowspan, NumericTypes):
## raise TypeError, "an integer is required for rowspan"
## if not isinstance(colspan, NumericTypes):
## raise TypeError, "an integer is required for colspan"
## self.data = data
self._data_chunks = []
## self.element_type = element_type
## self.rowspan = rowspan
## self.colspan = colspan
self._data = [] # contains HTMLTables
self._collapse_whitespace = collapse_whitespace
def fixup(self):
for table in self._data:
table.fixup()
if self._collapse_whitespace:
data = "".join(self._data_chunks).strip()
self.data = collapse_whitespace(data)
else:
self.data = "".join(self._data_chunks)
def __cmp__(self, other):
if isinstance(other, Cell):
if self.data == other.data: return 0
elif self.data == other: return 0
return 1
def push_data(self, data):
"""
data: string-lke object: part of cell contents
"""
if not isstringlike(data):
raise TypeError, "a string-like object is required for data"
self._data_chunks.append(data)
def __str__(self, expanded=False, abbr=False, indent=0):
rep = []
for obj in self._data:
if expanded:
rep.append(obj.__str__(expanded, abbr, indent))
else:
rep.append(repr(obj))
if self.rowspan != 1 or self.colspan != 1:
span = "%dx%d " % (self.rowspan, self.colspan)
else:
span = ""
data = self.data
if abbr and len(data) > abbr:
data = data[:abbr]+"..."
return "%s%s(%s)[%s]" % (span, self.__class__.__name__,
data, ", ".join(rep))
def __repr__(self):
if self.rowspan != 1 or self.colspan != 1:
span = "%dx%d " % (self.rowspan, self.colspan)
else:
span = ""
return "<%s%s(%s) instance at %x>" % (
span, self.__class__.__name__, self.data, id(self))
def __hash__(self): return id(self)
class TableRowOrColumn(TableSeq):
def __init__(self, is_header=False):
if is_header: self.is_header = True
else: self.is_header = False
self.headers = None
self._data = []
def fixup(self):
for obj in self._data:
obj.fixup() # table or cell
# XXX TableRow and TableColumn are identical ATM, other than for names...
# ...and push_cell, but that may be because TableColumn.push_cell is
# incorrect
class TableRow(TableRowOrColumn):
"""Row of an HTML table.
Indexing with a string gets a cell using the headers_row attribute.
Indexing with an integer gets a cell.
"""
def __init__(self, is_header=False):
TableRowOrColumn.__init__(self, is_header)
self.headers_row = None
def push_cell(self, cell):
if cell is not None:
assert isinstance(cell, Cell), cell.__class__.__name__
for i in range(cell.colspan):
self._data.append(cell)
else:
assert cell is None
self._data.append(cell)
def __getitem__(self, key):
if isstringlike(key):
return self.get_cell_by_name(key)
return self._data[key]
def get_cell_by_nr(self, colnr):
"""Get cell by integer index."""
return self._data[colnr]
def get_cell_by_name(self, colname):
"""Get row by finding its name in the headers_row attribute."""
if self.headers_row is None:
raise KeyError, "no header row has been set"
else:
i = self.headers_row.index(colname)
return self._data[i]
class TableColumn(TableRowOrColumn):
"""Row of an HTML table.
Indexing with a string gets a cell using the headers_col attribute.
Indexing with an integer gets a cell.
"""
def __init__(self, is_header=False):
TableRowOrColumn.__init__(self, is_header)
self.headers_col = None
def push_cell(self, cell):
# XXX shouldn't this do the same as TableColumn??
self._data.append(cell)
def __getitem__(self, key):
if isstringlike(key):
return self.get_cell_by_name(key)
return self._data[key]
def get_cell_by_nr(self, rownr):
"""Get cell by integer index."""
return self._data[rownr]
def get_cell_by_name(self, colname):
"""Get column by finding its name in the headers_col attribute."""
if self.headers_col is None:
raise KeyError, "no header column has been set"
else:
i = self.headers_col.index(rowname)
return self._data[i]
# Notes about cell spans.
# "rowspan" is nr rows spanned.
# "colspan" is nr cols spanned.
# COL has rowspan, colspan attributes.
# COLs default to unity span.
# TD and TH have rowspan, colspan attributes.
# COLGROUP has span attribute (which is a colspan).
# COLGROUPs default to unity span.
# COLGROUPs with no COLs have unity span.
# COLGROUPs with contained COLs have span equal to sum of colspans of
# contained COLs (the COLGROUP's span itself counts for nothing in this
# case).
# If no COLGROUPs or COLs, nr of cols is determined by max row length
# (taking TD and TH colspans into account). Pad short rows up to this nr
# of cols. Even if there *are* COLGROUPs or COLs, this col. width should
# agree (in theory) with that calculated from COLGROUP and COL spans, but
# should use COLGROUP/COL algorithm in preference to counting row
# lengths.
class ParseState:
def __init__(self):
#self.cell_data = [] # TD and TH element data (strings)
# set to HTMLTable object iff nested sub-table is being parsed
self.table = None
self.tablerow = None
self.tablecell = None
# following are true iff in named element
self.in_table = False
self.in_tr = False
self.in_th = False
self.in_td = False
self.in_colgroup = False
# current position in table
self.col = 0
self.row = 0
# true after parsing row iff this is a header row
self.in_header_row = False
# true after parsing table iff COL or COLSPAN was seen in this table
self.have_col = False
# column-counting for case where self.have_col is false after parsing
self.tx_colspan = 1
self.tx_rowspan = 1
self.tx_cols = 0 # total span from TDs and THs
# column-counting for case where self.have_col is true after parsing
self.colgroup_span = 0 # span due to COLGROUP containing no COLs
self.colgroup_cols = 0 # total span of COLGROUP due to contained COLs
self.col_cols = 0 # total span from COLs and COLSPANs
def __str__(self):
rep = []
for k, v in self. __dict__.values():
rep.append("%s:%s" % k, v)
return "\n".join(rep)
class ParseError(Exception): pass
class ParseFinished(Exception): pass
# XXX
# Look at HTML::TreeBuilder to check implicit td / th (etc.) rules.
# THEAD -- contains TRs, but is header.
# Move implicit element ending stuff to handle_starttag / handle_endtag?
class TableParser(HTMLParser.HTMLParser):
"""HTML Table parser."""
table_tags = "table", "tr", "td", "th", "col", "colgroup" # XXXX
def __init__(self, nr_to_parse=-1,
nr_toplevel_to_parse=-1,
single_span=False,
strip_tags=False,
collapse_whitespace=False,
recode_entities=None,
ignore_errors=False):
"""
nr_to_parse: only parse this number of tables, then stop; if negative,
parse all tables in the document
single_span: return cells that span multiple columns or rows only
once, rather than once for every row / column they span
collapse_whitespace: convert all consecutive whitespace characters
to a single space
# XXX name recode_entities is poor if this is a dict
recode_entities: recode HTML entities according to this dict
depth: internal use only
"""
HTMLParser.HTMLParser.__init__(self)
if nr_toplevel_to_parse >= 0:
raise NotImplementedError, \
"nr_toplevel_to_parse not yet implemented"
if single_span != False:
raise NotImplementedError, \
"single_span not yet implemented"
self._nr_to_parse = nr_to_parse
self._nr_toplevel_to_parse = nr_toplevel_to_parse
if single_span: self._single_span = True
else: self._single_span = False
if strip_tags: self._strip_tags = True
else: self._strip_tags = False
if collapse_whitespace: self._collapse_whitespace = True
else: self._collapse_whitespace = False
self._recode_entities = recode_entities
self._ignore_errors = ignore_errors
# the end result
self.tables = []
self._stack = [] # parse state stack
self._depth = 0 # table nesting depth
self._ps = ParseState()
self._ps.tablecell = Cell(
collapse_whitespace=self._collapse_whitespace)
def error(self, error):
if not self._ignore_errors: raise error
def handle_entityref(self, name):
debug("handle_entityref", name)
if self._recode_entities:
self.handle_data(self._recode_entities[name])
## def handle_charref(self, name):
## pass
def handle_starttag(self, tag, attrs):
#debug("handle_starttag")
#debug("tag", tag)
debug("<%s>" % tag)
if self._ps.in_colgroup and tag != "col":
self.end_colgroup()
# XXX put this in when rest of parser has stabilised
## if self._depth == 0 and tag not in self.table_tags:
## # implicit end of top-level table
## self.end_table()
if not self._strip_tags:
if ((self._ps.in_td or self._ps.in_th) and
tag not in self.table_tags):
self._ps.tablecell.push_data(self.get_starttag_text())
else:
self._ps.tablecell.push_data(" ")
try:
method = getattr(self, "start_" + tag)
except AttributeError:
try:
method = getattr(self, "do_" + tag)
except AttributeError:
pass
else:
method(attrs)
else:
method(attrs)
def handle_endtag(self, tag):
#debug("handle_endtag")
debug("%s>" % tag)
if self._ps.in_colgroup and tag != "col":
self.end_colgroup()
if not self._strip_tags:
if ((self._ps.in_td or self._ps.in_th) and
tag not in self.table_tags):
self._ps.tablecell.push_data("%s>" % tag)
else:
self._ps.tablecell.push_data(" ")
try:
method = getattr(self, "end_" + tag)
except AttributeError:
method = None
if method:
method()
def start_table(self, attrs):
debug("start_table")
# td and th may span nested tables:
#
# so we don't implicitly end td and th here
if self._nr_to_parse == 0:
raise ParseFinished
self._depth += 1
self._stack.append(self._ps)
self._ps = ParseState()
self._ps.table = HTMLTable()
self._ps.tablecell = Cell(
collapse_whitespace=self._collapse_whitespace)
self._ps.in_table = True
self._ps.row = self._ps.col = 0
def end_table(self):
debug("end_table")
if not self._ps.in_table: self.error(ParseError("end of TABLE before start"))
if self._ps.in_tr:
if self._ps.in_td: self.end_td()
if self._ps.in_th: self.end_th()
self.end_tr()
table = self._ps.table
# number of columns to pad to
if self._ps.have_col: max_cols = self._ps.col_cols
else: max_cols = self._ps.tx_cols
table.set_max_cols(max_cols) # XXXX yuck
self._depth -= 1
if self._depth == 0:
self.tables.append(table)
else:
self._ps = self._stack.pop()
if self._ps.in_td or self._ps.in_th:
self._ps.tablecell.push_table(table)
elif self._ps.in_tr:
self._ps.tablerow.push_table(table)
else:
self._ps.table.push_table(table)
self._nr_to_parse -= 1
def start_col(self, attrs):
debug("start_col")
if not self._ps.in_table: self.error(ParseError("COL outside of TABLE"))
span = 1
for k, v in attrs:
if k == "span": span = int(v, 10)
self._ps.have_col = True
self._ps.colgroup_cols += span
def start_colgroup(self, attrs):
debug("start_colgroup")
if not self._ps.in_table: self.error(ParseError("COL outside of TABLE"))
self._ps.in_colgroup = True
self._ps.have_col = True
self._ps.colgroup_cols = 0
span = 1
for k, v in attrs:
if k == "span": span = int(v, 10)
self._ps.colgroup_span = span
def end_colgroup(self):
if not self._ps.in_colgroup:
self.error(ParseError("end of COLGROUP before start"))
self._ps.in_colgroup = False
if self._ps.colgroup_cols == 0:
self._ps.col_cols += self._ps.colgroup_span
else:
self._ps.col_cols += self._ps.colgroup_cols
def start_tr(self, attrs):
debug("start_tr")
if not self._ps.in_table:
self.error(ParseError("start of TR element outside of TABLE"))
if self._ps.in_tr:
if self._ps.in_td: self.end_td()
if self._ps.in_th: self.end_th()
self.end_tr()
self._ps.tablerow = TableRow()
self._ps.in_tr = True
self._ps.col = 0
def end_tr(self):
debug("end_tr")
if not self._ps.in_table:
self.error(ParseError("end of TR element outside of TABLE"))
if self._ps.in_td: self.end_td()
if self._ps.in_th: self.end_th()
self._ps.tx_cols = max(self._ps.tx_cols, len(self._ps.tablerow))
self._ps.tablerow.is_header = self._ps.in_header_row
self._ps.table.push_row(self._ps.tablerow)
self._ps.in_header_row = False
self._ps.in_tr = False
self._ps.row += 1
def _process_cell_data(self):
"""Consolidate data for single table entry (including headers).XXX now done by Cell object"""
debug("_process_cell_data")
if self._ps.in_th: el = "th"
else: el = "td"
cell = self._ps.tablecell
cell.element_type = el
cell.rowspan = self._ps.tx_rowspan
cell.colspan = self._ps.tx_colspan
self._ps.tablerow.push_cell(cell)
self._ps.tablecell = Cell(
collapse_whitespace=self._collapse_whitespace)
def start_th(self, attrs):
debug("start_th")
if not self._ps.in_table: self.error(ParseError("TH outside of TABLE"))
if self._ps.in_th: self.end_th()
if self._ps.in_td: self.end_td()
self._ps.in_header_row = True
self._ps.in_th = True
self._ps.tx_colspan = self._ps.tx_rowspan = 1
for k, v in attrs:
if k == "colspan": self._ps.tx_colspan = int(v, 10)
if k == "rowspan": self._ps.tx_rowspan = int(v, 10)
def end_th(self):
debug("end_th")
if not self._ps.in_th: self.error(ParseError("end of TH before start"))
assert self._ps.in_table
self._process_cell_data()
self._ps.in_th = False
self._ps.col += 1
def start_td(self, attrs):
debug("start_td")
if not self._ps.in_table: self.error(ParseError("TD outside of TABLE"))
if self._ps.in_td: self.end_td()
if self._ps.in_th: self.end_th()
self._ps.in_td = True
self._ps.tx_colspan = self._ps.tx_rowspan = 1
for k, v in attrs:
if k == "colspan": self._ps.tx_colspan = int(v, 10)
if k == "rowspan": self._ps.tx_rowspan = int(v, 10)
def end_td(self):
debug("end_td")
if not self._ps.in_td: self.error(ParseError("end of TD before start"))
assert self._ps.in_table
self._process_cell_data()
self._ps.in_td = False
self._ps.col += 1
def handle_data(self, data):
if not self._ps.in_table: return
debug("handle_data >>%s<<" % data)
if self._ps.in_td or self._ps.in_th:
self._ps.tablecell.push_data(data)