# Portal transform for images with captions
#
# We want to be able to support captions in images.
# The easiest way to do this is to define a Portal Transform which is
# applied to the HTML body text on output.
#
# The transform finds all the embedded images, and replaces them with
# an appropriate chunk of HTML to include the caption.
#
from Products.PortalTransforms.interfaces import itransform
from DocumentTemplate.DT_Util import html_quote
from DocumentTemplate.DT_Var import newline_to_br
from Products.CMFCore.utils import getToolByName
import re
from cgi import escape
from urlparse import urlsplit, urljoin, urlunsplit
from urllib import unquote_plus, quote_plus
from Acquisition import aq_base
from htmlentitydefs import name2codepoint
name2codepoint = name2codepoint.copy()
name2codepoint['apos']=ord("'")
__revision__ = '$Id$'
# IMAGE_PATTERN matches an image tag on its own, or an image tag
# enclosed in a simple
or
. In the latter case we strip out
# the enclosing tag since we are going to insert our own.
PATIMG = '\\]+class\s*=[^=>]*captioned[^>]+\\>'
PATA = '(?:\\]*\\>'+PATIMG+'\\)' + '|' + PATIMG
PAT0 = '('+PATA+')'
PAT1 = '<(?:p|div)[^>]*>'+PAT0 + '(?:p|div)>' + '|' + PAT0
IMAGE_PATTERN = re.compile(PAT1, re.IGNORECASE)
# Regex to match stupid IE attributes. In IE generated HTML an
# attribute may not be enclosed by quotes if it doesn't contain
# certain punctuation.
ATTR_VALUE = '=(?:"?)(?P<%s>(?<=")[^"]*|[^ \/>]*)'
ATTR_CLASS = ATTR_VALUE % 'class'
ATTR_WIDTH = ATTR_VALUE % 'width'
ATTR_ALT = ATTR_VALUE % 'alt'
ATTR_PATTERN = re.compile('''
(?P\<
( class%s
| src\s*=\s*"resolveuid/(?P([^/"#? ]*))
| width%s
| alt%s
| .
)*\>
)''' % (ATTR_CLASS, ATTR_WIDTH, ATTR_ALT), re.VERBOSE | re.IGNORECASE | re.DOTALL)
SRC_TAIL = re.compile(r'/([^" \/>]+)')
CLASS_PATTERN = re.compile('\s*class\s*=\s*("[^"]*captioned[^"]*"|[^" \/>]+)')
ALT_PATTERN = re.compile('\\balt\s*=\s*("[^"]*"|[^" \/>]+)')
END_TAG_PATTERN = re.compile('(]*?)( */?>)')
IMAGE_TEMPLATE = '''\
%(tag)s
%(caption)s
'''
UID_PATTERN = re.compile('(?P<(?:a|img|object|param)\\s[^>]*(?:src|href|data|value)\s*=\s*")(?P[^"]*resolveuid/(?P[^/"#? ]*))', re.DOTALL | re.IGNORECASE)
class HTMLToCaptioned:
"""Transform which adds captions to images embedded in HTML"""
__implements__ = itransform
__name__ = "html_to_captioned"
inputs = ('text/html',)
output = "text/x-html-captioned"
def __init__(self, name=None):
self.config_metadata = {
'inputs' : ('list', 'Inputs', 'Input(s) MIME type. Change with care.'),
}
if name:
self.__name__ = name
def name(self):
return self.__name__
def __getattr__(self, attr):
if attr == 'inputs':
return self.config['inputs']
if attr == 'output':
return self.config['output']
raise AttributeError(attr)
def resolveuid(self, context, reference_catalog, uid):
"""Convert a uid to an object by looking it up in the reference catalog.
If not found then tries to fallback to a possible hook (e.g. so you could
resolve uids on another system).
"""
target = reference_catalog.lookupObject(uid)
if target is not None:
return target
hook = getattr(context, 'kupu_resolveuid_hook', None)
if hook:
target = hook(uid)
return target
def convert(self, data, idata, filename=None, **kwargs):
"""convert the data, store the result in idata and return that
optional argument filename may give the original file name of received data
additional arguments given to engine's convert, convertTo or __call__ are
passed back to the transform
The object on which the translation was invoked is available as context
(default: None)
"""
context = kwargs.get('context', None)
template = context.kupu_captioned_image
if context:
at_tool = context.archetype_tool
rc = at_tool.reference_catalog
if context and at_tool:
def replaceImage(match):
tag = match.group(1) or match.group(2)
attrs = ATTR_PATTERN.match(tag)
src = attrs.group('src')
m = SRC_TAIL.match(tag, attrs.end('src'))
if m:
srctail = m.group(1)
else:
srctail = None
if src:
d = attrs.groupdict()
target = self.resolveuid(context, rc, src)
if target:
d['class'] = attrs.group('class')
d['originalwidth'] = attrs.group('width')
d['originalalt'] = attrs.group('alt')
d['caption'] = newline_to_br(html_quote(target.Description()))
d['image'] = d['fullimage'] = target
if srctail:
try:
subtarget = target.restrictedTraverse(srctail)
except:
subtarget = getattr(target, srctail, None)
if subtarget:
d['image'] = subtarget
return template(**d)
return match.group(0) # No change
html = IMAGE_PATTERN.sub(replaceImage, data)
# Replace urls that use UIDs with human friendly urls.
def replaceUids(match):
tag = match.group('tag')
uid = match.group('uid')
target = self.resolveuid(context, rc, uid)
if target:
try:
url = target.getRemoteUrl()
except AttributeError:
url = target.absolute_url_path()
return tag + url
return match.group(0)
html = UID_PATTERN.sub(replaceUids, html)
idata.setData(html)
return idata
# No context to use for replacements, so don't bother trying.
idata.setData(data)
return idata
def register():
return HTMLToCaptioned()
def initialize():
engine = getToolByName(portal, 'portal_transforms')
engine.registerTransform(register())
ATTR_HREF = ATTR_VALUE % 'href'
LINK_PATTERN = re.compile(
r'(?P\<(?:img\s[^>]*src|a\s[^>]*href)\s*=\s*(?:"?))(?P(?<=")[^"]*|[^ \/>]*)',
re.IGNORECASE)
FRAGMENT_TYPE = 'CompositePack Fragments'
NAVIGATION_PAGE = 'Navigation Page'
SUMMARY_PATTERN = re.compile(r'(\]*>.*?)|(\]*\>)', re.IGNORECASE|re.DOTALL)
class Migration:
FIELDS = ('portal_type', 'typename', 'fieldname',
'fieldlabel', 'position', 'action', 'commit_changes',
'image_tails', 'paths', 'pathuids', 'uids', 'found',
'batch_size',
)
def __init__(self, tool):
self.tool = tool
self.url_tool = getToolByName(tool, 'portal_url')
self.portal = self.url_tool.getPortalObject()
self.portal_base = self.url_tool.getPortalPath()
self.portal_base_url = self.portal.absolute_url()
self.prefix_length = len(self.portal_base)+1
self.uid_catalog = getToolByName(tool, 'uid_catalog')
self.reference_tool = getToolByName(tool, 'reference_catalog')
self.portal_catalog = getToolByName(tool, 'portal_catalog')
self._continue = True
self._firstoutput = False
self.commit_changes = False
self._objects = []
self.image_tails = []
def initImageSizes(self):
self.image_tails = self.tool._getImageSizes()
def initFromRequest(self):
self.initImageSizes()
self.uids = None
self.found = 0
request = self.tool.REQUEST
rfg = request.form.get
fields = [f for f in rfg('fields',()) if f.get('selected',0)]
if fields:
f = fields[0]
self.portal_type = f.portal_type
self.typename = f.type
self.fieldname = f.name
self.fieldlabel = f.label
else:
self.portal_type = rfg('portal_type', None)
self.fieldname = None
self.fieldlabel = None
self.typename = None
self.position = 0
self.action = rfg('button', None)
self.commit_changes = rfg('commit', False)
self.batch_size = 10
if self.commit_changes:
self.uids = rfg('uids', [])
pathuids = rfg('folderpaths', [])
self.paths = self.tool.convertUidsToPaths(pathuids)
self.pathuids = pathuids
def initCommit(self):
"""Reset counters for a commit pass"""
self.restoreState()
request = self.tool.REQUEST
rfg = request.form.get
self.commit_changes = True
self._firstoutput = True
self.found = 0
self.position = 0
self.batch_size = 5
self.uids = rfg('uids')
def saveState(self):
SESSION = self.tool.REQUEST.SESSION
SESSION['kupu_migrator'] = dict([(f, getattr(self, f, None)) for f in self.FIELDS])
def restoreState(self):
SESSION = self.tool.REQUEST.SESSION
state = SESSION['kupu_migrator']
for f in self.FIELDS:
setattr(self, f, state[f])
# def clearState(self):
# return
# SESSION = self.tool.REQUEST.SESSION
# if SESSION.has_key('kupu_migrator'):
# del SESSION['kupu_migrator']
def status(self):
s = [ '%s=%s' % (f,getattr(self, f, 'unset')) for f in
self.FIELDS ]
return '\n'.join(s)
def mkQuery(self):
query = {}
if self.portal_type:
query['portal_type'] = sanitize_portal_type(self.portal_type)
if self.paths:
query['path'] = self.paths
query['Language'] = 'all'
return query
def getInfo(self, saveState=True):
info = {}
if self._continue:
info['nexturi'] = self.tool.absolute_url_path()+'/kupu_migration.xml?button=continue'
if self.commit_changes and self._objects and self.position < getattr(self, '_total', -1):
info['delay'] = 5 # Avoid killing everyone else with conflict errors.
else:
info['nexturi'] = None
info['firstoutput'] = self._firstoutput
if hasattr(self, '_total'):
info['total'] = self._total
info['position'] = self.position
if self._total==0:
info['percent'] = '100%'
else:
info['percent'] = '%d%%' % ((100.*self.position)/self._total)
info['objects'] = self._objects
info['action'] = action = self.action
info['action_check'] = action=='check'
info['action_touid'] = action=='touid'
info['action_topath'] = action=='topath'
info['typename'] = self.typename
info['fieldlabel'] = self.fieldlabel
info['checkboxes'] = action != 'check' and not self.commit_changes
info['commit_changes'] = self.commit_changes
info['dryrun'] = not (self.action == 'check' or self.commit_changes)
info['found'] = self.found
if saveState:
self.saveState()
return info
def docontinue(self):
"""Scan selected documents looking for convertible links"""
uids = self.uids
if uids is None:
self.uids = uids = []
brains = self.portal_catalog.searchResults(self.mkQuery())
for b in brains:
uid = self.UIDfromBrain(b)
if uid:
uids.append(uid)
self._firstoutput = True
self._continue = True
return True
pos = self.position
self._total = total = len(uids)
uids = uids[pos:pos+self.batch_size]
self.position = pos + len(uids)
if not uids:
self._continue = False
return False # Done
self._objects = res = []
for uid in uids:
obj = self.reference_tool.lookupObject(uid)
if self.portal_type==FRAGMENT_TYPE and obj.portal_type!=FRAGMENT_TYPE:
try:
fldr = obj.cp_container.titles
except:
continue
else:
for o in fldr.objectValues([FRAGMENT_TYPE]):
objinfo = self.object_check(o)
if objinfo:
res.append(objinfo)
else:
objinfo = self.object_check(obj)
if objinfo:
res.append(objinfo)
self._continue = True
return True
def brain_check(self, brain):
object = brain.getObject()
return self.object_check(object)
def link_summary(self, data, start, link):
"""Summary information for a link"""
m = SUMMARY_PATTERN.match(data, start)
if m:
text = m.group(0)
else:
text = data[start:start+200]
bits = text.split(link, 1)
if len(bits)==1:
bits.append('')
return bits
def object_check(self, object):
"""Check the relative links within this object."""
def checklink(match):
matched = match.group(0)
newlink = link = decodeEntities(match.group('href'))
classification, uid, relpath, tail = self.classifyLink(link, base)
if self.action=='check':
if classification=='bad':
abslink = urljoin(base, link)
before, after = self.link_summary(data, match.start(), link)
summary = {'text':link, 'url':abslink,
'before': before,
'after': after, }
info.append(summary)
elif self.action=='touid':
if classification=='internal':
if uid and uid==objuid:
newlink = tail
elif uid:
newlink = 'resolveuid/%s%s' % (uid, tail)
else:
newlink = relpath+tail
elif self.action=='topath':
if classification=='internal':
newlink = relpath+tail
if newlink != link:
prefix = match.group('prefix')
newlink = html_quote(newlink).encode('ascii', 'xmlcharrefreplace')
changes.append((match.start()+len(prefix), match.end(), newlink))
return prefix + newlink
return matched
info = []
changes = []
try:
objuid = aq_base(object).UID
except:
return None # only archetypes objects
baseobj = object
if object.portal_type==FRAGMENT_TYPE:
baseobj = object.aq_parent.aq_parent.aq_parent
base = baseobj.absolute_url()
if getattr(baseobj.aq_explicit, 'isPrincipiaFolderish', 0):
base += '/'
field = object.getField(self.fieldname)
if field is None:
return None
content_type = field.getContentType(object)
if content_type != 'text/html':
# Don't attempt to modify non-html
return None
data = field.getEditAccessor(object)().decode('utf8')
__traceback_info__ = (object, data)
newdata = LINK_PATTERN.sub(checklink, data)
if data != newdata and self.commit_changes:
mutator = field.getMutator(object)
if mutator:
mutator(newdata.encode('utf8'), mimetype='text/html')
object.reindexObject() # Need to flag update
if info or changes:
self.found += 1
title = object.Title()
if not title:
title = object.getId()
if not title:
title = '