"""HTML Stripper Strips away all the HTML from a string. Contains portions of Universal Feed Parser by Mark Pilgrim (http://feedparser.org/) """ __license__ = """Copyright (c) 2006, Lawrence Oluyede, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" __feedparser_license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" import sgmllib import re class _BaseHTMLProcessor(sgmllib.SGMLParser): elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'] def __init__(self, encoding): self.encoding = encoding sgmllib.SGMLParser.__init__(self) def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) def _shorttag_replace(self, match): tag = match.group(1) if tag in self.elements_no_end_tag: return '<' + tag + ' />' else: return '<' + tag + '>' + tag + '>' def feed(self, data): data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') if self.encoding and type(data) == type(u''): data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), v) for k, v in attrs] attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
uattrs = []
# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
for key, value in attrs:
if type(value) != type(u''):
value = unicode(value, self.encoding)
uattrs.append((unicode(key, self.encoding), value))
strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
if tag in self.elements_no_end_tag:
self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
else:
self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
def unknown_endtag(self, tag):
# called for each end tag, e.g. for , tag will be 'pre'
# Reconstruct the original end tag.
if tag not in self.elements_no_end_tag:
self.pieces.append("%(tag)s>" % locals())
def handle_charref(self, ref):
# called for each character reference, e.g. for ' ', ref will be '160'
# Reconstruct the original character reference.
self.pieces.append('%(ref)s;' % locals())
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
self.pieces.append('&%(ref)s;' % locals())
def handle_data(self, text):
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
self.pieces.append(text)
def handle_comment(self, text):
# called for each HTML comment, e.g.
# Reconstruct the original comment.
self.pieces.append('' % locals())
def handle_pi(self, text):
# called for each processing instruction, e.g.
# Reconstruct original processing instruction.
self.pieces.append('%(text)s>' % locals())
def handle_decl(self, text):
# called for the DOCTYPE, if present, e.g.
#
# Reconstruct original DOCTYPE
self.pieces.append('' % locals())
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
def _scan_name(self, i, declstartpos):
rawdata = self.rawdata
n = len(rawdata)
if i == n:
return None, -1
m = self._new_declname_match(rawdata, i)
if m:
s = m.group()
name = s.strip()
if (i + len(s)) == n:
return None, -1 # end of buffer
return name.lower(), m.end()
else:
self.handle_data(rawdata)
return None, -1
def output(self):
'''Return processed HTML as a single string'''
return ''.join([str(p) for p in self.pieces])
class _HTMLSanitizer(_BaseHTMLProcessor):
acceptable_elements = []
## ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
## 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
## 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
## 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
## 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
## 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
## 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
## 'thead', 'tr', 'tt', 'u', 'ul', 'var']
acceptable_attributes = []
## ['abbr', 'accept', 'accept-charset', 'accesskey',
## 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
## 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
## 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
## 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
## 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
## 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
## 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
## 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
## 'usemap', 'valign', 'value', 'vspace', 'width']
unacceptable_elements_with_end_tag = ['script', 'applet']
def reset(self):
_BaseHTMLProcessor.reset(self)
self.unacceptablestack = 0
def unknown_starttag(self, tag, attrs):
if not tag in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
return
attrs = self.normalize_attrs(attrs)
attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
def unknown_endtag(self, tag):
if not tag in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack -= 1
return
_BaseHTMLProcessor.unknown_endtag(self, tag)
def handle_pi(self, text):
pass
def handle_decl(self, text):
pass
def handle_data(self, text):
if not self.unacceptablestack:
_BaseHTMLProcessor.handle_data(self, text)
def stripHTML(htmlSource, encoding):
p = _HTMLSanitizer(encoding)
p.feed(htmlSource)
data = p.output()
data = data.strip().replace('\r\n', '\n')
return data