#!/usr/bin/env python import BeautifulSoup import HTMLParser import os import sgmllib import sgmlop import time HTML_PATH = 'html' XML_PATH = 'xml' def get_data(): html = xml = [] html = [open(os.path.join(HTML_PATH, f) ).read() for f in os.listdir(HTML_PATH)] xml = [open(os.path.join(XML_PATH, f) ).read() for f in os.listdir(XML_PATH)] return html, xml def parse(klass, data): errors = [] for page in data: p = klass() try: p.feed(page) except Exception, e: errors.append(str(e)) continue return errors def soup_parse(data): errors = [] for page in data: try: BeautifulSoup.BeautifulSoup(page) except Exception, e: errors.append(str(e)) continue return errors def sgmlop_parse(klass, handler, data): errors = [] for page in data: p = klass() p.register(handler) try: p.feed(page) except Exception, e: errors.append(str(e)) continue return errors def bench_with_time(): html, xml = get_data() def time_htmlparser(): # HTMLParser, only HTML html_start_time = time.time() errors = parse(HTMLParser.HTMLParser, html) html_time = time.time() - html_start_time print "HTMLParser, only HTML - time: %s, errors: %d" % ( html_time, len(errors)) # HTMLParser, only XML xml_start_time = time.time() errors = parse(HTMLParser.HTMLParser, xml) xml_time = time.time() - xml_start_time print "HTMLParser, only XML - time: %s, errors: %d" % ( xml_time, len(errors)) print "Total: %f" % (html_time + xml_time,) print def time_sgmllib(): # sgmllib.SGMLParser, only HTML html_start_time = time.time() errors = parse(sgmllib.SGMLParser, html) html_time = time.time() - html_start_time print "sgmllib.SGMLParser, only HTML - time: %s, errors: %d" % ( html_time, len(errors)) # sgmllib.SGMLParser, only XML xml_start_time = time.time() errors = parse(sgmllib.SGMLParser, xml) xml_time = time.time() - xml_start_time print "sgmllib.SGMLParser, only XML - time: %s, errors: %d" % ( xml_time, len(errors)) print "Total: %f" % (html_time + xml_time,) print def time_beautifulsoup(): # BeautifulSoup, only HTML html_start_time = time.time() errors = soup_parse(html) html_time = time.time() - html_start_time print "BeautifulSoup, only HTML - time: %s, errors: %d" % ( html_time, len(errors)) # BeautifulSoup, only XML xml_start_time = time.time() errors = soup_parse(xml) xml_time = time.time() - xml_start_time print "BeautifulSoup, only XML - time: %s, errors: %d" % ( xml_time, len(errors)) print "Total: %f" % (html_time + xml_time,) print def time_sgmlop(): class Handler(object): def handle_special(self, text): pass def handle_proc(self, target, value): pass def finish_starttag(self, tag, attrs): pass def finish_endtag(self, tag): pass def handle_data(self, data): pass # sgmlop.SGMLParser, only HTML html_start_time = time.time() errors = sgmlop_parse(sgmlop.SGMLParser, Handler(), html) html_time = time.time() - html_start_time print "sgmlop.SGMLParser, only HTML - time: %s, errors: %d" % ( html_time, len(errors)) # sgmlop.SGMLParser, only XML xml_start_time = time.time() errors = sgmlop_parse(sgmlop.SGMLParser, Handler(), xml) xml_time = time.time() - xml_start_time print "sgmlop.SGMLParser, only XML - time: %s, errors: %d" % ( xml_time, len(errors)) print "Total: %f" % (html_time + xml_time,) print # sgmlop.XMLParser, only HTML html_start_time = time.time() errors = sgmlop_parse(sgmlop.XMLParser, Handler(), html) html_time = time.time() - html_start_time print "sgmlop.XMLParser, only HTML - time: %s, errors: %d" % ( html_time, len(errors)) # sgmlop.XMLParser, only XML xml_start_time = time.time() errors = sgmlop_parse(sgmlop.XMLParser, Handler(), html) xml_time = time.time() - xml_start_time print "sgmlop.XMLParser, only XML - time: %s, errors: %d" % ( xml_time, len(errors)) print "Total: %f" % (html_time + xml_time,) print time_htmlparser() time_sgmllib() time_beautifulsoup() time_sgmlop() if __name__ == '__main__': bench_with_time()