#!/usr/bin/env python
from HTMLParser import HTMLParser
import greenlet
class IterParser(HTMLParser):
def feed(self, data):
self.parent = greenlet.getcurrent().parent
HTMLParser.feed(self, data)
def handle_starttag(self, tag, attrs):
self.parent.switch(('START', tag, attrs))
def handle_startendtag(self, tag, attrs):
self.parent.switch(('STARTEND', tag, attrs))
def handle_endtag(self, tag):
self.parent.switch(('END', tag))
def handle_data(self, data):
self.parent.switch(('DATA', data))
def handle_charref(self, ref):
self.parent.switch(('CHARREF', ref))
def handle_entityref(self, name):
self.parent.switch(('ENTITYREF', name))
def handle_comment(self, data):
self.parent.switch(('COMMENT', data))
def handle_decl(self, decl):
self.parent.switch(('DECL', decl))
def handle_pi(self, data):
self.parent.switch(('PI', data))
def iterparse(stream):
parser = IterParser()
for chunk in iter(lambda: stream.read(8192), ''):
coro = greenlet.greenlet(lambda: parser.feed(chunk))
while not coro.dead:
yield coro.switch()
import sys
if __name__ == '__main__':
stream = sys.stdin if len(sys.argv) == 1 else open(sys.argv[1], 'r')
for token in iterparse(stream):
print token