from lxml import etree
import re
def fast_iter(context, func, *args, **kwargs):
for event, elem in context:
func(elem, *args, **kwargs)
# It's safe to call clear() here because no descendants will be
# accessed
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
def process_element(elem):
print(elem.xpath('@id')[0])
print(elem.xpath('text()')[0])
if __name__ == '__main__':
context = etree.iterparse('Wikipedia/small_wiki', tag='doc' )
fast_iter(context,process_element)
pass