[ create a new paste ] login | about

Link: http://codepad.org/bPqMyuly    [ raw code | output | fork ]

Python, pasted on May 13:
from lxml import etree
import re

def fast_iter(context, func, *args, **kwargs):
    for event, elem in context:
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
    del context
 
def process_element(elem):
    print(elem.xpath('@id')[0])
    print(elem.xpath('text()')[0])

if __name__ == '__main__':
    context = etree.iterparse('Wikipedia/small_wiki', tag='doc' )
    fast_iter(context,process_element)
    pass


Output:
1
2
3
4
Traceback (most recent call last):
  Line 1, in <module>
    from lxml import etree
ImportError: No module named lxml


Create a new paste based on this one


Comments: