codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
# -*- coding: utf-8 -*- import scrapy from dmoz2pages.items import Dmoz2PagesItem import urlparse from scrapy.http import Request class Spider1Spider(scrapy.Spider): name = "spider1" # allowed_domains = ["dmoz.com"] start_urls = ( 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/', ) def parse(self, response): for sel in response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]/a'): item = Dmoz2PagesItem() name = sel.xpath('div/text()').extract()[0].strip() link = sel.xpath('@href').extract()[0].strip() desc = sel.xpath('text()').extract()[0].strip() print name, link, desc item['name'] = name item['link'] = link yield Request(item['link'], meta={'item': item}, callback=self.parse_link) def parse_link(self,response): item = response.meta['item'] item['title'] = response.xpath('/html/head/title').extract()[0].strip() yield item
Private
[
?
]
Run code
Submit