# -*- coding: utf-8 -*-
import scrapy
from dmoz2pages.items import Dmoz2PagesItem
import urlparse
from scrapy.http import Request
class Spider1Spider(scrapy.Spider):
name = "spider1"
# allowed_domains = ["dmoz.com"]
start_urls = (
'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
)
def parse(self, response):
for sel in response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]/a'):
item = Dmoz2PagesItem()
name = sel.xpath('div/text()').extract()[0].strip()
link = sel.xpath('@href').extract()[0].strip()
desc = sel.xpath('text()').extract()[0].strip()
print name, link, desc
item['name'] = name
item['link'] = link
yield Request(item['link'], meta={'item': item}, callback=self.parse_link)
def parse_link(self,response):
item = response.meta['item']
item['title'] = response.xpath('/html/head/title').extract()[0].strip()
yield item