import scrapy
from movie.items import MovieItem
class MeijuSpider(scrapy.Spider):
name = "meiju"
allowed_domains = ["alexa.cn"]
start_urls = ['www.alexa.cn/siterank']
def parse(self, response):
movies = response.xpath('//ul[@class="siterank-sitelist"]/li')
for each_movie in movies:
item = MovieItem()
item['name'] =each_movie.xpath('.//p[@class="infos"]').extract()[0]
yield item
代碼是這樣的。我想循環(huán)抓取的是:
www.alexa.cn/siterank/2
www.alexa.cn/siterank/3
www.alexa.cn/siterank/4
.....
我看循環(huán)應(yīng)該是這樣的for i in range(2,10):
yield scrapy.Request('www.alexa.cn/siterank/%d'%i),但是我不知道怎么填進(jìn)去。求助
歡迎選擇我的課程,讓我們一起見(jiàn)證您的進(jìn)步~~
若你范圍都確定的話,不如從start_urls 下手
start_urls = ['http://www.alexa.cn/siterank/{n}'.format(n=x) for x in range(2,10)]
官網(wǎng)上有例子,關(guān)于追蹤下一頁(yè),官網(wǎng)上面的例子用的是遞歸,官網(wǎng)的代碼如下:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('p.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('p.tags a.tag::text').extract(),
}
# next_page是用css選擇器獲取到的下一頁(yè), 在下面它遞歸地調(diào)用了parse方法來(lái)不斷地追蹤下一頁(yè)
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
我自己用Scrapy寫了一個(gè)貼吧的爬蟲,獲取下一頁(yè)用的也是這種遞歸的方法,代碼如下:
import scrapy
from tieba_crawler.items import ImageItem
class TiebaSpider(scrapy.Spider):
name = 'tbimg'
def start_requests(self):
url = 'http://tieba.baidu.com/f?kw=%E6%B8%A1%E8%BE%B9%E9%BA%BB%E5%8F%8B'
yield scrapy.Request(url=url, callback=self.parse_post)
def parse_post(self, response):
post_list = response.css('ul#thread_list li.j_thread_list')
for item in post_list:
title = item.css('a.j_th_tit::text').extract_first()
url = 'http://tieba.baidu.com' \
+ item.css('a.j_th_tit::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_image)
page_list = response.css('p#frs_list_pager a::attr(href)').extract()
if not page_list:
return
else:
next_page = page_list[-2]
if next_page:
yield response.follow(next_page, callback=self.parse_post)
def parse_image(self, response):
img_urls = response.css('p#j_p_postlist img.BDE_Image::attr(src)').extract()
yield ImageItem(image_urls=img_urls)
page_list = response.css('ul.l_posts_num li.pb_list_pager a::attr(href)').extract()
if not page_list:
return
else:
next_page = page_list[-2]
if next_page:
yield response.follow(next_page, callback=self.parse_image)