[Python] 纯文本查看 复制代码 import scrapy
from scrapy import cmdline
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250']
def start_requests(self):
for i in range(0, 1):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i * 25)
yield scrapy.Request(url)
def parse(self, response):
res = response.xpath('//ol[@class="grid_view"]/li/div/div[@class="info"]')
for i in res:
item = {}
item['title'] = i.xpath('./div/a/span[1]/text()').extract_first()
item['rating_num'] = i.xpath('./div[@class="bd"]/div/span[2]/text()').extract_first()
item['href'] = i.xpath('./div[@class="hd"]/a/@href').extract_first()
print(item)
yield scrapy.Request(item['href'], callback=self.my_parse, meta={'item': item,'dont_filter':True})
def my_parse(self, response):
item = response.meta['item']
# 通过查找类名或标签提取电影详情简介
item['indent'] = response.xpath('//div[@class="indent"]//p/text()').extract()
# 将多行文本合并为单个字符串
item['indent'] = ' '.join(item['indent']).strip()
print(item)
yield item
if __name__ == '__main__':
cmdline.execute('scrapy crawl douban'.split())
改了一下,再试试
|