Scrapy: scrapy_redis

发布时间 2023-08-07 13:01:23作者: 看一百次夜空里的深蓝
 1 # 安装
 2 pip3 install scrapy_redis
 3 # 源码
 4 https://github.com/rmax/scrapy-redis.git
 5 # 文档
 6 https://github.com/rmax/scrapy-redis
 7 
 8 # 配置说明: https://github.com/rmax/scrapy-redis/wiki/Usage
 9 REDIS_HOST = 'localhost'
10 REDIS_PORT = 6379
11 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
12 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
13 # 可暂停Spider
14 SCHEDULER_PERSIST = True
15 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
16 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
17 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
18 
19 ITEM_PIPELINES = {
20     'scrapy_redis.pipelines.RedisPipeline': 400,
21 }
22 
23 # Spider
24 from scrapy_redis.spiders import RedisSpider
25 
26 class BaiduSpider(RedisSpider):
27     """Spider that reads urls from redis queue (myspider:start_urls)."""
28     name = 'baiu'
29     redis_key = 'myspider:baiu'
30     # allowed_domains = ['baiu.com']
31 
32     def __init__(self, *args, **kwargs):
33         # Dynamically define the allowed domains list.
34         domain = kwargs.pop('domain', '')
35         self.allowed_domains = filter(None, domain.split(','))
36         super(BaiduSpider, self).__init__(*args, **kwargs)
37 
38     def parse(self, response):
39         print(response.text)
40         # return {
41         #     'name': response.css('title::text').extract_first(),
42         #     'url': response.url,
43         # }
44 
45 # CrawlSpider
46 from scrapy.linkextractors import LinkExtractor
47 from scrapy.spiders import CrawlSpider, Rule
48 from scrapy_redis.spiders import RedisCrawlSpider
49 
50 
51 class FanqienovelSpider(RedisCrawlSpider):
52     name = 'fanqienovel'
53     redis_key = 'mycrawler:fanqienovel'
54     # allowed_domains = ['baiu.com']
55 
56     rules = (
57         # follow all links
58         Rule(LinkExtractor(), callback='parse_page', follow=True),
59     )
60 
61     def __init__(self, *args, **kwargs):
62         # Dynamically define the allowed domains list.
63         domain = kwargs.pop('domain', '')
64         self.allowed_domains = filter(None, domain.split(','))
65         super(FanqienovelSpider, self).__init__(*args, **kwargs)
66 
67     def parse_page(self, response):
68         print(response.text)
69         return {
70             'name': response.css('title::text').extract_first(),
71             'url': response.url,
72         }