Scrapy-CrawlSpider爬虫类使用案例

发布时间 2023-07-17 11:46:18作者: 蕝戀

CrawlSpider类型的爬虫会根据指定的rules规则自动找到url比自动爬取。

优点:适合整站爬取,自动翻页爬取

缺点:比较难以通过meta传参,只适合一个页面就能拿完数据的。


import scrapy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from zolwallpaper.items import Bang123Item


class Bang123Spider(CrawlSpider):
    name = "bang123"
    allowed_domains = ["bang123.cn"]
    start_urls = ["https://www.bang123.cn/"]

    rules = (
        # 翻页
        Rule(LinkExtractor(allow=r"https://www.bang123.cn/index_\d+.html"), follow=True),
        # 详情页,不需要follow跟进,所以设置为False,但是需要从里面抽取数据
        Rule(LinkExtractor(allow=r"https://www.bang123.cn/gongshi/\d+.html"), callback="parse_item", follow=False),
    )

    def parse_item(self, response: HtmlResponse):
        
        bang_item = Bang123Item()
        
        selector = response.xpath('//div[@class="article_content layui-field-box"]')[0]
        title = selector.xpath('./h1/text()').get()
        
        main = response.xpath('//div[@class="content tindent of_table"]/p').getall()
        
        bang_item["title"] = title
        bang_item["main"] = main

        print(f"【{title=}】")
        print(f"{main=}")
        print("-"*150)

        return bang_item