Scrapy 保存数据案例-小说保存

发布时间 2023-06-24 19:07:11作者: jiang_jiayun

spider

import scrapy


class XiaoshuoSpider(scrapy.Spider):
    name = "爬虫名"
    allowed_domains = ["域名"]
    start_urls = ["第一章url地址"]

    def parse(self, response):
        # 章节名称
        title = response.xpath('//h1/text()').get()     #extract_first()
        # 章节内容
        content = response.xpath('//div[@id="content"]/text()').getall()   #extract()
        # 下一章链接
        next_url = response.xpath('//div[@class="bottem2"]/a[4]/@href').get()

        yield {
            'title':title,
            'content':content
        }
        yield scrapy.Request('https://www.tycqzw.la'+next_url,callback=self.parse)

pipeline

class Scrapy05Pipeline:
    def open_spider(self,spider):
        self.file = open('xiaoshuo.txt','w',encoding='utf-8')
    def process_item(self, item, spider):
        self.file.write(item['title']+'\n')
        self.file.write(''.join(item['content'])+'\n\n\n\n')
        return item
    def close_spider(self,spider):
        self.file.close()

setting

#请求头
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
#注释掉协议
# ROBOTSTXT_OBEY = True
#间隔时间
DOWNLOAD_DELAY = 2
#管道pipeline
ITEM_PIPELINES = {
   "scrapy05.pipelines.Scrapy05Pipeline": 300,
}