LinkExtractor提取链接
创建爬虫
scrapy genspider 爬虫名 域名 -t crawl
spider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class XsSpider(CrawlSpider):
name = "爬虫名"
allowed_domains = ["域名"]
start_urls = ["首页url地址"]
# rules = (Rule(LinkExtractor(restrict_xpaths='//*[@id="list"]/dl/dd[10]/a'), callback="parse_item", follow=True),)
rules = (
# 获取第一章链接
Rule(LinkExtractor(restrict_xpaths='//div[@id="list"]/dl/dd[10]/a'), callback="parse_item", follow=True),
# 获取下一章链接
Rule(LinkExtractor(restrict_xpaths='//div[@class="bottem2"]/a[4]'), callback="parse_item", follow=True),
)
def parse_item(self, response):
# 章节名称
title = response.xpath('//h1/text()').get() #extract_first()
# 章节内容
content = response.xpath('//div[@id="content"]/text()').getall() #extract()
yield {
'title':title,
'content':content
}
直接爬取可能会忽略掉第一章,所以要单独获取第一章的url
pipeline
class Scrapy05Pipeline:
def open_spider(self,spider):
self.file = open('文件名.txt','w',encoding='utf-8')
def process_item(self, item, spider):
self.file.write(item['title']+'\n')
self.file.write(''.join(item['content'])+'\n\n\n\n')
return item
def close_spider(self,spider):
self.file.close()
begin
执行命令scrapy crawl 爬虫名
from scrapy.cmdline import execute
execute(['scrapy','crawl','爬虫名'])