安装
1.安装wheel
pip install wheel
2.安装lxml
pip install lxml
3.安装pyopenssl
pip install pyopenssl
4.下载并安装pywin32
pip install pywin32
5.下载twisted的wheel文件
下载地址:http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
6.安装wheel文件
pip install D:\Twisted-20.3.0-cp37-cp37m-win32.whl
7.安装scrapy
pip install scrapy
创建
1.创建项目
scrapy startproject xiangmuming
2.创建爬虫
scrapy genspider firstspider www.cnblogs.com
3.运行爬虫
scrapy crawl baidu #不带日志运行
scrapy crawl baidu --nolog #带日志运行
4.创建main.py文件运行爬虫
from scrapy.cmdline import execute
execute(['scrapy','crawl','chouti','--nolog'])
使用
settings配置
- 是否遵循爬虫协议(否)
-ROBOTSTXT_OBEY = False
- 配置user_agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
- 配置日志等级
LOG_LEVEL='ERROR'
配置main.py运行爬虫
from scrapy.cmdline import execute
execute(['scrapy','crawl','chouti','--nolog'])
写爬虫
- 完善文件名:firstspider.py
即由此命令生成的:scrapy genspider firstspider www.cnblogs.com,
import scrapy
class FirstspiderSpider(scrapy.Spider):
name = "firstspider"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["http://www.cnblogs.com/"]
def parse(self, response):
pass
eg:
import scrapy
from bs4 import BeautifulSoup
class FirstspiderSpider(scrapy.Spider):
name = "firstspider"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["http://www.cnblogs.com/"]
def parse(self, response):
soup = BeautifulSoup(response.text,'lxml')
title_list = soup.find_all(class_="post-item")
for i in title_list:
title = i.find(class_="post-item-title").text
link = i.find(class_="post-item-title")['href']
print(title,link)
爬取数据进行保存
- 保存为csv,json,pickle文件
数据必须以字典列表形式(控制台执行)
scrapy crawl firstspider -o first.csv #导出csv格式
scrapy crawl firstspider -o first.json #导出json格式
eg:
import scrapy
from bs4 import BeautifulSoup
class FirstspiderSpider(scrapy.Spider):
name = "firstspider"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["http://www.cnblogs.com/"]
def parse(self, response):
l =[]
soup = BeautifulSoup(response.text,'lxml')
title_list = soup.find_all(class_="post-item")
for i in title_list:
title = i.find(class_="post-item-title").text
link = i.find(class_="post-item-title")['href']
l.append({'标题':title,'链接':link})
return l
- 保存为文件,MySQL等
items.py初始化文件如下
import scrapy
class ScrapyprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
pipelines.py初始化文件如下
class ScrapyprojectPipeline:
def process_item(self, item, spider):
return item
配置settings
ITEM_PIPELINES = {
"ScrapyProject.pipelines.firstPipeline": 300,
}
eg:
firstspider.py
import scrapy
from ScrapyProject.items import firstItem
from bs4 import BeautifulSoup
class FirstspiderSpider(scrapy.Spider):
name = "firstspider"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["http://www.cnblogs.com/"]
def parse(self, response):
item = firstItem()
soup = BeautifulSoup(response.text,'lxml')
title_list = soup.find_all(class_="post-item")
for i in title_list:
title = i.find(class_="post-item-title").text
link = i.find(class_="post-item-title")['href']
item['title']=title
item['link']=link
yield item
items.py
import scrapy
class firstItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
pipelines.py(文件保存)
class firstPipeline:
def open_spider(self,spider):
self.f = open('firsttxt','w',encoding='utf-8')
def process_item(self, item, spider):
self.f.write(item['title']+'\n')
self.f.write(item['link']+'\n')
return item
def close_spider(self,spider):
self.f.close()