多进程下载nadc上的数据

发布时间 2023-04-11 15:31:53作者: 裏表異体
import wget
from bs4 import BeautifulSoup as bs
import requests
import random
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, Future, as_completed, wait
from multiprocessing import cpu_count

headers = [
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        ]  # 随便找

def open_url(url):
    respon = requests.get(url, headers={"user-agent": random.choice(headers)}).content
    respon_decoded = respon.decode("utf-8")
    return respon_decoded


response = open_url('https://nadc.china-vo.org/res/r101217/')
soup = bs(response, 'html.parser')
pd_files = soup.find_all(id='pd-files')

file_info_list = []
for file in pd_files:
    file_info = {
        'file_name': file.find(class_='paperinfo-files-filename').text.strip(),
        'file_size': file.find(class_='paperinfo-files-filesize').text,
        'download_link': 'https://nadc.china-vo.org/'+file.find(class_='col-sm-1').find('a')['href']
    }
    file_info_list.append(file_info)
    

# python 源码
import requests
import time



def download(file_info):
    # 用流stream的方式获取url的数据
    resp = requests.get(file_info['download_link'], stream=True)
    # 拿到文件的长度,并把total初始化为0
    total = int(resp.headers.get('content-length', 0))
    # 打开当前目录的fname文件(名字你来传入)
    # 初始化tqdm,传入总数,文件名等数据,接着就是写入,更新等操作了
    with open('download/'+file_info['file_name'], 'wb') as file, tqdm(
        desc=file_info['file_name'],
        total=total,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=1024):
            size = file.write(data)
            bar.update(size)

print(f'总共有:{cpu_count()} 个核心')
def test_tqdm():
    executor = ThreadPoolExecutor(max_workers=cpu_count()) # 线程池设置,最多同时跑8个线程
    for file_info in file_info_list:
        args = [file_info,]
        tasks = [executor.submit(lambda p:download(*p), args)]
    wait(tasks)
 
test_tqdm()