异步爬虫 原理与解析
先遍历100遍一个程序
import requests
import logging
import timelogging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
TOTAL_NUMBER = 100
BASE_URL = 'https://ssr4.scrape.center/'start_time = time.time()
for id in range(1,TOTAL_NUMBER + 1):url =BASE_URL.format(id=id)logging.info('scraping %s',url)response = requests.get(url)
end_time = time.time()
logging.info('total time: %s seconds',end_time - start_time)
耗时较长
同步指不同程序单元为了完成某个任务在执行过程中需靠某种通信方式以协调一致,称这些程序单元是同步执行的
同步意味着有序
异步 不同程序单元之间无需通信协调,也能完成任务的方式,不相关的程序单元之间可以是异步的
异步以为着无序
协程
coroutine
在python中指代为协程对象类型可以将协程对象注册到时间循环中,它会被事件循环调用
可以使用async关键字来定义一个方法
这个方法在调用时不会立即被执行,而是返回一个协程对象
实例引用:
import asyncioasync def execute(X):print('Number:',X)coroutine = execute(1)
print('Coroutine:',coroutine)
print('After calling execute')loop = asyncio.get_event_loop()
loop.run_until_complete(coroutine)
print('After calling loop')
运行结果如下:
import asyncioasync def execute(X):print('Number:',X)return Xcoroutine = execute(1)
print('Coroutine:',coroutine)
print('After calling execute')loop = asyncio.get_event_loop()
task = loop.create_task(coroutine)
print('Task:',task)
loop.run_until_complete(task)
print('Task:',task)
print('After calling loop')
import asyncio
import requestsasync def request():url = 'https://www.baidu.com'status = requests.get(url)return statusdef callback(task):print('Status:',task.result())coroutine = request()
task = asyncio.ensure_future(coroutine)
task.add_done_callback(callback)
print('Task:',task)loop = asyncio.get_event_loop()
loop.run_until_complete(task)
print('Task:',task)
一次请求
多任务协程
import asyncio
import requestsasync def request():url = 'https://www.baidu.com'status = requests.get(url)return statustasks = [asyncio.ensure_future(request()) for _ in range(5)]
print('Task:',tasks)loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))for task in tasks:print('Status:',task.result())
import asyncio
import requests
import timestart = time.time()async def request():url = 'https://ssr4.scrape.center/'print('Waiting for',url)response = requests.get(url)print('Get response from',url,'response',response)tasks = [asyncio.ensure_future(request()) for _ in range(10)]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))end = time.time()
print('Cost time:',end - start)
相比于第一个耗时更少
使用aiohttp 模块 将一个进程挂起
import asyncio
import aiohttp
import timestart = time.time()async def get(url):session = aiohttp.ClientSession()response = await session.get(url)await response.text()await session.close()return responseasync def request():url = 'https://ssr4.scrape.center/'print('Waiting for',url)response = await get(url)print('Get response from',url,'response',response)tasks = [asyncio.ensure_future(request()) for _ in range(10)]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))end = time.time()
print('Cost time:',end - start)