数据获取
小红书单个帖子的图片保存到本地
import requests
import re
import osif not os.path.exists('img'):os.mkdir('img')headers = {'cookie':'abRequestId=5e194f90-2bfd-585c-9fb2-713bd6df6f77; xsecappid=xhs-pc-web; a1=197b07f1d6a4g2nxvwx0wmu3qbrmi34rcwoc34t7250000269640; webId=da2bebd1ab71b8ece8eae5a8fd5de8c5; gid=yjWD8WiJdJSdyjWD8WiyfE49K04kJIChEC8ElWJ67qADFS2863q4Fq888JKjK488KKjWY2q2; acw_tc=0a00d49317510155369181162e4aad58e7ed63ff6b41170ad226c54d8a5e9b; webBuild=4.68.0; web_session=040069b92a168371f895ef286b3a4bc2a4ab56; loadts=1751015741844; websectiga=9730ffafd96f2d09dc024760e253af6ab1feb0002827740b95a255ddf6847fc8; sec_poison_id=4f647338-d155-4f9a-898c-b865ee36a1ce','referer':'https://www.xiaohongshu.com/user/profile/610dfa37000000000101d41d?xsec_token=ABP8LcWXwugkR5oQ8uZ0mjGEMHvgi_rbK6e_n6v22wQpM=&xsec_source=pc_note','user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
}url = 'https://www.xiaohongshu.com/explore/67958c98000000002a00e6a9?xsec_token=ABYXrvCI5tqlTaTgrn5w86OSaVPFq8ONY6XjTmQLUDNoc=&xsec_source=pc_user'response = requests.get(url=url, headers=headers)html = response.texttitle = re.findall('<meta name="og:title" content="(.*?)">', html)[0]
print(title)
img_list = re.findall('<meta name="og:image" content="(.*?)">',html)
num = 1
for img in img_list:print(img)img_content = requests.get(url=img).contentwith open('img\\' + title + str(num) + '.jpg', 'wb') as f:f.write(img_content)num += 1
如下图所示:
多页帖子采集:
import requests
import re
import os
from DrissionPage import ChromiumPageif not os.path.exists('img'):os.mkdir('img')headers = {'cookie':'abRequestId=5e194f90-2bfd-585c-9fb2-713bd6df6f77; xsecappid=xhs-pc-web; a1=197b07f1d6a4g2nxvwx0wmu3qbrmi34rcwoc34t7250000269640; webId=da2bebd1ab71b8ece8eae5a8fd5de8c5; gid=yjWD8WiJdJSdyjWD8WiyfE49K04kJIChEC8ElWJ67qADFS2863q4Fq888JKjK488KKjWY2q2; acw_tc=0a00d49317510155369181162e4aad58e7ed63ff6b41170ad226c54d8a5e9b; webBuild=4.68.0; web_session=040069b92a168371f895ef286b3a4bc2a4ab56; loadts=1751015741844; websectiga=9730ffafd96f2d09dc024760e253af6ab1feb0002827740b95a255ddf6847fc8; sec_poison_id=4f647338-d155-4f9a-898c-b865ee36a1ce','referer':'https://www.xiaohongshu.com/user/profile/610dfa37000000000101d41d?xsec_token=ABP8LcWXwugkR5oQ8uZ0mjGEMHvgi_rbK6e_n6v22wQpM=&xsec_source=pc_note','user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
}dp = ChromiumPage()
dp.listen.start('web/v1/search/notes')
dp.get('https://www.xiaohongshu.com/search_result/?keyword=%25E5%25A3%2581%25E7%25BA%25B8&source=web_explore_feed&type=51')r = dp.listen.wait()json_data = r.response.bodyitems = json_data['data']['items']
for item in items:try:id_ = item['id']token = item['xsec_token']url = f'https://www.xiaohongshu.com/explore/{id_}?xsec_token={token}&xsec_source=pc_user'print(url)response = requests.get(url=url, headers=headers)html = response.textold_title = re.findall('<meta name="og:title" content="(.*?)">', html)[0]title = re.sub(r'[\\/:*?"<>|\n]', '', old_title)print(title)img_list = re.findall('<meta name="og:image" content="(.*?)">',html)num = 1for img in img_list:print(img)img_content = requests.get(url=img).contentwith open('img\\' + title + str(num) + '.jpg', 'wb') as f:f.write(img_content)num += 1except Exception as e:print(e)