Python 爬虫案例(不定期更新)
目录
- 一、网页爬虫 — DrissionPage 模块(保存 + 翻页)
- 1. Boss 直聘(CSV 文件 + 滑动翻页)
- 2. 前程无忧(CSV 文件 + 点击翻页)
- 3. 智联招聘(CSV 文件 + 点击翻页)
- 4. 猎聘网(CSV 文件 + 点击翻页)
- 二、Requests 模块
- 1. 得物(XLSX 文件 + JS 逆向加密)
一、网页爬虫 — DrissionPage 模块(保存 + 翻页)
DrissionPage 模块文档:【DrissionPage官网】
1. Boss 直聘(CSV 文件 + 滑动翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import time
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['zpData']['jobList']# for 循环遍历,提取列表里的元素for index in data_list:# 分割薪资制度salary_list = index['salaryDesc'].split('·')salary = salary_list[0]if len(salary_list) == 2:salary_system = salary_list[1]else:salary_system = '12薪'# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['brandName'],'公司行业': index['brandIndustry'],'公司规模': index['brandScaleName'],'融资阶段': index['brandStageName'],'工作区域': index['cityName'] + ' ' + index['areaDistrict'] + ' ' + index['businessDistrict'],'学历要求': index['jobDegree'],'工作经验': index['jobExperience'],'职位名称': index['jobName'],'薪资待遇': salary,'薪资制度': salary_system,'沟通职员': index['bossTitle'] + '-' + index['bossName'],'所需技能': ' '.join(index['skills']),'公司福利': ' '.join(index['welfareList']),}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包(自行修改)google.listen.start(r"wapi/zpgeek/search/joblist.json")# 访问指定网站的页面(自行修改)google.get(r"https://www.zhipin.com/web/geek/jobs?city=101280100&query=%E9%A1%B9%E7%9B%AE%E5%8A%A9%E7%90%86")# 创建文件对象f = open('boss_project_assistant.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司行业', '公司规模', '融资阶段', '工作区域', '学历要求','工作经验', '职位名称', '薪资待遇', '薪资制度', '沟通职员', '所需技能','公司福利'])cd.writeheader()num = 50for page in range(1, num + 1):print(f'正在处理第 {page} 页数据……')# 获取数据包加载try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 下滑页面到底部google.scroll.to_bottom()time.sleep(1)else:print('No more information!')exit(1)if __name__ == '__main__':main()
运行结果展示:
2. 前程无忧(CSV 文件 + 点击翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['resultbody']['job']['items']# for 循环遍历,提取列表里的元素for index in data_list:# 处理薪资salary_list = index['provideSalaryString'].split('·')salary = salary_list[0]salary_system = '12薪'if len(salary_list) == 2:salary_system = salary_list[1]# 处理地区district_string = '未知'if 'districtString' in index['jobAreaLevelDetail']:district_string = index['jobAreaLevelDetail']['districtString']# 处理公司类型company_type = index['companyIndustryType1Str']if 'companyIndustryType2Str' in index and index['companyIndustryType2Str'] != index['companyIndustryType1Str']:company_type = index['companyIndustryType1Str'] + ';' + index['companyIndustryType2Str']# 处理 HR 状态hr_labels, hr_active_status_green, hr_info = '未知', '未知', '未知'if 'hrLabels' in index and index['hrLabels'] != []:hr_labels = index['hrLabels'][0]if 'hrActiveStatusGreen' in index:hr_active_status_green = index['hrActiveStatusGreen']if 'hrPosition' in index and 'hrName' in index:hr_info = index['hrPosition'] + '-' + index['hrName']# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['fullCompanyName'],'公司性质': index['companyTypeString'],'公司领域': company_type,'公司规模': index['companySizeString'],'职位名称': index['jobName'],'优先专业': index['major1Str'] + ' ' + index['major2Str'],'所在省份': index['jobAreaLevelDetail']['provinceString'],'所在城市': index['jobAreaLevelDetail']['cityString'],'所在地区': district_string,'薪资范围': salary,'薪资制度': salary_system,'工作形式': index['termStr'],'所需学历': index['degreeString'],'所需经验': index['workYearString'],'沟通HR': hr_info,'处理速度': hr_labels,'在线时间': hr_active_status_green,'投递频率': index['applyTimeText'],'公司详情页': index['companyHref'],'其他标签': ','.join(index['jobTags'])}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包google.listen.start(r"api/job/search-pc")# 访问指定网站的页面google.get(r"https://we.51job.com/pc/search?jobArea=260200&keyword=Python&searchType=2&keywordType=")# 创建文件对象f = open('51job_artificial_intelligence.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模','职位名称', '优先专业', '所在省份', '所在城市','所在地区', '薪资范围', '薪资制度', '工作形式','所需学历', '所需经验', '沟通HR', '处理速度','在线时间', '投递频率', '公司详情页', '其他标签'])cd.writeheader()num = 10for page in range(1, num + 1):if page == 1:# 滑到页面底部google.scroll.to_bottom()# 定位下一页按钮并点击button = google.ele('css:.el-icon-arrow-right')button.run_js('this.click();')google.scroll.to_bottom()# 暂停监听,清空已获取队列google.listen.pause(clear=True)# 继续暂停的监听google.listen.resume()# 定位上一页按钮并点击button = google.ele('css:.el-icon-arrow-left')button.run_js('this.click();')# 获取数据包加载try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:print(f'正在处理第 {page} 页数据……')# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 滑到页面底部google.scroll.to_bottom()# 定位下一页按钮并点击button = google.ele('css:.el-icon-arrow-right')button.run_js('this.click();')else:print('No more information!')exit(1)if __name__ == '__main__':main()
dp.ele()
→ 通过元素面板定位元素位置,其中 dp 为浏览器对象;ele 为元素面板的缩写。
运行结果展示:
3. 智联招聘(CSV 文件 + 点击翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['data']['list']# for 循环遍历,提取列表里的元素for index in data_list:# 处理技能skill_result = ''for skill_dictionary in index['skillLabel']:for key, value in skill_dictionary.items():if key == 'value':skill_result += value + ' '# 处理薪资salary_list = index['salary60'].split('·')salary = salary_list[0]salary_system = '12薪'if len(salary_list) == 2:salary_system = salary_list[1]# 处理福利welfare_str = ' '.join(index['welfareTagList'])if 'jobKnowledgeWelfareFeatures' in index and len(index['jobKnowledgeWelfareFeatures']) > len(index['welfareTagList']):welfare_str = ' '.join(index['jobKnowledgeWelfareFeatures'])# 处理 HR 回复速度hr_processing_speed = '未知'if 'hrStateInfo' in index and len(index['hrStateInfo']) > 0:hr_processing_speed = index['hrStateInfo']# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['companyName'],'公司性质': index['property'],'公司领域': index['industryName'],'公司规模': index['companySize'],'职位名称': index['name'],'所在城市': index['workCity'],'所在地区': index['cityDistrict'],'所在街道': index['streetName'],'公司源址': index['jobRootOrgInfo']['cityName'],'薪资范围': salary,'薪资制度': salary_system,'工作形式': index['workType'],'所需学历': index['education'],'所需经验': index['workingExp'],'所需技能': skill_result,'沟通HR': index['staffCard']['hrJob'] + '-' + index['staffCard']['staffName'],'处理速度': hr_processing_speed,'在线时间': index['staffCard']['hrOnlineState'],'公司详情页': index['companyUrl'],'职位详情页': index['positionUrl'],'其他福利': welfare_str}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包google.listen.start(r"c/i/search/positions")# 访问指定网站的页面google.get(r"https://www.zhaopin.com/sou/jl765/kw01800U80EG06G03F01N0/p2?kt=3")# 创建文件对象f = open('zhaopin_python.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模', '职位名称','所在城市', '所在地区', '所在街道', '公司源址', '薪资范围','薪资制度', '工作形式', '所需学历', '所需经验', '所需技能','沟通HR', '处理速度', '在线时间', '公司详情页', '职位详情页', '其他福利'])cd.writeheader()num = 10for page in range(1, num + 1):if page == 1:# 暂停监听,清空已获取队列google.listen.pause(clear=True)# 继续暂停的监听google.listen.resume()# 定位上一页按钮并点击button = google.ele('css:.soupager a:first-of-type')# 滑动页面google.scroll.to_see(button)button.run_js('this.click();')# 获取数据包加载try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:print(f'正在处理第 {page} 页数据……')# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 定位下一页按钮并点击button = google.ele('css:.soupager a:last-of-type')# 滑动页面google.scroll.to_see(button)button.run_js('this.click();')else:print('No more information!')exit(1)if __name__ == '__main__':main()
css:.soupager a:first-of-type
→ css:.soupager 定位类名为 soupager 的标签;a:first-of-type 表示提取第一个 a 标签。css:.soupager a:last-of-type
→ css:.soupager 定位类名为 soupager 的标签;a:last-of-type 表示提取最后一个 a 标签。a:nth-of-type(even)
表示提取偶数位置的 a 标签;nth-of-type(odd)
表示提取奇数位置的 a 标签。
运行结果展示:
4. 猎聘网(CSV 文件 + 点击翻页)
Python 代码:
# 导入自动化模块
from DrissionPage import ChromiumPage
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['data']['soJobForms']# for 循环遍历,提取列表里的元素for index in data_list:# 处理薪资salary_list = index['salary'].split('·')salary_value = salary_list[0]salary_system = '12薪'if len(salary_list) == 2:salary_system = salary_list[1]# 处理技能和福利skill_value, welfare_value = '', ''if 'jobLabels' in index and 'sellingPointList' in index:skill_list = [item for item in index['jobLabels'] if item not in index['sellingPointList']]if len(skill_list) >= 1:skill_value = ';'.join(skill_list)if len(index['sellingPointList']) >= 1:welfare_value = ';'.join(index['sellingPointList'])elif 'jobLabels' in index:if len(index['jobLabels']) >= 1:skill_value = ';'.join(index['jobLabels'])elif 'sellingPointList' in index:if len(index['sellingPointList']) >= 1:welfare_value = ';'.join(index['sellingPointList'])# 处理沟通 HRhr = index['recruiterName']if 'recruiterTitle' in index:hr = index['recruiterTitle'] + '-' + index['recruiterName']# 处理公司规模scale_value = '未知'if 'compScale' in index:scale_value = index['compScale']# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['company'],'公司领域': index['industry'],'公司规模': scale_value,'职位名称': index['title'],'所在地址': index['dq'],'薪资范围': salary_value,'薪资制度': salary_system,'所需学历': index['requireEduLevel'],'所需技能': skill_value,'所需经验': index['requireWorkYears'],'沟通HR': hr,'发布时间': index['date'],'公司福利': welfare_value}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包google.listen.start(r"api/com.liepin.searchfront4c.h5-search-job")# 访问指定网站的页面google.get(r"https://m.liepin.com/zhaopin/?dqs=170020&keyword=Python")# 创建文件对象f = open('liepin_python.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司领域', '公司规模', '职位名称', '所在地址','薪资范围', '薪资制度', '所需学历', '所需技能', '所需经验','沟通HR', '发布时间', '公司福利'])cd.writeheader()num = 10for page in range(1, num + 1):try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:print(f'正在处理第 {page} 页数据……')# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 滑动页面google.scroll.to_bottom()# 定位下一页按钮并点击button = google('下一页')button.run_js('this.click();')else:print('No more information!')exit(1)if __name__ == '__main__':main()
运行结果展示:
二、Requests 模块
1. 得物(XLSX 文件 + JS 逆向加密)
js_file.js 模块代码:【对得物进行爬虫时使用到的 js 模块】
Python 代码:
import requests
import pandas as pd
# 导入编译 js 代码模块
import execjs
# ------------------------------------------------
import openpyxl
from openpyxl.drawing.image import Image as xlImage
from openpyxl.utils import get_column_letter
from PIL import Image
from io import BytesIOdef get_data_xlsx(js_path, save_path):# 请求标头request_header = {'accept': '*/*','accept-encoding': 'gzip, deflate, br, zstd','accept-language': 'zh-CN,zh;q=0.9','connection': 'keep-alive','content-length': '124','content-type': 'application/json','cookie': 'sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221977d73069b152b-02f7c860143a274-26011e51-1474560-1977d73069c1620%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.google.com%2F%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk3N2Q3MzA2OWIxNTJiLTAyZjdjODYwMTQzYTI3NC0yNjAxMWU1MS0xNDc0NTYwLTE5NzdkNzMwNjljMTYyMCJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%221977d73069b152b-02f7c860143a274-26011e51-1474560-1977d73069c1620%22%7D','host': 'app.dewu.com','ltk': 'P8K6wonDvcOIO8K3w4XCoMK2GMKrwrfCusO3VMOmwpbCnMKiNjnCkHvDmMOjXMOPKAvCv3LCrinCsMOCwqhrBcKdQWPCrcO7w5PCicO4wojDkMK+','origin': 'https://www.dewu.com','referer': 'https://www.dewu.com/','sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-site','sessionid': '1lsivpac-zyhm-oncb-nxp4-qn3gc0sqkf5xifpu','shumeiid': '2025061718330014de1bd23c8e4b38edb76ef2c501261700c05efa0c81576f','sk': '','traceparent': '00-f55b95896851448f2eb277d7ad722eb9-a4839e063b590e48-01','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36'}# 请求网址request_url = r'https://app.dewu.com/api/v1/h5/commodity-pick-interfaces/pc/pick-rule-result/feeds/info'# 请求载荷request_parameters = {'filterUnbid': True,'pageNum': 1, # 页码'pageSize': 24,'pickRuleId': 644443, # 类目 ID'showCspu': True}# 编译 js 代码js_code = execjs.compile(open(js_path, encoding='utf-8').read())# 获取 sign 加密参数sign_data = js_code.call('c', request_parameters)# 0e5d10fb111f2afef6ac0a1776187e23# 将 sign 添加至请求载荷中request_parameters['sign'] = sign_dataprint('Data is being requested and processed…')# 请求数据response = requests.post(url=request_url, json=request_parameters, headers=request_header)# 获取数据data_json = response.json()# 创建一个空列表dewu_info = []# 解析数据info_list = data_json['data']['list']for index in info_list:info_dict = {'标题': index['title'],'价格': index['price'] / 100,'图片网址': index['logoUrl']}# 写入数据dewu_info.append(info_dict)# 转换数据df = pd.DataFrame(dewu_info)# 导出保存为 Excel 表格df.to_excel(save_path, index=False)print(f'The data is already saved in {save_path}')def download_image(url):rg_url = requests.get(url)# 检查响应状态码if rg_url.status_code == 200:# 创建图像对象image = Image.open(BytesIO(rg_url.content))# 统一图像类型if image.mode != 'RGB':image = image.convert('RGB')# 调整图像大小return image.resize((150, 96))else:raise Exception(f"Unable to download images, status codes: {rg_url.status_code}")def link_to_png(source_path, destination_path):# 加载 Excel 文件wb = openpyxl.load_workbook(source_path)# 默认为第一个 sheetsheet = wb.active# 调整行高和列宽for row in range(2, sheet.max_row + 1):sheet.row_dimensions[row].height = 75sheet.column_dimensions['C'].width = 20# 读取链接并下载图片插入到对应位置for row in range(2, sheet.max_row + 1):# 假设图片链接在第 2 行开始,第 C 列是链接(对应 column = 3),获取链接单元格的值link = sheet.cell(row=row, column=3).value# 清空内容sheet.cell(row=row, column=3).value = None# 如果链接不为空if link:# 发送 HTTP 请求下载图片try:# 尝试下载图像resized_image = download_image(link)except OSError:print(f"Failed to download image {link}")continueelse:# 将调整后的图像插入到工作表中img_bytes = BytesIO()resized_image.save(img_bytes, format='PNG') # 将图片保存到内存中img = xlImage(img_bytes)sheet.add_image(img, f'{get_column_letter(3)}{row}') # 插入图片到指定位置wb.save(destination_path) # 必要wb.close() # 必要if __name__ == '__main__':j_path = './js_file.js's_path = './dewu_link.xlsx'# 获取数据并保存为 Excel 文件get_data_xlsx(j_path, s_path)d_path = './dewu_png.xlsx'print('Excel file is being processed…')link_to_png(s_path, d_path)print(f'The data is already saved in {d_path}')
运行结果展示: