Python实例题:Web 爬虫与数据可视化
目录
Python实例题
题目
要求:
解题思路:
代码实现:
Python实例题
题目
Web 爬虫与数据可视化
要求:
- 编写一个爬虫,从豆瓣电影 Top250 页面(豆瓣电影 Top 250)抓取电影名称、评分、导演、主演和上映年份。
- 将数据存储到 SQLite 数据库中。
- 基于数据库数据,使用 Matplotlib 生成柱状图,展示评分最高的 10 部电影。
- 添加异常处理(如网络请求失败、解析错误、数据库操作失败等)。
解题思路:
- 使用
requests
和BeautifulSoup
实现网页爬取与解析。 - 使用
sqlite3
建立数据库并存储数据。 - 使用
matplotlib
绘制柱状图。 - 添加重试机制和异常捕获。
代码实现:
import requests
from bs4 import BeautifulSoup
import sqlite3
import matplotlib.pyplot as plt
from requests.exceptions import RequestException
import timedef fetch_movie_data(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}max_retries = 3for attempt in range(max_retries):try:response = requests.get(url, headers=headers, timeout=10)response.raise_for_status()return response.textexcept RequestException as e:print(f"请求失败 ({attempt+1}/{max_retries}): {e}")if attempt < max_retries - 1:time.sleep(2) # 等待2秒后重试return Nonedef parse_movie_data(html_content):if not html_content:return []soup = BeautifulSoup(html_content, 'html.parser')movies = []for item in soup.select('div.item'):try:title = item.select_one('span.title').text.strip()rating = float(item.select_one('span.rating_num').text)info = item.select_one('div.bd p').text.strip()# 解析导演、主演和年份director, *actors = info.split('\xa0\xa0\xa0')[0].replace('导演: ', '').split('主演: ')year = info.split('\n')[-1].strip().split('/')[0]movies.append({'title': title,'rating': rating,'director': director.strip(),'actors': actors[0].strip() if actors else '','year': year})except (AttributeError, ValueError, IndexError) as e:print(f"解析错误: {e}")return moviesdef save_to_database(movies):conn = sqlite3.connect('douban_movies.db')c = conn.cursor()try:c.execute('''CREATE TABLE IF NOT EXISTS movies(id INTEGER PRIMARY KEY AUTOINCREMENT,title TEXT NOT NULL,rating REAL NOT NULL,director TEXT,actors TEXT,year TEXT)''')for movie in movies:c.execute('''INSERT INTO movies (title, rating, director, actors, year)VALUES (?, ?, ?, ?, ?)''',(movie['title'], movie['rating'], movie['director'], movie['actors'], movie['year']))conn.commit()except sqlite3.Error as e:print(f"数据库错误: {e}")conn.rollback()finally:conn.close()def plot_top_movies():conn = sqlite3.connect('douban_movies.db')c = conn.cursor()try:c.execute('SELECT title, rating FROM movies ORDER BY rating DESC LIMIT 10')top_movies = c.fetchall()if not top_movies:print("数据库中没有电影数据")returntitles, ratings = zip(*top_movies)plt.figure(figsize=(10, 6))plt.barh(titles, ratings, color='skyblue')plt.xlabel('评分')plt.ylabel('电影名称')plt.title('豆瓣电影评分Top10')plt.tight_layout()plt.savefig('top_movies.png')plt.show()except sqlite3.Error as e:print(f"数据库错误: {e}")finally:conn.close()if __name__ == "__main__":all_movies = []for start in range(0, 250, 25):url = f"https://movie.douban.com/top250?start={start}"html = fetch_movie_data(url)movies = parse_movie_data(html)all_movies.extend(movies)print(f"已抓取 {start+25}/250 部电影")time.sleep(1) # 避免请求过快if all_movies:save_to_database(all_movies)plot_top_movies()else:print("未抓取到任何电影数据")