import requests
from bs4 import BeautifulSoup
import pandas as pd
# 步骤1: 确定目标网站
base_url = 'https://www.imdb.com/'
search_url = 'https://www.imdb.com/search/title?genres=action&title_type=feature&sort=user_rating,desc&page=1'
# 步骤2: 分析网站结构
response = requests.get(search_url)
soup = BeautifulSoup(response.text, 'html.parser')
# 步骤3: 获取数据
movies = soup.find_all('div', class_='lister-item-content')
for movie in movies:
title = movie.find('h3').text
genre = movie.find('span', class_='genre').text
rating = movie.find('span', class_='rating').text
link = movie.find('a')['href']
full_url = base_url + link
# 步骤4: 提取数据
response_page = requests.get(full_url)
soup_page = BeautifulSoup(response_page.text, 'html.parser')
# 步骤5: 存储数据 (此处我们将其存储在DataFrame中)
data = {
'title': title,
'genre': genre,
'rating': rating,
'link': full_url,
}
df = pd.DataFrame(data, index=[0])
df.to_csv('imdb_movies.csv', index=False)