Python爬虫实战:requests+BeautifulSoup

requests和BeautifulSoup是Python爬虫最经典的组合,上手简单、功能够用,适合绝大多数静态页面的数据抓取场景。本文通过一个完整的实战案例,带你从零开始写一个真正能跑的爬虫。

环境准备

pip install requests beautifulsoup4 lxml

lxml是推荐的HTML解析器,比Python内置的html.parser快很多。

requests基础

requests库把HTTP请求封装得非常简洁:

import requests

# GET请求
resp = requests.get("https://httpbin.org/get", params={"key": "value"})
print(resp.status_code)  # 200
print(resp.text)         # 响应文本
print(resp.json())       # 自动解析JSON

# 自定义请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
resp = requests.get("https://example.com", headers=headers)

# POST请求
resp = requests.post("https://httpbin.org/post", data={"username": "test"})

# 设置超时
resp = requests.get("https://example.com", timeout=10)

几个实用技巧:

  • 永远设置timeout,不然程序可能卡死
  • 带上合理的User-Agent,很多网站会拒绝默认UA
  • resp.raise_for_status()自动检查HTTP错误

Session保持

如果需要登录或者保持Cookie,用Session对象:

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 ..."})

# 登录
session.post("https://example.com/login", data={
    "username": "admin",
    "password": "123456"
})

# 后续请求自动带Cookie
resp = session.get("https://example.com/dashboard")

BeautifulSoup解析HTML

拿到HTML之后,用BeautifulSoup来解析:

from bs4 import BeautifulSoup

html = '<div class="article"><h2>标题</h2><p>内容</p></div>'
soup = BeautifulSoup(html, "lxml")

# find / find_all
tag = soup.find("h2")
print(tag.text)  # "标题"

tags = soup.find_all("p")
for t in tags:
    print(t.text)

CSS选择器

CSS选择器写起来更直观,推荐优先使用:

# 类选择器
items = soup.select(".article")

# 层级选择器
titles = soup.select("div.article > h2")

# 属性选择器
links = soup.select('a[href^="https://"]')

# 组合使用
rows = soup.select("table.data-table tbody tr")
for row in rows:
    cols = row.select("td")
    print([col.text.strip() for col in cols])

获取属性和文本

link = soup.select_one("a.download")
href = link["href"]          # 获取属性
text = link.get_text(strip=True)  # 获取文本,strip去除空白

实战:爬取豆瓣Top250

下面写一个完整的爬虫,抓取豆瓣电影Top250的基本信息:

import requests
from bs4 import BeautifulSoup
import csv
import time

def crawl_douban_top250():
    base_url = "https://movie.douban.com/top250"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/91.0.4472.124 Safari/537.36"
    }

    all_movies = []

    for start in range(0, 250, 25):
        print(f"正在抓取第 {start // 25 + 1} 页...")
        resp = requests.get(base_url, params={"start": start}, headers=headers, timeout=15)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "lxml")
        items = soup.select("div.item")

        for item in items:
            title = item.select_one("span.title").text
            rating = item.select_one("span.rating_num").text
            info = item.select_one("div.bd p").text.strip()
            # 提取导演和年份
            lines = [line.strip() for line in info.split("\n") if line.strip()]
            director = lines[0].split("\xa0\xa0\xa0")[0] if lines else ""
            quote_tag = item.select_one("span.inq")
            quote = quote_tag.text if quote_tag else ""

            all_movies.append({
                "title": title,
                "rating": rating,
                "director": director,
                "quote": quote
            })

        time.sleep(2)  # 礼貌爬取,间隔2秒

    return all_movies


def save_to_csv(movies, filename="douban_top250.csv"):
    with open(filename, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "rating", "director", "quote"])
        writer.writeheader()
        writer.writerows(movies)
    print(f"保存完成: {filename}, 共 {len(movies)} 条")


if __name__ == "__main__":
    movies = crawl_douban_top250()
    save_to_csv(movies)

异常处理

生产环境的爬虫必须做好异常处理:

import requests
from requests.exceptions import RequestException

def safe_get(url, headers=None, retries=3):
    for i in range(retries):
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            return resp
        except RequestException as e:
            print(f"请求失败 (第{i+1}次): {e}")
            if i < retries - 1:
                time.sleep(3 * (i + 1))  # 递增等待
    return None

小结

requests + BeautifulSoup这套组合处理静态页面完全够用。如果遇到JS动态渲染的页面,可以考虑Selenium或者Playwright。爬虫的核心不难,难的是反爬对抗和数据清洗——这些后面有机会再聊。