跳至内容

requests进阶操作

本篇在 requests 基本操作 基础上,介绍爬虫中常见的进阶场景:Session 保持、Cookie 反爬、代理 IP、图片懒加载识别和并发下载。

Session:模拟登录与 Cookie 保持

有些网站的数据接口需要先访问主页触发 Cookie,再访问目标 API,Session 可以自动管理这个过程:

import requests

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    )
}

with requests.Session() as sess:
    # 第一次请求:服务端设置 Cookie,Session 自动保存
    sess.get("https://xueqiu.com/", headers=headers, timeout=10)

    # 第二次请求:自动携带 Cookie
    api_url = "https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=72813&size=15"
    data = sess.get(api_url, headers=headers, timeout=10).json()
    print(data)

手动指定 Cookie

import requests

# 从浏览器开发者工具 → Application → Cookies 中复制
cookies = {
    "session_id": "abc123",
    "user_token": "xyz789",
}

with requests.Session() as sess:
    sess.headers.update(headers)
    sess.cookies.update(cookies)
    resp = sess.get("https://example.com/api/data", timeout=10)
    print(resp.json())

代理 IP

当频繁请求同一网站导致 IP 被封禁时,使用代理 IP 转发请求:

import requests
import random

proxies_pool = [
    {"https": "http://1.2.3.4:8080"},
    {"https": "http://5.6.7.8:3128"},
]

headers = {"User-Agent": "Mozilla/5.0"}
url = "https://httpbin.org/ip"

proxy = random.choice(proxies_pool)
try:
    resp = requests.get(url, headers=headers, proxies=proxy, timeout=10)
    print(resp.json())
except requests.exceptions.ProxyError:
    print("代理连接失败,切换代理")

代理匿名度说明

  • 透明代理:对方知道真实 IP 和代理 IP。
  • 匿名代理:对方知道在使用代理,但不知道真实 IP。
  • 高匿代理:对方无法判断是否使用了代理。

爬虫使用高匿代理效果最佳,可从代理平台 API 动态获取 IP 列表。

图片懒加载

部分网站使用懒加载反爬:图片的真实地址不放在 src 属性中,而放在 src2data-srcoriginal 等伪属性里,当用户滚动到该区域时才动态赋值给 src

import requests
from lxml import etree
from pathlib import Path

headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get("https://example.com/gallery", headers=headers, timeout=10)
tree = etree.HTML(resp.text)

# 普通加载:取 src
normal_srcs = tree.xpath('//img/@src')

# 懒加载:取 data-src(不同网站属性名不同)
lazy_srcs = tree.xpath('//img/@data-src')
lazy_srcs2 = tree.xpath('//img/@src2')

print("懒加载图片:", lazy_srcs or lazy_srcs2)

识别懒加载:在浏览器开发者工具 Elements 面板中查看 img 标签,若 src 为空白图或 base64 占位图,说明使用了懒加载,需找到真实属性名。

并发下载(ThreadPoolExecutor)

I/O 密集型的批量下载场景,用线程池可以显著提速:

import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import time

headers = {"User-Agent": "Mozilla/5.0"}

def download_image(url: str, save_dir: Path) -> str:
    filename = url.rsplit("/", 1)[-1]
    filepath = save_dir / filename
    if filepath.exists():
        return f"已存在: {filename}"

    resp = requests.get(url, headers=headers, timeout=30, stream=True)
    resp.raise_for_status()

    with filepath.open("wb") as f:
        for chunk in resp.iter_content(chunk_size=8192):
            f.write(chunk)

    time.sleep(0.2)   # 礼貌等待,避免触发速率限制
    return f"完成: {filename}"

if __name__ == "__main__":
    img_urls = [
        "https://httpbin.org/image/png",
        "https://httpbin.org/image/jpeg",
    ]
    save_dir = Path("downloads")
    save_dir.mkdir(exist_ok=True)

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(download_image, url, save_dir): url for url in img_urls}
        for future in as_completed(futures):
            try:
                print(future.result())
            except Exception as e:
                print(f"下载失败: {futures[future]}{e}")

翻页爬取

构建通用 URL 模板,循环生成各页 URL:

import requests
from bs4 import BeautifulSoup
import time

headers = {"User-Agent": "Mozilla/5.0"}

def crawl_page(url: str) -> list[dict]:
    resp = requests.get(url, headers=headers, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

    items = []
    for article in soup.select("article.product_pod"):
        items.append({
            "title": article.h3.a["title"],
            "price": article.select_one(".price_color").get_text(strip=True),
        })
    return items

all_data = []
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

for page in range(1, 6):   # 爬前 5 页
    url = base_url.format(page)
    print(f"正在爬取第 {page} 页...")
    try:
        data = crawl_page(url)
        all_data.extend(data)
    except requests.RequestException as e:
        print(f"第 {page} 页失败: {e}")
    time.sleep(0.5)   # 降低请求频率

print(f"共爬取 {len(all_data)} 条数据")

robots.txt 合规

在爬取网站前,先检查 robots.txt 了解允许和禁止的路径:

import urllib.robotparser

rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://books.toscrape.com/robots.txt")
rp.read()

url_to_check = "https://books.toscrape.com/catalogue/page-1.html"
user_agent = "my-bot"

if rp.can_fetch(user_agent, url_to_check):
    print(f"允许爬取: {url_to_check}")
else:
    print(f"禁止爬取: {url_to_check}")
最后更新于