爬取博客园 collection 下的文章

✅ 完整代码(保存为 spider.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import html2text
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urljoin, urlparse

BASE_URL = "https://chuna2.787528.xyz/xxx/collections/11111"
HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/123 Safari/537.36")
}
SAVE_DIR = "blog"
IMG_DIR = os.path.join(SAVE_DIR, "images")
os.makedirs(IMG_DIR, exist_ok=True)

h = html2text.HTML2Text()
h.ignore_links = False
h.wrap_links = False
h.mark_code = True

# ------------------ utils ------------------
def safe_name(title: str) -> str:
    return re.sub(r'[\\/:*?"<>|]', '_', title) or "untitled"

def fetch(url: str) -> str:
    resp = requests.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    resp.encoding = resp.apparent_encoding
    return resp.text

def download_img(src: str) -> str:
    if src.startswith("//"):
        src = "https:" + src
    fname = os.path.basename(urlparse(src).path) or "img"
    local_path = os.path.join(IMG_DIR, fname)
    if not os.path.exists(local_path):
        try:
            data = requests.get(src, headers=HEADERS, timeout=15).content
            with open(local_path, "wb") as f:
                f.write(data)
        except Exception as e:
            print(f"⚠️ 图片下载失败: {src} ({e})")
            return src  # fallback
    return os.path.relpath(local_path, SAVE_DIR)

# ------------------ main logic ------------------
def collect_links():
    html = fetch(BASE_URL)
    soup = BeautifulSoup(html, "lxml")
    links = []
    for idx, a in enumerate(soup.select("a.entrylistItemTitle"), 1):
        title = a.get_text(strip=True)
        href = a["href"]
        if not href.startswith("http"):
            href = urljoin(BASE_URL, href)
        links.append((f"{idx:03d}", title, href))
    return links

def post2md(prefix: str, title: str, url: str):
    html = fetch(url)
    soup = BeautifulSoup(html, "lxml")
    body = soup.find("div", id="cnblogs_post_body") or soup

    # 本地化图片
    for img in body.select("img"):
        src = img.get("src")
        if src:
            img["src"] = download_img(src)

    md_body = h.handle(str(body))
    meta = f"原文:<{url}>\n\n---\n\n"
    filename = f"{prefix}_{safe_name(title)}.md"
    with open(os.path.join(SAVE_DIR, filename), "w", encoding="utf-8") as f:
        f.write(meta + md_body)

def main():
    links = collect_links()
    print(f"🔍 共发现 {len(links)} 篇文章")
    for prefix, title, url in tqdm(links, desc="📥 下载"):
        outfile = os.path.join(SAVE_DIR, f"{prefix}_{safe_name(title)}.md")
        if os.path.exists(outfile):
            continue
        try:
            post2md(prefix, title, url)
            time.sleep(1)
        except Exception as e:
            tqdm.write(f"❌ 失败 {url} - {e}")
    print(f"✅ 全部完成!文件保存在 → {os.path.abspath(SAVE_DIR)}")

if __name__ == "__main__":
    main()

✅ 使用方法

  1. 安装依赖:

    pip install requests beautifulsoup4 html2text tqdm
    
  2. 运行脚本:

    python spider.py
    

✅ 输出示例

blog/
├── 001_ aaa.md
├── 002_ bbb.md
├── 003_ ccc.md
└── images/
    ├── kkksc01.png
    ├── kkksc02.png
    └── kkksc03.png
posted @ 2025-07-16 16:55  爱玩游戏的jason  阅读(10)  评论(0)    收藏  举报