介绍#

都是一些基础的东西啦，需要下载 tqdm、requests 等库，然后把 user_id 和 cookie 换成自己的，直接运行代码。参数自己看注释调整吧。
cookie 可以用浏览器插件获取，或 F12 开发者工具查看。

本代码参考于 CSDN 的博文。
添加什么额外功能自己看着改吧。
仅供学习交流使用，请勿用于商业用途。

相对来说还是比较简单的一个东西，喜欢的话发个评论互动一下吧。

注意：部分接口在未登录或 Cookie/权限不正确时可能返回 404；请确保使用自己的账号、有效的 Cookie，并遵守站点条款。

代码#

1
import os
2
import re
3
import time
4
import requests
5
import concurrent.futures as futures
6
from typing import Set, Iterable, Callable, Dict, Optional, Tuple
7
from tqdm import tqdm
8

9

10
class Downloader:
11
    """处理图像下载的类"""
12
    def __init__(self, capacity, headers, threads, standard_time, date):
13
        self.url_group: Set[str] = set()
14
        self.capacity = capacity   # 最大下载量(MB)
15
        self.store_path = date + "/"   # 当前日期作为存储路径
16
        self.standard_time = standard_time
17
        self.threads = threads
18
        self.headers = headers.copy()
19

20
    def add(self, urls: Iterable[str]):
21
        """添加待下载的URL"""
22
        for url in urls:
23
            self.url_group.add(url)
24

25
    def download_image(self, url: str) -> float:
26
        """下载单个图像并返回其大小"""
27
        image_name = url[url.rfind("/") + 1:]
28
        image_id = re.search(r"/(\d+)_", url).group(1)
29
        image_path = self.store_path + image_name
30
        self.headers.update({"Referer": f"https://www.pixiv.net/artworks/{image_id}"})
31
        os.makedirs(self.store_path, exist_ok=True)
32

33
        if os.path.exists(image_path):
34
            return 0
35

36
        for _ in range(10):
37
            try:
38
                response = requests.get(url, headers=self.headers, timeout=(4, self.standard_time))
39
                if response.status_code == 200:
40
                    image_size = int(response.headers.get("content-length", 0))
41
                    with open(image_path, "wb") as f:
42
                        f.write(response.content)
43
                    return image_size / (1 << 20)
44
            except Exception:
45
                pass
46
        return 0
47

48
    def download(self):
49
        """启动下载过程并返回下载的总大小"""
50
        flow_size = .0
51
        print("===== downloader start =====")
52
        with futures.ThreadPoolExecutor(self.threads) as executor:
53
            with tqdm(total=len(self.url_group), desc="downloading") as pbar:
54
                for image_size in executor.map(self.download_image, self.url_group):
55
                    flow_size += image_size
56
                    pbar.update()
57
                    pbar.set_description(f"downloading / flow {flow_size:.2f}MB")
58
                    if flow_size > self.capacity:
59
                        executor.shutdown(wait=False, cancel_futures=True)
60
                        break
61
        print("===== downloader complete =====")
62
        return flow_size
63

64

65
class Collector:
66
    """收集作品URL的类"""
67
    def __init__(self, threads, user_id, headers, downloader):
68
        self.id_group: Set[str] = set()
69
        self.threads = threads
70
        self.user_id = user_id
71
        self.headers = headers.copy()
72
        self.downloader = downloader
73

74
    def add(self, image_ids):
75
        """添加图像ID"""
76
        self.id_group.add(image_ids)
77

78
    def select_page(self, response) -> Set[str]:
79
        """从响应中选择作品的URL"""
80
        group = set()
81
        for url in response.json()["body"]:
82
            group.add(url["urls"]["original"])
83
        return group
84

85
    def get_artworks_urls(self, args: Tuple[str, Callable, Optional[Dict]]) -> Optional[Iterable[str]]:
86
        """获取作品的URL"""
87
        url, selector, additional_headers = args
88
        headers = self.headers
89
        headers.update(additional_headers)
90
        time.sleep(1)
91

92
        for _ in range(10):
93
            try:
94
                response = requests.get(url, headers=headers, timeout=4)
95
                if response.status_code == 200:
96
                    id_group = selector(response)
97
                    return id_group
98
            except Exception as e:
99
                print(e)
100
            time.sleep(1)
101

102
    def collect(self):
103
        """启动收集过程"""
104
        print("===== collector start =====")
105
        with futures.ThreadPoolExecutor(self.threads) as executor:
106
            with tqdm(total=len(self.id_group), desc="collecting urls") as pbar:
107
                urls_list = [f"https://www.pixiv.net/ajax/illust/{illust_id}/pages?lang=zh" for illust_id in self.id_group]
108
                additional_headers = [
109
                    {
110
                        "Referer": f"https://www.pixiv.net/artworks/{illust_id}",
111
                        "x-user-id": self.user_id,
112
                    }
113
                    for illust_id in self.id_group]
114

115
                for urls in executor.map(self.get_artworks_urls, zip(urls_list, [self.select_page] * len(urls_list), additional_headers)):
116
                    if urls is not None:
117
                        self.downloader.add(urls)
118
                    pbar.update()
119
        print("===== collector complete =====")
120
        return self.id_group
121

122

123
class BookmarkCrawler:
124
    """每个页面爬虫类"""
125
    def __init__(self, user_id, max_pages=5):
126
        self.user_id = user_id
127
        self.max_pages = max_pages
128
        self.headers = {
129
            "User-Agent": "Mozilla/5.0",
130
            "Cookie": "自己提取",
131
        }
132
        self.threads = 12
133
        self.capacity = 10000
134
        self.standard_time = 10
135
        self.date = time.strftime("%Y%m%d")
136
        self.downloader = Downloader(self.capacity, self.headers, self.threads, self.standard_time, self.date)
137
        self.collector = Collector(self.threads, self.user_id, self.headers, self.downloader)
138

139
    def get_bookmarks(self):
140
        """获取用户总页面"""
141
        for page in range(1, self.max_pages + 1):
142
            url = f"https://www.pixiv.net/ajax/user/{self.user_id}/illusts/bookmarks?tag=&offset={(page - 1) * 48}&limit=48&rest=show&lang=zh"
143
            response = requests.get(url, headers=self.headers)
144
            if response.status_code == 200:
145
                works = response.json()["body"]["works"]
146
                for work in works:
147
                        self.collector.add(str(work["id"]))
148
            else:
149
                print(f"Failed to fetch bookmarks from page {page}")
150
            time.sleep(1)
151

152
    def run(self):
153
        """运行爬虫，执行获取页面、收集和下载操作"""
154
        self.get_bookmarks()
155
        self.collector.collect()
156
        self.downloader.download()
157

158

159
if __name__ == "__main__":
160
    """参数设置"""
161
    BookmarkCrawler(user_id="96765879", max_pages=7).run()

注意事项#

接口在非登录或 Cookie 无效时可能返回 404；需确保你的账号权限与 Cookie 正确。
合理设置下载并发与速率，避免请求过快导致被限制。
请遵守站点使用条款，仅用于学习交流。

介绍#

代码#

注意事项#

分享海报预览