豆瓣前250名作品采集

自动采集豆瓣排行榜前250作品详细信息并导出(反爬)

// ==UserScript==
// @name         豆瓣前250名作品采集
// @namespace    http://tampermonkey.net/
// @version      2.0
// @description  自动采集豆瓣排行榜前250作品详细信息并导出(反爬)
// @author       专业开发
// @license      GPL License
// @match        https://movie.douban.com/top250*
// @grant        GM_setClipboard
// @grant        GM_addStyle
// @grant        GM_xmlhttpRequest
// ==/UserScript==

(function(){
    'use strict';

    // 样式定义
    const STYLE = `
        #douban-crawler-box {
            position: fixed;
            top: 60px;
            right: 30px;
            background: #fff;
            border: 1px solid #67c23a;
            box-shadow: 0 8px 20px rgba(0,0,0,0.2);
            z-index: 99999;
            border-radius: 8px;
            padding: 16px;
            font-size: 14px;
            min-width: 280px;
            max-width: 400px;
        }
        .crawler-progress {
            margin: 10px 0;
        }
        .crawler-btn {
            background: #67c23a;
            color: #fff;
            border: none;
            padding: 8px 16px;
            border-radius: 6px;
            cursor: pointer;
            font-weight: bold;
            margin: 4px;
        }
        .crawler-btn:hover {
            background: #409EFF;
        }
        .crawler-btn:disabled {
            background: #ccc;
            cursor: not-allowed;
        }
        .crawler-log {
            max-height: 150px;
            overflow-y: auto;
            color: #333;
            font-size: 12px;
            margin-top: 6px;
            line-height: 1.5;
            border: 1px solid #eee;
            padding: 8px;
            border-radius: 4px;
            background: #f9f9f9;
        }
        .progress-bar {
            width: 100%;
            height: 6px;
            background: #eee;
            border-radius: 3px;
            overflow: hidden;
            margin: 8px 0;
        }
        .progress-fill {
            height: 100%;
            background: linear-gradient(90deg, #67c23a, #409EFF);
            width: 0%;
            transition: width 0.3s ease;
        }
        .status-success { color: #67c23a; font-weight: bold; }
        .status-error { color: #f56c6c; font-weight: bold; }
        .status-warning { color: #e6a23c; font-weight: bold; }
    `;
    GM_addStyle(STYLE);

    // UI界面
    const box = document.createElement('div');
    box.id = 'douban-crawler-box';
    box.innerHTML = `
        <b style='color:#67c23a'>🎬 豆瓣前100采集器</b>
        <div class='crawler-progress'>
            进度:<span id='crawler-rate'>0/100</span>
            <div class='progress-bar'>
                <div class='progress-fill' id='progress-fill'></div>
            </div>
        </div>
        <div>
            <button class='crawler-btn' id='startCrawler'>开始采集</button>
            <button class='crawler-btn' id='pauseCrawler' disabled>暂停</button>
            <button class='crawler-btn' id='exportJSON'>导出JSON</button>
            <button class='crawler-btn' id='exportCSV'>导出CSV</button>
        </div>
        <div class='crawler-log' id='crawlerLog'>点击"开始采集"启动抓取...</div>
    `;
    document.body.appendChild(box);

    // 数据和状态
    let movieList = [];
    let isRunning = false;
    let isPaused = false;
    let currentIndex = 0;

    // 通用请求函数(带反爬措施)
    function makeRequest(url) {
        return new Promise((resolve, reject) => {
            GM_xmlhttpRequest({
                method: 'GET',
                url: url,
                headers: {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'DNT': '1',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1',
                },
                timeout: 15000,
                onload: function(response) {
                    if (response.status === 200) {
                        resolve(response.responseText);
                    } else if (response.status === 418) {
                        reject(new Error('被豆瓣反爬虫拦截 (418)'));
                    } else {
                        reject(new Error(`HTTP ${response.status}: ${response.statusText}`));
                    }
                },
                onerror: function(error) {
                    reject(new Error('网络请求失败'));
                },
                ontimeout: function() {
                    reject(new Error('请求超时'));
                }
            });
        });
    }

    // 日志函数
    function log(message, type = 'info') {
        const logEl = document.getElementById('crawlerLog');
        const time = new Date().toLocaleTimeString();
        const className = type === 'success' ? 'status-success' :
                         type === 'error' ? 'status-error' :
                         type === 'warning' ? 'status-warning' : '';

        logEl.innerHTML += `<div class="${className}">[${time}] ${message}</div>`;
        logEl.scrollTop = logEl.scrollHeight;
        console.log(`[豆瓣采集] ${message}`);
    }

    // 更新进度
    function updateProgress(current, total) {
        document.getElementById('crawler-rate').textContent = `${current}/${total}`;
        const percentage = (current / total) * 100;
        document.getElementById('progress-fill').style.width = `${percentage}%`;
    }

    // 解析单页数据
    function parsePage(html) {
        const parser = new DOMParser();
        const doc = parser.parseFromString(html, 'text/html');
        const items = doc.querySelectorAll('.item');
        const movies = [];

        items.forEach(item => {
            try {
                const titleEl = item.querySelector('.title');
                const title = titleEl ? titleEl.textContent.trim() : '';

                const infoEl = item.querySelector('.bd p');
                const infoText = infoEl ? infoEl.textContent.trim() : '';

                // 解析导演和主演
                const infoLines = infoText.split('\n').filter(line => line.trim());
                const firstLine = infoLines || '';
                const secondLine = infoLines[1] || '';

                const ratingEl = item.querySelector('.rating_num');
                const rating = ratingEl ? ratingEl.textContent.trim() : '';

                const linkEl = item.querySelector('.hd a');
                const link = linkEl ? linkEl.href : '';

                const quoteEl = item.querySelector('.quote .inq');
                const quote = quoteEl ? quoteEl.textContent.trim() : '';

                // 提取年份和地区
                const yearMatch = secondLine.match(/(\d{4})/);
                const year = yearMatch ? yearMatch[1] : '';

                movies.push({
                    title: title,
                    director_actors: firstLine,
                    year: year,
                    region_genre: secondLine,
                    rating: rating,
                    quote: quote,
                    link: link,
                    rank: movieList.length + movies.length + 1
                });
            } catch (e) {
                log(`解析单个条目失败: ${e.message}`, 'warning');
            }
        });

        return movies;
    }

    // 主采集函数
    async function startCrawling() {
        if (isRunning) return;

        isRunning = true;
        isPaused = false;
        movieList = [];
        currentIndex = 0;

        document.getElementById('startCrawler').disabled = true;
        document.getElementById('pauseCrawler').disabled = false;

        log('🚀 开始采集豆瓣TOP250前100部电影...', 'success');

        try {
            // 采集前4页(每页25部,共100部)
            for (let page = 0; page < 4; page++) {
                if (!isRunning || isPaused) break;

                const url = `https://movie.douban.com/top250?start=${page * 25}`;
                log(`正在采集第${page + 1}页...`);

                try {
                    const html = await makeRequest(url);
                    const movies = parsePage(html);
                    movieList.push(...movies);

                    currentIndex = movieList.length;
                    updateProgress(currentIndex, 100);

                    log(`第${page + 1}页采集完成,获得${movies.length}部电影`, 'success');

                    // 控制请求频率,避免被封
                    if (page < 3) {
                        log('等待3秒防止反爬...');
                        await new Promise(resolve => setTimeout(resolve, 3000));
                    }
                } catch (error) {
                    log(`第${page + 1}页采集失败: ${error.message}`, 'error');

                    // 如果是418错误,等待更长时间
                    if (error.message.includes('418')) {
                        log('检测到反爬拦截,等待10秒后重试...', 'warning');
                        await new Promise(resolve => setTimeout(resolve, 10000));
                        page--; // 重试当前页
                        continue;
                    }
                }
            }

            // 只保留前100部
            movieList = movieList.slice(0, 100);
            updateProgress(movieList.length, 100);

            log(`🎉 采集完成!成功获取${movieList.length}部电影信息`, 'success');

        } catch (error) {
            log(`采集过程出错: ${error.message}`, 'error');
        } finally {
            isRunning = false;
            document.getElementById('startCrawler').disabled = false;
            document.getElementById('pauseCrawler').disabled = true;
        }
    }

    // 暂停功能
    function pauseCrawling() {
        isPaused = true;
        isRunning = false;
        log('⏸️ 用户暂停采集', 'warning');
        document.getElementById('startCrawler').disabled = false;
        document.getElementById('pauseCrawler').disabled = true;
    }

    // 导出JSON
    function exportJSON() {
        if (movieList.length === 0) {
            alert('没有数据可导出!');
            return;
        }

        const jsonData = JSON.stringify(movieList, null, 2);
        GM_setClipboard(jsonData);

        // 同时触发下载
        const blob = new Blob([jsonData], {type: 'application/json'});
        const url = URL.createObjectURL(blob);
        const a = document.createElement('a');
        a.href = url;
        a.download = `豆瓣TOP100_${new Date().toISOString().slice(0,10)}.json`;
        a.click();
        URL.revokeObjectURL(url);

        log(`✅ JSON数据已复制到剪贴板并下载,共${movieList.length}条记录`, 'success');
    }

    // 导出CSV
    function exportCSV() {
        if (movieList.length === 0) {
            alert('没有数据可导出!');
            return;
        }

        const headers = ['排名', '电影名称', '导演主演', '年份', '地区类型', '评分', '经典台词', '豆瓣链接'];
        const csvRows = [headers];

        movieList.forEach(movie => {
            csvRows.push([
                movie.rank,
                `"${movie.title}"`,
                `"${movie.director_actors}"`,
                movie.year,
                `"${movie.region_genre}"`,
                movie.rating,
                `"${movie.quote}"`,
                movie.link
            ]);
        });

        const csvContent = csvRows.map(row => row.join(',')).join('\n');
        const blob = new Blob(['\uFEFF' + csvContent], {type: 'text/csv;charset=utf-8'});
        const url = URL.createObjectURL(blob);
        const a = document.createElement('a');
        a.href = url;
        a.download = `豆瓣TOP100_${new Date().toISOString().slice(0,10)}.csv`;
        a.click();
        URL.createObjectURL(url);

        log(`✅ CSV文件已下载,共${movieList.length}条记录`, 'success');
    }

    // 绑定事件
    document.getElementById('startCrawler').onclick = startCrawling;
    document.getElementById('pauseCrawler').onclick = pauseCrawling;
    document.getElementById('exportJSON').onclick = exportJSON;
    document.getElementById('exportCSV').onclick = exportCSV;

    log('🎬 豆瓣采集器已就绪!请点击"开始采集"按钮开始抓取数据', 'success');
})();