您需要先安装一个扩展,例如 篡改猴、Greasemonkey 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 Userscripts ,之后才能安装此脚本。
您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。
您需要先安装用户脚本管理器扩展后才能安装此脚本。
自动采集豆瓣排行榜前250作品详细信息并导出(反爬)
// ==UserScript== // @name 豆瓣前250名作品采集 // @namespace http://tampermonkey.net/ // @version 2.0 // @description 自动采集豆瓣排行榜前250作品详细信息并导出(反爬) // @author 专业开发 // @license GPL License // @match https://movie.douban.com/top250* // @grant GM_setClipboard // @grant GM_addStyle // @grant GM_xmlhttpRequest // ==/UserScript== (function(){ 'use strict'; // 样式定义 const STYLE = ` #douban-crawler-box { position: fixed; top: 60px; right: 30px; background: #fff; border: 1px solid #67c23a; box-shadow: 0 8px 20px rgba(0,0,0,0.2); z-index: 99999; border-radius: 8px; padding: 16px; font-size: 14px; min-width: 280px; max-width: 400px; } .crawler-progress { margin: 10px 0; } .crawler-btn { background: #67c23a; color: #fff; border: none; padding: 8px 16px; border-radius: 6px; cursor: pointer; font-weight: bold; margin: 4px; } .crawler-btn:hover { background: #409EFF; } .crawler-btn:disabled { background: #ccc; cursor: not-allowed; } .crawler-log { max-height: 150px; overflow-y: auto; color: #333; font-size: 12px; margin-top: 6px; line-height: 1.5; border: 1px solid #eee; padding: 8px; border-radius: 4px; background: #f9f9f9; } .progress-bar { width: 100%; height: 6px; background: #eee; border-radius: 3px; overflow: hidden; margin: 8px 0; } .progress-fill { height: 100%; background: linear-gradient(90deg, #67c23a, #409EFF); width: 0%; transition: width 0.3s ease; } .status-success { color: #67c23a; font-weight: bold; } .status-error { color: #f56c6c; font-weight: bold; } .status-warning { color: #e6a23c; font-weight: bold; } `; GM_addStyle(STYLE); // UI界面 const box = document.createElement('div'); box.id = 'douban-crawler-box'; box.innerHTML = ` <b style='color:#67c23a'>🎬 豆瓣前100采集器</b> <div class='crawler-progress'> 进度:<span id='crawler-rate'>0/100</span> <div class='progress-bar'> <div class='progress-fill' id='progress-fill'></div> </div> </div> <div> <button class='crawler-btn' id='startCrawler'>开始采集</button> <button class='crawler-btn' id='pauseCrawler' disabled>暂停</button> <button class='crawler-btn' id='exportJSON'>导出JSON</button> <button class='crawler-btn' id='exportCSV'>导出CSV</button> </div> <div class='crawler-log' id='crawlerLog'>点击"开始采集"启动抓取...</div> `; document.body.appendChild(box); // 数据和状态 let movieList = []; let isRunning = false; let isPaused = false; let currentIndex = 0; // 通用请求函数(带反爬措施) function makeRequest(url) { return new Promise((resolve, reject) => { GM_xmlhttpRequest({ method: 'GET', url: url, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }, timeout: 15000, onload: function(response) { if (response.status === 200) { resolve(response.responseText); } else if (response.status === 418) { reject(new Error('被豆瓣反爬虫拦截 (418)')); } else { reject(new Error(`HTTP ${response.status}: ${response.statusText}`)); } }, onerror: function(error) { reject(new Error('网络请求失败')); }, ontimeout: function() { reject(new Error('请求超时')); } }); }); } // 日志函数 function log(message, type = 'info') { const logEl = document.getElementById('crawlerLog'); const time = new Date().toLocaleTimeString(); const className = type === 'success' ? 'status-success' : type === 'error' ? 'status-error' : type === 'warning' ? 'status-warning' : ''; logEl.innerHTML += `<div class="${className}">[${time}] ${message}</div>`; logEl.scrollTop = logEl.scrollHeight; console.log(`[豆瓣采集] ${message}`); } // 更新进度 function updateProgress(current, total) { document.getElementById('crawler-rate').textContent = `${current}/${total}`; const percentage = (current / total) * 100; document.getElementById('progress-fill').style.width = `${percentage}%`; } // 解析单页数据 function parsePage(html) { const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); const items = doc.querySelectorAll('.item'); const movies = []; items.forEach(item => { try { const titleEl = item.querySelector('.title'); const title = titleEl ? titleEl.textContent.trim() : ''; const infoEl = item.querySelector('.bd p'); const infoText = infoEl ? infoEl.textContent.trim() : ''; // 解析导演和主演 const infoLines = infoText.split('\n').filter(line => line.trim()); const firstLine = infoLines || ''; const secondLine = infoLines[1] || ''; const ratingEl = item.querySelector('.rating_num'); const rating = ratingEl ? ratingEl.textContent.trim() : ''; const linkEl = item.querySelector('.hd a'); const link = linkEl ? linkEl.href : ''; const quoteEl = item.querySelector('.quote .inq'); const quote = quoteEl ? quoteEl.textContent.trim() : ''; // 提取年份和地区 const yearMatch = secondLine.match(/(\d{4})/); const year = yearMatch ? yearMatch[1] : ''; movies.push({ title: title, director_actors: firstLine, year: year, region_genre: secondLine, rating: rating, quote: quote, link: link, rank: movieList.length + movies.length + 1 }); } catch (e) { log(`解析单个条目失败: ${e.message}`, 'warning'); } }); return movies; } // 主采集函数 async function startCrawling() { if (isRunning) return; isRunning = true; isPaused = false; movieList = []; currentIndex = 0; document.getElementById('startCrawler').disabled = true; document.getElementById('pauseCrawler').disabled = false; log('🚀 开始采集豆瓣TOP250前100部电影...', 'success'); try { // 采集前4页(每页25部,共100部) for (let page = 0; page < 4; page++) { if (!isRunning || isPaused) break; const url = `https://movie.douban.com/top250?start=${page * 25}`; log(`正在采集第${page + 1}页...`); try { const html = await makeRequest(url); const movies = parsePage(html); movieList.push(...movies); currentIndex = movieList.length; updateProgress(currentIndex, 100); log(`第${page + 1}页采集完成,获得${movies.length}部电影`, 'success'); // 控制请求频率,避免被封 if (page < 3) { log('等待3秒防止反爬...'); await new Promise(resolve => setTimeout(resolve, 3000)); } } catch (error) { log(`第${page + 1}页采集失败: ${error.message}`, 'error'); // 如果是418错误,等待更长时间 if (error.message.includes('418')) { log('检测到反爬拦截,等待10秒后重试...', 'warning'); await new Promise(resolve => setTimeout(resolve, 10000)); page--; // 重试当前页 continue; } } } // 只保留前100部 movieList = movieList.slice(0, 100); updateProgress(movieList.length, 100); log(`🎉 采集完成!成功获取${movieList.length}部电影信息`, 'success'); } catch (error) { log(`采集过程出错: ${error.message}`, 'error'); } finally { isRunning = false; document.getElementById('startCrawler').disabled = false; document.getElementById('pauseCrawler').disabled = true; } } // 暂停功能 function pauseCrawling() { isPaused = true; isRunning = false; log('⏸️ 用户暂停采集', 'warning'); document.getElementById('startCrawler').disabled = false; document.getElementById('pauseCrawler').disabled = true; } // 导出JSON function exportJSON() { if (movieList.length === 0) { alert('没有数据可导出!'); return; } const jsonData = JSON.stringify(movieList, null, 2); GM_setClipboard(jsonData); // 同时触发下载 const blob = new Blob([jsonData], {type: 'application/json'}); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = `豆瓣TOP100_${new Date().toISOString().slice(0,10)}.json`; a.click(); URL.revokeObjectURL(url); log(`✅ JSON数据已复制到剪贴板并下载,共${movieList.length}条记录`, 'success'); } // 导出CSV function exportCSV() { if (movieList.length === 0) { alert('没有数据可导出!'); return; } const headers = ['排名', '电影名称', '导演主演', '年份', '地区类型', '评分', '经典台词', '豆瓣链接']; const csvRows = [headers]; movieList.forEach(movie => { csvRows.push([ movie.rank, `"${movie.title}"`, `"${movie.director_actors}"`, movie.year, `"${movie.region_genre}"`, movie.rating, `"${movie.quote}"`, movie.link ]); }); const csvContent = csvRows.map(row => row.join(',')).join('\n'); const blob = new Blob(['\uFEFF' + csvContent], {type: 'text/csv;charset=utf-8'}); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = `豆瓣TOP100_${new Date().toISOString().slice(0,10)}.csv`; a.click(); URL.createObjectURL(url); log(`✅ CSV文件已下载,共${movieList.length}条记录`, 'success'); } // 绑定事件 document.getElementById('startCrawler').onclick = startCrawling; document.getElementById('pauseCrawler').onclick = pauseCrawling; document.getElementById('exportJSON').onclick = exportJSON; document.getElementById('exportCSV').onclick = exportCSV; log('🎬 豆瓣采集器已就绪!请点击"开始采集"按钮开始抓取数据', 'success'); })();