您需要先安装一个扩展,例如 篡改猴、Greasemonkey 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 Userscripts ,之后才能安装此脚本。
您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。
您需要先安装用户脚本管理器扩展后才能安装此脚本。
Download site in single file automatically
// ==UserScript== // @name Crawler base on SingleFile // @author Mark // @description Download site in single file automatically // @license MIT // @version 0.0.21 // @match https://*/* // @run-at document-idle // @grant GM.setValue // @grant GM.getValue // @grant GM.xmlHttpRequest // @grant GM_registerMenuCommand // @grant unsafeWindow // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js // @require https://openuserjs.org/src/libs/sizzle/GM_config.js // @connect * // @noframes // @namespace https://greasyfork.org/users/1106595 // ==/UserScript== const REPORT_ADDRESS = "https://crawler-hit.deno.dev/api/update"; const PAGE_LOADING_TIME = 7; const ERROR_RELOAD_TIME = 10; const ERROR_RELOAD_LONG_TIME = 60; const NEXT_TASK_WAITING_TIME = 10; const NO_TASK_WAITING_TIME = 90; const CF_CHALLENGE_WAITING_TIME = 20; const QUICK_SLEEP_TIME = 5; const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/; const TASK_MAX_RETRY_TIMES = 3; const TIME_POINT_TYPES = { PREPARE_START: "prepareStart", TASK_LOADED: "taskLoaded", TASK_REPORTED: "taskReported", PRESIGN_INDEX: "presignIndex", PRESIGN_SINGLEFILE: "presignSinglefile", SINGLE_FILE_SUCCESS: "singleFileSuccess", INDEX_FILE_UPLOADED: "indexFileUploaded", SINGLE_FILE_UPLOADED: "singleFileUploaded", VALIDATE_FAILED: "validateFailed", }; let gmc = new GM_config({ id: "CrawlerConfig", title: "Crawler setting", fields: { Name: { label: "Name", type: "text", }, Password: { label: "Password", type: "text", }, taskInterval: { label: "Task Interval (s)", type: "int", default: NEXT_TASK_WAITING_TIME, }, taskMaxRetryTimes: { label: "Task Max Retry Times", type: "int", default: TASK_MAX_RETRY_TIMES, }, preferServer: { label: "Prefer preSign Server", type: "text", }, reportServer: { label: "Report Server", type: "text", default: REPORT_ADDRESS, }, }, events: { init: function () { // runs after initialization completes }, save: function () { // runs after values are saved console.log("save", this.get("Name"), this.get("Password")); this.close(); }, }, }); const crawlerUtil = { addScript: (url) => { const s = document.createElement("script"); s.src = url; s.onerror = (evt) => { setTimeout(() => { addScript(url); }, 2000); }; document.body.append(s); }, addScriptByText: async (url, cache = false, retry = 0) => { const s = document.createElement("script"); s.dataset.crawler = "true"; const scriptCache = (await GM.getValue("scriptCache")) || {}; if (cache && scriptCache[url]) { s.innerHTML = scriptCache[url]; document.body.append(s); return true; } try { const res = await GM.xmlHttpRequest({ url: url, method: "GET", }); const text = res.responseText; if (cache) { scriptCache[url] = text; GM.setValue("scriptCache", scriptCache); } s.innerHTML = text; document.body.append(s); return true; } catch (error) { if (retry > 3) { return false; } await sleep(2); return await addScriptByText(url, retry + 1); } }, getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => { const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : []; const preSignSevers = configServer.concat([ "https://electrolyte-brain-minio.deno.dev", ]); async function getPreSignUrlFromServer(serverIndex = 0) { try { return await ( await GM_fetch( `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}` ) ).json(); } catch (error) { if (!preSignSevers[serverIndex + 1]) { return { reload: true }; } return await getPreSignUrlFromServer(serverIndex + 1); } } const preSignRes = await getPreSignUrlFromServer(); if (preSignRes.reload) { return "RELOAD"; } const url = preSignRes?.url; return url || null; }, uploader: async (url, content) => { const mime = "application/gzip" const gzip_data = pako.gzip(content, { level: 9 }); const upload_blob = new Blob([gzip_data], { type: mime }); return await GM.xmlHttpRequest({ method: "PUT", url, headers: { "Content-Type": mime, "Content-Length": upload_blob.size, }, data: upload_blob, }); }, downloadFile: (data, fileName) => { const a = document.createElement("a"); document.body.appendChild(a); a.style = "display: none"; const blob = new Blob([data], { type: "application/octet-stream", }); const url = window.URL.createObjectURL(blob); a.href = url; a.download = fileName; a.click(); window.URL.revokeObjectURL(url); }, generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""), sleep: (duration) => { return new Promise((res, rej) => { setTimeout(() => res(), duration * 1000); }); }, }; // main function (function () { "use strict"; const { addScript, addScriptByText, generateClientId, uploader, downloadFile, getPreSignUrl, sleep, } = crawlerUtil; const dependenciesInit = async () => { await addScriptByText( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js", true ); await addScriptByText( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js", true ); await addScriptByText( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js", true ); await addScriptByText( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js", true ); await addScriptByText( "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js" ); await addScriptByText( "https://crawal-validator.deno.dev/" ); await addScriptByText( "https://cdn.jsdelivr.net/npm/[email protected]/dist/pako.min.js" ); return () => { document.querySelectorAll("script[data-crawler='true']").forEach((el) => { el.parentElement.removeChild(el); }); }; }; const pureHTMLCleaner = (document) => { document.querySelectorAll("script").forEach((el) => { el.parentElement.removeChild(el); }); document.querySelectorAll("style").forEach((el) => { el.parentElement.removeChild(el); }); }; // Overwrite fetch function to bypass CORS window.unsafeWindow.fetch = async (...args) => { return await fetch(...args).catch(async (err) => { return await GM_fetch(...args); }); }; async function reload(waiting = 60, message = "") { console.warn(`%c${message}, reload ${waiting}s later`, printStyle); await sleep(waiting); location.reload(); } function readFile(accept = "", multiple = false) { const inputEl = document.createElement("input"); inputEl.setAttribute("type", "file"); inputEl.setAttribute("accept", accept); inputEl.setAttribute("multiple", !!multiple); return new Promise((resolve, reject) => { inputEl.addEventListener("change", (e) => { resolve(multiple ? inputEl.files : inputEl.files[0]); window.removeEventListener("click", onWindowClick, true); }); document.body.append(inputEl); inputEl.click(); const onWindowClick = () => { if (!inputEl.value) { reject(new Error("用户取消选择")); } window.removeEventListener("click", onWindowClick, true); }; setTimeout(() => { window.addEventListener("click", onWindowClick, true); }, 100); }); } function AddImportBtn() { const btnWrapImport = document.createElement("div"); btnWrapImport.id = "CRAWLER_ID"; btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`; const importBtn = btnWrapImport.querySelector("button"); importBtn.onclick = async () => { if ( !window.confirm( "The data in browser will be clear up. Please make sure you have to do this !!!" ) ) { return; } const file = await readFile(".json"); const reader = new FileReader(); reader.onload = (event) => { const json = JSON.parse(event.target.result); // console.log({json}, 'json') // this.importFromBackUp.bind(this)(json); if ( json instanceof Array && json.every((item) => item.doi && item.validator) ) { GM.setValue("tasks", json); location.reload(); } else { alert( "Please upload json file like [{doi: string, validator: string, ...}]" ); } }; reader.readAsText(file); }; document.body.appendChild(btnWrapImport); return () => { const importBtn = document.getElementById("CRAWLER_ID"); if (importBtn) { importBtn.parentElement.removeChild(importBtn); } }; } GM_registerMenuCommand("Download", async () => { const taskData = await GM.getValue("tasks"); const waitingTasks = taskData.filter( (task) => !task.downloaded && task.validated === undefined && validators[task.validator] ); const now = new Date(); downloadFile( JSON.stringify(taskData), `${now.getFullYear()}-${ now.getMonth() + 1 }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${ taskData.length }-${taskData.length - waitingTasks.length}.json` ); }); GM_registerMenuCommand("Config", async () => { gmc.open(); }); const printStyle = "color: blue;background-color: #ccc;font-size: 20px"; const prepareNextTask = async (nextDoi) => { const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME; if (nextDoi) { console.log( `%cStart next task ${taskInterval}s later...`, printStyle, nextDoi ); await sleep(taskInterval); const taskData = await GM.getValue("tasks"); const task = taskData.find((task) => task.doi === nextDoi); await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData); location.href = nextDoi; } else { await reload(NO_TASK_WAITING_TIME, "No tasks waiting"); } }; let lasestTimepoint = 0; const saveTaskTimepoint = async (pointName, task, taskData) => { if (pointName === TIME_POINT_TYPES.PREPARE_START) { task[`timePoint_${pointName}`] = new Date().valueOf() } else { if (lasestTimepoint == 0) { lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0; } if (lasestTimepoint == 0) { task[`timePoint_${pointName}`] = 0; } else { task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint; } lasestTimepoint = new Date().valueOf(); } await GM.setValue("tasks", taskData); }; const checkRetry = async (task, taskData, nextDoi) => { const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES; const retryTimes = task.retryTimes || 0; let result = true; if (retryTimes >= taskMaxRetryTimes) { console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle); task.validated = false; task.updateTime = new Date().valueOf(); await prepareNextTask(nextDoi); result = false; } else { task.retryTimes = retryTimes + 1; } await GM.setValue("tasks", taskData); return result; } async function start() { console.log(new Date()); const importBtnHandler = AddImportBtn(); let clientId = await GM.getValue("clientId"); if (typeof clientId !== "string" || !clientId) { clientId = generateClientId(); await GM.setValue("clientId", clientId); } // ---------------------------- Script dependencies handler ----------------------------------------------------- const dependenciesHandler = await dependenciesInit(); if (!singlefile || !singlefile.getPageData) { await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`); return; } if (!(validators && DEFAULT_CONFIG)) { await reload( ERROR_RELOAD_TIME, "Can not get validators or DEFAULT_CONFIG" ); return; } // ---------------------------- Get Task ----------------------------------------------------- const taskData = await GM.getValue("tasks"); let tasks = taskData || []; // find task which not downloaded and not validated before const waitingTasks = tasks.filter( (task) => !task.downloaded && task.validated === undefined && validators[task.validator] ); console.log( `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`, printStyle, tasks ); if (!waitingTasks.length) { await reload(NO_TASK_WAITING_TIME, "No tasks waiting"); return; } const invalidatedTasks = tasks.filter((task) => task.validated === false); const doneTasks = tasks .filter((task) => task.downloaded) .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1)); const previousDay = new Date().valueOf() - 24 * 3600 * 1000; const last24hDoneTasks = doneTasks.filter( (task) => task.updateTime > previousDay ); const lastDoneTime = new Date(doneTasks[0]?.updateTime); const currentTask = waitingTasks[0]; const nextTask = waitingTasks[1] || {}; await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks); const updateCurrentTask = async (isSuccess) => { currentTask.validated = isSuccess; currentTask.updateTime = new Date().valueOf(); await GM.setValue("tasks", tasks); }; // ---------------------------- Report progress ----------------------------------------------------- const reportUrl = gmc.get("reportServer") || REPORT_ADDRESS; const reportTip = `Last download time: ${lastDoneTime.toLocaleString()} Speed: ${last24hDoneTasks.length} / last 24h`; GM.xmlHttpRequest({ url: reportUrl, method: "POST", headers: { "Content-Type": "application/json" }, data: JSON.stringify({ account: clientId, invalidate_count: invalidatedTasks.length, done_count: doneTasks.length, queue_count: waitingTasks.length, tip: reportTip, }), }) .then((res) => { console.log("Report successfully", { res }); }) .finally(() => { saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks); }); // -------------------------- Detect Cloudflare challenge ------------------------------------------------------- await sleep(PAGE_LOADING_TIME); if (document.getElementById("challenge-form")) { console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle); await sleep(CF_CHALLENGE_WAITING_TIME); currentTask.cloudflareBlock = true; await updateCurrentTask(false); await prepareNextTask(nextTask.doi); return; } // bypass els institution check if (document.querySelector('.sec-A #bdd-els-close')) { const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close'); elsCloseBtn.click(); } // ---------------------------- validated task ------------------------------------------------ const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase(); const doiFixed = doi.replaceAll("/", "_"); const validator = (document) => { const abs_selectors = validators[currentTask.validator]["sel_A"]; const para_selectors = validators[currentTask.validator]["sel_P"]; if (abs_selectors.length == 0 && para_selectors.length == 0) { return false; } const absValidated = abs_selectors.length == 0 || abs_selectors.some((selector) => document.querySelector(selector)); const paraValidated = para_selectors.length == 0 || para_selectors.some((selector) => document.querySelectorAll(selector).length > 0); return absValidated && paraValidated; } let name = ""; let pass = ""; let preferServer = ""; try { name = gmc.get("Name"); pass = gmc.get("Password"); preferServer = gmc.get("preferServer"); if (!name || !pass) { throw new Error(); } } catch (err) { console.error( `%cMiss name or password. Please input in config panel.`, printStyle ); return; } const indexUrl = await getPreSignUrl(doiFixed, `_.html.gz`, name, pass, preferServer); await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks); const singlefileUrl = await getPreSignUrl( doiFixed, `_.sf.html.gz`, name, pass, preferServer ); await saveTaskTimepoint( TIME_POINT_TYPES.PRESIGN_SINGLEFILE, currentTask, tasks ); if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") { await reload( ERROR_RELOAD_LONG_TIME, "Minio PreSignUrl error, please check url or account" ); return; } if (!indexUrl && !singlefileUrl) { console.error("%cFile existed!!!", printStyle, currentTask.doi); await updateCurrentTask(false); await prepareNextTask(nextTask.doi); return; } else { const old_index = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer); const old_singlefileUrl = await getPreSignUrl( doiFixed, `_.sf.html`, name, pass, preferServer ); if (!old_index && !old_singlefileUrl) { console.error("%cFile existed!!!", printStyle, currentTask.doi); await updateCurrentTask(false); await prepareNextTask(nextTask.doi); return; } } // --------------------------- Page validate ------------------------------------------------------ if (!document.body.textContent.toLowerCase().includes(doi)) { console.log( `%cURL not match, will redirect to ${currentTask.doi} 5s later`, printStyle ); await sleep(QUICK_SLEEP_TIME); if(await checkRetry(currentTask, tasks, nextTask.doi)){ location.href = currentTask.doi; } return; } if (validator(document)) { console.log( "%cValidate successfully! Downloading page...", printStyle, waitingTasks, tasks ); importBtnHandler(); // repair special page if (typeof documentFixer[currentTask.validator] === "function") { documentFixer[currentTask.validator](document); } try { const data = await singlefile.getPageData(DEFAULT_CONFIG); await saveTaskTimepoint( TIME_POINT_TYPES.SINGLE_FILE_SUCCESS, currentTask, tasks ); // downloadFile(data.content, `${doiFixed}.singlefile.html`); // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`); if (singlefileUrl) { await uploader(singlefileUrl, data.content); await saveTaskTimepoint( TIME_POINT_TYPES.SINGLE_FILE_UPLOADED, currentTask, tasks ); } if (indexUrl) { dependenciesHandler(); pureHTMLCleaner(document); await uploader(indexUrl, document.body.parentElement.outerHTML); await saveTaskTimepoint( TIME_POINT_TYPES.INDEX_FILE_UPLOADED, currentTask, tasks ); } console.log("%cUpload successfully!", printStyle); currentTask.downloaded = true; await updateCurrentTask(true); } catch (error) { console.error(error); if (await checkRetry(currentTask, tasks, nextTask.doi)) { await reload( ERROR_RELOAD_TIME, `singlefile or upload error! ${currentTask.doi}` ); } return; } } else { console.log(`%cValidate failed! ${currentTask.doi}`, printStyle); await saveTaskTimepoint( TIME_POINT_TYPES.VALIDATE_FAILED, currentTask, tasks ); await updateCurrentTask(false); } // --------------------------- Prepare next task ------------------------------------------------------ await prepareNextTask(nextTask.doi); } start(); })();