您需要先安装一个扩展,例如 篡改猴、Greasemonkey 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 Userscripts ,之后才能安装此脚本。
您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。
您需要先安装用户脚本管理器扩展后才能安装此脚本。
Download site in single file automatically
当前为
// ==UserScript== // @name Crawler base on SingleFile // @author Mark // @description Download site in single file automatically // @license MIT // @version 0.0.5 // @match https://*/* // @run-at document-idle // @grant GM.setValue // @grant GM.getValue // @grant GM.xmlHttpRequest // @grant GM_registerMenuCommand // @grant unsafeWindow // @noframes // @namespace https://greasyfork.org/users/1106595 // ==/UserScript== // config for singleFile const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent); const DEFAULT_CONFIG = { removeHiddenElements: true, removeUnusedStyles: true, removeUnusedFonts: true, removeFrames: false, compressHTML: true, compressCSS: false, loadDeferredImages: true, loadDeferredImagesMaxIdleTime: 1500, loadDeferredImagesBlockCookies: false, loadDeferredImagesBlockStorage: false, loadDeferredImagesKeepZoomLevel: false, loadDeferredImagesDispatchScrollEvent: false, loadDeferredImagesBeforeFrames: false, filenameTemplate: "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}", infobarTemplate: "", includeInfobar: false, confirmInfobarContent: false, autoClose: false, confirmFilename: false, filenameConflictAction: "uniquify", filenameMaxLength: 192, filenameMaxLengthUnit: "bytes", filenameReplacedCharacters: [ "~", "+", "\\\\", "?", "%", "*", ":", "|", '"', "<", ">", "\x00-\x1f", "\x7F", ], filenameReplacementCharacter: "_", replaceEmojisInFilename: false, saveFilenameTemplateData: false, contextMenuEnabled: true, tabMenuEnabled: true, browserActionMenuEnabled: true, shadowEnabled: true, logsEnabled: true, progressBarEnabled: true, maxResourceSizeEnabled: false, maxResourceSize: 10, displayInfobar: true, displayStats: false, backgroundSave: BACKGROUND_SAVE_SUPPORTED, defaultEditorMode: "normal", applySystemTheme: true, autoSaveDelay: 1, autoSaveLoad: false, autoSaveUnload: false, autoSaveLoadOrUnload: true, autoSaveDiscard: false, autoSaveRemove: false, autoSaveRepeat: false, autoSaveRepeatDelay: 10, removeAlternativeFonts: true, removeAlternativeMedias: true, removeAlternativeImages: true, groupDuplicateImages: true, maxSizeDuplicateImages: 512 * 1024, saveRawPage: false, saveToClipboard: false, addProof: false, saveToGDrive: false, saveToDropbox: false, saveWithWebDAV: false, webDAVURL: "", webDAVUser: "", webDAVPassword: "", saveToGitHub: false, githubToken: "", githubUser: "", githubRepository: "SingleFile-Archives", githubBranch: "main", saveWithCompanion: false, forceWebAuthFlow: false, resolveFragmentIdentifierURLs: false, userScriptEnabled: false, openEditor: false, openSavedPage: false, autoOpenEditor: false, saveCreatedBookmarks: false, allowedBookmarkFolders: [], ignoredBookmarkFolders: [], replaceBookmarkURL: true, saveFavicon: true, includeBOM: false, warnUnsavedPage: true, displayInfobarInEditor: false, compressContent: false, createRootDirectory: false, selfExtractingArchive: true, extractDataFromPage: true, preventAppendedData: false, insertTextBody: false, autoSaveExternalSave: false, insertMetaNoIndex: false, insertMetaCSP: true, passReferrerOnError: false, password: "", insertSingleFileComment: true, removeSavedDate: false, blockMixedContent: false, saveOriginalURLs: false, acceptHeaders: { font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8", image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", stylesheet: "text/css,*/*;q=0.1", script: "*/*", document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", video: "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5", audio: "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5", }, moveStylesInHead: false, networkTimeout: 0, woleetKey: "", blockImages: false, blockStylesheets: false, blockFonts: false, blockScripts: true, blockVideos: true, blockAudios: true, _migratedTemplateFormat: true, }; // validator define for different press const validators = { 1002: (document) => document.querySelector( ".article__body .abstract-group .article-section__abstract .article-section__content" ) && document.querySelectorAll( ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)" ).length > 0, 1016: (document) => (document.querySelector("div.abstract.author > div") || document.querySelector('[data-left-hand-nav="Summary"]')) && (document.querySelectorAll( "div#body > div:first-child > section[id^=s] p[id^=p]" ).length > 0 || document.querySelectorAll( "div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)" ).length > 0 || document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0), 3390: (document) => document.querySelector("#html-abstract .html-p") && document.querySelectorAll("article .html-body .html-p").length > 0, 1039: (document) => document.querySelector("article .capsule__text") && document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0, 1021: (document) => document.querySelector("p.articleBody_abstractText") && (document.querySelectorAll("div.NLM_p").length > 0 || document.querySelectorAll(".article_content-left > p").length > 0), 1038: (document) => document.querySelector("#Abs1-content") && document.querySelectorAll( "article .main-content .c-article-section__content > p" ).length > 0, 1007: (document) => document.querySelectorAll("#Abs1-content p").length > 0 && document.querySelectorAll(".main-content .c-article-section__content > p") .length > 0, 1088: (document) => document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 && document.querySelectorAll(`:where( div[itemprop="articleBody"] > p, div[itemprop="articleBody"] > .article-text > p, div[itemprop="articleBody"] > .article-text > .article-text > p, div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p) `).length > 0, 1063: (document) => document.querySelectorAll("#ContentTab .abstract p").length > 0 && document.querySelectorAll("#ContentTab .article-section-wrapper > p") .length > 0, 1149: (document) => document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 && document.querySelectorAll(`:where( div[itemprop="articleBody"] > p, div[itemprop="articleBody"] > .article-text > p, div[itemprop="articleBody"] > .article-text > .article-text > p, div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p) `).length > 0, }; const documentFixer = { 1088: (document) => { const imgs = Array.from( document.querySelectorAll('main figure img[data-src^="http"]') ); imgs.forEach((item) => { item.src = item.dataset.src; }); }, 1149: (document) => { const imgs = Array.from( document.querySelectorAll('main figure img[data-src^="http"]') ); imgs.forEach((item) => { item.src = item.dataset.src; }); }, }; const addScript = (url) => { const s = document.createElement("script"); s.src = url; document.body.append(s); }; const generateClientId = () => (1e6 * Math.random()).toString(32).replace(".", ""); // main function (function () { "use strict"; addScript( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js" ); addScript( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js" ); addScript( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js" ); // Overwrite fetch function to bypass CORS /** The "fetch-url2.deno.dev" code as follow * serve((req: Request) => handleRequest(req)); async function handleRequest(req: Request) { const url = req.url; const finalUrl = url && url.split("?url=")[1]; if (!finalUrl) { return new Response(url + " no match '?url='"); } const res = await fetch(finalUrl); return new Response(res.body, { headers: { ...res.headers, "Access-Control-Allow-Origin": "*", "Access-Control-Expose-Headers": "Request-Context,api-supported-versions,Content-Length,Date,Server", }, }); } **/ window.unsafeWindow.fetch = async (...args) => { console.log(args); if (args.length <= 1) { return await fetch(...args); } else { const [url, ...otherArgs] = args; return await fetch(...args).catch(async (err) => { if (url.startsWith("https://fetch-url2.deno.dev")) { return; } return await fetch( "https://fetch-url2.deno.dev?url=" + (url.trim().startsWith("http") ? url : `${location.origin}${url}`), ...otherArgs ); }); } }; const downloadFile = (data, fileName) => { const a = document.createElement("a"); document.body.appendChild(a); a.style = "display: none"; const blob = new Blob([data], { type: "application/octet-stream", }); const url = window.URL.createObjectURL(blob); a.href = url; a.download = fileName; a.click(); window.URL.revokeObjectURL(url); }; const sleep = (duration) => { return new Promise((res, rej) => { setTimeout(() => res(), duration * 1000); }); }; async function reload(waiting = 60, message = "") { console.warn(`%c${message}, reload ${waiting}s later`, printStyle); await sleep(waiting); location.reload(); } function readFile(accept = "", multiple = false) { const inputEl = document.createElement("input"); inputEl.setAttribute("type", "file"); inputEl.setAttribute("accept", accept); inputEl.setAttribute("multiple", !!multiple); return new Promise((resolve, reject) => { inputEl.addEventListener("change", (e) => { resolve(multiple ? inputEl.files : inputEl.files[0]); window.removeEventListener("click", onWindowClick, true); }); document.body.append(inputEl); inputEl.click(); const onWindowClick = () => { if (!inputEl.value) { reject(new Error("用户取消选择")); } window.removeEventListener("click", onWindowClick, true); }; setTimeout(() => { window.addEventListener("click", onWindowClick, true); }, 100); }); } function AddImportBtn() { const btnWrapImport = document.createElement("div"); btnWrapImport.id = "CRAWLER_ID"; btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`; const importBtn = btnWrapImport.querySelector("button"); importBtn.onclick = async () => { if ( !window.confirm( "The data in browser will be clear up. Please make sure you have to do this !!!" ) ) { return; } const file = await readFile(".json"); const reader = new FileReader(); reader.onload = (event) => { const json = JSON.parse(event.target.result); // console.log({json}, 'json') // this.importFromBackUp.bind(this)(json); if ( json instanceof Array && json.every((item) => item.doi && item.validator) ) { GM.setValue("tasks", json); location.reload(); } else { alert( "Please upload json file like [{doi: string, validator: string, ...}]" ); } }; reader.readAsText(file); }; document.body.appendChild(btnWrapImport); } function removeImportBtn() { const importBtn = document.getElementById("CRAWLER_ID"); if (importBtn) { importBtn.parentElement.removeChild(importBtn); } } GM_registerMenuCommand("Download", async () => { const taskData = await GM.getValue("tasks"); const waitingTasks = taskData.filter( (task) => !task.downloaded && task.validated === undefined && validators[task.validator] ); const now = new Date(); downloadFile( JSON.stringify(taskData), `${now.getFullYear()}-${ now.getMonth() + 1 }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${ taskData.length }-${taskData.length - waitingTasks.length}.json` ); }); const printStyle = "color: blue;background-color: #ccc;font-size: 20px"; async function start() { console.log(new Date()); AddImportBtn(); await sleep(7); addScript( "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js" ); const taskData = await GM.getValue("tasks"); let tasks = taskData || []; // find task which not downloaded and not validated before const waitingTasks = tasks.filter( (task) => !task.downloaded && task.validated === undefined && validators[task.validator] ); console.log( `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`, printStyle, tasks ); // ---------------------------- Report progress ----------------------------------------------------- let clientId = await GM.getValue("clientId"); if (typeof clientId !== "string" || !clientId) { clientId = generateClientId(); await GM.setValue("clientId", clientId); } const invalidatedTasks = tasks.filter((task) => task.validated === false); const doneTasks = tasks .filter((task) => task.downloaded) .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1)); const lastDoneTime = new Date(doneTasks[0]?.updateTime); const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}`; GM.xmlHttpRequest({ url: "https://crawler-hit.deno.dev/api/update", method: "POST", headers: { "Content-Type": "application/json" }, data: JSON.stringify({ account: clientId, invalidate_count: invalidatedTasks.length, done_count: doneTasks.length, queue_count: waitingTasks.length, tip: reportTip, }), }).then((res) => { window.tts = res; console.log({ res }); }); if (!waitingTasks.length) { await reload(90, "No tasks waiting"); return; } // -------------------------- Detect Cloudflare challenge ------------------------------------------------------- await sleep(10); const currentTask = waitingTasks[0]; const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase(); const validator = validators[currentTask.validator]; if (document.getElementById("challenge-form")) { console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle); await sleep(20); currentTask.validated = false; currentTask.cloudflareBlock = true; } // --------------------------- Page validate ------------------------------------------------------ if ( !currentTask.cloudflareBlock && !document.body.textContent.toLowerCase().includes(doi) ) { console.log( `%cURL not match, will redirect to ${currentTask.doi} 5s later`, printStyle ); await sleep(5); location.href = currentTask.doi; return; } if (!currentTask.cloudflareBlock && validator(document)) { console.log( "%cValidate successfully! Downloading page...", printStyle, waitingTasks, tasks ); removeImportBtn(); // repair special page if (typeof documentFixer[currentTask.validator] === "function") { documentFixer[currentTask.validator](document); } try { const data = await singlefile.getPageData(DEFAULT_CONFIG); downloadFile( data.content, `${doi.replaceAll("/", "_")}.singlefile.html` ); downloadFile( document.body.parentElement.outerHTML, `${doi.replaceAll("/", "_")}.html` ); currentTask.downloaded = true; currentTask.validated = true; currentTask.updateTime = new Date().valueOf(); } catch (error) { console.error(error); await reload(10, `singlefile error! ${currentTask.doi}`); return; } } else { console.log(`%cValidate failed! ${currentTask.doi}`, printStyle); currentTask.validated = false; } await GM.setValue("tasks", tasks); // --------------------------- Prepare next task ------------------------------------------------------ const nextTask = waitingTasks[1]; if (nextTask) { console.log( `%cStart next task 10s later...`, printStyle, nextTask.doi, tasks ); await sleep(10); location.href = nextTask.doi; } else { await reload(60, "No tasks waiting"); } } start(); })();