- // ==UserScript==
- // @name Crawler base on SingleFile
- // @author Mark
- // @description Download site in single file automatically
- // @license MIT
- // @version 0.0.6
- // @match https://*/*
- // @run-at document-idle
- // @grant GM.setValue
- // @grant GM.getValue
- // @grant GM.xmlHttpRequest
- // @grant GM_registerMenuCommand
- // @grant unsafeWindow
- // @noframes
- // @namespace https://greasyfork.org/users/1106595
- // ==/UserScript==
-
- // config for singleFile
- const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
- const DEFAULT_CONFIG = {
- removeHiddenElements: true,
- removeUnusedStyles: true,
- removeUnusedFonts: true,
- removeFrames: false,
- compressHTML: true,
- compressCSS: false,
- loadDeferredImages: true,
- loadDeferredImagesMaxIdleTime: 1500,
- loadDeferredImagesBlockCookies: false,
- loadDeferredImagesBlockStorage: false,
- loadDeferredImagesKeepZoomLevel: false,
- loadDeferredImagesDispatchScrollEvent: false,
- loadDeferredImagesBeforeFrames: false,
- filenameTemplate:
- "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
- infobarTemplate: "",
- includeInfobar: false,
- confirmInfobarContent: false,
- autoClose: false,
- confirmFilename: false,
- filenameConflictAction: "uniquify",
- filenameMaxLength: 192,
- filenameMaxLengthUnit: "bytes",
- filenameReplacedCharacters: [
- "~",
- "+",
- "\\\\",
- "?",
- "%",
- "*",
- ":",
- "|",
- '"',
- "<",
- ">",
- "\x00-\x1f",
- "\x7F",
- ],
- filenameReplacementCharacter: "_",
- replaceEmojisInFilename: false,
- saveFilenameTemplateData: false,
- contextMenuEnabled: true,
- tabMenuEnabled: true,
- browserActionMenuEnabled: true,
- shadowEnabled: true,
- logsEnabled: true,
- progressBarEnabled: true,
- maxResourceSizeEnabled: false,
- maxResourceSize: 10,
- displayInfobar: true,
- displayStats: false,
- backgroundSave: BACKGROUND_SAVE_SUPPORTED,
- defaultEditorMode: "normal",
- applySystemTheme: true,
- autoSaveDelay: 1,
- autoSaveLoad: false,
- autoSaveUnload: false,
- autoSaveLoadOrUnload: true,
- autoSaveDiscard: false,
- autoSaveRemove: false,
- autoSaveRepeat: false,
- autoSaveRepeatDelay: 10,
- removeAlternativeFonts: true,
- removeAlternativeMedias: true,
- removeAlternativeImages: true,
- groupDuplicateImages: true,
- maxSizeDuplicateImages: 512 * 1024,
- saveRawPage: false,
- saveToClipboard: false,
- addProof: false,
- saveToGDrive: false,
- saveToDropbox: false,
- saveWithWebDAV: false,
- webDAVURL: "",
- webDAVUser: "",
- webDAVPassword: "",
- saveToGitHub: false,
- githubToken: "",
- githubUser: "",
- githubRepository: "SingleFile-Archives",
- githubBranch: "main",
- saveWithCompanion: false,
- forceWebAuthFlow: false,
- resolveFragmentIdentifierURLs: false,
- userScriptEnabled: false,
- openEditor: false,
- openSavedPage: false,
- autoOpenEditor: false,
- saveCreatedBookmarks: false,
- allowedBookmarkFolders: [],
- ignoredBookmarkFolders: [],
- replaceBookmarkURL: true,
- saveFavicon: true,
- includeBOM: false,
- warnUnsavedPage: true,
- displayInfobarInEditor: false,
- compressContent: false,
- createRootDirectory: false,
- selfExtractingArchive: true,
- extractDataFromPage: true,
- preventAppendedData: false,
- insertTextBody: false,
- autoSaveExternalSave: false,
- insertMetaNoIndex: false,
- insertMetaCSP: true,
- passReferrerOnError: false,
- password: "",
- insertSingleFileComment: true,
- removeSavedDate: false,
- blockMixedContent: false,
- saveOriginalURLs: false,
- acceptHeaders: {
- font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
- image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
- stylesheet: "text/css,*/*;q=0.1",
- script: "*/*",
- document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- video:
- "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
- audio:
- "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
- },
- moveStylesInHead: false,
- networkTimeout: 0,
- woleetKey: "",
- blockImages: false,
- blockStylesheets: false,
- blockFonts: false,
- blockScripts: true,
- blockVideos: true,
- blockAudios: true,
- _migratedTemplateFormat: true,
- };
-
- // validator define for different press
- const validators = {
- 1002: (document) =>
- (document.querySelector(
- ".article__body .abstract-group .article-section__abstract .article-section__content"
- ) || document.querySelector("article .abstract-group")) &&
- document.querySelectorAll(
- ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
- ).length > 0,
- 1016: (document) =>
- (document.querySelector("div.abstract.author > div") ||
- document.querySelector('[data-left-hand-nav="Summary"]')) &&
- (document.querySelectorAll(
- "div#body > div:first-child > section[id^=s] p[id^=p]"
- ).length > 0 ||
- document.querySelectorAll(
- "div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
- ).length > 0 ||
- document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0 ||
- document.querySelectorAll("div#body [id^='sec'] p[id^='par']").length > 0 ),
- 3390: (document) =>
- document.querySelector("#html-abstract .html-p") &&
- document.querySelectorAll("article .html-body .html-p").length > 0,
- 1039: (document) =>
- document.querySelector("article .capsule__text") &&
- document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
- 1021: (document) =>
- (document.querySelector("p.articleBody_abstractText") || document.querySelector("#specialIssueNotice") || document.querySelector('meta[name="dc.Type"]').content === 'review-article') &&
- (document.querySelectorAll("div.NLM_p").length > 0 ||
- document.querySelectorAll(".article_content-left > p").length > 0),
- 1038: (document) =>
- (document.querySelector("#Abs1-content") || document.querySelector('article [data-title="Abstract"]')) &&
- document.querySelectorAll(
- "article .main-content .c-article-section__content > p"
- ).length > 0,
- 1007: (document) =>
- document.querySelectorAll("#Abs1-content p").length > 0 &&
- document.querySelectorAll(".main-content .c-article-section__content > p")
- .length > 0,
- 1088: (document) =>
- document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
- document.querySelectorAll(`:where(
- div[itemprop="articleBody"] > p,
- div[itemprop="articleBody"] > .article-text > p,
- div[itemprop="articleBody"] > .article-text > .article-text > p,
- div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
- `).length > 0,
- 1063: (document) =>
- document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
- document.querySelectorAll("#ContentTab .article-section-wrapper > p")
- .length > 0,
- 1126: (document) =>
- document.querySelectorAll('[role="doc-abstract"] > [role="paragraph"]')
- .length > 0 &&
- document.querySelectorAll(`#bodymatter [role="paragraph"]`).length > 0,
- 1155: (document) =>
- document.querySelector(".articleBody #abstract") &&
- document.querySelectorAll(".articleBody .xml-content > p:not(#abstract + p)").length > 0,
- 1074: (document) =>
- document.querySelector('.article__sections section:first-child:not(section[id^="cesec"])') &&
- document.querySelectorAll('.article__sections section[id^="cesec"] > .section-paragraph').length > 0,
- 3389: (document) =>
- document.querySelector('.JournalAbstract .authors+.notes+p') &&
- document.querySelectorAll('.article-container .JournalFullText > p').length > 0,
- 1186: (document) =>
- document.querySelector('[data-title="Abstract"] .c-article-section__content') &&
- document.querySelectorAll('main > article > section:not([data-title="Abstract"]):not(#MagazineFulltextArticleBodySuffix ~ section) .c-article-section__content > p').length > 0,
- 3762: (document) =>
- document.querySelector('#articleContent #abstract p') &&
- document.querySelectorAll('#articleContent .text-bs > p').length > 0,
- 1371: (document) =>
- document.querySelector('.article-content .abstract-content p') &&
- document.querySelectorAll('.article-content #artText div[id^="section"] > p').length > 0,
- };
-
- validators["1006"] = validators["1016"];
- validators["1149"] = validators["1088"];
-
- const documentFixer = {
- 1088: (document) => {
- const imgs = Array.from(
- document.querySelectorAll('main figure img[data-src^="http"]')
- );
- imgs.forEach((item) => {
- item.src = item.dataset.src;
- });
- },
- 3389: (document) => {
- const imgs = Array.from(
- document.querySelectorAll('.article-container .JournalFullText .FigureDesc img[data-src^="http"]')
- );
- imgs.forEach((item) => {
- item.src = item.dataset.src;
- });
- },
- };
- documentFixer["1149"] = documentFixer["1088"];
-
- const addScript = (url) => {
- const s = document.createElement("script");
- s.src = url;
- document.body.append(s);
- };
-
- const generateClientId = () =>
- (1e6 * Math.random()).toString(32).replace(".", "");
- // main function
- (function () {
- "use strict";
-
- addScript(
- "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
- );
- addScript(
- "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
- );
- addScript(
- "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
- );
- // Overwrite fetch function to bypass CORS
- /** The "fetch-url2.deno.dev" code as follow
- *
- serve((req: Request) => handleRequest(req));
-
- async function handleRequest(req: Request) {
- const url = req.url;
- const finalUrl = url && url.split("?url=")[1];
- if (!finalUrl) {
- return new Response(url + " no match '?url='");
- }
- const res = await fetch(finalUrl);
- return new Response(res.body, {
- headers: {
- ...res.headers,
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Expose-Headers":
- "Request-Context,api-supported-versions,Content-Length,Date,Server",
- },
- });
- }
- **/
- window.unsafeWindow.fetch = async (...args) => {
- console.log(args);
- if (args.length <= 1) {
- return await fetch(...args);
- } else {
- const [url, ...otherArgs] = args;
- return await fetch(...args).catch(async (err) => {
- if (url.startsWith("https://fetch-url2.deno.dev")) {
- return;
- }
- return await fetch(
- "https://fetch-url2.deno.dev?url=" +
- (url.trim().startsWith("http") ? url : `${location.origin}${url}`),
- ...otherArgs
- );
- });
- }
- };
-
- const downloadFile = (data, fileName) => {
- const a = document.createElement("a");
- document.body.appendChild(a);
- a.style = "display: none";
- const blob = new Blob([data], {
- type: "application/octet-stream",
- });
- const url = window.URL.createObjectURL(blob);
- a.href = url;
- a.download = fileName;
- a.click();
- window.URL.revokeObjectURL(url);
- };
-
- const sleep = (duration) => {
- return new Promise((res, rej) => {
- setTimeout(() => res(), duration * 1000);
- });
- };
-
- async function reload(waiting = 60, message = "") {
- console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
- await sleep(waiting);
- location.reload();
- }
-
- function readFile(accept = "", multiple = false) {
- const inputEl = document.createElement("input");
- inputEl.setAttribute("type", "file");
- inputEl.setAttribute("accept", accept);
- inputEl.setAttribute("multiple", !!multiple);
- return new Promise((resolve, reject) => {
- inputEl.addEventListener("change", (e) => {
- resolve(multiple ? inputEl.files : inputEl.files[0]);
- window.removeEventListener("click", onWindowClick, true);
- });
- document.body.append(inputEl);
- inputEl.click();
-
- const onWindowClick = () => {
- if (!inputEl.value) {
- reject(new Error("用户取消选择"));
- }
- window.removeEventListener("click", onWindowClick, true);
- };
- setTimeout(() => {
- window.addEventListener("click", onWindowClick, true);
- }, 100);
- });
- }
-
- function AddImportBtn() {
- const btnWrapImport = document.createElement("div");
- btnWrapImport.id = "CRAWLER_ID";
- btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
- const importBtn = btnWrapImport.querySelector("button");
- importBtn.onclick = async () => {
- if (
- !window.confirm(
- "The data in browser will be clear up. Please make sure you have to do this !!!"
- )
- ) {
- return;
- }
- const file = await readFile(".json");
- const reader = new FileReader();
-
- reader.onload = (event) => {
- const json = JSON.parse(event.target.result);
- // console.log({json}, 'json')
- // this.importFromBackUp.bind(this)(json);
- if (
- json instanceof Array &&
- json.every((item) => item.doi && item.validator)
- ) {
- GM.setValue("tasks", json);
- location.reload();
- } else {
- alert(
- "Please upload json file like [{doi: string, validator: string, ...}]"
- );
- }
- };
-
- reader.readAsText(file);
- };
- document.body.appendChild(btnWrapImport);
- }
-
- function removeImportBtn() {
- const importBtn = document.getElementById("CRAWLER_ID");
- if (importBtn) {
- importBtn.parentElement.removeChild(importBtn);
- }
- }
-
- GM_registerMenuCommand("Download", async () => {
- const taskData = await GM.getValue("tasks");
- const waitingTasks = taskData.filter(
- (task) =>
- !task.downloaded &&
- task.validated === undefined &&
- validators[task.validator]
- );
- const now = new Date();
- downloadFile(
- JSON.stringify(taskData),
- `${now.getFullYear()}-${
- now.getMonth() + 1
- }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
- taskData.length
- }-${taskData.length - waitingTasks.length}.json`
- );
- });
-
- const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
-
- async function start() {
- console.log(new Date());
- AddImportBtn();
- await sleep(7);
- addScript(
- "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
- );
- const taskData = await GM.getValue("tasks");
- let tasks = taskData || [];
-
- // find task which not downloaded and not validated before
- const waitingTasks = tasks.filter(
- (task) =>
- !task.downloaded &&
- task.validated === undefined &&
- validators[task.validator]
- );
- console.log(
- `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
- printStyle,
- tasks
- );
-
- // ---------------------------- Report progress -----------------------------------------------------
-
- let clientId = await GM.getValue("clientId");
- if (typeof clientId !== "string" || !clientId) {
- clientId = generateClientId();
- await GM.setValue("clientId", clientId);
- }
- const invalidatedTasks = tasks.filter((task) => task.validated === false);
- const doneTasks = tasks
- .filter((task) => task.downloaded)
- .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
- const lastDoneTime = new Date(doneTasks[0]?.updateTime);
- const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}`;
- GM.xmlHttpRequest({
- url: "https://crawler-hit.deno.dev/api/update",
- method: "POST",
- headers: { "Content-Type": "application/json" },
- data: JSON.stringify({
- account: clientId,
- invalidate_count: invalidatedTasks.length,
- done_count: doneTasks.length,
- queue_count: waitingTasks.length,
- tip: reportTip,
- }),
- }).then((res) => {
- window.tts = res;
- console.log({ res });
- });
-
- if (!waitingTasks.length) {
- await reload(90, "No tasks waiting");
- return;
- }
-
- // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
- await sleep(10);
- const currentTask = waitingTasks[0];
- const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
- const validator = validators[currentTask.validator];
- if (document.getElementById("challenge-form")) {
- console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
- await sleep(20);
- currentTask.validated = false;
- currentTask.cloudflareBlock = true;
- }
-
- // --------------------------- Page validate ------------------------------------------------------
- if (
- !currentTask.cloudflareBlock &&
- !document.body.textContent.toLowerCase().includes(doi)
- ) {
- console.log(
- `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
- printStyle
- );
- await sleep(5);
- location.href = currentTask.doi;
- return;
- }
- if (!currentTask.cloudflareBlock && validator(document)) {
- console.log(
- "%cValidate successfully! Downloading page...",
- printStyle,
- waitingTasks,
- tasks
- );
- removeImportBtn();
- // repair special page
- if (typeof documentFixer[currentTask.validator] === "function") {
- documentFixer[currentTask.validator](document);
- }
- try {
- const data = await singlefile.getPageData(DEFAULT_CONFIG);
- downloadFile(
- data.content,
- `${doi.replaceAll("/", "_")}.singlefile.html`
- );
- downloadFile(
- document.body.parentElement.outerHTML,
- `${doi.replaceAll("/", "_")}.html`
- );
- currentTask.downloaded = true;
- currentTask.validated = true;
- currentTask.updateTime = new Date().valueOf();
- } catch (error) {
- console.error(error);
- await reload(10, `singlefile error! ${currentTask.doi}`);
- return;
- }
- } else {
- console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
- currentTask.validated = false;
- }
-
- await GM.setValue("tasks", tasks);
-
- // --------------------------- Prepare next task ------------------------------------------------------
- const nextTask = waitingTasks[1];
- if (nextTask) {
- console.log(
- `%cStart next task 10s later...`,
- printStyle,
- nextTask.doi,
- tasks
- );
- await sleep(10);
- location.href = nextTask.doi;
- } else {
- await reload(60, "No tasks waiting");
- }
- }
-
- start();
- })();