Crawler base on SingleFile

Download site in single file automatically

目前为 2023-12-22 提交的版本。查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.6
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @noframes
  15. // @namespace https://greasyfork.org/users/1106595
  16. // ==/UserScript==
  17.  
  18. // config for singleFile
  19. const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
  20. const DEFAULT_CONFIG = {
  21. removeHiddenElements: true,
  22. removeUnusedStyles: true,
  23. removeUnusedFonts: true,
  24. removeFrames: false,
  25. compressHTML: true,
  26. compressCSS: false,
  27. loadDeferredImages: true,
  28. loadDeferredImagesMaxIdleTime: 1500,
  29. loadDeferredImagesBlockCookies: false,
  30. loadDeferredImagesBlockStorage: false,
  31. loadDeferredImagesKeepZoomLevel: false,
  32. loadDeferredImagesDispatchScrollEvent: false,
  33. loadDeferredImagesBeforeFrames: false,
  34. filenameTemplate:
  35. "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
  36. infobarTemplate: "",
  37. includeInfobar: false,
  38. confirmInfobarContent: false,
  39. autoClose: false,
  40. confirmFilename: false,
  41. filenameConflictAction: "uniquify",
  42. filenameMaxLength: 192,
  43. filenameMaxLengthUnit: "bytes",
  44. filenameReplacedCharacters: [
  45. "~",
  46. "+",
  47. "\\\\",
  48. "?",
  49. "%",
  50. "*",
  51. ":",
  52. "|",
  53. '"',
  54. "<",
  55. ">",
  56. "\x00-\x1f",
  57. "\x7F",
  58. ],
  59. filenameReplacementCharacter: "_",
  60. replaceEmojisInFilename: false,
  61. saveFilenameTemplateData: false,
  62. contextMenuEnabled: true,
  63. tabMenuEnabled: true,
  64. browserActionMenuEnabled: true,
  65. shadowEnabled: true,
  66. logsEnabled: true,
  67. progressBarEnabled: true,
  68. maxResourceSizeEnabled: false,
  69. maxResourceSize: 10,
  70. displayInfobar: true,
  71. displayStats: false,
  72. backgroundSave: BACKGROUND_SAVE_SUPPORTED,
  73. defaultEditorMode: "normal",
  74. applySystemTheme: true,
  75. autoSaveDelay: 1,
  76. autoSaveLoad: false,
  77. autoSaveUnload: false,
  78. autoSaveLoadOrUnload: true,
  79. autoSaveDiscard: false,
  80. autoSaveRemove: false,
  81. autoSaveRepeat: false,
  82. autoSaveRepeatDelay: 10,
  83. removeAlternativeFonts: true,
  84. removeAlternativeMedias: true,
  85. removeAlternativeImages: true,
  86. groupDuplicateImages: true,
  87. maxSizeDuplicateImages: 512 * 1024,
  88. saveRawPage: false,
  89. saveToClipboard: false,
  90. addProof: false,
  91. saveToGDrive: false,
  92. saveToDropbox: false,
  93. saveWithWebDAV: false,
  94. webDAVURL: "",
  95. webDAVUser: "",
  96. webDAVPassword: "",
  97. saveToGitHub: false,
  98. githubToken: "",
  99. githubUser: "",
  100. githubRepository: "SingleFile-Archives",
  101. githubBranch: "main",
  102. saveWithCompanion: false,
  103. forceWebAuthFlow: false,
  104. resolveFragmentIdentifierURLs: false,
  105. userScriptEnabled: false,
  106. openEditor: false,
  107. openSavedPage: false,
  108. autoOpenEditor: false,
  109. saveCreatedBookmarks: false,
  110. allowedBookmarkFolders: [],
  111. ignoredBookmarkFolders: [],
  112. replaceBookmarkURL: true,
  113. saveFavicon: true,
  114. includeBOM: false,
  115. warnUnsavedPage: true,
  116. displayInfobarInEditor: false,
  117. compressContent: false,
  118. createRootDirectory: false,
  119. selfExtractingArchive: true,
  120. extractDataFromPage: true,
  121. preventAppendedData: false,
  122. insertTextBody: false,
  123. autoSaveExternalSave: false,
  124. insertMetaNoIndex: false,
  125. insertMetaCSP: true,
  126. passReferrerOnError: false,
  127. password: "",
  128. insertSingleFileComment: true,
  129. removeSavedDate: false,
  130. blockMixedContent: false,
  131. saveOriginalURLs: false,
  132. acceptHeaders: {
  133. font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
  134. image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
  135. stylesheet: "text/css,*/*;q=0.1",
  136. script: "*/*",
  137. document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  138. video:
  139. "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
  140. audio:
  141. "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
  142. },
  143. moveStylesInHead: false,
  144. networkTimeout: 0,
  145. woleetKey: "",
  146. blockImages: false,
  147. blockStylesheets: false,
  148. blockFonts: false,
  149. blockScripts: true,
  150. blockVideos: true,
  151. blockAudios: true,
  152. _migratedTemplateFormat: true,
  153. };
  154.  
  155. // validator define for different press
  156. const validators = {
  157. 1002: (document) =>
  158. (document.querySelector(
  159. ".article__body .abstract-group .article-section__abstract .article-section__content"
  160. ) || document.querySelector("article .abstract-group")) &&
  161. document.querySelectorAll(
  162. ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
  163. ).length > 0,
  164. 1016: (document) =>
  165. (document.querySelector("div.abstract.author > div") ||
  166. document.querySelector('[data-left-hand-nav="Summary"]')) &&
  167. (document.querySelectorAll(
  168. "div#body > div:first-child > section[id^=s] p[id^=p]"
  169. ).length > 0 ||
  170. document.querySelectorAll(
  171. "div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
  172. ).length > 0 ||
  173. document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0 ||
  174. document.querySelectorAll("div#body [id^='sec'] p[id^='par']").length > 0 ),
  175. 3390: (document) =>
  176. document.querySelector("#html-abstract .html-p") &&
  177. document.querySelectorAll("article .html-body .html-p").length > 0,
  178. 1039: (document) =>
  179. document.querySelector("article .capsule__text") &&
  180. document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
  181. 1021: (document) =>
  182. (document.querySelector("p.articleBody_abstractText") || document.querySelector("#specialIssueNotice") || document.querySelector('meta[name="dc.Type"]').content === 'review-article') &&
  183. (document.querySelectorAll("div.NLM_p").length > 0 ||
  184. document.querySelectorAll(".article_content-left > p").length > 0),
  185. 1038: (document) =>
  186. (document.querySelector("#Abs1-content") || document.querySelector('article [data-title="Abstract"]')) &&
  187. document.querySelectorAll(
  188. "article .main-content .c-article-section__content > p"
  189. ).length > 0,
  190. 1007: (document) =>
  191. document.querySelectorAll("#Abs1-content p").length > 0 &&
  192. document.querySelectorAll(".main-content .c-article-section__content > p")
  193. .length > 0,
  194. 1088: (document) =>
  195. document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
  196. document.querySelectorAll(`:where(
  197. div[itemprop="articleBody"] > p,
  198. div[itemprop="articleBody"] > .article-text > p,
  199. div[itemprop="articleBody"] > .article-text > .article-text > p,
  200. div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  201. `).length > 0,
  202. 1063: (document) =>
  203. document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
  204. document.querySelectorAll("#ContentTab .article-section-wrapper > p")
  205. .length > 0,
  206. 1126: (document) =>
  207. document.querySelectorAll('[role="doc-abstract"] > [role="paragraph"]')
  208. .length > 0 &&
  209. document.querySelectorAll(`#bodymatter [role="paragraph"]`).length > 0,
  210. 1155: (document) =>
  211. document.querySelector(".articleBody #abstract") &&
  212. document.querySelectorAll(".articleBody .xml-content > p:not(#abstract + p)").length > 0,
  213. 1074: (document) =>
  214. document.querySelector('.article__sections section:first-child:not(section[id^="cesec"])') &&
  215. document.querySelectorAll('.article__sections section[id^="cesec"] > .section-paragraph').length > 0,
  216. 3389: (document) =>
  217. document.querySelector('.JournalAbstract .authors+.notes+p') &&
  218. document.querySelectorAll('.article-container .JournalFullText > p').length > 0,
  219. 1186: (document) =>
  220. document.querySelector('[data-title="Abstract"] .c-article-section__content') &&
  221. document.querySelectorAll('main > article > section:not([data-title="Abstract"]):not(#MagazineFulltextArticleBodySuffix ~ section) .c-article-section__content > p').length > 0,
  222. 3762: (document) =>
  223. document.querySelector('#articleContent #abstract p') &&
  224. document.querySelectorAll('#articleContent .text-bs > p').length > 0,
  225. 1371: (document) =>
  226. document.querySelector('.article-content .abstract-content p') &&
  227. document.querySelectorAll('.article-content #artText div[id^="section"] > p').length > 0,
  228. };
  229.  
  230. validators["1006"] = validators["1016"];
  231. validators["1149"] = validators["1088"];
  232.  
  233. const documentFixer = {
  234. 1088: (document) => {
  235. const imgs = Array.from(
  236. document.querySelectorAll('main figure img[data-src^="http"]')
  237. );
  238. imgs.forEach((item) => {
  239. item.src = item.dataset.src;
  240. });
  241. },
  242. 3389: (document) => {
  243. const imgs = Array.from(
  244. document.querySelectorAll('.article-container .JournalFullText .FigureDesc img[data-src^="http"]')
  245. );
  246. imgs.forEach((item) => {
  247. item.src = item.dataset.src;
  248. });
  249. },
  250. };
  251. documentFixer["1149"] = documentFixer["1088"];
  252.  
  253. const addScript = (url) => {
  254. const s = document.createElement("script");
  255. s.src = url;
  256. document.body.append(s);
  257. };
  258.  
  259. const generateClientId = () =>
  260. (1e6 * Math.random()).toString(32).replace(".", "");
  261. // main function
  262. (function () {
  263. "use strict";
  264.  
  265. addScript(
  266. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
  267. );
  268. addScript(
  269. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
  270. );
  271. addScript(
  272. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
  273. );
  274. // Overwrite fetch function to bypass CORS
  275. /** The "fetch-url2.deno.dev" code as follow
  276. *
  277. serve((req: Request) => handleRequest(req));
  278.  
  279. async function handleRequest(req: Request) {
  280. const url = req.url;
  281. const finalUrl = url && url.split("?url=")[1];
  282. if (!finalUrl) {
  283. return new Response(url + " no match '?url='");
  284. }
  285. const res = await fetch(finalUrl);
  286. return new Response(res.body, {
  287. headers: {
  288. ...res.headers,
  289. "Access-Control-Allow-Origin": "*",
  290. "Access-Control-Expose-Headers":
  291. "Request-Context,api-supported-versions,Content-Length,Date,Server",
  292. },
  293. });
  294. }
  295. **/
  296. window.unsafeWindow.fetch = async (...args) => {
  297. console.log(args);
  298. if (args.length <= 1) {
  299. return await fetch(...args);
  300. } else {
  301. const [url, ...otherArgs] = args;
  302. return await fetch(...args).catch(async (err) => {
  303. if (url.startsWith("https://fetch-url2.deno.dev")) {
  304. return;
  305. }
  306. return await fetch(
  307. "https://fetch-url2.deno.dev?url=" +
  308. (url.trim().startsWith("http") ? url : `${location.origin}${url}`),
  309. ...otherArgs
  310. );
  311. });
  312. }
  313. };
  314.  
  315. const downloadFile = (data, fileName) => {
  316. const a = document.createElement("a");
  317. document.body.appendChild(a);
  318. a.style = "display: none";
  319. const blob = new Blob([data], {
  320. type: "application/octet-stream",
  321. });
  322. const url = window.URL.createObjectURL(blob);
  323. a.href = url;
  324. a.download = fileName;
  325. a.click();
  326. window.URL.revokeObjectURL(url);
  327. };
  328.  
  329. const sleep = (duration) => {
  330. return new Promise((res, rej) => {
  331. setTimeout(() => res(), duration * 1000);
  332. });
  333. };
  334.  
  335. async function reload(waiting = 60, message = "") {
  336. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  337. await sleep(waiting);
  338. location.reload();
  339. }
  340.  
  341. function readFile(accept = "", multiple = false) {
  342. const inputEl = document.createElement("input");
  343. inputEl.setAttribute("type", "file");
  344. inputEl.setAttribute("accept", accept);
  345. inputEl.setAttribute("multiple", !!multiple);
  346. return new Promise((resolve, reject) => {
  347. inputEl.addEventListener("change", (e) => {
  348. resolve(multiple ? inputEl.files : inputEl.files[0]);
  349. window.removeEventListener("click", onWindowClick, true);
  350. });
  351. document.body.append(inputEl);
  352. inputEl.click();
  353.  
  354. const onWindowClick = () => {
  355. if (!inputEl.value) {
  356. reject(new Error("用户取消选择"));
  357. }
  358. window.removeEventListener("click", onWindowClick, true);
  359. };
  360. setTimeout(() => {
  361. window.addEventListener("click", onWindowClick, true);
  362. }, 100);
  363. });
  364. }
  365.  
  366. function AddImportBtn() {
  367. const btnWrapImport = document.createElement("div");
  368. btnWrapImport.id = "CRAWLER_ID";
  369. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  370. const importBtn = btnWrapImport.querySelector("button");
  371. importBtn.onclick = async () => {
  372. if (
  373. !window.confirm(
  374. "The data in browser will be clear up. Please make sure you have to do this !!!"
  375. )
  376. ) {
  377. return;
  378. }
  379. const file = await readFile(".json");
  380. const reader = new FileReader();
  381.  
  382. reader.onload = (event) => {
  383. const json = JSON.parse(event.target.result);
  384. // console.log({json}, 'json')
  385. // this.importFromBackUp.bind(this)(json);
  386. if (
  387. json instanceof Array &&
  388. json.every((item) => item.doi && item.validator)
  389. ) {
  390. GM.setValue("tasks", json);
  391. location.reload();
  392. } else {
  393. alert(
  394. "Please upload json file like [{doi: string, validator: string, ...}]"
  395. );
  396. }
  397. };
  398.  
  399. reader.readAsText(file);
  400. };
  401. document.body.appendChild(btnWrapImport);
  402. }
  403.  
  404. function removeImportBtn() {
  405. const importBtn = document.getElementById("CRAWLER_ID");
  406. if (importBtn) {
  407. importBtn.parentElement.removeChild(importBtn);
  408. }
  409. }
  410.  
  411. GM_registerMenuCommand("Download", async () => {
  412. const taskData = await GM.getValue("tasks");
  413. const waitingTasks = taskData.filter(
  414. (task) =>
  415. !task.downloaded &&
  416. task.validated === undefined &&
  417. validators[task.validator]
  418. );
  419. const now = new Date();
  420. downloadFile(
  421. JSON.stringify(taskData),
  422. `${now.getFullYear()}-${
  423. now.getMonth() + 1
  424. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  425. taskData.length
  426. }-${taskData.length - waitingTasks.length}.json`
  427. );
  428. });
  429.  
  430. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  431.  
  432. async function start() {
  433. console.log(new Date());
  434. AddImportBtn();
  435. await sleep(7);
  436. addScript(
  437. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
  438. );
  439. const taskData = await GM.getValue("tasks");
  440. let tasks = taskData || [];
  441.  
  442. // find task which not downloaded and not validated before
  443. const waitingTasks = tasks.filter(
  444. (task) =>
  445. !task.downloaded &&
  446. task.validated === undefined &&
  447. validators[task.validator]
  448. );
  449. console.log(
  450. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  451. printStyle,
  452. tasks
  453. );
  454.  
  455. // ---------------------------- Report progress -----------------------------------------------------
  456.  
  457. let clientId = await GM.getValue("clientId");
  458. if (typeof clientId !== "string" || !clientId) {
  459. clientId = generateClientId();
  460. await GM.setValue("clientId", clientId);
  461. }
  462. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  463. const doneTasks = tasks
  464. .filter((task) => task.downloaded)
  465. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  466. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  467. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}`;
  468. GM.xmlHttpRequest({
  469. url: "https://crawler-hit.deno.dev/api/update",
  470. method: "POST",
  471. headers: { "Content-Type": "application/json" },
  472. data: JSON.stringify({
  473. account: clientId,
  474. invalidate_count: invalidatedTasks.length,
  475. done_count: doneTasks.length,
  476. queue_count: waitingTasks.length,
  477. tip: reportTip,
  478. }),
  479. }).then((res) => {
  480. window.tts = res;
  481. console.log({ res });
  482. });
  483.  
  484. if (!waitingTasks.length) {
  485. await reload(90, "No tasks waiting");
  486. return;
  487. }
  488.  
  489. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  490. await sleep(10);
  491. const currentTask = waitingTasks[0];
  492. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  493. const validator = validators[currentTask.validator];
  494. if (document.getElementById("challenge-form")) {
  495. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  496. await sleep(20);
  497. currentTask.validated = false;
  498. currentTask.cloudflareBlock = true;
  499. }
  500.  
  501. // --------------------------- Page validate ------------------------------------------------------
  502. if (
  503. !currentTask.cloudflareBlock &&
  504. !document.body.textContent.toLowerCase().includes(doi)
  505. ) {
  506. console.log(
  507. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  508. printStyle
  509. );
  510. await sleep(5);
  511. location.href = currentTask.doi;
  512. return;
  513. }
  514. if (!currentTask.cloudflareBlock && validator(document)) {
  515. console.log(
  516. "%cValidate successfully! Downloading page...",
  517. printStyle,
  518. waitingTasks,
  519. tasks
  520. );
  521. removeImportBtn();
  522. // repair special page
  523. if (typeof documentFixer[currentTask.validator] === "function") {
  524. documentFixer[currentTask.validator](document);
  525. }
  526. try {
  527. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  528. downloadFile(
  529. data.content,
  530. `${doi.replaceAll("/", "_")}.singlefile.html`
  531. );
  532. downloadFile(
  533. document.body.parentElement.outerHTML,
  534. `${doi.replaceAll("/", "_")}.html`
  535. );
  536. currentTask.downloaded = true;
  537. currentTask.validated = true;
  538. currentTask.updateTime = new Date().valueOf();
  539. } catch (error) {
  540. console.error(error);
  541. await reload(10, `singlefile error! ${currentTask.doi}`);
  542. return;
  543. }
  544. } else {
  545. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  546. currentTask.validated = false;
  547. }
  548.  
  549. await GM.setValue("tasks", tasks);
  550.  
  551. // --------------------------- Prepare next task ------------------------------------------------------
  552. const nextTask = waitingTasks[1];
  553. if (nextTask) {
  554. console.log(
  555. `%cStart next task 10s later...`,
  556. printStyle,
  557. nextTask.doi,
  558. tasks
  559. );
  560. await sleep(10);
  561. location.href = nextTask.doi;
  562. } else {
  563. await reload(60, "No tasks waiting");
  564. }
  565. }
  566.  
  567. start();
  568. })();