Crawler base on SingleFile

Download site in single file automatically

当前为 2023-12-22 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.8
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @noframes
  15. // @namespace https://greasyfork.org/users/1106595
  16. // ==/UserScript==
  17.  
  18. // config for singleFile
  19. const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
  20. const DEFAULT_CONFIG = {
  21. removeHiddenElements: true,
  22. removeUnusedStyles: true,
  23. removeUnusedFonts: true,
  24. removeFrames: false,
  25. compressHTML: true,
  26. compressCSS: false,
  27. loadDeferredImages: true,
  28. loadDeferredImagesMaxIdleTime: 1500,
  29. loadDeferredImagesBlockCookies: false,
  30. loadDeferredImagesBlockStorage: false,
  31. loadDeferredImagesKeepZoomLevel: false,
  32. loadDeferredImagesDispatchScrollEvent: false,
  33. loadDeferredImagesBeforeFrames: false,
  34. filenameTemplate:
  35. "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
  36. infobarTemplate: "",
  37. includeInfobar: false,
  38. confirmInfobarContent: false,
  39. autoClose: false,
  40. confirmFilename: false,
  41. filenameConflictAction: "uniquify",
  42. filenameMaxLength: 192,
  43. filenameMaxLengthUnit: "bytes",
  44. filenameReplacedCharacters: [
  45. "~",
  46. "+",
  47. "\\\\",
  48. "?",
  49. "%",
  50. "*",
  51. ":",
  52. "|",
  53. '"',
  54. "<",
  55. ">",
  56. "\x00-\x1f",
  57. "\x7F",
  58. ],
  59. filenameReplacementCharacter: "_",
  60. replaceEmojisInFilename: false,
  61. saveFilenameTemplateData: false,
  62. contextMenuEnabled: true,
  63. tabMenuEnabled: true,
  64. browserActionMenuEnabled: true,
  65. shadowEnabled: true,
  66. logsEnabled: true,
  67. progressBarEnabled: true,
  68. maxResourceSizeEnabled: false,
  69. maxResourceSize: 10,
  70. displayInfobar: true,
  71. displayStats: false,
  72. backgroundSave: BACKGROUND_SAVE_SUPPORTED,
  73. defaultEditorMode: "normal",
  74. applySystemTheme: true,
  75. autoSaveDelay: 1,
  76. autoSaveLoad: false,
  77. autoSaveUnload: false,
  78. autoSaveLoadOrUnload: true,
  79. autoSaveDiscard: false,
  80. autoSaveRemove: false,
  81. autoSaveRepeat: false,
  82. autoSaveRepeatDelay: 10,
  83. removeAlternativeFonts: true,
  84. removeAlternativeMedias: true,
  85. removeAlternativeImages: true,
  86. groupDuplicateImages: true,
  87. maxSizeDuplicateImages: 512 * 1024,
  88. saveRawPage: false,
  89. saveToClipboard: false,
  90. addProof: false,
  91. saveToGDrive: false,
  92. saveToDropbox: false,
  93. saveWithWebDAV: false,
  94. webDAVURL: "",
  95. webDAVUser: "",
  96. webDAVPassword: "",
  97. saveToGitHub: false,
  98. githubToken: "",
  99. githubUser: "",
  100. githubRepository: "SingleFile-Archives",
  101. githubBranch: "main",
  102. saveWithCompanion: false,
  103. forceWebAuthFlow: false,
  104. resolveFragmentIdentifierURLs: false,
  105. userScriptEnabled: false,
  106. openEditor: false,
  107. openSavedPage: false,
  108. autoOpenEditor: false,
  109. saveCreatedBookmarks: false,
  110. allowedBookmarkFolders: [],
  111. ignoredBookmarkFolders: [],
  112. replaceBookmarkURL: true,
  113. saveFavicon: true,
  114. includeBOM: false,
  115. warnUnsavedPage: true,
  116. displayInfobarInEditor: false,
  117. compressContent: false,
  118. createRootDirectory: false,
  119. selfExtractingArchive: true,
  120. extractDataFromPage: true,
  121. preventAppendedData: false,
  122. insertTextBody: false,
  123. autoSaveExternalSave: false,
  124. insertMetaNoIndex: false,
  125. insertMetaCSP: true,
  126. passReferrerOnError: false,
  127. password: "",
  128. insertSingleFileComment: true,
  129. removeSavedDate: false,
  130. blockMixedContent: false,
  131. saveOriginalURLs: false,
  132. acceptHeaders: {
  133. font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
  134. image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
  135. stylesheet: "text/css,*/*;q=0.1",
  136. script: "*/*",
  137. document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  138. video:
  139. "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
  140. audio:
  141. "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
  142. },
  143. moveStylesInHead: false,
  144. networkTimeout: 0,
  145. woleetKey: "",
  146. blockImages: false,
  147. blockStylesheets: false,
  148. blockFonts: false,
  149. blockScripts: true,
  150. blockVideos: true,
  151. blockAudios: true,
  152. _migratedTemplateFormat: true,
  153. };
  154.  
  155. // validator define for different press
  156. const validators = {
  157. 1002: (document) =>
  158. (document.querySelector(
  159. ".article__body .abstract-group .article-section__abstract .article-section__content"
  160. ) || document.querySelector("article .abstract-group")) &&
  161. document.querySelectorAll(
  162. ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
  163. ).length > 0,
  164. 1016: (document) =>
  165. (document.querySelector("div.abstract.author > div") ||
  166. document.querySelector('[data-left-hand-nav="Summary"]')) &&
  167. (document.querySelectorAll(
  168. "div#body > div:first-child > section[id^=s] p[id^=p]"
  169. ).length > 0 ||
  170. document.querySelectorAll(
  171. "div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
  172. ).length > 0 ||
  173. document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0 ||
  174. document.querySelectorAll("div#body [id^='sec'] p[id^='par']").length > 0 ),
  175. 3390: (document) =>
  176. document.querySelector("#html-abstract .html-p") &&
  177. document.querySelectorAll("article .html-body .html-p").length > 0,
  178. 1039: (document) =>
  179. document.querySelector("article .capsule__text") &&
  180. document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
  181. 1021: (document) =>
  182. (document.querySelector("p.articleBody_abstractText") || document.querySelector("#specialIssueNotice") || document.querySelector('meta[name="dc.Type"]').content === 'review-article') &&
  183. (document.querySelectorAll("div.NLM_p").length > 0 ||
  184. document.querySelectorAll(".article_content-left > p").length > 0),
  185. 1038: (document) =>
  186. (document.querySelector("#Abs1-content") || document.querySelector('article [data-title="Abstract"]')) &&
  187. document.querySelectorAll(
  188. "article .main-content .c-article-section__content > p"
  189. ).length > 0,
  190. 1007: (document) =>
  191. document.querySelectorAll("#Abs1-content p").length > 0 &&
  192. document.querySelectorAll(".main-content .c-article-section__content > p")
  193. .length > 0,
  194. 1088: (document) =>
  195. document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
  196. document.querySelectorAll(`:where(
  197. div[itemprop="articleBody"] > p,
  198. div[itemprop="articleBody"] > .article-text > p,
  199. div[itemprop="articleBody"] > .article-text > .article-text > p,
  200. div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  201. `).length > 0,
  202. 1063: (document) =>
  203. document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
  204. document.querySelectorAll("#ContentTab .article-section-wrapper > p")
  205. .length > 0,
  206. 1126: (document) =>
  207. document.querySelectorAll('[role="doc-abstract"] > [role="paragraph"]')
  208. .length > 0 &&
  209. document.querySelectorAll(`#bodymatter [role="paragraph"]`).length > 0,
  210.  
  211. // Cancel "1155" because of CSP, need to fix "fetch-url2.deno.dev" fetch
  212. // const addScript = async (url) => {
  213. // const s = document.createElement("script");
  214. // const res = await GM.xmlHttpRequest({
  215. // url: url,
  216. // method: "GET",
  217. // });
  218.  
  219. // const text = res.responseText;
  220. // s.innerHTML = text;
  221. // document.body.append(s);
  222. // };
  223.  
  224. // 1155: (document) =>
  225. // document.querySelector(".articleBody #abstract") &&
  226. // document.querySelectorAll(".articleBody .xml-content > p:not(#abstract + p)").length > 0,
  227. 1074: (document) =>
  228. document.querySelector('.article__sections section:first-child:not(section[id^="cesec"])') &&
  229. document.querySelectorAll('.article__sections section[id^="cesec"] > .section-paragraph').length > 0,
  230. 3389: (document) =>
  231. document.querySelector('.JournalAbstract .authors+.notes+p') &&
  232. document.querySelectorAll('.article-container .JournalFullText > p').length > 0,
  233. 1186: (document) =>
  234. document.querySelector('[data-title="Abstract"] .c-article-section__content') &&
  235. document.querySelectorAll('main > article > section:not([data-title="Abstract"]):not(#MagazineFulltextArticleBodySuffix ~ section) .c-article-section__content > p').length > 0,
  236. 3762: (document) =>
  237. document.querySelector('#articleContent #abstract p') &&
  238. document.querySelectorAll('#articleContent .text-bs > p').length > 0,
  239. 1371: (document) =>
  240. document.querySelector('.article-content .abstract-content p') &&
  241. document.querySelectorAll('.article-content #artText div[id^="section"] > p').length > 0,
  242. };
  243.  
  244. validators["1006"] = validators["1016"];
  245. validators["1149"] = validators["1088"];
  246.  
  247. const documentFixer = {
  248. 1088: (document) => {
  249. const imgs = Array.from(
  250. document.querySelectorAll('main figure img[data-src^="http"]')
  251. );
  252. imgs.forEach((item) => {
  253. item.src = item.dataset.src;
  254. });
  255. },
  256. 3389: (document) => {
  257. const imgs = Array.from(
  258. document.querySelectorAll('.article-container .JournalFullText .FigureDesc img[data-src^="http"]')
  259. );
  260. imgs.forEach((item) => {
  261. item.src = item.dataset.src;
  262. });
  263. },
  264. };
  265. documentFixer["1149"] = documentFixer["1088"];
  266.  
  267. const addScript = (url) => {
  268. const s = document.createElement("script");
  269. s.src = url;
  270. document.body.append(s);
  271. };
  272.  
  273. const generateClientId = () =>
  274. (1e6 * Math.random()).toString(32).replace(".", "");
  275. // main function
  276. (function () {
  277. "use strict";
  278.  
  279. addScript(
  280. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
  281. );
  282. addScript(
  283. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
  284. );
  285. addScript(
  286. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
  287. );
  288. // Overwrite fetch function to bypass CORS
  289. /** The "fetch-url2.deno.dev" code as follow
  290. *
  291. serve((req: Request) => handleRequest(req));
  292.  
  293. async function handleRequest(req: Request) {
  294. const url = req.url;
  295. const finalUrl = url && url.split("?url=")[1];
  296. if (!finalUrl) {
  297. return new Response(url + " no match '?url='");
  298. }
  299. const res = await fetch(finalUrl);
  300. return new Response(res.body, {
  301. headers: {
  302. ...res.headers,
  303. "Access-Control-Allow-Origin": "*",
  304. "Access-Control-Expose-Headers":
  305. "Request-Context,api-supported-versions,Content-Length,Date,Server",
  306. },
  307. });
  308. }
  309. **/
  310. window.unsafeWindow.fetch = async (...args) => {
  311. console.log(args);
  312. if (args.length <= 1) {
  313. return await fetch(...args);
  314. } else {
  315. const [url, ...otherArgs] = args;
  316. return await fetch(...args).catch(async (err) => {
  317. if (url.startsWith("https://fetch-url2.deno.dev")) {
  318. return;
  319. }
  320. return await fetch(
  321. "https://fetch-url2.deno.dev?url=" +
  322. (url.trim().startsWith("http") ? url : `${location.origin}${url}`),
  323. ...otherArgs
  324. );
  325. });
  326. }
  327. };
  328.  
  329. const downloadFile = (data, fileName) => {
  330. const a = document.createElement("a");
  331. document.body.appendChild(a);
  332. a.style = "display: none";
  333. const blob = new Blob([data], {
  334. type: "application/octet-stream",
  335. });
  336. const url = window.URL.createObjectURL(blob);
  337. a.href = url;
  338. a.download = fileName;
  339. a.click();
  340. window.URL.revokeObjectURL(url);
  341. };
  342.  
  343. const sleep = (duration) => {
  344. return new Promise((res, rej) => {
  345. setTimeout(() => res(), duration * 1000);
  346. });
  347. };
  348.  
  349. async function reload(waiting = 60, message = "") {
  350. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  351. await sleep(waiting);
  352. location.reload();
  353. }
  354.  
  355. function readFile(accept = "", multiple = false) {
  356. const inputEl = document.createElement("input");
  357. inputEl.setAttribute("type", "file");
  358. inputEl.setAttribute("accept", accept);
  359. inputEl.setAttribute("multiple", !!multiple);
  360. return new Promise((resolve, reject) => {
  361. inputEl.addEventListener("change", (e) => {
  362. resolve(multiple ? inputEl.files : inputEl.files[0]);
  363. window.removeEventListener("click", onWindowClick, true);
  364. });
  365. document.body.append(inputEl);
  366. inputEl.click();
  367.  
  368. const onWindowClick = () => {
  369. if (!inputEl.value) {
  370. reject(new Error("用户取消选择"));
  371. }
  372. window.removeEventListener("click", onWindowClick, true);
  373. };
  374. setTimeout(() => {
  375. window.addEventListener("click", onWindowClick, true);
  376. }, 100);
  377. });
  378. }
  379.  
  380. function AddImportBtn() {
  381. const btnWrapImport = document.createElement("div");
  382. btnWrapImport.id = "CRAWLER_ID";
  383. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  384. const importBtn = btnWrapImport.querySelector("button");
  385. importBtn.onclick = async () => {
  386. if (
  387. !window.confirm(
  388. "The data in browser will be clear up. Please make sure you have to do this !!!"
  389. )
  390. ) {
  391. return;
  392. }
  393. const file = await readFile(".json");
  394. const reader = new FileReader();
  395.  
  396. reader.onload = (event) => {
  397. const json = JSON.parse(event.target.result);
  398. // console.log({json}, 'json')
  399. // this.importFromBackUp.bind(this)(json);
  400. if (
  401. json instanceof Array &&
  402. json.every((item) => item.doi && item.validator)
  403. ) {
  404. GM.setValue("tasks", json);
  405. location.reload();
  406. } else {
  407. alert(
  408. "Please upload json file like [{doi: string, validator: string, ...}]"
  409. );
  410. }
  411. };
  412.  
  413. reader.readAsText(file);
  414. };
  415. document.body.appendChild(btnWrapImport);
  416. }
  417.  
  418. function removeImportBtn() {
  419. const importBtn = document.getElementById("CRAWLER_ID");
  420. if (importBtn) {
  421. importBtn.parentElement.removeChild(importBtn);
  422. }
  423. }
  424.  
  425. GM_registerMenuCommand("Download", async () => {
  426. const taskData = await GM.getValue("tasks");
  427. const waitingTasks = taskData.filter(
  428. (task) =>
  429. !task.downloaded &&
  430. task.validated === undefined &&
  431. validators[task.validator]
  432. );
  433. const now = new Date();
  434. downloadFile(
  435. JSON.stringify(taskData),
  436. `${now.getFullYear()}-${
  437. now.getMonth() + 1
  438. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  439. taskData.length
  440. }-${taskData.length - waitingTasks.length}.json`
  441. );
  442. });
  443.  
  444. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  445.  
  446. async function start() {
  447. console.log(new Date());
  448. AddImportBtn();
  449. await sleep(7);
  450. addScript(
  451. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
  452. );
  453. const taskData = await GM.getValue("tasks");
  454. let tasks = taskData || [];
  455.  
  456. // find task which not downloaded and not validated before
  457. const waitingTasks = tasks.filter(
  458. (task) =>
  459. !task.downloaded &&
  460. task.validated === undefined &&
  461. validators[task.validator]
  462. );
  463. console.log(
  464. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  465. printStyle,
  466. tasks
  467. );
  468.  
  469. // ---------------------------- Report progress -----------------------------------------------------
  470.  
  471. let clientId = await GM.getValue("clientId");
  472. if (typeof clientId !== "string" || !clientId) {
  473. clientId = generateClientId();
  474. await GM.setValue("clientId", clientId);
  475. }
  476. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  477. const doneTasks = tasks
  478. .filter((task) => task.downloaded)
  479. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  480. const previousDay = new Date().valueOf() - 24*3600*1000;
  481. const last24hDoneTasks = doneTasks.filter(task => task.updateTime > previousDay);
  482. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  483. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  484. Speed: ${last24hDoneTasks.length} / last 24h`;
  485. GM.xmlHttpRequest({
  486. url: "https://crawler-hit.deno.dev/api/update",
  487. method: "POST",
  488. headers: { "Content-Type": "application/json" },
  489. data: JSON.stringify({
  490. account: clientId,
  491. invalidate_count: invalidatedTasks.length,
  492. done_count: doneTasks.length,
  493. queue_count: waitingTasks.length,
  494. tip: reportTip,
  495. }),
  496. }).then((res) => {
  497. window.tts = res;
  498. console.log({ res });
  499. });
  500.  
  501. if (!waitingTasks.length) {
  502. await reload(90, "No tasks waiting");
  503. return;
  504. }
  505.  
  506. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  507. await sleep(10);
  508. const currentTask = waitingTasks[0];
  509. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  510. const validator = validators[currentTask.validator];
  511. if (document.getElementById("challenge-form")) {
  512. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  513. await sleep(20);
  514. currentTask.validated = false;
  515. currentTask.cloudflareBlock = true;
  516. }
  517.  
  518. // --------------------------- Page validate ------------------------------------------------------
  519. if (
  520. !currentTask.cloudflareBlock &&
  521. !document.body.textContent.toLowerCase().includes(doi)
  522. ) {
  523. console.log(
  524. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  525. printStyle
  526. );
  527. await sleep(5);
  528. location.href = currentTask.doi;
  529. return;
  530. }
  531. if (!currentTask.cloudflareBlock && validator(document)) {
  532. console.log(
  533. "%cValidate successfully! Downloading page...",
  534. printStyle,
  535. waitingTasks,
  536. tasks
  537. );
  538. removeImportBtn();
  539. // repair special page
  540. if (typeof documentFixer[currentTask.validator] === "function") {
  541. documentFixer[currentTask.validator](document);
  542. }
  543. try {
  544. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  545. downloadFile(
  546. data.content,
  547. `${doi.replaceAll("/", "_")}.singlefile.html`
  548. );
  549. downloadFile(
  550. document.body.parentElement.outerHTML,
  551. `${doi.replaceAll("/", "_")}.html`
  552. );
  553. currentTask.downloaded = true;
  554. currentTask.validated = true;
  555. currentTask.updateTime = new Date().valueOf();
  556. } catch (error) {
  557. console.error(error);
  558. await reload(10, `singlefile error! ${currentTask.doi}`);
  559. return;
  560. }
  561. } else {
  562. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  563. currentTask.validated = false;
  564. }
  565.  
  566. await GM.setValue("tasks", tasks);
  567.  
  568. // --------------------------- Prepare next task ------------------------------------------------------
  569. const nextTask = waitingTasks[1];
  570. if (nextTask) {
  571. console.log(
  572. `%cStart next task 10s later...`,
  573. printStyle,
  574. nextTask.doi,
  575. tasks
  576. );
  577. await sleep(10);
  578. location.href = nextTask.doi;
  579. } else {
  580. await reload(60, "No tasks waiting");
  581. }
  582. }
  583.  
  584. start();
  585. })();