Crawler base on SingleFile

Download site in single file automatically

目前为 2023-12-21 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.5
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @noframes
  15. // @namespace https://greasyfork.org/users/1106595
  16. // ==/UserScript==
  17.  
  18. // config for singleFile
  19. const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
  20. const DEFAULT_CONFIG = {
  21. removeHiddenElements: true,
  22. removeUnusedStyles: true,
  23. removeUnusedFonts: true,
  24. removeFrames: false,
  25. compressHTML: true,
  26. compressCSS: false,
  27. loadDeferredImages: true,
  28. loadDeferredImagesMaxIdleTime: 1500,
  29. loadDeferredImagesBlockCookies: false,
  30. loadDeferredImagesBlockStorage: false,
  31. loadDeferredImagesKeepZoomLevel: false,
  32. loadDeferredImagesDispatchScrollEvent: false,
  33. loadDeferredImagesBeforeFrames: false,
  34. filenameTemplate:
  35. "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
  36. infobarTemplate: "",
  37. includeInfobar: false,
  38. confirmInfobarContent: false,
  39. autoClose: false,
  40. confirmFilename: false,
  41. filenameConflictAction: "uniquify",
  42. filenameMaxLength: 192,
  43. filenameMaxLengthUnit: "bytes",
  44. filenameReplacedCharacters: [
  45. "~",
  46. "+",
  47. "\\\\",
  48. "?",
  49. "%",
  50. "*",
  51. ":",
  52. "|",
  53. '"',
  54. "<",
  55. ">",
  56. "\x00-\x1f",
  57. "\x7F",
  58. ],
  59. filenameReplacementCharacter: "_",
  60. replaceEmojisInFilename: false,
  61. saveFilenameTemplateData: false,
  62. contextMenuEnabled: true,
  63. tabMenuEnabled: true,
  64. browserActionMenuEnabled: true,
  65. shadowEnabled: true,
  66. logsEnabled: true,
  67. progressBarEnabled: true,
  68. maxResourceSizeEnabled: false,
  69. maxResourceSize: 10,
  70. displayInfobar: true,
  71. displayStats: false,
  72. backgroundSave: BACKGROUND_SAVE_SUPPORTED,
  73. defaultEditorMode: "normal",
  74. applySystemTheme: true,
  75. autoSaveDelay: 1,
  76. autoSaveLoad: false,
  77. autoSaveUnload: false,
  78. autoSaveLoadOrUnload: true,
  79. autoSaveDiscard: false,
  80. autoSaveRemove: false,
  81. autoSaveRepeat: false,
  82. autoSaveRepeatDelay: 10,
  83. removeAlternativeFonts: true,
  84. removeAlternativeMedias: true,
  85. removeAlternativeImages: true,
  86. groupDuplicateImages: true,
  87. maxSizeDuplicateImages: 512 * 1024,
  88. saveRawPage: false,
  89. saveToClipboard: false,
  90. addProof: false,
  91. saveToGDrive: false,
  92. saveToDropbox: false,
  93. saveWithWebDAV: false,
  94. webDAVURL: "",
  95. webDAVUser: "",
  96. webDAVPassword: "",
  97. saveToGitHub: false,
  98. githubToken: "",
  99. githubUser: "",
  100. githubRepository: "SingleFile-Archives",
  101. githubBranch: "main",
  102. saveWithCompanion: false,
  103. forceWebAuthFlow: false,
  104. resolveFragmentIdentifierURLs: false,
  105. userScriptEnabled: false,
  106. openEditor: false,
  107. openSavedPage: false,
  108. autoOpenEditor: false,
  109. saveCreatedBookmarks: false,
  110. allowedBookmarkFolders: [],
  111. ignoredBookmarkFolders: [],
  112. replaceBookmarkURL: true,
  113. saveFavicon: true,
  114. includeBOM: false,
  115. warnUnsavedPage: true,
  116. displayInfobarInEditor: false,
  117. compressContent: false,
  118. createRootDirectory: false,
  119. selfExtractingArchive: true,
  120. extractDataFromPage: true,
  121. preventAppendedData: false,
  122. insertTextBody: false,
  123. autoSaveExternalSave: false,
  124. insertMetaNoIndex: false,
  125. insertMetaCSP: true,
  126. passReferrerOnError: false,
  127. password: "",
  128. insertSingleFileComment: true,
  129. removeSavedDate: false,
  130. blockMixedContent: false,
  131. saveOriginalURLs: false,
  132. acceptHeaders: {
  133. font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
  134. image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
  135. stylesheet: "text/css,*/*;q=0.1",
  136. script: "*/*",
  137. document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  138. video:
  139. "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
  140. audio:
  141. "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
  142. },
  143. moveStylesInHead: false,
  144. networkTimeout: 0,
  145. woleetKey: "",
  146. blockImages: false,
  147. blockStylesheets: false,
  148. blockFonts: false,
  149. blockScripts: true,
  150. blockVideos: true,
  151. blockAudios: true,
  152. _migratedTemplateFormat: true,
  153. };
  154.  
  155. // validator define for different press
  156. const validators = {
  157. 1002: (document) =>
  158. document.querySelector(
  159. ".article__body .abstract-group .article-section__abstract .article-section__content"
  160. ) &&
  161. document.querySelectorAll(
  162. ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
  163. ).length > 0,
  164. 1016: (document) =>
  165. (document.querySelector("div.abstract.author > div") ||
  166. document.querySelector('[data-left-hand-nav="Summary"]')) &&
  167. (document.querySelectorAll(
  168. "div#body > div:first-child > section[id^=s] p[id^=p]"
  169. ).length > 0 ||
  170. document.querySelectorAll(
  171. "div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
  172. ).length > 0 ||
  173. document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0),
  174. 3390: (document) =>
  175. document.querySelector("#html-abstract .html-p") &&
  176. document.querySelectorAll("article .html-body .html-p").length > 0,
  177. 1039: (document) =>
  178. document.querySelector("article .capsule__text") &&
  179. document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
  180. 1021: (document) =>
  181. document.querySelector("p.articleBody_abstractText") &&
  182. (document.querySelectorAll("div.NLM_p").length > 0 ||
  183. document.querySelectorAll(".article_content-left > p").length > 0),
  184. 1038: (document) =>
  185. document.querySelector("#Abs1-content") &&
  186. document.querySelectorAll(
  187. "article .main-content .c-article-section__content > p"
  188. ).length > 0,
  189. 1007: (document) =>
  190. document.querySelectorAll("#Abs1-content p").length > 0 &&
  191. document.querySelectorAll(".main-content .c-article-section__content > p")
  192. .length > 0,
  193. 1088: (document) =>
  194. document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
  195. document.querySelectorAll(`:where(
  196. div[itemprop="articleBody"] > p,
  197. div[itemprop="articleBody"] > .article-text > p,
  198. div[itemprop="articleBody"] > .article-text > .article-text > p,
  199. div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  200. `).length > 0,
  201. 1063: (document) =>
  202. document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
  203. document.querySelectorAll("#ContentTab .article-section-wrapper > p")
  204. .length > 0,
  205. 1149: (document) =>
  206. document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
  207. document.querySelectorAll(`:where(
  208. div[itemprop="articleBody"] > p,
  209. div[itemprop="articleBody"] > .article-text > p,
  210. div[itemprop="articleBody"] > .article-text > .article-text > p,
  211. div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  212. `).length > 0,
  213. };
  214.  
  215. const documentFixer = {
  216. 1088: (document) => {
  217. const imgs = Array.from(
  218. document.querySelectorAll('main figure img[data-src^="http"]')
  219. );
  220. imgs.forEach((item) => {
  221. item.src = item.dataset.src;
  222. });
  223. },
  224. 1149: (document) => {
  225. const imgs = Array.from(
  226. document.querySelectorAll('main figure img[data-src^="http"]')
  227. );
  228. imgs.forEach((item) => {
  229. item.src = item.dataset.src;
  230. });
  231. },
  232. };
  233.  
  234. const addScript = (url) => {
  235. const s = document.createElement("script");
  236. s.src = url;
  237. document.body.append(s);
  238. };
  239.  
  240. const generateClientId = () =>
  241. (1e6 * Math.random()).toString(32).replace(".", "");
  242. // main function
  243. (function () {
  244. "use strict";
  245.  
  246. addScript(
  247. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
  248. );
  249. addScript(
  250. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
  251. );
  252. addScript(
  253. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
  254. );
  255. // Overwrite fetch function to bypass CORS
  256. /** The "fetch-url2.deno.dev" code as follow
  257. *
  258. serve((req: Request) => handleRequest(req));
  259.  
  260. async function handleRequest(req: Request) {
  261. const url = req.url;
  262. const finalUrl = url && url.split("?url=")[1];
  263. if (!finalUrl) {
  264. return new Response(url + " no match '?url='");
  265. }
  266. const res = await fetch(finalUrl);
  267. return new Response(res.body, {
  268. headers: {
  269. ...res.headers,
  270. "Access-Control-Allow-Origin": "*",
  271. "Access-Control-Expose-Headers":
  272. "Request-Context,api-supported-versions,Content-Length,Date,Server",
  273. },
  274. });
  275. }
  276. **/
  277. window.unsafeWindow.fetch = async (...args) => {
  278. console.log(args);
  279. if (args.length <= 1) {
  280. return await fetch(...args);
  281. } else {
  282. const [url, ...otherArgs] = args;
  283. return await fetch(...args).catch(async (err) => {
  284. if (url.startsWith("https://fetch-url2.deno.dev")) {
  285. return;
  286. }
  287. return await fetch(
  288. "https://fetch-url2.deno.dev?url=" +
  289. (url.trim().startsWith("http") ? url : `${location.origin}${url}`),
  290. ...otherArgs
  291. );
  292. });
  293. }
  294. };
  295.  
  296. const downloadFile = (data, fileName) => {
  297. const a = document.createElement("a");
  298. document.body.appendChild(a);
  299. a.style = "display: none";
  300. const blob = new Blob([data], {
  301. type: "application/octet-stream",
  302. });
  303. const url = window.URL.createObjectURL(blob);
  304. a.href = url;
  305. a.download = fileName;
  306. a.click();
  307. window.URL.revokeObjectURL(url);
  308. };
  309.  
  310. const sleep = (duration) => {
  311. return new Promise((res, rej) => {
  312. setTimeout(() => res(), duration * 1000);
  313. });
  314. };
  315.  
  316. async function reload(waiting = 60, message = "") {
  317. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  318. await sleep(waiting);
  319. location.reload();
  320. }
  321.  
  322. function readFile(accept = "", multiple = false) {
  323. const inputEl = document.createElement("input");
  324. inputEl.setAttribute("type", "file");
  325. inputEl.setAttribute("accept", accept);
  326. inputEl.setAttribute("multiple", !!multiple);
  327. return new Promise((resolve, reject) => {
  328. inputEl.addEventListener("change", (e) => {
  329. resolve(multiple ? inputEl.files : inputEl.files[0]);
  330. window.removeEventListener("click", onWindowClick, true);
  331. });
  332. document.body.append(inputEl);
  333. inputEl.click();
  334.  
  335. const onWindowClick = () => {
  336. if (!inputEl.value) {
  337. reject(new Error("用户取消选择"));
  338. }
  339. window.removeEventListener("click", onWindowClick, true);
  340. };
  341. setTimeout(() => {
  342. window.addEventListener("click", onWindowClick, true);
  343. }, 100);
  344. });
  345. }
  346.  
  347. function AddImportBtn() {
  348. const btnWrapImport = document.createElement("div");
  349. btnWrapImport.id = "CRAWLER_ID";
  350. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  351. const importBtn = btnWrapImport.querySelector("button");
  352. importBtn.onclick = async () => {
  353. if (
  354. !window.confirm(
  355. "The data in browser will be clear up. Please make sure you have to do this !!!"
  356. )
  357. ) {
  358. return;
  359. }
  360. const file = await readFile(".json");
  361. const reader = new FileReader();
  362.  
  363. reader.onload = (event) => {
  364. const json = JSON.parse(event.target.result);
  365. // console.log({json}, 'json')
  366. // this.importFromBackUp.bind(this)(json);
  367. if (
  368. json instanceof Array &&
  369. json.every((item) => item.doi && item.validator)
  370. ) {
  371. GM.setValue("tasks", json);
  372. location.reload();
  373. } else {
  374. alert(
  375. "Please upload json file like [{doi: string, validator: string, ...}]"
  376. );
  377. }
  378. };
  379.  
  380. reader.readAsText(file);
  381. };
  382. document.body.appendChild(btnWrapImport);
  383. }
  384.  
  385. function removeImportBtn() {
  386. const importBtn = document.getElementById("CRAWLER_ID");
  387. if (importBtn) {
  388. importBtn.parentElement.removeChild(importBtn);
  389. }
  390. }
  391.  
  392. GM_registerMenuCommand("Download", async () => {
  393. const taskData = await GM.getValue("tasks");
  394. const waitingTasks = taskData.filter(
  395. (task) =>
  396. !task.downloaded &&
  397. task.validated === undefined &&
  398. validators[task.validator]
  399. );
  400. const now = new Date();
  401. downloadFile(
  402. JSON.stringify(taskData),
  403. `${now.getFullYear()}-${
  404. now.getMonth() + 1
  405. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  406. taskData.length
  407. }-${taskData.length - waitingTasks.length}.json`
  408. );
  409. });
  410.  
  411. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  412.  
  413. async function start() {
  414. console.log(new Date());
  415. AddImportBtn();
  416. await sleep(7);
  417. addScript(
  418. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
  419. );
  420. const taskData = await GM.getValue("tasks");
  421. let tasks = taskData || [];
  422.  
  423. // find task which not downloaded and not validated before
  424. const waitingTasks = tasks.filter(
  425. (task) =>
  426. !task.downloaded &&
  427. task.validated === undefined &&
  428. validators[task.validator]
  429. );
  430. console.log(
  431. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  432. printStyle,
  433. tasks
  434. );
  435.  
  436. // ---------------------------- Report progress -----------------------------------------------------
  437.  
  438. let clientId = await GM.getValue("clientId");
  439. if (typeof clientId !== "string" || !clientId) {
  440. clientId = generateClientId();
  441. await GM.setValue("clientId", clientId);
  442. }
  443. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  444. const doneTasks = tasks
  445. .filter((task) => task.downloaded)
  446. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  447. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  448. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}`;
  449. GM.xmlHttpRequest({
  450. url: "https://crawler-hit.deno.dev/api/update",
  451. method: "POST",
  452. headers: { "Content-Type": "application/json" },
  453. data: JSON.stringify({
  454. account: clientId,
  455. invalidate_count: invalidatedTasks.length,
  456. done_count: doneTasks.length,
  457. queue_count: waitingTasks.length,
  458. tip: reportTip,
  459. }),
  460. }).then((res) => {
  461. window.tts = res;
  462. console.log({ res });
  463. });
  464.  
  465. if (!waitingTasks.length) {
  466. await reload(90, "No tasks waiting");
  467. return;
  468. }
  469.  
  470. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  471. await sleep(10);
  472. const currentTask = waitingTasks[0];
  473. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  474. const validator = validators[currentTask.validator];
  475. if (document.getElementById("challenge-form")) {
  476. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  477. await sleep(20);
  478. currentTask.validated = false;
  479. currentTask.cloudflareBlock = true;
  480. }
  481.  
  482. // --------------------------- Page validate ------------------------------------------------------
  483. if (
  484. !currentTask.cloudflareBlock &&
  485. !document.body.textContent.toLowerCase().includes(doi)
  486. ) {
  487. console.log(
  488. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  489. printStyle
  490. );
  491. await sleep(5);
  492. location.href = currentTask.doi;
  493. return;
  494. }
  495. if (!currentTask.cloudflareBlock && validator(document)) {
  496. console.log(
  497. "%cValidate successfully! Downloading page...",
  498. printStyle,
  499. waitingTasks,
  500. tasks
  501. );
  502. removeImportBtn();
  503. // repair special page
  504. if (typeof documentFixer[currentTask.validator] === "function") {
  505. documentFixer[currentTask.validator](document);
  506. }
  507. try {
  508. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  509. downloadFile(
  510. data.content,
  511. `${doi.replaceAll("/", "_")}.singlefile.html`
  512. );
  513. downloadFile(
  514. document.body.parentElement.outerHTML,
  515. `${doi.replaceAll("/", "_")}.html`
  516. );
  517. currentTask.downloaded = true;
  518. currentTask.validated = true;
  519. currentTask.updateTime = new Date().valueOf();
  520. } catch (error) {
  521. console.error(error);
  522. await reload(10, `singlefile error! ${currentTask.doi}`);
  523. return;
  524. }
  525. } else {
  526. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  527. currentTask.validated = false;
  528. }
  529.  
  530. await GM.setValue("tasks", tasks);
  531.  
  532. // --------------------------- Prepare next task ------------------------------------------------------
  533. const nextTask = waitingTasks[1];
  534. if (nextTask) {
  535. console.log(
  536. `%cStart next task 10s later...`,
  537. printStyle,
  538. nextTask.doi,
  539. tasks
  540. );
  541. await sleep(10);
  542. location.href = nextTask.doi;
  543. } else {
  544. await reload(60, "No tasks waiting");
  545. }
  546. }
  547.  
  548. start();
  549. })();