Crawler base on SingleFile

Download site in single file automatically

当前为 2023-12-20 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.1
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @noframes
  15. // @namespace https://greasyfork.org/users/1106595
  16. // ==/UserScript==
  17.  
  18. // config for singleFile
  19. const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
  20. const DEFAULT_CONFIG = {
  21. removeHiddenElements: true,
  22. removeUnusedStyles: true,
  23. removeUnusedFonts: true,
  24. removeFrames: false,
  25. compressHTML: true,
  26. compressCSS: false,
  27. loadDeferredImages: true,
  28. loadDeferredImagesMaxIdleTime: 1500,
  29. loadDeferredImagesBlockCookies: false,
  30. loadDeferredImagesBlockStorage: false,
  31. loadDeferredImagesKeepZoomLevel: false,
  32. loadDeferredImagesDispatchScrollEvent: false,
  33. loadDeferredImagesBeforeFrames: false,
  34. filenameTemplate:
  35. "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
  36. infobarTemplate: "",
  37. includeInfobar: false,
  38. confirmInfobarContent: false,
  39. autoClose: false,
  40. confirmFilename: false,
  41. filenameConflictAction: "uniquify",
  42. filenameMaxLength: 192,
  43. filenameMaxLengthUnit: "bytes",
  44. filenameReplacedCharacters: [
  45. "~",
  46. "+",
  47. "\\\\",
  48. "?",
  49. "%",
  50. "*",
  51. ":",
  52. "|",
  53. '"',
  54. "<",
  55. ">",
  56. "\x00-\x1f",
  57. "\x7F",
  58. ],
  59. filenameReplacementCharacter: "_",
  60. replaceEmojisInFilename: false,
  61. saveFilenameTemplateData: false,
  62. contextMenuEnabled: true,
  63. tabMenuEnabled: true,
  64. browserActionMenuEnabled: true,
  65. shadowEnabled: true,
  66. logsEnabled: true,
  67. progressBarEnabled: true,
  68. maxResourceSizeEnabled: false,
  69. maxResourceSize: 10,
  70. displayInfobar: true,
  71. displayStats: false,
  72. backgroundSave: BACKGROUND_SAVE_SUPPORTED,
  73. defaultEditorMode: "normal",
  74. applySystemTheme: true,
  75. autoSaveDelay: 1,
  76. autoSaveLoad: false,
  77. autoSaveUnload: false,
  78. autoSaveLoadOrUnload: true,
  79. autoSaveDiscard: false,
  80. autoSaveRemove: false,
  81. autoSaveRepeat: false,
  82. autoSaveRepeatDelay: 10,
  83. removeAlternativeFonts: true,
  84. removeAlternativeMedias: true,
  85. removeAlternativeImages: true,
  86. groupDuplicateImages: true,
  87. maxSizeDuplicateImages: 512 * 1024,
  88. saveRawPage: false,
  89. saveToClipboard: false,
  90. addProof: false,
  91. saveToGDrive: false,
  92. saveToDropbox: false,
  93. saveWithWebDAV: false,
  94. webDAVURL: "",
  95. webDAVUser: "",
  96. webDAVPassword: "",
  97. saveToGitHub: false,
  98. githubToken: "",
  99. githubUser: "",
  100. githubRepository: "SingleFile-Archives",
  101. githubBranch: "main",
  102. saveWithCompanion: false,
  103. forceWebAuthFlow: false,
  104. resolveFragmentIdentifierURLs: false,
  105. userScriptEnabled: false,
  106. openEditor: false,
  107. openSavedPage: false,
  108. autoOpenEditor: false,
  109. saveCreatedBookmarks: false,
  110. allowedBookmarkFolders: [],
  111. ignoredBookmarkFolders: [],
  112. replaceBookmarkURL: true,
  113. saveFavicon: true,
  114. includeBOM: false,
  115. warnUnsavedPage: true,
  116. displayInfobarInEditor: false,
  117. compressContent: false,
  118. createRootDirectory: false,
  119. selfExtractingArchive: true,
  120. extractDataFromPage: true,
  121. preventAppendedData: false,
  122. insertTextBody: false,
  123. autoSaveExternalSave: false,
  124. insertMetaNoIndex: false,
  125. insertMetaCSP: true,
  126. passReferrerOnError: false,
  127. password: "",
  128. insertSingleFileComment: true,
  129. removeSavedDate: false,
  130. blockMixedContent: false,
  131. saveOriginalURLs: false,
  132. acceptHeaders: {
  133. font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
  134. image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
  135. stylesheet: "text/css,*/*;q=0.1",
  136. script: "*/*",
  137. document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  138. video:
  139. "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
  140. audio:
  141. "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
  142. },
  143. moveStylesInHead: false,
  144. networkTimeout: 0,
  145. woleetKey: "",
  146. blockImages: false,
  147. blockStylesheets: false,
  148. blockFonts: false,
  149. blockScripts: true,
  150. blockVideos: true,
  151. blockAudios: true,
  152. _migratedTemplateFormat: true,
  153. };
  154.  
  155. // validator define for different press
  156. const validators = {
  157. 1002: (document) =>
  158. document.querySelector(
  159. ".article__body .abstract-group .article-section__abstract .article-section__content"
  160. ) &&
  161. document.querySelectorAll(
  162. ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
  163. ).length > 0,
  164. 1016: (document) =>
  165. (document.querySelector("div.abstract.author > div") ||
  166. document.querySelector('[data-left-hand-nav="Summary"]')) &&
  167. (document.querySelectorAll(
  168. "div#body > div:first-child > section[id^=s] p[id^=p]"
  169. ).length > 0 ||
  170. document.querySelectorAll(
  171. "div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
  172. ).length > 0 ||
  173. document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0),
  174. 3390: (document) =>
  175. document.querySelector("#html-abstract .html-p") &&
  176. document.querySelectorAll("article .html-body .html-p").length > 0,
  177. 1039: (document) =>
  178. document.querySelector("article .capsule__text") &&
  179. document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
  180. 1021: (document) =>
  181. document.querySelector("p.articleBody_abstractText") &&
  182. (document.querySelectorAll("div.NLM_p").length > 0 ||
  183. document.querySelectorAll(".article_content-left > p").length > 0),
  184. 1038: (document) =>
  185. document.querySelector("#Abs1-content") &&
  186. document.querySelectorAll(
  187. "article .main-content .c-article-section__content > p"
  188. ).length > 0,
  189. 1007: (document) =>
  190. document.querySelectorAll("#Abs1-content p").length > 0 &&
  191. document.querySelectorAll(".main-content .c-article-section__content > p")
  192. .length > 0,
  193. 1088: (document) =>
  194. document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
  195. document.querySelectorAll(`:where(
  196. div[itemprop="articleBody"] > p,
  197. div[itemprop="articleBody"] > .article-text > p,
  198. div[itemprop="articleBody"] > .article-text > .article-text > p,
  199. div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  200. `).length > 0,
  201. 1063: (document) =>
  202. document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
  203. document.querySelectorAll("#ContentTab .article-section-wrapper > p")
  204. .length > 0,
  205. 1149: (document) =>
  206. document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
  207. document.querySelectorAll(`:where(
  208. div[itemprop="articleBody"] > p,
  209. div[itemprop="articleBody"] > .article-text > p,
  210. div[itemprop="articleBody"] > .article-text > .article-text > p,
  211. div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  212. `).length > 0,
  213. };
  214.  
  215. const addScript = (url) => {
  216. const s = document.createElement("script");
  217. s.src = url;
  218. document.body.append(s);
  219. };
  220.  
  221. // main function
  222. (function () {
  223. "use strict";
  224.  
  225. addScript(
  226. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
  227. );
  228. addScript(
  229. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
  230. );
  231. addScript(
  232. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
  233. );
  234. // Overwrite fetch function to bypass CORS
  235. /** The "fetch-url2.deno.dev" code as follow
  236. *
  237. serve((req: Request) => handleRequest(req));
  238.  
  239. async function handleRequest(req: Request) {
  240. const url = req.url;
  241. const finalUrl = url && url.split("?url=")[1];
  242. if (!finalUrl) {
  243. return new Response(url + " no match '?url='");
  244. }
  245. const res = await fetch(finalUrl);
  246. return new Response(res.body, {
  247. headers: {
  248. ...res.headers,
  249. "Access-Control-Allow-Origin": "*",
  250. "Access-Control-Expose-Headers":
  251. "Request-Context,api-supported-versions,Content-Length,Date,Server",
  252. },
  253. });
  254. }
  255. **/
  256. window.unsafeWindow.fetch = async (...args) => {
  257. console.log(args);
  258. if (args.length <= 1) {
  259. return await fetch(...args);
  260. } else {
  261. const [url, ...otherArgs] = args;
  262. return await fetch(...args).catch(
  263. async (err) =>
  264. await fetch("https://fetch-url2.deno.dev?url=" + url, ...otherArgs)
  265. );
  266. }
  267. };
  268.  
  269. const downloadFile = (data, fileName) => {
  270. const a = document.createElement("a");
  271. document.body.appendChild(a);
  272. a.style = "display: none";
  273. const blob = new Blob([data], {
  274. type: "application/octet-stream",
  275. });
  276. const url = window.URL.createObjectURL(blob);
  277. a.href = url;
  278. a.download = fileName;
  279. a.click();
  280. window.URL.revokeObjectURL(url);
  281. };
  282.  
  283. const sleep = (duration) => {
  284. return new Promise((res, rej) => {
  285. setTimeout(() => res(), duration * 1000);
  286. });
  287. };
  288.  
  289. async function reload(waiting = 60, message = "") {
  290. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  291. await sleep(waiting);
  292. location.reload();
  293. }
  294.  
  295. function readFile(accept = "", multiple = false) {
  296. const inputEl = document.createElement("input");
  297. inputEl.setAttribute("type", "file");
  298. inputEl.setAttribute("accept", accept);
  299. inputEl.setAttribute("multiple", !!multiple);
  300. return new Promise((resolve, reject) => {
  301. inputEl.addEventListener("change", (e) => {
  302. resolve(multiple ? inputEl.files : inputEl.files[0]);
  303. window.removeEventListener("click", onWindowClick, true);
  304. });
  305. document.body.append(inputEl);
  306. inputEl.click();
  307.  
  308. const onWindowClick = () => {
  309. if (!inputEl.value) {
  310. reject(new Error("用户取消选择"));
  311. }
  312. window.removeEventListener("click", onWindowClick, true);
  313. };
  314. setTimeout(() => {
  315. window.addEventListener("click", onWindowClick, true);
  316. }, 100);
  317. });
  318. }
  319.  
  320. function AddImportBtn() {
  321. const btnWrapImport = document.createElement("div");
  322. btnWrapImport.id = "CRAWLER_ID";
  323. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  324. const importBtn = btnWrapImport.querySelector("button");
  325. importBtn.onclick = async () => {
  326. if (
  327. !window.confirm(
  328. "The data in browser will be clear up. Please make sure you have to do this !!!"
  329. )
  330. ) {
  331. return;
  332. }
  333. const file = await readFile(".json");
  334. const reader = new FileReader();
  335.  
  336. reader.onload = (event) => {
  337. const json = JSON.parse(event.target.result);
  338. // console.log({json}, 'json')
  339. // this.importFromBackUp.bind(this)(json);
  340. if (
  341. json instanceof Array &&
  342. json.every((item) => item.doi && item.validator)
  343. ) {
  344. GM.setValue("tasks", json);
  345. location.reload();
  346. } else {
  347. alert(
  348. "Please upload json file like [{doi: string, validator: string, ...}]"
  349. );
  350. }
  351. };
  352.  
  353. reader.readAsText(file);
  354. };
  355. document.body.appendChild(btnWrapImport);
  356. }
  357.  
  358. function removeImportBtn() {
  359. const importBtn = document.getElementById("CRAWLER_ID");
  360. if (importBtn) {
  361. importBtn.parentElement.removeChild(importBtn);
  362. }
  363. }
  364.  
  365. GM_registerMenuCommand("Download", async () => {
  366. const taskData = await GM.getValue("tasks");
  367. const waitingTasks = taskData.filter(
  368. (task) =>
  369. !task.downloaded &&
  370. task.validated === undefined &&
  371. validators[task.validator]
  372. );
  373. const now = new Date();
  374. downloadFile(
  375. JSON.stringify(taskData),
  376. `${now.getFullYear()}-${
  377. now.getMonth() + 1
  378. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  379. taskData.length
  380. }-${taskData.length - waitingTasks.length}.json`
  381. );
  382. });
  383.  
  384. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  385.  
  386. async function start() {
  387. console.log(new Date());
  388. AddImportBtn();
  389. await sleep(7);
  390. addScript(
  391. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
  392. );
  393. const taskData = await GM.getValue("tasks");
  394. let tasks = taskData || [];
  395.  
  396. // find task which not downloaded and not validated before
  397. const waitingTasks = tasks.filter(
  398. (task) =>
  399. !task.downloaded &&
  400. task.validated === undefined &&
  401. validators[task.validator]
  402. );
  403. console.log(
  404. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  405. printStyle,
  406. tasks
  407. );
  408.  
  409. const invalidatedTasks = tasks.filter(
  410. (task) =>task.validated === false
  411. )
  412. const doneTasks = tasks.filter(
  413. (task) =>task.downloaded
  414. ).sort((a,b)=>a.updateTime > b.updateTime ? -1 : 1);
  415. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  416. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}`
  417. GM.xmlHttpRequest({
  418. url: "https://crawler-hit.deno.dev/api/update",
  419. method: "POST",
  420. headers: {"Content-Type" : "application/json"},
  421. data: JSON.stringify({"account": "开发机", "invalidate_count": invalidatedTasks.length, "done_count": doneTasks.length, "queue_count": waitingTasks.length, "tip": reportTip}),
  422. }).then((res) => {
  423. window.tts = res;
  424. console.log({ res });
  425. });
  426.  
  427. if (!waitingTasks.length) {
  428. await reload(90, "No tasks waiting");
  429. return;
  430. }
  431.  
  432. // ---------------------------------------------------------------------------------
  433. await sleep(10);
  434. const currentTask = waitingTasks[0];
  435. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  436. const validator = validators[currentTask.validator];
  437. if (document.getElementById('challenge-form')) {
  438. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  439. await sleep(20);
  440. currentTask.validated = false;
  441. currentTask.cloudflareBlock = true;
  442. }
  443. if (!currentTask.cloudflareBlock && !document.body.textContent.toLowerCase().includes(doi)) {
  444. console.log(
  445. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  446. printStyle
  447. );
  448. await sleep(5);
  449. location.href = currentTask.doi;
  450. return;
  451. }
  452. if (!currentTask.cloudflareBlock && validator(document)) {
  453. console.log(
  454. "%cValidate successfully! Downloading page...",
  455. printStyle,
  456. waitingTasks,
  457. tasks
  458. );
  459. removeImportBtn();
  460. try {
  461. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  462. downloadFile(
  463. data.content,
  464. `${doi.replaceAll("/", "_")}.singlefile.html`
  465. );
  466. downloadFile(
  467. document.body.parentElement.outerHTML,
  468. `${doi.replaceAll("/", "_")}.html`
  469. );
  470. currentTask.downloaded = true;
  471. currentTask.validated = true;
  472. currentTask.updateTime = new Date().valueOf();
  473. } catch (error) {
  474. console.error(error);
  475. await reload(10, `singlefile error! ${currentTask.doi}`);
  476. return;
  477. }
  478. } else {
  479. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  480. currentTask.validated = false;
  481. }
  482.  
  483. await GM.setValue("tasks", tasks);
  484.  
  485. // ---------------------------------------------------------------------------------
  486. const nextTask = waitingTasks[1];
  487. if (nextTask) {
  488. console.log(
  489. `%cStart next task 10s later...`,
  490. printStyle,
  491. nextTask.doi,
  492. tasks
  493. );
  494. await sleep(10);
  495. location.href = nextTask.doi;
  496. } else {
  497. await reload(60, "No tasks waiting");
  498. }
  499. }
  500.  
  501. start();
  502. })();