Crawler base on SingleFile

Download site in single file automatically

当前为 2024-01-11 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.16
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
  15. // @require https://openuserjs.org/src/libs/sizzle/GM_config.js
  16. // @connect *
  17. // @noframes
  18. // @namespace https://greasyfork.org/users/1106595
  19. // ==/UserScript==
  20.  
  21. const PAGE_LOADING_TIME = 7;
  22. const ERROR_RELOAD_TIME = 10;
  23. const ERROR_RELOAD_LONG_TIME = 60;
  24. const NEXT_TASK_WAITING_TIME = 10;
  25.  
  26. const NO_TASK_WAITING_TIME = 90;
  27. const CF_CHALLENGE_WAITING_TIME = 20;
  28. const QUICK_SLEEP_TIME = 5;
  29. const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
  30. const TASK_MAX_RETRY_TIMES = 3;
  31. const TIME_POINT_TYPES = {
  32. PREPARE_START: "prepareStart",
  33. TASK_LOADED: "taskLoaded",
  34. TASK_REPORTED: "taskReported",
  35. PRESIGN_INDEX: "presignIndex",
  36. PRESIGN_SINGLEFILE: "presignSinglefile",
  37. SINGLE_FILE_SUCCESS: "singleFileSuccess",
  38. INDEX_FILE_UPLOADED: "indexFileUploaded",
  39. SINGLE_FILE_UPLOADED: "singleFileUploaded",
  40. VALIDATE_FAILED: "validateFailed",
  41. };
  42. let gmc = new GM_config({
  43. id: "CrawlerConfig",
  44. title: "Crawler setting",
  45. fields: {
  46. Name: {
  47. label: "Name",
  48. type: "text",
  49. },
  50. Password: {
  51. label: "Password",
  52. type: "text",
  53. },
  54. taskInterval: {
  55. label: "Task Interval (s)",
  56. type: "int",
  57. default: NEXT_TASK_WAITING_TIME,
  58. },
  59. taskMaxRetryTimes: {
  60. label: "Task Max Retry Times",
  61. type: "int",
  62. default: TASK_MAX_RETRY_TIMES,
  63. },
  64. preferServer: {
  65. label: "Prefer preSign Server",
  66. type: "text",
  67. },
  68. },
  69. events: {
  70. init: function () {
  71. // runs after initialization completes
  72. },
  73. save: function () {
  74. // runs after values are saved
  75. console.log("save", this.get("Name"), this.get("Password"));
  76. this.close();
  77. },
  78. },
  79. });
  80.  
  81. const crawlerUtil = {
  82. addScript: (url) => {
  83. const s = document.createElement("script");
  84. s.src = url;
  85. s.onerror = (evt) => {
  86. setTimeout(() => {
  87. addScript(url);
  88. }, 2000);
  89. };
  90. document.body.append(s);
  91. },
  92.  
  93. addScriptByText: async (url, cache = false, retry = 0) => {
  94. const s = document.createElement("script");
  95. s.dataset.crawler = "true";
  96. const scriptCache = (await GM.getValue("scriptCache")) || {};
  97. if (cache && scriptCache[url]) {
  98. s.innerHTML = scriptCache[url];
  99. document.body.append(s);
  100. return true;
  101. }
  102. try {
  103. const res = await GM.xmlHttpRequest({
  104. url: url,
  105. method: "GET",
  106. });
  107.  
  108. const text = res.responseText;
  109. if (cache) {
  110. scriptCache[url] = text;
  111. GM.setValue("scriptCache", scriptCache);
  112. }
  113. s.innerHTML = text;
  114. document.body.append(s);
  115. return true;
  116. } catch (error) {
  117. if (retry > 3) {
  118. return false;
  119. }
  120. await sleep(2);
  121. return await addScriptByText(url, retry + 1);
  122. }
  123. },
  124.  
  125. getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
  126. const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
  127. const preSignSevers = configServer.concat([
  128. "https://minio-presign.hzc.pub",
  129. "https://minio-presign-ali.hzc.pub",
  130. "https://chem-brain-minio.deno.dev",
  131. ]);
  132. async function getPreSignUrlFromServer(serverIndex = 0) {
  133. try {
  134. return await (
  135. await GM_fetch(
  136. `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
  137. )
  138. ).json();
  139. } catch (error) {
  140. if (!preSignSevers[serverIndex + 1]) {
  141. return { reload: true };
  142. }
  143. return await getPreSignUrlFromServer(serverIndex + 1);
  144. }
  145. }
  146.  
  147. const preSignRes = await getPreSignUrlFromServer();
  148. if (preSignRes.reload) {
  149. return "RELOAD";
  150. }
  151.  
  152. const url = preSignRes?.url;
  153. return url || null;
  154. },
  155.  
  156. uploader: async (url, content) => {
  157. const file = new File([content], "default.html");
  158.  
  159. return await GM.xmlHttpRequest({
  160. method: "PUT",
  161. url,
  162. headers: {
  163. "Content-Type": "text/html",
  164. "Content-Length": file.size,
  165. },
  166. data: file,
  167. });
  168. },
  169.  
  170. downloadFile: (data, fileName) => {
  171. const a = document.createElement("a");
  172. document.body.appendChild(a);
  173. a.style = "display: none";
  174. const blob = new Blob([data], {
  175. type: "application/octet-stream",
  176. });
  177. const url = window.URL.createObjectURL(blob);
  178. a.href = url;
  179. a.download = fileName;
  180. a.click();
  181. window.URL.revokeObjectURL(url);
  182. },
  183.  
  184. generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
  185.  
  186. sleep: (duration) => {
  187. return new Promise((res, rej) => {
  188. setTimeout(() => res(), duration * 1000);
  189. });
  190. },
  191. };
  192.  
  193. // main function
  194. (function () {
  195. "use strict";
  196. const {
  197. addScript,
  198. addScriptByText,
  199. generateClientId,
  200. uploader,
  201. downloadFile,
  202. getPreSignUrl,
  203. sleep,
  204. } = crawlerUtil;
  205.  
  206. const dependenciesInit = async () => {
  207. await addScriptByText(
  208. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
  209. true
  210. );
  211. await addScriptByText(
  212. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
  213. true
  214. );
  215. await addScriptByText(
  216. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
  217. true
  218. );
  219. await addScriptByText(
  220. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
  221. true
  222. );
  223.  
  224. await addScriptByText(
  225. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
  226. );
  227. await addScriptByText(
  228. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
  229. );
  230. return () => {
  231. document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
  232. el.parentElement.removeChild(el);
  233. });
  234. };
  235. };
  236.  
  237. const pureHTMLCleaner = (document) => {
  238. document.querySelectorAll("script").forEach((el) => {
  239. el.parentElement.removeChild(el);
  240. });
  241. document.querySelectorAll("style").forEach((el) => {
  242. el.parentElement.removeChild(el);
  243. });
  244. };
  245.  
  246. // Overwrite fetch function to bypass CORS
  247. window.unsafeWindow.fetch = async (...args) => {
  248. return await fetch(...args).catch(async (err) => {
  249. return await GM_fetch(...args);
  250. });
  251. };
  252.  
  253. async function reload(waiting = 60, message = "") {
  254. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  255. await sleep(waiting);
  256. location.reload();
  257. }
  258.  
  259. function readFile(accept = "", multiple = false) {
  260. const inputEl = document.createElement("input");
  261. inputEl.setAttribute("type", "file");
  262. inputEl.setAttribute("accept", accept);
  263. inputEl.setAttribute("multiple", !!multiple);
  264. return new Promise((resolve, reject) => {
  265. inputEl.addEventListener("change", (e) => {
  266. resolve(multiple ? inputEl.files : inputEl.files[0]);
  267. window.removeEventListener("click", onWindowClick, true);
  268. });
  269. document.body.append(inputEl);
  270. inputEl.click();
  271.  
  272. const onWindowClick = () => {
  273. if (!inputEl.value) {
  274. reject(new Error("用户取消选择"));
  275. }
  276. window.removeEventListener("click", onWindowClick, true);
  277. };
  278. setTimeout(() => {
  279. window.addEventListener("click", onWindowClick, true);
  280. }, 100);
  281. });
  282. }
  283.  
  284. function AddImportBtn() {
  285. const btnWrapImport = document.createElement("div");
  286. btnWrapImport.id = "CRAWLER_ID";
  287. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  288. const importBtn = btnWrapImport.querySelector("button");
  289. importBtn.onclick = async () => {
  290. if (
  291. !window.confirm(
  292. "The data in browser will be clear up. Please make sure you have to do this !!!"
  293. )
  294. ) {
  295. return;
  296. }
  297. const file = await readFile(".json");
  298. const reader = new FileReader();
  299.  
  300. reader.onload = (event) => {
  301. const json = JSON.parse(event.target.result);
  302. // console.log({json}, 'json')
  303. // this.importFromBackUp.bind(this)(json);
  304. if (
  305. json instanceof Array &&
  306. json.every((item) => item.doi && item.validator)
  307. ) {
  308. GM.setValue("tasks", json);
  309. location.reload();
  310. } else {
  311. alert(
  312. "Please upload json file like [{doi: string, validator: string, ...}]"
  313. );
  314. }
  315. };
  316.  
  317. reader.readAsText(file);
  318. };
  319. document.body.appendChild(btnWrapImport);
  320. return () => {
  321. const importBtn = document.getElementById("CRAWLER_ID");
  322. if (importBtn) {
  323. importBtn.parentElement.removeChild(importBtn);
  324. }
  325. };
  326. }
  327.  
  328. GM_registerMenuCommand("Download", async () => {
  329. const taskData = await GM.getValue("tasks");
  330. const waitingTasks = taskData.filter(
  331. (task) =>
  332. !task.downloaded &&
  333. task.validated === undefined &&
  334. validators[task.validator]
  335. );
  336. const now = new Date();
  337. downloadFile(
  338. JSON.stringify(taskData),
  339. `${now.getFullYear()}-${
  340. now.getMonth() + 1
  341. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  342. taskData.length
  343. }-${taskData.length - waitingTasks.length}.json`
  344. );
  345. });
  346.  
  347. GM_registerMenuCommand("Config", async () => {
  348. gmc.open();
  349. });
  350.  
  351. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  352.  
  353. const prepareNextTask = async (nextDoi) => {
  354. const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
  355. if (nextDoi) {
  356. console.log(
  357. `%cStart next task ${taskInterval}s later...`,
  358. printStyle,
  359. nextDoi
  360. );
  361. await sleep(taskInterval);
  362. const taskData = await GM.getValue("tasks");
  363. const task = taskData.find((task) => task.doi === nextDoi);
  364. await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
  365. location.href = nextDoi;
  366. } else {
  367. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  368. }
  369. };
  370.  
  371. let lasestTimepoint = 0;
  372. const saveTaskTimepoint = async (pointName, task, taskData) => {
  373. if (pointName === TIME_POINT_TYPES.PREPARE_START) {
  374. task[`timePoint_${pointName}`] = new Date().valueOf()
  375. }
  376. else {
  377. if (lasestTimepoint == 0) {
  378. lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
  379. }
  380. if (lasestTimepoint == 0) {
  381. task[`timePoint_${pointName}`] = 0;
  382. } else {
  383. task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
  384. }
  385. lasestTimepoint = new Date().valueOf();
  386. }
  387. await GM.setValue("tasks", taskData);
  388. };
  389.  
  390. const checkRetry = async (task, taskData, nextDoi) => {
  391. const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES;
  392. const retryTimes = task.retryTimes || 0;
  393. let result = true;
  394. if (retryTimes >= taskMaxRetryTimes) {
  395. console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle);
  396. task.validated = false;
  397. task.updateTime = new Date().valueOf();
  398. await prepareNextTask(nextDoi);
  399. result = false;
  400. } else {
  401. task.retryTimes = retryTimes + 1;
  402. }
  403. await GM.setValue("tasks", taskData);
  404. return result;
  405. }
  406.  
  407. async function start() {
  408. console.log(new Date());
  409.  
  410. const importBtnHandler = AddImportBtn();
  411.  
  412. let clientId = await GM.getValue("clientId");
  413. if (typeof clientId !== "string" || !clientId) {
  414. clientId = generateClientId();
  415. await GM.setValue("clientId", clientId);
  416. }
  417.  
  418. // ---------------------------- Script dependencies handler -----------------------------------------------------
  419. const dependenciesHandler = await dependenciesInit();
  420.  
  421. if (!singlefile || !singlefile.getPageData) {
  422. await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
  423. return;
  424. }
  425.  
  426. if (!(validators && DEFAULT_CONFIG)) {
  427. await reload(
  428. ERROR_RELOAD_TIME,
  429. "Can not get validators or DEFAULT_CONFIG"
  430. );
  431. return;
  432. }
  433.  
  434. // ---------------------------- Get Task -----------------------------------------------------
  435. const taskData = await GM.getValue("tasks");
  436. let tasks = taskData || [];
  437.  
  438. // find task which not downloaded and not validated before
  439. const waitingTasks = tasks.filter(
  440. (task) =>
  441. !task.downloaded &&
  442. task.validated === undefined &&
  443. validators[task.validator]
  444. );
  445. console.log(
  446. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  447. printStyle,
  448. tasks
  449. );
  450.  
  451. if (!waitingTasks.length) {
  452. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  453. return;
  454. }
  455.  
  456. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  457. const doneTasks = tasks
  458. .filter((task) => task.downloaded)
  459. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  460. const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
  461. const last24hDoneTasks = doneTasks.filter(
  462. (task) => task.updateTime > previousDay
  463. );
  464.  
  465. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  466. const currentTask = waitingTasks[0];
  467. const nextTask = waitingTasks[1] || {};
  468. await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
  469.  
  470. const updateCurrentTask = async (isSuccess) => {
  471. currentTask.validated = isSuccess;
  472. currentTask.updateTime = new Date().valueOf();
  473. await GM.setValue("tasks", tasks);
  474. };
  475.  
  476. // ---------------------------- Report progress -----------------------------------------------------
  477.  
  478. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  479. Speed: ${last24hDoneTasks.length} / last 24h`;
  480. GM.xmlHttpRequest({
  481. url: "https://crawler-hit.deno.dev/api/update",
  482. method: "POST",
  483. headers: { "Content-Type": "application/json" },
  484. data: JSON.stringify({
  485. account: clientId,
  486. invalidate_count: invalidatedTasks.length,
  487. done_count: doneTasks.length,
  488. queue_count: waitingTasks.length,
  489. tip: reportTip,
  490. }),
  491. })
  492. .then((res) => {
  493. console.log("Report successfully", { res });
  494. })
  495. .finally(() => {
  496. saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
  497. });
  498.  
  499.  
  500. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  501. await sleep(PAGE_LOADING_TIME);
  502. if (document.getElementById("challenge-form")) {
  503. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  504. await sleep(CF_CHALLENGE_WAITING_TIME);
  505. currentTask.cloudflareBlock = true;
  506. await updateCurrentTask(false);
  507. await prepareNextTask(nextTask.doi);
  508. return;
  509. }
  510. // bypass els institution check
  511. if (document.querySelector('.sec-A #bdd-els-close')) {
  512. const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close');
  513. elsCloseBtn.click();
  514. }
  515.  
  516. // ---------------------------- validated task ------------------------------------------------
  517.  
  518. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  519. const doiFixed = doi.replaceAll("/", "_");
  520. const validator = validators[currentTask.validator];
  521.  
  522. let name = "";
  523. let pass = "";
  524. let preferServer = "";
  525. try {
  526. name = gmc.get("Name");
  527. pass = gmc.get("Password");
  528. preferServer = gmc.get("preferServer");
  529. if (!name || !pass) {
  530. throw new Error();
  531. }
  532. } catch (err) {
  533. console.error(
  534. `%cMiss name or password. Please input in config panel.`,
  535. printStyle
  536. );
  537. return;
  538. }
  539.  
  540. const indexUrl = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
  541. await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
  542. const singlefileUrl = await getPreSignUrl(
  543. doiFixed,
  544. `_.sf.html`,
  545. name,
  546. pass,
  547. preferServer
  548. );
  549. await saveTaskTimepoint(
  550. TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
  551. currentTask,
  552. tasks
  553. );
  554. if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
  555. await reload(
  556. ERROR_RELOAD_LONG_TIME,
  557. "Minio PreSignUrl error, please check url or account"
  558. );
  559. return;
  560. }
  561. if (!indexUrl && !singlefileUrl) {
  562. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  563. await updateCurrentTask(false);
  564. await prepareNextTask(nextTask.doi);
  565. return;
  566. }
  567.  
  568. // --------------------------- Page validate ------------------------------------------------------
  569. if (!document.body.textContent.toLowerCase().includes(doi)) {
  570. console.log(
  571. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  572. printStyle
  573. );
  574. await sleep(QUICK_SLEEP_TIME);
  575. if(await checkRetry(currentTask, tasks, nextTask.doi)){
  576. location.href = currentTask.doi;
  577. }
  578. return;
  579. }
  580. if (validator(document)) {
  581. console.log(
  582. "%cValidate successfully! Downloading page...",
  583. printStyle,
  584. waitingTasks,
  585. tasks
  586. );
  587. importBtnHandler();
  588. // repair special page
  589. if (typeof documentFixer[currentTask.validator] === "function") {
  590. documentFixer[currentTask.validator](document);
  591. }
  592. try {
  593. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  594. await saveTaskTimepoint(
  595. TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
  596. currentTask,
  597. tasks
  598. );
  599. // downloadFile(data.content, `${doiFixed}.singlefile.html`);
  600. // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
  601. if (singlefileUrl) {
  602. await uploader(singlefileUrl, data.content);
  603. await saveTaskTimepoint(
  604. TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
  605. currentTask,
  606. tasks
  607. );
  608. }
  609. if (indexUrl) {
  610. dependenciesHandler();
  611. pureHTMLCleaner(document);
  612. await uploader(indexUrl, document.body.parentElement.outerHTML);
  613. await saveTaskTimepoint(
  614. TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
  615. currentTask,
  616. tasks
  617. );
  618. }
  619. console.log("%cUpload successfully!", printStyle);
  620. currentTask.downloaded = true;
  621. await updateCurrentTask(true);
  622. } catch (error) {
  623. console.error(error);
  624. if (await checkRetry(currentTask, tasks, nextTask.doi)) {
  625. await reload(
  626. ERROR_RELOAD_TIME,
  627. `singlefile or upload error! ${currentTask.doi}`
  628. );
  629. }
  630. return;
  631. }
  632. } else {
  633. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  634. await saveTaskTimepoint(
  635. TIME_POINT_TYPES.VALIDATE_FAILED,
  636. currentTask,
  637. tasks
  638. );
  639. await updateCurrentTask(false);
  640. }
  641.  
  642. // --------------------------- Prepare next task ------------------------------------------------------
  643. await prepareNextTask(nextTask.doi);
  644. }
  645.  
  646. start();
  647. })();