Crawler base on SingleFile

Download site in single file automatically

目前为 2024-02-27 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.18
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
  15. // @require https://openuserjs.org/src/libs/sizzle/GM_config.js
  16. // @connect *
  17. // @noframes
  18. // @namespace https://greasyfork.org/users/1106595
  19. // ==/UserScript==
  20.  
  21. const REPORT_ADDRESS = "https://crawler-hit.deno.dev/api/update";
  22. const PAGE_LOADING_TIME = 7;
  23. const ERROR_RELOAD_TIME = 10;
  24. const ERROR_RELOAD_LONG_TIME = 60;
  25. const NEXT_TASK_WAITING_TIME = 10;
  26.  
  27. const NO_TASK_WAITING_TIME = 90;
  28. const CF_CHALLENGE_WAITING_TIME = 20;
  29. const QUICK_SLEEP_TIME = 5;
  30. const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
  31. const TASK_MAX_RETRY_TIMES = 3;
  32. const TIME_POINT_TYPES = {
  33. PREPARE_START: "prepareStart",
  34. TASK_LOADED: "taskLoaded",
  35. TASK_REPORTED: "taskReported",
  36. PRESIGN_INDEX: "presignIndex",
  37. PRESIGN_SINGLEFILE: "presignSinglefile",
  38. SINGLE_FILE_SUCCESS: "singleFileSuccess",
  39. INDEX_FILE_UPLOADED: "indexFileUploaded",
  40. SINGLE_FILE_UPLOADED: "singleFileUploaded",
  41. VALIDATE_FAILED: "validateFailed",
  42. };
  43. let gmc = new GM_config({
  44. id: "CrawlerConfig",
  45. title: "Crawler setting",
  46. fields: {
  47. Name: {
  48. label: "Name",
  49. type: "text",
  50. },
  51. Password: {
  52. label: "Password",
  53. type: "text",
  54. },
  55. taskInterval: {
  56. label: "Task Interval (s)",
  57. type: "int",
  58. default: NEXT_TASK_WAITING_TIME,
  59. },
  60. taskMaxRetryTimes: {
  61. label: "Task Max Retry Times",
  62. type: "int",
  63. default: TASK_MAX_RETRY_TIMES,
  64. },
  65. preferServer: {
  66. label: "Prefer preSign Server",
  67. type: "text",
  68. },
  69. reportServer: {
  70. label: "Report Server",
  71. type: "text",
  72. default: REPORT_ADDRESS,
  73. },
  74. },
  75. events: {
  76. init: function () {
  77. // runs after initialization completes
  78. },
  79. save: function () {
  80. // runs after values are saved
  81. console.log("save", this.get("Name"), this.get("Password"));
  82. this.close();
  83. },
  84. },
  85. });
  86.  
  87. const crawlerUtil = {
  88. addScript: (url) => {
  89. const s = document.createElement("script");
  90. s.src = url;
  91. s.onerror = (evt) => {
  92. setTimeout(() => {
  93. addScript(url);
  94. }, 2000);
  95. };
  96. document.body.append(s);
  97. },
  98.  
  99. addScriptByText: async (url, cache = false, retry = 0) => {
  100. const s = document.createElement("script");
  101. s.dataset.crawler = "true";
  102. const scriptCache = (await GM.getValue("scriptCache")) || {};
  103. if (cache && scriptCache[url]) {
  104. s.innerHTML = scriptCache[url];
  105. document.body.append(s);
  106. return true;
  107. }
  108. try {
  109. const res = await GM.xmlHttpRequest({
  110. url: url,
  111. method: "GET",
  112. });
  113.  
  114. const text = res.responseText;
  115. if (cache) {
  116. scriptCache[url] = text;
  117. GM.setValue("scriptCache", scriptCache);
  118. }
  119. s.innerHTML = text;
  120. document.body.append(s);
  121. return true;
  122. } catch (error) {
  123. if (retry > 3) {
  124. return false;
  125. }
  126. await sleep(2);
  127. return await addScriptByText(url, retry + 1);
  128. }
  129. },
  130.  
  131. getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
  132. const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
  133. const preSignSevers = configServer.concat([
  134. "https://minio-presign.hzc.pub",
  135. "https://minio-presign-ali.hzc.pub",
  136. "https://chem-brain-minio.deno.dev",
  137. ]);
  138. async function getPreSignUrlFromServer(serverIndex = 0) {
  139. try {
  140. return await (
  141. await GM_fetch(
  142. `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
  143. )
  144. ).json();
  145. } catch (error) {
  146. if (!preSignSevers[serverIndex + 1]) {
  147. return { reload: true };
  148. }
  149. return await getPreSignUrlFromServer(serverIndex + 1);
  150. }
  151. }
  152.  
  153. const preSignRes = await getPreSignUrlFromServer();
  154. if (preSignRes.reload) {
  155. return "RELOAD";
  156. }
  157.  
  158. const url = preSignRes?.url;
  159. return url || null;
  160. },
  161.  
  162. uploader: async (url, content) => {
  163. const mime = "application/gzip"
  164. const gzip_data = pako.gzip(content, { level: 9 });
  165. const upload_blob = new Blob([gzip_data], { type: mime });
  166.  
  167. return await GM.xmlHttpRequest({
  168. method: "PUT",
  169. url,
  170. headers: {
  171. "Content-Type": mime,
  172. "Content-Length": upload_blob.size,
  173. },
  174. data: upload_blob,
  175. });
  176. },
  177.  
  178. downloadFile: (data, fileName) => {
  179. const a = document.createElement("a");
  180. document.body.appendChild(a);
  181. a.style = "display: none";
  182. const blob = new Blob([data], {
  183. type: "application/octet-stream",
  184. });
  185. const url = window.URL.createObjectURL(blob);
  186. a.href = url;
  187. a.download = fileName;
  188. a.click();
  189. window.URL.revokeObjectURL(url);
  190. },
  191.  
  192. generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
  193.  
  194. sleep: (duration) => {
  195. return new Promise((res, rej) => {
  196. setTimeout(() => res(), duration * 1000);
  197. });
  198. },
  199. };
  200.  
  201. // main function
  202. (function () {
  203. "use strict";
  204. const {
  205. addScript,
  206. addScriptByText,
  207. generateClientId,
  208. uploader,
  209. downloadFile,
  210. getPreSignUrl,
  211. sleep,
  212. } = crawlerUtil;
  213.  
  214. const dependenciesInit = async () => {
  215. await addScriptByText(
  216. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
  217. true
  218. );
  219. await addScriptByText(
  220. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
  221. true
  222. );
  223. await addScriptByText(
  224. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
  225. true
  226. );
  227. await addScriptByText(
  228. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
  229. true
  230. );
  231.  
  232. await addScriptByText(
  233. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
  234. );
  235. await addScriptByText(
  236. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
  237. );
  238. await addScriptByText(
  239. "https://cdn.jsdelivr.net/npm/pako@2.1.0/dist/pako.min.js"
  240. );
  241. return () => {
  242. document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
  243. el.parentElement.removeChild(el);
  244. });
  245. };
  246. };
  247.  
  248. const pureHTMLCleaner = (document) => {
  249. document.querySelectorAll("script").forEach((el) => {
  250. el.parentElement.removeChild(el);
  251. });
  252. document.querySelectorAll("style").forEach((el) => {
  253. el.parentElement.removeChild(el);
  254. });
  255. };
  256.  
  257. // Overwrite fetch function to bypass CORS
  258. window.unsafeWindow.fetch = async (...args) => {
  259. return await fetch(...args).catch(async (err) => {
  260. return await GM_fetch(...args);
  261. });
  262. };
  263.  
  264. async function reload(waiting = 60, message = "") {
  265. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  266. await sleep(waiting);
  267. location.reload();
  268. }
  269.  
  270. function readFile(accept = "", multiple = false) {
  271. const inputEl = document.createElement("input");
  272. inputEl.setAttribute("type", "file");
  273. inputEl.setAttribute("accept", accept);
  274. inputEl.setAttribute("multiple", !!multiple);
  275. return new Promise((resolve, reject) => {
  276. inputEl.addEventListener("change", (e) => {
  277. resolve(multiple ? inputEl.files : inputEl.files[0]);
  278. window.removeEventListener("click", onWindowClick, true);
  279. });
  280. document.body.append(inputEl);
  281. inputEl.click();
  282.  
  283. const onWindowClick = () => {
  284. if (!inputEl.value) {
  285. reject(new Error("用户取消选择"));
  286. }
  287. window.removeEventListener("click", onWindowClick, true);
  288. };
  289. setTimeout(() => {
  290. window.addEventListener("click", onWindowClick, true);
  291. }, 100);
  292. });
  293. }
  294.  
  295. function AddImportBtn() {
  296. const btnWrapImport = document.createElement("div");
  297. btnWrapImport.id = "CRAWLER_ID";
  298. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  299. const importBtn = btnWrapImport.querySelector("button");
  300. importBtn.onclick = async () => {
  301. if (
  302. !window.confirm(
  303. "The data in browser will be clear up. Please make sure you have to do this !!!"
  304. )
  305. ) {
  306. return;
  307. }
  308. const file = await readFile(".json");
  309. const reader = new FileReader();
  310.  
  311. reader.onload = (event) => {
  312. const json = JSON.parse(event.target.result);
  313. // console.log({json}, 'json')
  314. // this.importFromBackUp.bind(this)(json);
  315. if (
  316. json instanceof Array &&
  317. json.every((item) => item.doi && item.validator)
  318. ) {
  319. GM.setValue("tasks", json);
  320. location.reload();
  321. } else {
  322. alert(
  323. "Please upload json file like [{doi: string, validator: string, ...}]"
  324. );
  325. }
  326. };
  327.  
  328. reader.readAsText(file);
  329. };
  330. document.body.appendChild(btnWrapImport);
  331. return () => {
  332. const importBtn = document.getElementById("CRAWLER_ID");
  333. if (importBtn) {
  334. importBtn.parentElement.removeChild(importBtn);
  335. }
  336. };
  337. }
  338.  
  339. GM_registerMenuCommand("Download", async () => {
  340. const taskData = await GM.getValue("tasks");
  341. const waitingTasks = taskData.filter(
  342. (task) =>
  343. !task.downloaded &&
  344. task.validated === undefined &&
  345. validators[task.validator]
  346. );
  347. const now = new Date();
  348. downloadFile(
  349. JSON.stringify(taskData),
  350. `${now.getFullYear()}-${
  351. now.getMonth() + 1
  352. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  353. taskData.length
  354. }-${taskData.length - waitingTasks.length}.json`
  355. );
  356. });
  357.  
  358. GM_registerMenuCommand("Config", async () => {
  359. gmc.open();
  360. });
  361.  
  362. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  363.  
  364. const prepareNextTask = async (nextDoi) => {
  365. const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
  366. if (nextDoi) {
  367. console.log(
  368. `%cStart next task ${taskInterval}s later...`,
  369. printStyle,
  370. nextDoi
  371. );
  372. await sleep(taskInterval);
  373. const taskData = await GM.getValue("tasks");
  374. const task = taskData.find((task) => task.doi === nextDoi);
  375. await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
  376. location.href = nextDoi;
  377. } else {
  378. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  379. }
  380. };
  381.  
  382. let lasestTimepoint = 0;
  383. const saveTaskTimepoint = async (pointName, task, taskData) => {
  384. if (pointName === TIME_POINT_TYPES.PREPARE_START) {
  385. task[`timePoint_${pointName}`] = new Date().valueOf()
  386. }
  387. else {
  388. if (lasestTimepoint == 0) {
  389. lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
  390. }
  391. if (lasestTimepoint == 0) {
  392. task[`timePoint_${pointName}`] = 0;
  393. } else {
  394. task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
  395. }
  396. lasestTimepoint = new Date().valueOf();
  397. }
  398. await GM.setValue("tasks", taskData);
  399. };
  400.  
  401. const checkRetry = async (task, taskData, nextDoi) => {
  402. const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES;
  403. const retryTimes = task.retryTimes || 0;
  404. let result = true;
  405. if (retryTimes >= taskMaxRetryTimes) {
  406. console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle);
  407. task.validated = false;
  408. task.updateTime = new Date().valueOf();
  409. await prepareNextTask(nextDoi);
  410. result = false;
  411. } else {
  412. task.retryTimes = retryTimes + 1;
  413. }
  414. await GM.setValue("tasks", taskData);
  415. return result;
  416. }
  417.  
  418. async function start() {
  419. console.log(new Date());
  420.  
  421. const importBtnHandler = AddImportBtn();
  422.  
  423. let clientId = await GM.getValue("clientId");
  424. if (typeof clientId !== "string" || !clientId) {
  425. clientId = generateClientId();
  426. await GM.setValue("clientId", clientId);
  427. }
  428.  
  429. // ---------------------------- Script dependencies handler -----------------------------------------------------
  430. const dependenciesHandler = await dependenciesInit();
  431.  
  432. if (!singlefile || !singlefile.getPageData) {
  433. await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
  434. return;
  435. }
  436.  
  437. if (!(validators && DEFAULT_CONFIG)) {
  438. await reload(
  439. ERROR_RELOAD_TIME,
  440. "Can not get validators or DEFAULT_CONFIG"
  441. );
  442. return;
  443. }
  444.  
  445. // ---------------------------- Get Task -----------------------------------------------------
  446. const taskData = await GM.getValue("tasks");
  447. let tasks = taskData || [];
  448.  
  449. // find task which not downloaded and not validated before
  450. const waitingTasks = tasks.filter(
  451. (task) =>
  452. !task.downloaded &&
  453. task.validated === undefined &&
  454. validators[task.validator]
  455. );
  456. console.log(
  457. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  458. printStyle,
  459. tasks
  460. );
  461.  
  462. if (!waitingTasks.length) {
  463. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  464. return;
  465. }
  466.  
  467. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  468. const doneTasks = tasks
  469. .filter((task) => task.downloaded)
  470. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  471. const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
  472. const last24hDoneTasks = doneTasks.filter(
  473. (task) => task.updateTime > previousDay
  474. );
  475.  
  476. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  477. const currentTask = waitingTasks[0];
  478. const nextTask = waitingTasks[1] || {};
  479. await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
  480.  
  481. const updateCurrentTask = async (isSuccess) => {
  482. currentTask.validated = isSuccess;
  483. currentTask.updateTime = new Date().valueOf();
  484. await GM.setValue("tasks", tasks);
  485. };
  486.  
  487. // ---------------------------- Report progress -----------------------------------------------------
  488.  
  489. const reportUrl = gmc.get("reportServer") || REPORT_ADDRESS;
  490. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  491. Speed: ${last24hDoneTasks.length} / last 24h`;
  492. GM.xmlHttpRequest({
  493. url: reportUrl,
  494. method: "POST",
  495. headers: { "Content-Type": "application/json" },
  496. data: JSON.stringify({
  497. account: clientId,
  498. invalidate_count: invalidatedTasks.length,
  499. done_count: doneTasks.length,
  500. queue_count: waitingTasks.length,
  501. tip: reportTip,
  502. }),
  503. })
  504. .then((res) => {
  505. console.log("Report successfully", { res });
  506. })
  507. .finally(() => {
  508. saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
  509. });
  510.  
  511.  
  512. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  513. await sleep(PAGE_LOADING_TIME);
  514. if (document.getElementById("challenge-form")) {
  515. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  516. await sleep(CF_CHALLENGE_WAITING_TIME);
  517. currentTask.cloudflareBlock = true;
  518. await updateCurrentTask(false);
  519. await prepareNextTask(nextTask.doi);
  520. return;
  521. }
  522. // bypass els institution check
  523. if (document.querySelector('.sec-A #bdd-els-close')) {
  524. const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close');
  525. elsCloseBtn.click();
  526. }
  527.  
  528. // ---------------------------- validated task ------------------------------------------------
  529.  
  530. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  531. const doiFixed = doi.replaceAll("/", "_");
  532. const validator = validators[currentTask.validator];
  533.  
  534. let name = "";
  535. let pass = "";
  536. let preferServer = "";
  537. try {
  538. name = gmc.get("Name");
  539. pass = gmc.get("Password");
  540. preferServer = gmc.get("preferServer");
  541. if (!name || !pass) {
  542. throw new Error();
  543. }
  544. } catch (err) {
  545. console.error(
  546. `%cMiss name or password. Please input in config panel.`,
  547. printStyle
  548. );
  549. return;
  550. }
  551.  
  552. const indexUrl = await getPreSignUrl(doiFixed, `_.html.gz`, name, pass, preferServer);
  553. await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
  554. const singlefileUrl = await getPreSignUrl(
  555. doiFixed,
  556. `_.sf.html.gz`,
  557. name,
  558. pass,
  559. preferServer
  560. );
  561. await saveTaskTimepoint(
  562. TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
  563. currentTask,
  564. tasks
  565. );
  566. if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
  567. await reload(
  568. ERROR_RELOAD_LONG_TIME,
  569. "Minio PreSignUrl error, please check url or account"
  570. );
  571. return;
  572. }
  573. if (!indexUrl && !singlefileUrl) {
  574. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  575. await updateCurrentTask(false);
  576. await prepareNextTask(nextTask.doi);
  577. return;
  578. } else {
  579. const old_index = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
  580. const old_singlefileUrl = await getPreSignUrl(
  581. doiFixed,
  582. `_.sf.html`,
  583. name,
  584. pass,
  585. preferServer
  586. );
  587. if (!old_index && !old_singlefileUrl) {
  588. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  589. await updateCurrentTask(false);
  590. await prepareNextTask(nextTask.doi);
  591. return;
  592. }
  593. }
  594.  
  595. // --------------------------- Page validate ------------------------------------------------------
  596. if (!document.body.textContent.toLowerCase().includes(doi)) {
  597. console.log(
  598. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  599. printStyle
  600. );
  601. await sleep(QUICK_SLEEP_TIME);
  602. if(await checkRetry(currentTask, tasks, nextTask.doi)){
  603. location.href = currentTask.doi;
  604. }
  605. return;
  606. }
  607. if (validator(document)) {
  608. console.log(
  609. "%cValidate successfully! Downloading page...",
  610. printStyle,
  611. waitingTasks,
  612. tasks
  613. );
  614. importBtnHandler();
  615. // repair special page
  616. if (typeof documentFixer[currentTask.validator] === "function") {
  617. documentFixer[currentTask.validator](document);
  618. }
  619. try {
  620. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  621. await saveTaskTimepoint(
  622. TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
  623. currentTask,
  624. tasks
  625. );
  626. // downloadFile(data.content, `${doiFixed}.singlefile.html`);
  627. // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
  628. if (singlefileUrl) {
  629. await uploader(singlefileUrl, data.content);
  630. await saveTaskTimepoint(
  631. TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
  632. currentTask,
  633. tasks
  634. );
  635. }
  636. if (indexUrl) {
  637. dependenciesHandler();
  638. pureHTMLCleaner(document);
  639. await uploader(indexUrl, document.body.parentElement.outerHTML);
  640. await saveTaskTimepoint(
  641. TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
  642. currentTask,
  643. tasks
  644. );
  645. }
  646. console.log("%cUpload successfully!", printStyle);
  647. currentTask.downloaded = true;
  648. await updateCurrentTask(true);
  649. } catch (error) {
  650. console.error(error);
  651. if (await checkRetry(currentTask, tasks, nextTask.doi)) {
  652. await reload(
  653. ERROR_RELOAD_TIME,
  654. `singlefile or upload error! ${currentTask.doi}`
  655. );
  656. }
  657. return;
  658. }
  659. } else {
  660. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  661. await saveTaskTimepoint(
  662. TIME_POINT_TYPES.VALIDATE_FAILED,
  663. currentTask,
  664. tasks
  665. );
  666. await updateCurrentTask(false);
  667. }
  668.  
  669. // --------------------------- Prepare next task ------------------------------------------------------
  670. await prepareNextTask(nextTask.doi);
  671. }
  672.  
  673. start();
  674. })();