Crawler base on SingleFile

Download site in single file automatically

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.21
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
  15. // @require https://openuserjs.org/src/libs/sizzle/GM_config.js
  16. // @connect *
  17. // @noframes
  18. // @namespace https://greasyfork.org/users/1106595
  19. // ==/UserScript==
  20. const REPORT_ADDRESS = "https://crawler-hit.deno.dev/api/update";
  21. const PAGE_LOADING_TIME = 7;
  22. const ERROR_RELOAD_TIME = 10;
  23. const ERROR_RELOAD_LONG_TIME = 60;
  24. const NEXT_TASK_WAITING_TIME = 10;
  25. const NO_TASK_WAITING_TIME = 90;
  26. const CF_CHALLENGE_WAITING_TIME = 20;
  27. const QUICK_SLEEP_TIME = 5;
  28. const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
  29. const TASK_MAX_RETRY_TIMES = 3;
  30. const TIME_POINT_TYPES = {
  31. PREPARE_START: "prepareStart",
  32. TASK_LOADED: "taskLoaded",
  33. TASK_REPORTED: "taskReported",
  34. PRESIGN_INDEX: "presignIndex",
  35. PRESIGN_SINGLEFILE: "presignSinglefile",
  36. SINGLE_FILE_SUCCESS: "singleFileSuccess",
  37. INDEX_FILE_UPLOADED: "indexFileUploaded",
  38. SINGLE_FILE_UPLOADED: "singleFileUploaded",
  39. VALIDATE_FAILED: "validateFailed",
  40. };
  41. let gmc = new GM_config({
  42. id: "CrawlerConfig",
  43. title: "Crawler setting",
  44. fields: {
  45. Name: {
  46. label: "Name",
  47. type: "text",
  48. },
  49. Password: {
  50. label: "Password",
  51. type: "text",
  52. },
  53. taskInterval: {
  54. label: "Task Interval (s)",
  55. type: "int",
  56. default: NEXT_TASK_WAITING_TIME,
  57. },
  58. taskMaxRetryTimes: {
  59. label: "Task Max Retry Times",
  60. type: "int",
  61. default: TASK_MAX_RETRY_TIMES,
  62. },
  63. preferServer: {
  64. label: "Prefer preSign Server",
  65. type: "text",
  66. },
  67. reportServer: {
  68. label: "Report Server",
  69. type: "text",
  70. default: REPORT_ADDRESS,
  71. },
  72. },
  73. events: {
  74. init: function () {
  75. // runs after initialization completes
  76. },
  77. save: function () {
  78. // runs after values are saved
  79. console.log("save", this.get("Name"), this.get("Password"));
  80. this.close();
  81. },
  82. },
  83. });
  84. const crawlerUtil = {
  85. addScript: (url) => {
  86. const s = document.createElement("script");
  87. s.src = url;
  88. s.onerror = (evt) => {
  89. setTimeout(() => {
  90. addScript(url);
  91. }, 2000);
  92. };
  93. document.body.append(s);
  94. },
  95. addScriptByText: async (url, cache = false, retry = 0) => {
  96. const s = document.createElement("script");
  97. s.dataset.crawler = "true";
  98. const scriptCache = (await GM.getValue("scriptCache")) || {};
  99. if (cache && scriptCache[url]) {
  100. s.innerHTML = scriptCache[url];
  101. document.body.append(s);
  102. return true;
  103. }
  104. try {
  105. const res = await GM.xmlHttpRequest({
  106. url: url,
  107. method: "GET",
  108. });
  109. const text = res.responseText;
  110. if (cache) {
  111. scriptCache[url] = text;
  112. GM.setValue("scriptCache", scriptCache);
  113. }
  114. s.innerHTML = text;
  115. document.body.append(s);
  116. return true;
  117. } catch (error) {
  118. if (retry > 3) {
  119. return false;
  120. }
  121. await sleep(2);
  122. return await addScriptByText(url, retry + 1);
  123. }
  124. },
  125. getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
  126. const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
  127. const preSignSevers = configServer.concat([
  128. "https://electrolyte-brain-minio.deno.dev",
  129. ]);
  130. async function getPreSignUrlFromServer(serverIndex = 0) {
  131. try {
  132. return await (
  133. await GM_fetch(
  134. `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
  135. )
  136. ).json();
  137. } catch (error) {
  138. if (!preSignSevers[serverIndex + 1]) {
  139. return { reload: true };
  140. }
  141. return await getPreSignUrlFromServer(serverIndex + 1);
  142. }
  143. }
  144. const preSignRes = await getPreSignUrlFromServer();
  145. if (preSignRes.reload) {
  146. return "RELOAD";
  147. }
  148. const url = preSignRes?.url;
  149. return url || null;
  150. },
  151. uploader: async (url, content) => {
  152. const mime = "application/gzip"
  153. const gzip_data = pako.gzip(content, { level: 9 });
  154. const upload_blob = new Blob([gzip_data], { type: mime });
  155. return await GM.xmlHttpRequest({
  156. method: "PUT",
  157. url,
  158. headers: {
  159. "Content-Type": mime,
  160. "Content-Length": upload_blob.size,
  161. },
  162. data: upload_blob,
  163. });
  164. },
  165. downloadFile: (data, fileName) => {
  166. const a = document.createElement("a");
  167. document.body.appendChild(a);
  168. a.style = "display: none";
  169. const blob = new Blob([data], {
  170. type: "application/octet-stream",
  171. });
  172. const url = window.URL.createObjectURL(blob);
  173. a.href = url;
  174. a.download = fileName;
  175. a.click();
  176. window.URL.revokeObjectURL(url);
  177. },
  178. generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
  179. sleep: (duration) => {
  180. return new Promise((res, rej) => {
  181. setTimeout(() => res(), duration * 1000);
  182. });
  183. },
  184. };
  185. // main function
  186. (function () {
  187. "use strict";
  188. const {
  189. addScript,
  190. addScriptByText,
  191. generateClientId,
  192. uploader,
  193. downloadFile,
  194. getPreSignUrl,
  195. sleep,
  196. } = crawlerUtil;
  197. const dependenciesInit = async () => {
  198. await addScriptByText(
  199. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
  200. true
  201. );
  202. await addScriptByText(
  203. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
  204. true
  205. );
  206. await addScriptByText(
  207. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
  208. true
  209. );
  210. await addScriptByText(
  211. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
  212. true
  213. );
  214. await addScriptByText(
  215. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
  216. );
  217. await addScriptByText(
  218. "https://crawal-validator.deno.dev/"
  219. );
  220. await addScriptByText(
  221. "https://cdn.jsdelivr.net/npm/pako@2.1.0/dist/pako.min.js"
  222. );
  223. return () => {
  224. document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
  225. el.parentElement.removeChild(el);
  226. });
  227. };
  228. };
  229. const pureHTMLCleaner = (document) => {
  230. document.querySelectorAll("script").forEach((el) => {
  231. el.parentElement.removeChild(el);
  232. });
  233. document.querySelectorAll("style").forEach((el) => {
  234. el.parentElement.removeChild(el);
  235. });
  236. };
  237. // Overwrite fetch function to bypass CORS
  238. window.unsafeWindow.fetch = async (...args) => {
  239. return await fetch(...args).catch(async (err) => {
  240. return await GM_fetch(...args);
  241. });
  242. };
  243. async function reload(waiting = 60, message = "") {
  244. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  245. await sleep(waiting);
  246. location.reload();
  247. }
  248. function readFile(accept = "", multiple = false) {
  249. const inputEl = document.createElement("input");
  250. inputEl.setAttribute("type", "file");
  251. inputEl.setAttribute("accept", accept);
  252. inputEl.setAttribute("multiple", !!multiple);
  253. return new Promise((resolve, reject) => {
  254. inputEl.addEventListener("change", (e) => {
  255. resolve(multiple ? inputEl.files : inputEl.files[0]);
  256. window.removeEventListener("click", onWindowClick, true);
  257. });
  258. document.body.append(inputEl);
  259. inputEl.click();
  260. const onWindowClick = () => {
  261. if (!inputEl.value) {
  262. reject(new Error("用户取消选择"));
  263. }
  264. window.removeEventListener("click", onWindowClick, true);
  265. };
  266. setTimeout(() => {
  267. window.addEventListener("click", onWindowClick, true);
  268. }, 100);
  269. });
  270. }
  271. function AddImportBtn() {
  272. const btnWrapImport = document.createElement("div");
  273. btnWrapImport.id = "CRAWLER_ID";
  274. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  275. const importBtn = btnWrapImport.querySelector("button");
  276. importBtn.onclick = async () => {
  277. if (
  278. !window.confirm(
  279. "The data in browser will be clear up. Please make sure you have to do this !!!"
  280. )
  281. ) {
  282. return;
  283. }
  284. const file = await readFile(".json");
  285. const reader = new FileReader();
  286. reader.onload = (event) => {
  287. const json = JSON.parse(event.target.result);
  288. // console.log({json}, 'json')
  289. // this.importFromBackUp.bind(this)(json);
  290. if (
  291. json instanceof Array &&
  292. json.every((item) => item.doi && item.validator)
  293. ) {
  294. GM.setValue("tasks", json);
  295. location.reload();
  296. } else {
  297. alert(
  298. "Please upload json file like [{doi: string, validator: string, ...}]"
  299. );
  300. }
  301. };
  302. reader.readAsText(file);
  303. };
  304. document.body.appendChild(btnWrapImport);
  305. return () => {
  306. const importBtn = document.getElementById("CRAWLER_ID");
  307. if (importBtn) {
  308. importBtn.parentElement.removeChild(importBtn);
  309. }
  310. };
  311. }
  312. GM_registerMenuCommand("Download", async () => {
  313. const taskData = await GM.getValue("tasks");
  314. const waitingTasks = taskData.filter(
  315. (task) =>
  316. !task.downloaded &&
  317. task.validated === undefined &&
  318. validators[task.validator]
  319. );
  320. const now = new Date();
  321. downloadFile(
  322. JSON.stringify(taskData),
  323. `${now.getFullYear()}-${
  324. now.getMonth() + 1
  325. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  326. taskData.length
  327. }-${taskData.length - waitingTasks.length}.json`
  328. );
  329. });
  330. GM_registerMenuCommand("Config", async () => {
  331. gmc.open();
  332. });
  333. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  334. const prepareNextTask = async (nextDoi) => {
  335. const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
  336. if (nextDoi) {
  337. console.log(
  338. `%cStart next task ${taskInterval}s later...`,
  339. printStyle,
  340. nextDoi
  341. );
  342. await sleep(taskInterval);
  343. const taskData = await GM.getValue("tasks");
  344. const task = taskData.find((task) => task.doi === nextDoi);
  345. await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
  346. location.href = nextDoi;
  347. } else {
  348. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  349. }
  350. };
  351. let lasestTimepoint = 0;
  352. const saveTaskTimepoint = async (pointName, task, taskData) => {
  353. if (pointName === TIME_POINT_TYPES.PREPARE_START) {
  354. task[`timePoint_${pointName}`] = new Date().valueOf()
  355. }
  356. else {
  357. if (lasestTimepoint == 0) {
  358. lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
  359. }
  360. if (lasestTimepoint == 0) {
  361. task[`timePoint_${pointName}`] = 0;
  362. } else {
  363. task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
  364. }
  365. lasestTimepoint = new Date().valueOf();
  366. }
  367. await GM.setValue("tasks", taskData);
  368. };
  369. const checkRetry = async (task, taskData, nextDoi) => {
  370. const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES;
  371. const retryTimes = task.retryTimes || 0;
  372. let result = true;
  373. if (retryTimes >= taskMaxRetryTimes) {
  374. console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle);
  375. task.validated = false;
  376. task.updateTime = new Date().valueOf();
  377. await prepareNextTask(nextDoi);
  378. result = false;
  379. } else {
  380. task.retryTimes = retryTimes + 1;
  381. }
  382. await GM.setValue("tasks", taskData);
  383. return result;
  384. }
  385. async function start() {
  386. console.log(new Date());
  387. const importBtnHandler = AddImportBtn();
  388. let clientId = await GM.getValue("clientId");
  389. if (typeof clientId !== "string" || !clientId) {
  390. clientId = generateClientId();
  391. await GM.setValue("clientId", clientId);
  392. }
  393. // ---------------------------- Script dependencies handler -----------------------------------------------------
  394. const dependenciesHandler = await dependenciesInit();
  395. if (!singlefile || !singlefile.getPageData) {
  396. await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
  397. return;
  398. }
  399. if (!(validators && DEFAULT_CONFIG)) {
  400. await reload(
  401. ERROR_RELOAD_TIME,
  402. "Can not get validators or DEFAULT_CONFIG"
  403. );
  404. return;
  405. }
  406. // ---------------------------- Get Task -----------------------------------------------------
  407. const taskData = await GM.getValue("tasks");
  408. let tasks = taskData || [];
  409. // find task which not downloaded and not validated before
  410. const waitingTasks = tasks.filter(
  411. (task) =>
  412. !task.downloaded &&
  413. task.validated === undefined &&
  414. validators[task.validator]
  415. );
  416. console.log(
  417. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  418. printStyle,
  419. tasks
  420. );
  421. if (!waitingTasks.length) {
  422. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  423. return;
  424. }
  425. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  426. const doneTasks = tasks
  427. .filter((task) => task.downloaded)
  428. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  429. const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
  430. const last24hDoneTasks = doneTasks.filter(
  431. (task) => task.updateTime > previousDay
  432. );
  433. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  434. const currentTask = waitingTasks[0];
  435. const nextTask = waitingTasks[1] || {};
  436. await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
  437. const updateCurrentTask = async (isSuccess) => {
  438. currentTask.validated = isSuccess;
  439. currentTask.updateTime = new Date().valueOf();
  440. await GM.setValue("tasks", tasks);
  441. };
  442. // ---------------------------- Report progress -----------------------------------------------------
  443. const reportUrl = gmc.get("reportServer") || REPORT_ADDRESS;
  444. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  445. Speed: ${last24hDoneTasks.length} / last 24h`;
  446. GM.xmlHttpRequest({
  447. url: reportUrl,
  448. method: "POST",
  449. headers: { "Content-Type": "application/json" },
  450. data: JSON.stringify({
  451. account: clientId,
  452. invalidate_count: invalidatedTasks.length,
  453. done_count: doneTasks.length,
  454. queue_count: waitingTasks.length,
  455. tip: reportTip,
  456. }),
  457. })
  458. .then((res) => {
  459. console.log("Report successfully", { res });
  460. })
  461. .finally(() => {
  462. saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
  463. });
  464. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  465. await sleep(PAGE_LOADING_TIME);
  466. if (document.getElementById("challenge-form")) {
  467. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  468. await sleep(CF_CHALLENGE_WAITING_TIME);
  469. currentTask.cloudflareBlock = true;
  470. await updateCurrentTask(false);
  471. await prepareNextTask(nextTask.doi);
  472. return;
  473. }
  474. // bypass els institution check
  475. if (document.querySelector('.sec-A #bdd-els-close')) {
  476. const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close');
  477. elsCloseBtn.click();
  478. }
  479. // ---------------------------- validated task ------------------------------------------------
  480. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  481. const doiFixed = doi.replaceAll("/", "_");
  482. const validator = (document) => {
  483. const abs_selectors = validators[currentTask.validator]["sel_A"];
  484. const para_selectors = validators[currentTask.validator]["sel_P"];
  485. if (abs_selectors.length == 0 && para_selectors.length == 0) {
  486. return false;
  487. }
  488. const absValidated = abs_selectors.length == 0 || abs_selectors.some((selector) => document.querySelector(selector));
  489. const paraValidated = para_selectors.length == 0 || para_selectors.some((selector) => document.querySelectorAll(selector).length > 0);
  490. return absValidated && paraValidated;
  491. }
  492. let name = "";
  493. let pass = "";
  494. let preferServer = "";
  495. try {
  496. name = gmc.get("Name");
  497. pass = gmc.get("Password");
  498. preferServer = gmc.get("preferServer");
  499. if (!name || !pass) {
  500. throw new Error();
  501. }
  502. } catch (err) {
  503. console.error(
  504. `%cMiss name or password. Please input in config panel.`,
  505. printStyle
  506. );
  507. return;
  508. }
  509. const indexUrl = await getPreSignUrl(doiFixed, `_.html.gz`, name, pass, preferServer);
  510. await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
  511. const singlefileUrl = await getPreSignUrl(
  512. doiFixed,
  513. `_.sf.html.gz`,
  514. name,
  515. pass,
  516. preferServer
  517. );
  518. await saveTaskTimepoint(
  519. TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
  520. currentTask,
  521. tasks
  522. );
  523. if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
  524. await reload(
  525. ERROR_RELOAD_LONG_TIME,
  526. "Minio PreSignUrl error, please check url or account"
  527. );
  528. return;
  529. }
  530. if (!indexUrl && !singlefileUrl) {
  531. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  532. await updateCurrentTask(false);
  533. await prepareNextTask(nextTask.doi);
  534. return;
  535. } else {
  536. const old_index = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
  537. const old_singlefileUrl = await getPreSignUrl(
  538. doiFixed,
  539. `_.sf.html`,
  540. name,
  541. pass,
  542. preferServer
  543. );
  544. if (!old_index && !old_singlefileUrl) {
  545. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  546. await updateCurrentTask(false);
  547. await prepareNextTask(nextTask.doi);
  548. return;
  549. }
  550. }
  551. // --------------------------- Page validate ------------------------------------------------------
  552. if (!document.body.textContent.toLowerCase().includes(doi)) {
  553. console.log(
  554. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  555. printStyle
  556. );
  557. await sleep(QUICK_SLEEP_TIME);
  558. if(await checkRetry(currentTask, tasks, nextTask.doi)){
  559. location.href = currentTask.doi;
  560. }
  561. return;
  562. }
  563. if (validator(document)) {
  564. console.log(
  565. "%cValidate successfully! Downloading page...",
  566. printStyle,
  567. waitingTasks,
  568. tasks
  569. );
  570. importBtnHandler();
  571. // repair special page
  572. if (typeof documentFixer[currentTask.validator] === "function") {
  573. documentFixer[currentTask.validator](document);
  574. }
  575. try {
  576. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  577. await saveTaskTimepoint(
  578. TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
  579. currentTask,
  580. tasks
  581. );
  582. // downloadFile(data.content, `${doiFixed}.singlefile.html`);
  583. // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
  584. if (singlefileUrl) {
  585. await uploader(singlefileUrl, data.content);
  586. await saveTaskTimepoint(
  587. TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
  588. currentTask,
  589. tasks
  590. );
  591. }
  592. if (indexUrl) {
  593. dependenciesHandler();
  594. pureHTMLCleaner(document);
  595. await uploader(indexUrl, document.body.parentElement.outerHTML);
  596. await saveTaskTimepoint(
  597. TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
  598. currentTask,
  599. tasks
  600. );
  601. }
  602. console.log("%cUpload successfully!", printStyle);
  603. currentTask.downloaded = true;
  604. await updateCurrentTask(true);
  605. } catch (error) {
  606. console.error(error);
  607. if (await checkRetry(currentTask, tasks, nextTask.doi)) {
  608. await reload(
  609. ERROR_RELOAD_TIME,
  610. `singlefile or upload error! ${currentTask.doi}`
  611. );
  612. }
  613. return;
  614. }
  615. } else {
  616. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  617. await saveTaskTimepoint(
  618. TIME_POINT_TYPES.VALIDATE_FAILED,
  619. currentTask,
  620. tasks
  621. );
  622. await updateCurrentTask(false);
  623. }
  624. // --------------------------- Prepare next task ------------------------------------------------------
  625. await prepareNextTask(nextTask.doi);
  626. }
  627. start();
  628. })();