Crawler base on SingleFile

Download site in single file automatically

目前为 2024-01-10 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.14
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
  15. // @require https://openuserjs.org/src/libs/sizzle/GM_config.js
  16. // @connect *
  17. // @noframes
  18. // @namespace https://greasyfork.org/users/1106595
  19. // ==/UserScript==
  20.  
  21. const PAGE_LOADING_TIME = 7;
  22. const ERROR_RELOAD_TIME = 10;
  23. const ERROR_RELOAD_LONG_TIME = 60;
  24. const NEXT_TASK_WAITING_TIME = 10;
  25.  
  26. const NO_TASK_WAITING_TIME = 90;
  27. const CF_CHALLENGE_WAITING_TIME = 20;
  28. const QUICK_SLEEP_TIME = 5;
  29. const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
  30.  
  31. const TIME_POINT_TYPES = {
  32. PREPARE_START: "prepareStart",
  33. TASK_LOADED: "taskLoaded",
  34. TASK_REPORTED: "taskReported",
  35. PRESIGN_INDEX: "presignIndex",
  36. PRESIGN_SINGLEFILE: "presignSinglefile",
  37. SINGLE_FILE_SUCCESS: "singleFileSuccess",
  38. INDEX_FILE_UPLOADED: "indexFileUploaded",
  39. SINGLE_FILE_UPLOADED: "singleFileUploaded",
  40. VALIDATE_FAILED: "validateFailed",
  41. };
  42. let gmc = new GM_config({
  43. id: "CrawlerConfig",
  44. title: "Crawler setting",
  45. fields: {
  46. Name: {
  47. label: "Name",
  48. type: "text",
  49. },
  50. Password: {
  51. label: "Password",
  52. type: "text",
  53. },
  54. taskInterval: {
  55. label: "Task Interval (s)",
  56. type: "int",
  57. default: NEXT_TASK_WAITING_TIME,
  58. },
  59. preferServer: {
  60. label: "Prefer preSign Server",
  61. type: "text",
  62. },
  63. },
  64. events: {
  65. init: function () {
  66. // runs after initialization completes
  67. },
  68. save: function () {
  69. // runs after values are saved
  70. console.log("save", this.get("Name"), this.get("Password"));
  71. this.close();
  72. },
  73. },
  74. });
  75.  
  76. const crawlerUtil = {
  77. addScript: (url) => {
  78. const s = document.createElement("script");
  79. s.src = url;
  80. s.onerror = (evt) => {
  81. setTimeout(() => {
  82. addScript(url);
  83. }, 2000);
  84. };
  85. document.body.append(s);
  86. },
  87.  
  88. addScriptByText: async (url, cache = false, retry = 0) => {
  89. const s = document.createElement("script");
  90. s.dataset.crawler = "true";
  91. const scriptCache = (await GM.getValue("scriptCache")) || {};
  92. if (cache && scriptCache[url]) {
  93. s.innerHTML = scriptCache[url];
  94. document.body.append(s);
  95. return true;
  96. }
  97. try {
  98. const res = await GM.xmlHttpRequest({
  99. url: url,
  100. method: "GET",
  101. });
  102.  
  103. const text = res.responseText;
  104. if (cache) {
  105. scriptCache[url] = text;
  106. GM.setValue("scriptCache", scriptCache);
  107. }
  108. s.innerHTML = text;
  109. document.body.append(s);
  110. return true;
  111. } catch (error) {
  112. if (retry > 3) {
  113. return false;
  114. }
  115. await sleep(2);
  116. return await addScriptByText(url, retry + 1);
  117. }
  118. },
  119.  
  120. getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
  121. const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
  122. const preSignSevers = configServer.concat([
  123. "https://minio-presign.hzc.pub",
  124. "https://minio-presign-ali.hzc.pub",
  125. "https://chem-brain-minio.deno.dev",
  126. ]);
  127. async function getPreSignUrlFromServer(serverIndex = 0) {
  128. try {
  129. return await (
  130. await GM_fetch(
  131. `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
  132. )
  133. ).json();
  134. } catch (error) {
  135. if (!preSignSevers[serverIndex + 1]) {
  136. return { reload: true };
  137. }
  138. return await getPreSignUrlFromServer(serverIndex + 1);
  139. }
  140. }
  141.  
  142. const preSignRes = await getPreSignUrlFromServer();
  143. if (preSignRes.reload) {
  144. return "RELOAD";
  145. }
  146.  
  147. const url = preSignRes?.url;
  148. return url || null;
  149. },
  150.  
  151. uploader: async (url, content) => {
  152. const file = new File([content], "default.html");
  153.  
  154. return await GM.xmlHttpRequest({
  155. method: "PUT",
  156. url,
  157. headers: {
  158. "Content-Type": "text/html",
  159. "Content-Length": file.size,
  160. },
  161. data: file,
  162. });
  163. },
  164.  
  165. downloadFile: (data, fileName) => {
  166. const a = document.createElement("a");
  167. document.body.appendChild(a);
  168. a.style = "display: none";
  169. const blob = new Blob([data], {
  170. type: "application/octet-stream",
  171. });
  172. const url = window.URL.createObjectURL(blob);
  173. a.href = url;
  174. a.download = fileName;
  175. a.click();
  176. window.URL.revokeObjectURL(url);
  177. },
  178.  
  179. generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
  180.  
  181. sleep: (duration) => {
  182. return new Promise((res, rej) => {
  183. setTimeout(() => res(), duration * 1000);
  184. });
  185. },
  186. };
  187.  
  188. // main function
  189. (function () {
  190. "use strict";
  191. const {
  192. addScript,
  193. addScriptByText,
  194. generateClientId,
  195. uploader,
  196. downloadFile,
  197. getPreSignUrl,
  198. sleep,
  199. } = crawlerUtil;
  200.  
  201. const dependenciesInit = async () => {
  202. await addScriptByText(
  203. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
  204. true
  205. );
  206. await addScriptByText(
  207. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
  208. true
  209. );
  210. await addScriptByText(
  211. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
  212. true
  213. );
  214. await addScriptByText(
  215. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
  216. true
  217. );
  218.  
  219. await addScriptByText(
  220. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
  221. );
  222. await addScriptByText(
  223. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
  224. );
  225. return () => {
  226. document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
  227. el.parentElement.removeChild(el);
  228. });
  229. };
  230. };
  231.  
  232. const pureHTMLCleaner = (document) => {
  233. document.querySelectorAll("script").forEach((el) => {
  234. el.parentElement.removeChild(el);
  235. });
  236. document.querySelectorAll("style").forEach((el) => {
  237. el.parentElement.removeChild(el);
  238. });
  239. };
  240.  
  241. // Overwrite fetch function to bypass CORS
  242. window.unsafeWindow.fetch = async (...args) => {
  243. return await fetch(...args).catch(async (err) => {
  244. return await GM_fetch(...args);
  245. });
  246. };
  247.  
  248. async function reload(waiting = 60, message = "") {
  249. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  250. await sleep(waiting);
  251. location.reload();
  252. }
  253.  
  254. function readFile(accept = "", multiple = false) {
  255. const inputEl = document.createElement("input");
  256. inputEl.setAttribute("type", "file");
  257. inputEl.setAttribute("accept", accept);
  258. inputEl.setAttribute("multiple", !!multiple);
  259. return new Promise((resolve, reject) => {
  260. inputEl.addEventListener("change", (e) => {
  261. resolve(multiple ? inputEl.files : inputEl.files[0]);
  262. window.removeEventListener("click", onWindowClick, true);
  263. });
  264. document.body.append(inputEl);
  265. inputEl.click();
  266.  
  267. const onWindowClick = () => {
  268. if (!inputEl.value) {
  269. reject(new Error("用户取消选择"));
  270. }
  271. window.removeEventListener("click", onWindowClick, true);
  272. };
  273. setTimeout(() => {
  274. window.addEventListener("click", onWindowClick, true);
  275. }, 100);
  276. });
  277. }
  278.  
  279. function AddImportBtn() {
  280. const btnWrapImport = document.createElement("div");
  281. btnWrapImport.id = "CRAWLER_ID";
  282. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  283. const importBtn = btnWrapImport.querySelector("button");
  284. importBtn.onclick = async () => {
  285. if (
  286. !window.confirm(
  287. "The data in browser will be clear up. Please make sure you have to do this !!!"
  288. )
  289. ) {
  290. return;
  291. }
  292. const file = await readFile(".json");
  293. const reader = new FileReader();
  294.  
  295. reader.onload = (event) => {
  296. const json = JSON.parse(event.target.result);
  297. // console.log({json}, 'json')
  298. // this.importFromBackUp.bind(this)(json);
  299. if (
  300. json instanceof Array &&
  301. json.every((item) => item.doi && item.validator)
  302. ) {
  303. GM.setValue("tasks", json);
  304. location.reload();
  305. } else {
  306. alert(
  307. "Please upload json file like [{doi: string, validator: string, ...}]"
  308. );
  309. }
  310. };
  311.  
  312. reader.readAsText(file);
  313. };
  314. document.body.appendChild(btnWrapImport);
  315. return () => {
  316. const importBtn = document.getElementById("CRAWLER_ID");
  317. if (importBtn) {
  318. importBtn.parentElement.removeChild(importBtn);
  319. }
  320. };
  321. }
  322.  
  323. GM_registerMenuCommand("Download", async () => {
  324. const taskData = await GM.getValue("tasks");
  325. const waitingTasks = taskData.filter(
  326. (task) =>
  327. !task.downloaded &&
  328. task.validated === undefined &&
  329. validators[task.validator]
  330. );
  331. const now = new Date();
  332. downloadFile(
  333. JSON.stringify(taskData),
  334. `${now.getFullYear()}-${
  335. now.getMonth() + 1
  336. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  337. taskData.length
  338. }-${taskData.length - waitingTasks.length}.json`
  339. );
  340. });
  341.  
  342. GM_registerMenuCommand("Config", async () => {
  343. gmc.open();
  344. });
  345.  
  346. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  347.  
  348. const prepareNextTask = async (nextDoi) => {
  349. const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
  350. if (nextDoi) {
  351. console.log(
  352. `%cStart next task ${taskInterval}s later...`,
  353. printStyle,
  354. nextDoi
  355. );
  356. await sleep(taskInterval);
  357. const taskData = await GM.getValue("tasks");
  358. const task = taskData.find((task) => task.doi === nextDoi);
  359. await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
  360. location.href = nextDoi;
  361. } else {
  362. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  363. }
  364. };
  365.  
  366. let lasestTimepoint = 0;
  367. const saveTaskTimepoint = async (pointName, task, taskData) => {
  368. if (lasestTimepoint == 0 && pointName !== TIME_POINT_TYPES.PREPARE_START) {
  369. lasestTimepoint =
  370. task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
  371. }
  372. if (lasestTimepoint == 0) {
  373. task[`timePoint_${pointName}`] = 0;
  374. } else {
  375. task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
  376. }
  377. lasestTimepoint = new Date().valueOf();
  378. await GM.setValue("tasks", taskData);
  379. };
  380.  
  381. async function start() {
  382. console.log(new Date());
  383.  
  384. const importBtnHandler = AddImportBtn();
  385.  
  386. let clientId = await GM.getValue("clientId");
  387. if (typeof clientId !== "string" || !clientId) {
  388. clientId = generateClientId();
  389. await GM.setValue("clientId", clientId);
  390. }
  391.  
  392. // ---------------------------- Script dependencies handler -----------------------------------------------------
  393. const dependenciesHandler = await dependenciesInit();
  394.  
  395. if (!singlefile || !singlefile.getPageData) {
  396. await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
  397. return;
  398. }
  399.  
  400. if (!(validators && DEFAULT_CONFIG)) {
  401. await reload(
  402. ERROR_RELOAD_TIME,
  403. "Can not get validators or DEFAULT_CONFIG"
  404. );
  405. return;
  406. }
  407.  
  408. // ---------------------------- Get Task -----------------------------------------------------
  409. const taskData = await GM.getValue("tasks");
  410. let tasks = taskData || [];
  411.  
  412. // find task which not downloaded and not validated before
  413. const waitingTasks = tasks.filter(
  414. (task) =>
  415. !task.downloaded &&
  416. task.validated === undefined &&
  417. validators[task.validator]
  418. );
  419. console.log(
  420. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  421. printStyle,
  422. tasks
  423. );
  424.  
  425. if (!waitingTasks.length) {
  426. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  427. return;
  428. }
  429.  
  430. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  431. const doneTasks = tasks
  432. .filter((task) => task.downloaded)
  433. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  434. const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
  435. const last24hDoneTasks = doneTasks.filter(
  436. (task) => task.updateTime > previousDay
  437. );
  438.  
  439. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  440. const currentTask = waitingTasks[0];
  441. const nextTask = waitingTasks[1] || {};
  442. await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
  443.  
  444. const updateCurrentTask = async (isSuccess) => {
  445. currentTask.validated = isSuccess;
  446. currentTask.updateTime = new Date().valueOf();
  447. await GM.setValue("tasks", tasks);
  448. };
  449.  
  450. // ---------------------------- Report progress -----------------------------------------------------
  451.  
  452. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  453. Speed: ${last24hDoneTasks.length} / last 24h`;
  454. GM.xmlHttpRequest({
  455. url: "https://crawler-hit.deno.dev/api/update",
  456. method: "POST",
  457. headers: { "Content-Type": "application/json" },
  458. data: JSON.stringify({
  459. account: clientId,
  460. invalidate_count: invalidatedTasks.length,
  461. done_count: doneTasks.length,
  462. queue_count: waitingTasks.length,
  463. tip: reportTip,
  464. }),
  465. })
  466. .then((res) => {
  467. console.log("Report successfully", { res });
  468. })
  469. .finally(() => {
  470. saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
  471. });
  472.  
  473.  
  474. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  475. await sleep(PAGE_LOADING_TIME);
  476. if (document.getElementById("challenge-form")) {
  477. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  478. await sleep(CF_CHALLENGE_WAITING_TIME);
  479. currentTask.cloudflareBlock = true;
  480. await updateCurrentTask(false);
  481. await prepareNextTask(nextTask.doi);
  482. return;
  483. }
  484. // ---------------------------- validated task ------------------------------------------------
  485.  
  486. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  487. const doiFixed = doi.replaceAll("/", "_");
  488. const validator = validators[currentTask.validator];
  489.  
  490. let name = "";
  491. let pass = "";
  492. let preferServer = "";
  493. try {
  494. name = gmc.get("Name");
  495. pass = gmc.get("Password");
  496. preferServer = gmc.get("preferServer");
  497. if (!name || !pass) {
  498. throw new Error();
  499. }
  500. } catch (err) {
  501. console.error(
  502. `%cMiss name or password. Please input in config panel.`,
  503. printStyle
  504. );
  505. return;
  506. }
  507.  
  508. const indexUrl = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
  509. await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
  510. const singlefileUrl = await getPreSignUrl(
  511. doiFixed,
  512. `_.sf.html`,
  513. name,
  514. pass,
  515. preferServer
  516. );
  517. await saveTaskTimepoint(
  518. TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
  519. currentTask,
  520. tasks
  521. );
  522. if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
  523. await reload(
  524. ERROR_RELOAD_LONG_TIME,
  525. "Minio PreSignUrl error, please check url or account"
  526. );
  527. return;
  528. }
  529. if (!indexUrl && !singlefileUrl) {
  530. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  531. await updateCurrentTask(false);
  532. await prepareNextTask(nextTask.doi);
  533. return;
  534. }
  535.  
  536. // --------------------------- Page validate ------------------------------------------------------
  537. if (!document.body.textContent.toLowerCase().includes(doi)) {
  538. console.log(
  539. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  540. printStyle
  541. );
  542. await sleep(QUICK_SLEEP_TIME);
  543. location.href = currentTask.doi;
  544. return;
  545. }
  546. if (validator(document)) {
  547. console.log(
  548. "%cValidate successfully! Downloading page...",
  549. printStyle,
  550. waitingTasks,
  551. tasks
  552. );
  553. importBtnHandler();
  554. // repair special page
  555. if (typeof documentFixer[currentTask.validator] === "function") {
  556. documentFixer[currentTask.validator](document);
  557. }
  558. try {
  559. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  560. await saveTaskTimepoint(
  561. TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
  562. currentTask,
  563. tasks
  564. );
  565. // downloadFile(data.content, `${doiFixed}.singlefile.html`);
  566. // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
  567. await uploader(singlefileUrl, data.content);
  568. await saveTaskTimepoint(
  569. TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
  570. currentTask,
  571. tasks
  572. );
  573. dependenciesHandler();
  574. pureHTMLCleaner(document);
  575. await uploader(indexUrl, document.body.parentElement.outerHTML);
  576. await saveTaskTimepoint(
  577. TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
  578. currentTask,
  579. tasks
  580. );
  581. console.log("%cUpload successfully!", printStyle);
  582. currentTask.downloaded = true;
  583. await updateCurrentTask(true);
  584. } catch (error) {
  585. console.error(error);
  586. await reload(
  587. ERROR_RELOAD_TIME,
  588. `singlefile or upload error! ${currentTask.doi}`
  589. );
  590. return;
  591. }
  592. } else {
  593. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  594. await saveTaskTimepoint(
  595. TIME_POINT_TYPES.VALIDATE_FAILED,
  596. currentTask,
  597. tasks
  598. );
  599. await updateCurrentTask(false);
  600. }
  601.  
  602. // --------------------------- Prepare next task ------------------------------------------------------
  603. await prepareNextTask(nextTask.doi);
  604. }
  605.  
  606. start();
  607. })();