Crawler base on SingleFile

Download site in single file automatically

当前为 2024-01-10 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.13
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
  15. // @require https://openuserjs.org/src/libs/sizzle/GM_config.js
  16. // @connect *
  17. // @noframes
  18. // @namespace https://greasyfork.org/users/1106595
  19. // ==/UserScript==
  20.  
  21. const PAGE_LOADING_TIME = 7;
  22. const ERROR_RELOAD_TIME = 10;
  23. const ERROR_RELOAD_LONG_TIME = 60;
  24. const NEXT_TASK_WAITING_TIME = 10;
  25.  
  26. const NO_TASK_WAITING_TIME = 90;
  27. const CF_CHALLENGE_WAITING_TIME = 20;
  28. const QUICK_SLEEP_TIME = 5;
  29.  
  30. const TIME_POINT_TYPES = {
  31. PREPARE_START: "prepareStart",
  32. TASK_LOADED: "taskLoaded",
  33. TASK_REPORTED: "taskReported",
  34. PRESIGN_INDEX: "presignIndex",
  35. PRESIGN_SINGLEFILE: "presignSinglefile",
  36. SINGLE_FILE_SUCCESS: "singleFileSuccess",
  37. INDEX_FILE_UPLOADED: "indexFileUploaded",
  38. SINGLE_FILE_UPLOADED: "singleFileUploaded",
  39. VALIDATE_FAILED: "validateFailed",
  40. };
  41. let gmc = new GM_config({
  42. id: "CrawlerConfig",
  43. title: "Crawler setting",
  44. fields: {
  45. Name: {
  46. label: "Name",
  47. type: "text",
  48. },
  49. Password: {
  50. label: "Password",
  51. type: "text",
  52. },
  53. taskInterval: {
  54. label: "Task Interval (s)",
  55. type: "int",
  56. default: NEXT_TASK_WAITING_TIME,
  57. },
  58. },
  59. events: {
  60. init: function () {
  61. // runs after initialization completes
  62. },
  63. save: function () {
  64. // runs after values are saved
  65. console.log("save", this.get("Name"), this.get("Password"));
  66. this.close();
  67. },
  68. },
  69. });
  70.  
  71. const crawlerUtil = {
  72. addScript: (url) => {
  73. const s = document.createElement("script");
  74. s.src = url;
  75. s.onerror = (evt) => {
  76. setTimeout(() => {
  77. addScript(url);
  78. }, 2000);
  79. };
  80. document.body.append(s);
  81. },
  82.  
  83. addScriptByText: async (url, cache = false, retry = 0) => {
  84. const s = document.createElement("script");
  85. s.dataset.crawler = "true";
  86. const scriptCache = (await GM.getValue("scriptCache")) || {};
  87. if (cache && scriptCache[url]) {
  88. s.innerHTML = scriptCache[url];
  89. document.body.append(s);
  90. return true;
  91. }
  92. try {
  93. const res = await GM.xmlHttpRequest({
  94. url: url,
  95. method: "GET",
  96. });
  97.  
  98. const text = res.responseText;
  99. if (cache) {
  100. scriptCache[url] = text;
  101. GM.setValue("scriptCache", scriptCache);
  102. }
  103. s.innerHTML = text;
  104. document.body.append(s);
  105. return true;
  106. } catch (error) {
  107. if (retry > 3) {
  108. return false;
  109. }
  110. await sleep(2);
  111. return await addScriptByText(url, retry + 1);
  112. }
  113. },
  114.  
  115. getPreSignUrl: async (doi, fileName, name, pass) => {
  116. const preSignSevers = [
  117. "minio-presign.hzc.pub",
  118. "minio-presign-ali.hzc.pub",
  119. "chem-brain-minio.deno.dev",
  120. ];
  121. async function getPreSignUrlFromServer(serverIndex = 0) {
  122. try {
  123. return await (
  124. await GM_fetch(
  125. `https://${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
  126. )
  127. ).json();
  128. } catch (error) {
  129. if (!preSignSevers[serverIndex + 1]) {
  130. return null;
  131. }
  132. return await getPreSignUrlFromServer(serverIndex + 1);
  133. }
  134. }
  135.  
  136. const preSignRes = await getPreSignUrlFromServer();
  137. if (preSignRes.reload) {
  138. return "RELOAD";
  139. }
  140.  
  141. const url = preSignRes?.url;
  142. return url || null;
  143. },
  144.  
  145. uploader: async (url, content) => {
  146. const file = new File([content], "default.html");
  147.  
  148. return await GM.xmlHttpRequest({
  149. method: "PUT",
  150. url,
  151. headers: {
  152. "Content-Type": "text/html",
  153. "Content-Length": file.size,
  154. },
  155. data: file,
  156. });
  157. },
  158.  
  159. downloadFile: (data, fileName) => {
  160. const a = document.createElement("a");
  161. document.body.appendChild(a);
  162. a.style = "display: none";
  163. const blob = new Blob([data], {
  164. type: "application/octet-stream",
  165. });
  166. const url = window.URL.createObjectURL(blob);
  167. a.href = url;
  168. a.download = fileName;
  169. a.click();
  170. window.URL.revokeObjectURL(url);
  171. },
  172.  
  173. generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
  174.  
  175. sleep: (duration) => {
  176. return new Promise((res, rej) => {
  177. setTimeout(() => res(), duration * 1000);
  178. });
  179. },
  180. };
  181.  
  182. // main function
  183. (function () {
  184. "use strict";
  185. const {
  186. addScript,
  187. addScriptByText,
  188. generateClientId,
  189. uploader,
  190. downloadFile,
  191. getPreSignUrl,
  192. sleep,
  193. } = crawlerUtil;
  194.  
  195. const dependenciesInit = async () => {
  196. await addScriptByText(
  197. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
  198. true
  199. );
  200. await addScriptByText(
  201. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
  202. true
  203. );
  204. await addScriptByText(
  205. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
  206. true
  207. );
  208. await addScriptByText(
  209. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
  210. true
  211. );
  212.  
  213. await addScriptByText(
  214. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
  215. );
  216. await addScriptByText(
  217. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
  218. );
  219. return () => {
  220. document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
  221. el.parentElement.removeChild(el);
  222. });
  223. };
  224. };
  225.  
  226. const pureHTMLCleaner = (document) => {
  227. document.querySelectorAll("script").forEach((el) => {
  228. el.parentElement.removeChild(el);
  229. });
  230. document.querySelectorAll("style").forEach((el) => {
  231. el.parentElement.removeChild(el);
  232. });
  233. };
  234.  
  235. // Overwrite fetch function to bypass CORS
  236. window.unsafeWindow.fetch = async (...args) => {
  237. return await fetch(...args).catch(async (err) => {
  238. return await GM_fetch(...args);
  239. });
  240. };
  241.  
  242. async function reload(waiting = 60, message = "") {
  243. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  244. await sleep(waiting);
  245. location.reload();
  246. }
  247.  
  248. function readFile(accept = "", multiple = false) {
  249. const inputEl = document.createElement("input");
  250. inputEl.setAttribute("type", "file");
  251. inputEl.setAttribute("accept", accept);
  252. inputEl.setAttribute("multiple", !!multiple);
  253. return new Promise((resolve, reject) => {
  254. inputEl.addEventListener("change", (e) => {
  255. resolve(multiple ? inputEl.files : inputEl.files[0]);
  256. window.removeEventListener("click", onWindowClick, true);
  257. });
  258. document.body.append(inputEl);
  259. inputEl.click();
  260.  
  261. const onWindowClick = () => {
  262. if (!inputEl.value) {
  263. reject(new Error("用户取消选择"));
  264. }
  265. window.removeEventListener("click", onWindowClick, true);
  266. };
  267. setTimeout(() => {
  268. window.addEventListener("click", onWindowClick, true);
  269. }, 100);
  270. });
  271. }
  272.  
  273. function AddImportBtn() {
  274. const btnWrapImport = document.createElement("div");
  275. btnWrapImport.id = "CRAWLER_ID";
  276. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  277. const importBtn = btnWrapImport.querySelector("button");
  278. importBtn.onclick = async () => {
  279. if (
  280. !window.confirm(
  281. "The data in browser will be clear up. Please make sure you have to do this !!!"
  282. )
  283. ) {
  284. return;
  285. }
  286. const file = await readFile(".json");
  287. const reader = new FileReader();
  288.  
  289. reader.onload = (event) => {
  290. const json = JSON.parse(event.target.result);
  291. // console.log({json}, 'json')
  292. // this.importFromBackUp.bind(this)(json);
  293. if (
  294. json instanceof Array &&
  295. json.every((item) => item.doi && item.validator)
  296. ) {
  297. GM.setValue("tasks", json);
  298. location.reload();
  299. } else {
  300. alert(
  301. "Please upload json file like [{doi: string, validator: string, ...}]"
  302. );
  303. }
  304. };
  305.  
  306. reader.readAsText(file);
  307. };
  308. document.body.appendChild(btnWrapImport);
  309. return () => {
  310. const importBtn = document.getElementById("CRAWLER_ID");
  311. if (importBtn) {
  312. importBtn.parentElement.removeChild(importBtn);
  313. }
  314. };
  315. }
  316.  
  317. GM_registerMenuCommand("Download", async () => {
  318. const taskData = await GM.getValue("tasks");
  319. const waitingTasks = taskData.filter(
  320. (task) =>
  321. !task.downloaded &&
  322. task.validated === undefined &&
  323. validators[task.validator]
  324. );
  325. const now = new Date();
  326. downloadFile(
  327. JSON.stringify(taskData),
  328. `${now.getFullYear()}-${
  329. now.getMonth() + 1
  330. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  331. taskData.length
  332. }-${taskData.length - waitingTasks.length}.json`
  333. );
  334. });
  335.  
  336. GM_registerMenuCommand("Config", async () => {
  337. gmc.open();
  338. });
  339.  
  340. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  341.  
  342. const prepareNextTask = async (nextDoi) => {
  343. const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
  344. if (nextDoi) {
  345. console.log(
  346. `%cStart next task ${taskInterval}s later...`,
  347. printStyle,
  348. nextDoi
  349. );
  350. await sleep(taskInterval);
  351. const taskData = await GM.getValue("tasks");
  352. const task = taskData.find((task) => task.doi === nextDoi);
  353. await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
  354. location.href = nextDoi;
  355. } else {
  356. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  357. }
  358. };
  359.  
  360. let lasestTimepoint = 0;
  361. const saveTaskTimepoint = async (pointName, task, taskData) => {
  362. if (lasestTimepoint == 0 && pointName !== TIME_POINT_TYPES.PREPARE_START) {
  363. lasestTimepoint =
  364. task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
  365. }
  366. if (lasestTimepoint == 0) {
  367. task[`timePoint_${pointName}`] = 0;
  368. } else {
  369. task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
  370. }
  371. lasestTimepoint = new Date().valueOf();
  372. await GM.setValue("tasks", taskData);
  373. };
  374.  
  375. async function start() {
  376. console.log(new Date());
  377.  
  378. const importBtnHandler = AddImportBtn();
  379.  
  380. let clientId = await GM.getValue("clientId");
  381. if (typeof clientId !== "string" || !clientId) {
  382. clientId = generateClientId();
  383. await GM.setValue("clientId", clientId);
  384. }
  385.  
  386. // ---------------------------- Script dependencies handler -----------------------------------------------------
  387. const dependenciesHandler = await dependenciesInit();
  388.  
  389. if (!singlefile || !singlefile.getPageData) {
  390. await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
  391. return;
  392. }
  393.  
  394. if (!(validators && DEFAULT_CONFIG)) {
  395. await reload(
  396. ERROR_RELOAD_TIME,
  397. "Can not get validators or DEFAULT_CONFIG"
  398. );
  399. return;
  400. }
  401.  
  402. // ---------------------------- Get Task -----------------------------------------------------
  403. const taskData = await GM.getValue("tasks");
  404. let tasks = taskData || [];
  405.  
  406. // find task which not downloaded and not validated before
  407. const waitingTasks = tasks.filter(
  408. (task) =>
  409. !task.downloaded &&
  410. task.validated === undefined &&
  411. validators[task.validator]
  412. );
  413. console.log(
  414. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  415. printStyle,
  416. tasks
  417. );
  418.  
  419. if (!waitingTasks.length) {
  420. await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
  421. return;
  422. }
  423.  
  424. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  425. const doneTasks = tasks
  426. .filter((task) => task.downloaded)
  427. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  428. const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
  429. const last24hDoneTasks = doneTasks.filter(
  430. (task) => task.updateTime > previousDay
  431. );
  432.  
  433. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  434. const currentTask = waitingTasks[0];
  435. const nextTask = waitingTasks[1] || {};
  436. await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
  437.  
  438. const updateCurrentTask = async (isSuccess) => {
  439. currentTask.validated = isSuccess;
  440. currentTask.updateTime = new Date().valueOf();
  441. await GM.setValue("tasks", tasks);
  442. };
  443.  
  444. // ---------------------------- Report progress -----------------------------------------------------
  445.  
  446. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  447. Speed: ${last24hDoneTasks.length} / last 24h`;
  448. GM.xmlHttpRequest({
  449. url: "https://crawler-hit.deno.dev/api/update",
  450. method: "POST",
  451. headers: { "Content-Type": "application/json" },
  452. data: JSON.stringify({
  453. account: clientId,
  454. invalidate_count: invalidatedTasks.length,
  455. done_count: doneTasks.length,
  456. queue_count: waitingTasks.length,
  457. tip: reportTip,
  458. }),
  459. })
  460. .then((res) => {
  461. console.log("Report successfully", { res });
  462. })
  463. .finally(() => {
  464. saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
  465. });
  466.  
  467. // ---------------------------- validated task ------------------------------------------------
  468.  
  469. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  470. const doiFixed = doi.replaceAll("/", "_");
  471. const validator = validators[currentTask.validator];
  472.  
  473. let name = "";
  474. let pass = "";
  475. try {
  476. name = gmc.get("Name");
  477. pass = gmc.get("Password");
  478. if (!name || !pass) {
  479. throw new Error();
  480. }
  481. } catch (err) {
  482. console.error(
  483. `%cMiss name or password. Please input in config panel.`,
  484. printStyle
  485. );
  486. return;
  487. }
  488.  
  489. const indexUrl = await getPreSignUrl(doiFixed, `_.html`, name, pass);
  490. await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
  491. const singlefileUrl = await getPreSignUrl(
  492. doiFixed,
  493. `_.sf.html`,
  494. name,
  495. pass
  496. );
  497. await saveTaskTimepoint(
  498. TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
  499. currentTask,
  500. tasks
  501. );
  502. if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
  503. await reload(
  504. ERROR_RELOAD_LONG_TIME,
  505. "Minio PreSignUrl error, please check url or account"
  506. );
  507. return;
  508. }
  509. if (!indexUrl && !singlefileUrl) {
  510. console.error("%cFile existed!!!", printStyle, currentTask.doi);
  511. await updateCurrentTask(false);
  512. await prepareNextTask(nextTask.doi);
  513. return;
  514. }
  515.  
  516. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  517. await sleep(PAGE_LOADING_TIME);
  518. if (document.getElementById("challenge-form")) {
  519. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  520. await sleep(CF_CHALLENGE_WAITING_TIME);
  521. currentTask.cloudflareBlock = true;
  522. await updateCurrentTask(false);
  523. await prepareNextTask(nextTask.doi);
  524. return;
  525. }
  526.  
  527. // --------------------------- Page validate ------------------------------------------------------
  528. if (!document.body.textContent.toLowerCase().includes(doi)) {
  529. console.log(
  530. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  531. printStyle
  532. );
  533. await sleep(QUICK_SLEEP_TIME);
  534. location.href = currentTask.doi;
  535. return;
  536. }
  537. if (validator(document)) {
  538. console.log(
  539. "%cValidate successfully! Downloading page...",
  540. printStyle,
  541. waitingTasks,
  542. tasks
  543. );
  544. importBtnHandler();
  545. // repair special page
  546. if (typeof documentFixer[currentTask.validator] === "function") {
  547. documentFixer[currentTask.validator](document);
  548. }
  549. try {
  550. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  551. await saveTaskTimepoint(
  552. TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
  553. currentTask,
  554. tasks
  555. );
  556. // downloadFile(data.content, `${doiFixed}.singlefile.html`);
  557. // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
  558. await uploader(singlefileUrl, data.content);
  559. await saveTaskTimepoint(
  560. TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
  561. currentTask,
  562. tasks
  563. );
  564. dependenciesHandler();
  565. pureHTMLCleaner(document);
  566. await uploader(indexUrl, document.body.parentElement.outerHTML);
  567. await saveTaskTimepoint(
  568. TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
  569. currentTask,
  570. tasks
  571. );
  572. console.log("%cUpload successfully!", printStyle);
  573. currentTask.downloaded = true;
  574. await updateCurrentTask(true);
  575. } catch (error) {
  576. console.error(error);
  577. await reload(
  578. ERROR_RELOAD_TIME,
  579. `singlefile or upload error! ${currentTask.doi}`
  580. );
  581. return;
  582. }
  583. } else {
  584. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  585. await saveTaskTimepoint(
  586. TIME_POINT_TYPES.VALIDATE_FAILED,
  587. currentTask,
  588. tasks
  589. );
  590. await updateCurrentTask(false);
  591. }
  592.  
  593. // --------------------------- Prepare next task ------------------------------------------------------
  594. await prepareNextTask(nextTask.doi);
  595. }
  596.  
  597. start();
  598. })();