Crawler base on SingleFile

Download site in single file automatically

目前為 2024-01-04 提交的版本,檢視 最新版本

  1. // ==UserScript==
  2. // @name Crawler base on SingleFile
  3. // @author Mark
  4. // @description Download site in single file automatically
  5. // @license MIT
  6. // @version 0.0.11
  7. // @match https://*/*
  8. // @run-at document-idle
  9. // @grant GM.setValue
  10. // @grant GM.getValue
  11. // @grant GM.xmlHttpRequest
  12. // @grant GM_registerMenuCommand
  13. // @grant unsafeWindow
  14. // @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
  15. // @connect *
  16. // @noframes
  17. // @namespace https://greasyfork.org/users/1106595
  18. // ==/UserScript==
  19.  
  20.  
  21. const addScript = (url) => {
  22. const s = document.createElement("script");
  23. s.src = url;
  24. s.onerror = (evt) => {
  25. setTimeout(() => {addScript(url)}, 2000)
  26. }
  27. document.body.append(s);
  28. };
  29.  
  30. const generateClientId = () =>
  31. (1e6 * Math.random()).toString(32).replace(".", "");
  32. // main function
  33. (function () {
  34. "use strict";
  35.  
  36. addScript(
  37. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
  38. );
  39. addScript(
  40. "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
  41. );
  42. addScript(
  43. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
  44. );
  45. addScript(
  46. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
  47. );
  48. addScript(
  49. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
  50. );
  51. // Overwrite fetch function to bypass CORS
  52. window.unsafeWindow.fetch = async (...args) => {
  53. return await fetch(...args).catch(async (err) => {
  54. return await GM_fetch(...args);
  55. });
  56. };
  57.  
  58. const downloadFile = (data, fileName) => {
  59. const a = document.createElement("a");
  60. document.body.appendChild(a);
  61. a.style = "display: none";
  62. const blob = new Blob([data], {
  63. type: "application/octet-stream",
  64. });
  65. const url = window.URL.createObjectURL(blob);
  66. a.href = url;
  67. a.download = fileName;
  68. a.click();
  69. window.URL.revokeObjectURL(url);
  70. };
  71.  
  72. const sleep = (duration) => {
  73. return new Promise((res, rej) => {
  74. setTimeout(() => res(), duration * 1000);
  75. });
  76. };
  77.  
  78. async function reload(waiting = 60, message = "") {
  79. console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
  80. await sleep(waiting);
  81. location.reload();
  82. }
  83.  
  84. function readFile(accept = "", multiple = false) {
  85. const inputEl = document.createElement("input");
  86. inputEl.setAttribute("type", "file");
  87. inputEl.setAttribute("accept", accept);
  88. inputEl.setAttribute("multiple", !!multiple);
  89. return new Promise((resolve, reject) => {
  90. inputEl.addEventListener("change", (e) => {
  91. resolve(multiple ? inputEl.files : inputEl.files[0]);
  92. window.removeEventListener("click", onWindowClick, true);
  93. });
  94. document.body.append(inputEl);
  95. inputEl.click();
  96.  
  97. const onWindowClick = () => {
  98. if (!inputEl.value) {
  99. reject(new Error("用户取消选择"));
  100. }
  101. window.removeEventListener("click", onWindowClick, true);
  102. };
  103. setTimeout(() => {
  104. window.addEventListener("click", onWindowClick, true);
  105. }, 100);
  106. });
  107. }
  108.  
  109. function AddImportBtn() {
  110. const btnWrapImport = document.createElement("div");
  111. btnWrapImport.id = "CRAWLER_ID";
  112. btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
  113. const importBtn = btnWrapImport.querySelector("button");
  114. importBtn.onclick = async () => {
  115. if (
  116. !window.confirm(
  117. "The data in browser will be clear up. Please make sure you have to do this !!!"
  118. )
  119. ) {
  120. return;
  121. }
  122. const file = await readFile(".json");
  123. const reader = new FileReader();
  124.  
  125. reader.onload = (event) => {
  126. const json = JSON.parse(event.target.result);
  127. // console.log({json}, 'json')
  128. // this.importFromBackUp.bind(this)(json);
  129. if (
  130. json instanceof Array &&
  131. json.every((item) => item.doi && item.validator)
  132. ) {
  133. GM.setValue("tasks", json);
  134. location.reload();
  135. } else {
  136. alert(
  137. "Please upload json file like [{doi: string, validator: string, ...}]"
  138. );
  139. }
  140. };
  141.  
  142. reader.readAsText(file);
  143. };
  144. document.body.appendChild(btnWrapImport);
  145. }
  146.  
  147. function removeImportBtn() {
  148. const importBtn = document.getElementById("CRAWLER_ID");
  149. if (importBtn) {
  150. importBtn.parentElement.removeChild(importBtn);
  151. }
  152. }
  153.  
  154. GM_registerMenuCommand("Download", async () => {
  155. const taskData = await GM.getValue("tasks");
  156. const waitingTasks = taskData.filter(
  157. (task) =>
  158. !task.downloaded &&
  159. task.validated === undefined &&
  160. validators[task.validator]
  161. );
  162. const now = new Date();
  163. downloadFile(
  164. JSON.stringify(taskData),
  165. `${now.getFullYear()}-${
  166. now.getMonth() + 1
  167. }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
  168. taskData.length
  169. }-${taskData.length - waitingTasks.length}.json`
  170. );
  171. });
  172.  
  173. const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
  174.  
  175. const checker = async () => {
  176. for (let i = 0; i<2; i++) {
  177. try {
  178. if (validators && DEFAULT_CONFIG) {
  179. return true;
  180. }
  181. } catch(err) {} finally{
  182. await sleep(5);
  183. }
  184. }
  185. return false;
  186. }
  187.  
  188. async function start() {
  189. console.log(new Date());
  190. AddImportBtn();
  191. await sleep(7);
  192. addScript(
  193. "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
  194. );
  195. const taskData = await GM.getValue("tasks");
  196. let tasks = taskData || [];
  197.  
  198. const available = await checker();
  199. if (!available) {
  200. await reload(5, "Can not get validators or DEFAULT_CONFIG");
  201. return;
  202. }
  203.  
  204. // find task which not downloaded and not validated before
  205. const waitingTasks = tasks.filter(
  206. (task) =>
  207. !task.downloaded &&
  208. task.validated === undefined &&
  209. validators[task.validator]
  210. );
  211. console.log(
  212. `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
  213. printStyle,
  214. tasks
  215. );
  216.  
  217. // ---------------------------- Report progress -----------------------------------------------------
  218.  
  219. let clientId = await GM.getValue("clientId");
  220. if (typeof clientId !== "string" || !clientId) {
  221. clientId = generateClientId();
  222. await GM.setValue("clientId", clientId);
  223. }
  224. const invalidatedTasks = tasks.filter((task) => task.validated === false);
  225. const doneTasks = tasks
  226. .filter((task) => task.downloaded)
  227. .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
  228. const previousDay = new Date().valueOf() - 24*3600*1000;
  229. const last24hDoneTasks = doneTasks.filter(task => task.updateTime > previousDay);
  230. const lastDoneTime = new Date(doneTasks[0]?.updateTime);
  231. const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
  232. Speed: ${last24hDoneTasks.length} / last 24h`;
  233. GM.xmlHttpRequest({
  234. url: "https://crawler-hit.deno.dev/api/update",
  235. method: "POST",
  236. headers: { "Content-Type": "application/json" },
  237. data: JSON.stringify({
  238. account: clientId,
  239. invalidate_count: invalidatedTasks.length,
  240. done_count: doneTasks.length,
  241. queue_count: waitingTasks.length,
  242. tip: reportTip,
  243. }),
  244. }).then((res) => {
  245. window.tts = res;
  246. console.log({ res });
  247. });
  248.  
  249. if (!waitingTasks.length) {
  250. await reload(90, "No tasks waiting");
  251. return;
  252. }
  253.  
  254. // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
  255. await sleep(10);
  256. const currentTask = waitingTasks[0];
  257. const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
  258. const validator = validators[currentTask.validator];
  259. if (document.getElementById("challenge-form")) {
  260. console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
  261. await sleep(20);
  262. currentTask.validated = false;
  263. currentTask.cloudflareBlock = true;
  264. }
  265.  
  266. // --------------------------- Page validate ------------------------------------------------------
  267. if (
  268. !currentTask.cloudflareBlock &&
  269. !document.body.textContent.toLowerCase().includes(doi)
  270. ) {
  271. console.log(
  272. `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
  273. printStyle
  274. );
  275. await sleep(5);
  276. location.href = currentTask.doi;
  277. return;
  278. }
  279. if (!currentTask.cloudflareBlock && validator(document)) {
  280. console.log(
  281. "%cValidate successfully! Downloading page...",
  282. printStyle,
  283. waitingTasks,
  284. tasks
  285. );
  286. removeImportBtn();
  287. // repair special page
  288. if (typeof documentFixer[currentTask.validator] === "function") {
  289. documentFixer[currentTask.validator](document);
  290. }
  291. try {
  292. const data = await singlefile.getPageData(DEFAULT_CONFIG);
  293. downloadFile(
  294. data.content,
  295. `${doi.replaceAll("/", "_")}.singlefile.html`
  296. );
  297. downloadFile(
  298. document.body.parentElement.outerHTML,
  299. `${doi.replaceAll("/", "_")}.html`
  300. );
  301. currentTask.downloaded = true;
  302. currentTask.validated = true;
  303. currentTask.updateTime = new Date().valueOf();
  304. } catch (error) {
  305. console.error(error);
  306. await reload(10, `singlefile error! ${currentTask.doi}`);
  307. return;
  308. }
  309. } else {
  310. console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
  311. currentTask.validated = false;
  312. currentTask.updateTime = new Date().valueOf();
  313. }
  314.  
  315. await GM.setValue("tasks", tasks);
  316.  
  317. // --------------------------- Prepare next task ------------------------------------------------------
  318. const nextTask = waitingTasks[1];
  319. if (nextTask) {
  320. console.log(
  321. `%cStart next task 10s later...`,
  322. printStyle,
  323. nextTask.doi,
  324. tasks
  325. );
  326. await sleep(10);
  327. location.href = nextTask.doi;
  328. } else {
  329. await reload(60, "No tasks waiting");
  330. }
  331. }
  332.  
  333. start();
  334. })();