懒人小说下载器

通用网站内容抓取工具,可批量抓取小说、论坛内容等并保存为TXT文档

当前为 2021-12-17 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name DownloadAllContent
  3. // @name:zh-CN 懒人小说下载器
  4. // @name:zh-TW 懶人小説下載器
  5. // @name:ja 怠惰者小説ダウンロードツール
  6. // @namespace hoothin
  7. // @version 2.5.2
  8. // @description Fetch and download main content on current page, provide special support for chinese novel
  9. // @description:zh-CN 通用网站内容抓取工具,可批量抓取小说、论坛内容等并保存为TXT文档
  10. // @description:zh-TW 通用網站內容抓取工具,可批量抓取小說、論壇內容等並保存為TXT文檔
  11. // @description:ja ユニバーサルサイトコンテンツクロールツール、クロール、フォーラム内容など
  12. // @author hoothin
  13. // @include *
  14. // @grant GM_xmlhttpRequest
  15. // @grant GM_registerMenuCommand
  16. // @grant GM_setValue
  17. // @grant GM_getValue
  18. // @require https://cdn.jsdelivr.net/npm/file-saver@1.3.8/FileSaver.min.js
  19. // @license MIT License
  20. // @compatible chrome
  21. // @compatible firefox
  22. // @compatible opera 未测试
  23. // @compatible safari 未测试
  24. // @contributionURL https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=rixixi@sina.com&item_name=Greasy+Fork+donation
  25. // @contributionAmount 1
  26. // ==/UserScript==
  27.  
  28. (function() {
  29. 'use strict';
  30. var indexReg=/PART\b|^Prologue|^\D+\-\d+|分卷|Chapter\s*[\-_]?\d+|^序$|^序\s*言|^序\s*章|^前\s*言|^引\s*言|^引\s*子|^摘\s*要|^楔\s*子|^契\s*子|^后\s*记|^後\s*記|^附\s*言|^结\s*语|^結\s*語|^尾\s*声|^最終話|^最终话|^番\s*外|[第^\s(][\d〇零一二三四五六七八九十百千万萬-]+\s*(、|)|\.\D|章|节|節|回|卷|折|篇|幕|集|话|話)/i;
  31. var innerNextPage=/下一(页|张)|next\s*page/i;
  32. var lang = navigator.appName=="Netscape"?navigator.language:navigator.userLanguage;
  33. var i18n={};
  34. var rCats=[];
  35. switch (lang){
  36. case "zh-CN":
  37. case "zh-SG":
  38. i18n={
  39. fetch:"开始下载小说或其他【Ctrl+F9】",
  40. info:"本文是使用懒人小说下载器(DownloadAllContent)下载的",
  41. error:"该段内容获取失败",
  42. downloading:"已下载完成 %s 段,剩余 %s 段<br>正在下载 %s",
  43. complete:"已全部下载完成,共 %s 段",
  44. del:"设置文本干扰码的CSS选择器",
  45. custom:"自定义下载",
  46. customInfo:"输入网址或者章节CSS选择器",
  47. reSort:"按标题名重新排序",
  48. setting:"懒人小说下载设置",
  49. abort:"跳过此章",
  50. save:"临时保存",
  51. downThreadNum:"设置同时下载的线程数"
  52. };
  53. break;
  54. case "zh-TW":
  55. case "zh-HK":
  56. i18n={
  57. fetch:"開始下載小說或其他【Ctrl+F9】",
  58. info:"本文是使用懶人小說下載器(DownloadAllContent)下載的",
  59. error:"該段內容獲取失敗",
  60. downloading:"已下載完成 %s 段,剩餘 %s 段<br>正在下載 %s",
  61. complete:"已全部下載完成,共 %s 段",
  62. del:"設置文本干擾碼的CSS選擇器",
  63. custom:"自定義下載",
  64. customInfo:"輸入網址或者章節CSS選擇器",
  65. reSort:"按標題名重新排序",
  66. setting:"懶人小說下載設置",
  67. abort:"跳過此章",
  68. save:"保存當前",
  69. downThreadNum:"設置同時下載的綫程數"
  70. };
  71. break;
  72. default:
  73. i18n={
  74. fetch:"Download All Content[Ctrl+F9]",
  75. info:"The TXT is downloaded by 'DownloadAllContent'",
  76. error:"Failed in downloading current chapter",
  77. downloading:"%s pages are downloaded, there are still %s pages left<br>Downloading %s ......",
  78. complete:"Completed! Get %s pages in total",
  79. del:"Set css selectors for ignore",
  80. custom:"Custom to download",
  81. customInfo:"Input urls OR sss selectors for chapter links",
  82. reSort:"ReSort by title",
  83. setting:"DownloadAllContent Setting",
  84. abort:"Abort",
  85. save:"Save",
  86. downThreadNum:"Set threadNum for download"
  87. };
  88. break;
  89. }
  90. var firefox=navigator.userAgent.toLowerCase().indexOf('firefox')!=-1,curRequests=[];
  91. var rocketContent,txtDownContent,txtDownWords,txtDownQuit,txtDownDivInited=false;
  92.  
  93. function initTxtDownDiv(){
  94. if(txtDownDivInited)return;
  95. txtDownDivInited=true;
  96. rocketContent=document.createElement("div");
  97. document.body.appendChild(rocketContent);
  98. rocketContent.outerHTML=`
  99. <div id="txtDownContent">
  100. <div style="width:360px;height:90px;position:fixed;left:50%;top:50%;margin-top:-25px;margin-left:-150px;z-index:100000;background-color:#ffffff;border:1px solid #afb3b6;border-radius:10px;opacity:0.95;filter:alpha(opacity=95);box-shadow:5px 5px 20px 0px #000;">
  101. <div id="txtDownWords" style="position:absolute;width:275px;max-height: 90%;border: 1px solid #f3f1f1;padding: 8px;border-radius: 10px;overflow: auto;">
  102. Downloading......
  103. </div>
  104. <div id="txtDownQuit" style="width:36px;height:28px;border-radius:10px;position:absolute;right:2px;top:2px;cursor: pointer;background-color:#ff5a5a;">
  105. <span style="height:28px;line-height:28px;display:block;color:#FFF;text-align:center;font-size:20px;">╳</span>
  106. </div>
  107. <div style="position:absolute;right:0px;bottom:2px;cursor: pointer;max-width:85px">
  108. <button id="abortRequest" style="background: #008aff;border: 0;padding: 5px;border-radius: 10px;color: white;float: right;margin: 1px;height: 25px;display:none;">${getI18n('abort')}</button>
  109. <button id="tempSaveTxt" style="background: #008aff;border: 0;padding: 5px;border-radius: 10px;color: white;float: right;margin: 1px;height: 25px;">${getI18n('save')}</button>
  110. </div>
  111. </div>
  112. </div>`;
  113. txtDownContent=document.querySelector("#txtDownContent");
  114. txtDownWords=document.querySelector("#txtDownWords");
  115. txtDownQuit=document.querySelector("#txtDownQuit");
  116. txtDownQuit.onclick=function(){
  117. txtDownContent.style.display="none";
  118. txtDownContent.parentNode.removeChild(txtDownContent);
  119. };
  120. initTempSave();
  121. }
  122.  
  123. function initTempSave(){
  124. var tempSavebtn = document.getElementById('tempSaveTxt');
  125. var abortbtn = document.getElementById('abortRequest');
  126. tempSavebtn.onclick = function(){
  127. var blob = new Blob([i18n.info+"\r\n"+document.title+"\r\n\r\n"+rCats.join("\r\n\r\n")], {type: "text/plain;charset=utf-8"});
  128. saveAs(blob, document.title+".txt");
  129. }
  130. abortbtn.onclick = function(){
  131. let curRequest = curRequests.pop();
  132. if(curRequest)curRequest[1].abort();
  133. }
  134. }
  135.  
  136. function indexDownload(aEles){
  137. if(aEles.length<1)return;
  138. initTxtDownDiv();
  139. if(GM_getValue("contentSort")){
  140. aEles.sort(function(a,b){
  141. return parseInt(a.innerText.replace(/[^0-9]/ig,"")) - parseInt(b.innerText.replace(/[^0-9]/ig,""));
  142. });
  143. }
  144. rCats=[];
  145. var insertSigns=[];
  146. // var j=0,rCats=[];
  147. var downIndex=0,downNum=0,downOnce=function(){
  148. if(downNum>=aEles.length)return;
  149. let curIndex=downIndex;
  150. let aTag=aEles[curIndex];
  151. let request=(aTag, curIndex)=>{
  152. return [curIndex,GM_xmlhttpRequest({
  153. method: 'GET',
  154. url: aTag.href,
  155. headers:{referer:aTag.href},
  156. timeout:15000,
  157. overrideMimeType:"text/html;charset="+document.charset,
  158. onload: function(result) {
  159. var doc = getDocEle(result.responseText);
  160. let nextPage=checkNextPage(doc);
  161. if(nextPage){
  162. nextPage.innerText=aTag.innerText+"\t>>";
  163. aEles.push(nextPage);
  164. let targetIndex = curIndex;
  165. for(let a=0;a<insertSigns.length;a++){
  166. let signs=insertSigns[a],breakSign=false;
  167. if(signs){
  168. for(let b=0;b<signs.length;b++){
  169. let sign=signs[b];
  170. if(sign==curIndex){
  171. targetIndex=a;
  172. breakSign=true;
  173. break;
  174. }
  175. }
  176. }
  177. if(breakSign)break;
  178. }
  179. let insertSign = insertSigns[targetIndex];
  180. if(!insertSign)insertSigns[targetIndex] = [];
  181. insertSigns[targetIndex].push(aEles.length-1);
  182. }
  183. downIndex++;
  184. downNum++;
  185. processDoc(curIndex, aTag, doc);
  186. let request=downOnce();
  187. if(request)curRequests.push(request);
  188. },
  189. onerror: function(e) {
  190. console.warn("error:");
  191. console.log(e);
  192. downIndex++;
  193. downNum++;
  194. processDoc(curIndex, aTag, null);
  195. let request=downOnce();
  196. if(request)curRequests.push(request);
  197. },
  198. ontimeout: function(e) {
  199. console.warn("timeout:");
  200. console.log(e);
  201. downIndex++;
  202. downNum++;
  203. processDoc(curIndex, aTag, null);
  204. let request=downOnce();
  205. if(request)curRequests.push(request);
  206. },
  207. })];
  208. }
  209. if(!aTag){
  210. let waitAtagReadyInterval=setInterval(function(){
  211. if(downNum>=aEles.length)clearInterval(waitAtagReadyInterval);
  212. aTag=aEles[curIndex];
  213. if(aTag){
  214. clearInterval(waitAtagReadyInterval);
  215. request(aTag, curIndex);
  216. }
  217. },1000);
  218. return null;
  219. }
  220. return request(aTag, curIndex);
  221. };
  222. function getDocEle(str){
  223. var doc = null;
  224. try {
  225. doc = document.implementation.createHTMLDocument('');
  226. doc.documentElement.innerHTML = str;
  227. }
  228. catch (e) {
  229. console.log('parse error');
  230. }
  231. return doc;
  232. }
  233. function sortInnerPage(){
  234. var pageArrs=[],maxIndex=0,i,j;
  235. for(i=0;i<insertSigns.length;i++){
  236. var signs=insertSigns[i];
  237. if(signs){
  238. for(j=0;j<signs.length;j++){
  239. var sign=signs[j];
  240. var cat=rCats[sign];
  241. rCats[sign]=null;
  242. if(!pageArrs[i])pageArrs[i]=[];
  243. pageArrs[i].push(cat);
  244. }
  245. }
  246. }
  247. for(i=pageArrs.length-1;i>=0;i--){
  248. let pageArr=pageArrs[i];
  249. if(pageArr){
  250. for(j=pageArr.length-1;j>=0;j--){
  251. rCats.splice(i+1, 0, pageArr[j]);
  252. }
  253. }
  254. }
  255. rCats = rCats.filter(function(e){return e!=null});
  256. }
  257. function processDoc(i, aTag, doc){
  258. curRequests = curRequests.filter(function(e){return e[0]!=i});
  259. rCats[i]=(aTag.innerText+"\r\n"+getPageContent(doc));
  260. txtDownContent.style.display="block";
  261. txtDownWords.innerHTML=getI18n("downloading",[downNum,(aEles.length-downNum),aTag.innerText]);
  262. if(downNum==aEles.length){
  263. txtDownWords.innerHTML=getI18n("complete",[downNum]);
  264. sortInnerPage();
  265. var blob = new Blob([i18n.info+"\r\n"+document.title+"\r\n\r\n"+rCats.join("\r\n\r\n")], {type: "text/plain;charset=utf-8"});
  266. saveAs(blob, document.title+".txt");
  267. }
  268. }
  269. var downThreadNum = parseInt(GM_getValue("downThreadNum"));
  270. downThreadNum=downThreadNum>0?downThreadNum:20;
  271. for(var i=0;i<downThreadNum;i++){
  272. let request=downOnce();
  273. if(request)curRequests.push(request);
  274. if(downIndex>=aEles.length-1 || downIndex>=downThreadNum-1)break;
  275. else downIndex++;
  276. }
  277.  
  278. /*for(let i=0;i<aEles.length;i++){
  279. let aTag=aEles[i];
  280. GM_xmlhttpRequest({
  281. method: 'GET',
  282. url: aTag.href,
  283. overrideMimeType:"text/html;charset="+document.charset,
  284. onload: function(result) {
  285. var doc = getDocEle(result.responseText);
  286. processDoc(i, aTag, doc);
  287. }
  288. });
  289. }*/
  290. }
  291.  
  292. function checkNextPage(doc){
  293. let aTags=doc.querySelectorAll("a"),nextPage=null;
  294. for(var i=0;i<aTags.length;i++){
  295. let aTag=aTags[i];
  296. if(innerNextPage.test(aTag.innerText) && /^http/i.test(aTag.href)){
  297. nextPage=aTag;
  298. break;
  299. }
  300. }
  301. return nextPage;
  302. }
  303.  
  304. function getPageContent(doc){
  305. if(!doc)return i18n.error;
  306. if(doc.defaultView)
  307. [].forEach.call(doc.querySelectorAll("span,div"),function(item){
  308. var thisStyle=doc.defaultView.getComputedStyle(item);
  309. if(thisStyle && (thisStyle.display=="none" || (item.tagName=="SPAN" && thisStyle.fontSize=="0px")))
  310. item.parentNode.removeChild(item);
  311. });
  312. var i,j,k,rStr="",pageData=(doc.body?doc.body:doc).cloneNode(true),delList=[];
  313. [].forEach.call(pageData.querySelectorAll("font.jammer"),function(item){
  314. item.parentNode.removeChild(item);
  315. });
  316. var selectors=GM_getValue("selectors");
  317. if(selectors){
  318. [].forEach.call(pageData.querySelectorAll(selectors),function(item){
  319. item.parentNode.removeChild(item);
  320. });
  321. }
  322. [].forEach.call(pageData.querySelectorAll("script,style,link,img,noscript,iframe"),function(item){delList.push(item);});
  323. [].forEach.call(delList,function(item){item.parentNode.removeChild(item);});
  324. var largestContent,contents=pageData.querySelectorAll("span,div,article,p,td"),largestNum=0;
  325. for(i=0;i<contents.length;i++){
  326. let content=contents[i],hasText=false,allSingle=true,item,curNum=0;
  327. for(j=content.childNodes.length-1;j>=0;j--){
  328. item=content.childNodes[j];
  329. if(item.nodeType==3){
  330. if(/^\s*$/.test(item.data))
  331. item.parentNode.removeChild(item);
  332. else hasText=true;
  333. }else if(/^(I|A|STRONG|B|FONT|P|DL|DD|H\d)$/.test(item.tagName))hasText=true;
  334. }
  335. for(j=content.childNodes.length-1;j>=0;j--){
  336. item=content.childNodes[j];
  337. if(item.nodeType==1 && !/^(I|A|STRONG|B|FONT|BR)$/.test(item.tagName) && /^\s*$/.test(item.innerHTML))
  338. item.parentNode.removeChild(item);
  339. }
  340. if(content.childNodes.length>1){
  341. for(j=0;j<content.childNodes.length;j++){
  342. item=content.childNodes[j];
  343. if(item.nodeType==1){
  344. for(k=0;k<item.childNodes.length;k++){
  345. var childNode=item.childNodes[k];
  346. if(childNode.nodeType!=3 && !/^(I|A|STRONG|B|FONT|BR)$/.test(childNode.tagName)){
  347. allSingle=false;
  348. break;
  349. }
  350. }
  351. if(!allSingle)break;
  352. }
  353. }
  354. }else{
  355. allSingle=false;
  356. }
  357. if(allSingle){
  358. curNum=(firefox?content.textContent.length:content.innerText.length);
  359. }else {
  360. if(!hasText)continue;
  361. if(pageData==document && content.offsetWidth<=0 && content.offsetHeight<=0)
  362. continue;
  363. [].forEach.call(content.childNodes,function(item){
  364. if(item.nodeType==3)curNum+=item.data.length;
  365. else if(/^(I|A|STRONG|B|FONT|P|DL|DD|H\d)$/.test(item.tagName))curNum+=(firefox?item.textContent.length:item.innerText.length);
  366. });
  367. }
  368. if(curNum>largestNum){
  369. largestNum=curNum;
  370. largestContent=content;
  371. }
  372. }
  373. if(!largestContent)return i18n.error;
  374. var childlist=pageData.querySelectorAll(largestContent.tagName);//+(largestContent.className?"."+largestContent.className.replace(/(^\s*)|(\s*$)/g, '').replace(/\s+/g, '.'):""));
  375. function getRightStr(ele, noTextEnable){
  376. let childNodes=ele.childNodes,cStr="\r\n",hasText=false;
  377. for(let j=0;j<childNodes.length;j++){
  378. let childNode=childNodes[j];
  379. if(childNode.nodeType==3 && childNode.data && !/^\s*$/.test(childNode.data))hasText=true;
  380. if(childNode.innerHTML){
  381. childNode.innerHTML=childNode.innerHTML.replace(/\<\s*br\s*\>/gi,"\r\n").replace(/\n+/gi,"\n").replace(/\r+/gi,"\r");
  382. }
  383. if(childNode.textContent){
  384. cStr+=childNode.textContent.replace(/ +/g," ").replace(/([^\r]|^)\n([^\r]|$)/gi,"$1\r\n$2");
  385. }
  386. if(childNode.nodeType!=3 && !/^(I|A|STRONG|B|FONT)$/.test(childNode.tagName))cStr+="\r\n";
  387. }
  388. if(hasText || noTextEnable || ele==largestContent)rStr+=cStr+"\r\n";
  389. }
  390. for(i=0;i<childlist.length;i++){
  391. var child=childlist[i];
  392. if(getDepth(child)==getDepth(largestContent)){
  393. if((!largestContent.className && child.className) || (largestContent.className && !child.className))continue;
  394. if((largestContent.className && largestContent.className==child.className)||largestContent.parentNode ==child.parentNode){
  395. getRightStr(child, true);
  396. }else {
  397. getRightStr(child, false);
  398. }
  399. }
  400. }
  401. return rStr;
  402. }
  403.  
  404. function getI18n(key, args){
  405. var resultStr=i18n[key];
  406. if(args && args.length>0){
  407. args.forEach(function(item){
  408. resultStr=resultStr.replace(/%s/,item);
  409. });
  410. }
  411. return resultStr;
  412. }
  413.  
  414. function getDepth(dom){
  415. var pa=dom,i=0;
  416. while(pa.parentNode){
  417. pa=pa.parentNode;
  418. i++;
  419. }
  420. return i;
  421. }
  422.  
  423. function fetch(){
  424. var aEles=document.querySelectorAll("a"),list=[];
  425. for(var i=0;i<aEles.length;i++){
  426. var aEle=aEles[i],has=false;
  427. for(var j=0;j<list.length;j++){
  428. if(list[j].href==aEle.href){
  429. list.splice(j,1);
  430. list.push(aEle);
  431. has=true;
  432. break;
  433. }
  434. }
  435. if(!has && aEle.href && /^http/i.test(aEle.href) && ((aEle.innerText.trim()!="" && indexReg.test(aEle.innerText.trim())) || /chapter[\-_]?\d/.test(aEle.href))){
  436. list.push(aEle);
  437. }
  438. }
  439. if(list.length>2){
  440. indexDownload(list);
  441. }else{
  442. var blob = new Blob([i18n.info+"\r\n"+document.title+"\r\n\r\n"+getPageContent(document)], {type: "text/plain;charset=utf-8"});
  443. saveAs(blob, document.title+".txt");
  444. }
  445. }
  446.  
  447. document.addEventListener("keydown", function(e) {
  448. if(e.keyCode == 120 && e.ctrlKey) {
  449. fetch();
  450. }
  451. });
  452. function setDel(){
  453. var selValue=GM_getValue("selectors");
  454. var selectors=prompt(i18n.del,selValue?selValue:"");
  455. GM_setValue("selectors",selectors);
  456. selValue=GM_getValue("downThreadNum");
  457. var downThreadNum=prompt(i18n.downThreadNum,selValue?selValue:"20");
  458. GM_setValue("downThreadNum",downThreadNum);
  459. if(window.confirm(i18n.reSort)){
  460. GM_setValue("contentSort", true);
  461. }else{
  462. GM_setValue("contentSort", false);
  463. }
  464. }
  465. function customDown(){
  466. var urls=window.prompt(i18n.customInfo,"https://xxx.xxx/book-[20-99].html, https://xxx.xxx/book-[01-10].html");
  467. if(urls){
  468. var processEles=[];
  469. if(/^http|^ftp/.test(urls)){
  470. [].forEach.call(urls.split(","),function(i){
  471. var varNum=/\[\d+\-\d+\]/.exec(i)[0].trim();
  472. var num1=/\[(\d+)/.exec(varNum)[1].trim();
  473. var num2=/(\d+)\]/.exec(varNum)[1].trim();
  474. var num1Int=parseInt(num1);
  475. var num2Int=parseInt(num2);
  476. var numLen=num1.length;
  477. var needAdd=num1.charAt(0)=="0";
  478. if(num1Int>=num2Int)return;
  479. for(var j=num1Int;j<=num2Int;j++){
  480. var urlIndex=j.toString();
  481. if(needAdd){
  482. while(urlIndex.length<numLen)urlIndex="0"+urlIndex;
  483. }
  484. var curUrl=i.replace(/\[\d+\-\d+\]/,urlIndex).trim();
  485. var curEle=document.createElement("a");
  486. curEle.href=curUrl;
  487. processEles.push(curEle);
  488. curEle.innerText=processEles.length.toString();
  489. }
  490. });
  491. }else{
  492. let urlsArr=urls.split("@@");
  493. [].forEach.call(document.querySelectorAll(urlsArr[0]),function(item){
  494. let has=false;
  495. for(var j=0;j<processEles.length;j++){
  496. if(processEles[j].href==item.href){
  497. processEles.splice(j,1);
  498. processEles.push(item);
  499. has=true;
  500. break;
  501. }
  502. }
  503. if(!has && item.href && /^http/i.test(item.href)){
  504. processEles.push(item.cloneNode(1));
  505. }
  506. });
  507. if(urlsArr.length>1){
  508. processEles.forEach(ele=>{
  509. ele.href=ele.href.replace(new RegExp(urlsArr[1]), urlsArr[2]);
  510. });
  511. }
  512. }
  513. indexDownload(processEles);
  514. }
  515. }
  516. GM_registerMenuCommand(i18n.fetch, fetch);
  517. GM_registerMenuCommand(i18n.custom, customDown);
  518. GM_registerMenuCommand(i18n.setting, setDel);
  519. })();