懶人小説下載器

通用網站內容抓取工具,可批量抓取小說、論壇內容等並保存為TXT文檔

目前為 2017-02-04 提交的版本,檢視 最新版本

  1. // ==UserScript==
  2. // @name DownloadAllContent
  3. // @name:zh-CN 懒人小说下载器
  4. // @name:zh-TW 懶人小説下載器
  5. // @name:ja 怠惰者小説ダウンロードツール
  6. // @namespace hoothin
  7. // @version 1.09
  8. // @description Fetch and download main content on current page,provide special support for chinese novel
  9. // @description:zh-CN 通用网站内容抓取工具,可批量抓取小说、论坛内容等并保存为TXT文档
  10. // @description:zh-TW 通用網站內容抓取工具,可批量抓取小說、論壇內容等並保存為TXT文檔
  11. // @description:ja ユニバーサルサイトコンテンツクロールツール、クロール、フォーラム内容など
  12. // @author hoothin
  13. // @include *
  14. // @grant GM_xmlhttpRequest
  15. // @grant GM_registerMenuCommand
  16. // @require https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/1.3.3/FileSaver.min.js
  17. // @license MIT License
  18. // @compatible chrome
  19. // @compatible firefox
  20. // @compatible opera 未测试
  21. // @compatible safari 未测试
  22. // @contributionURL https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=rixixi@sina.com&item_name=Greasy+Fork+donation
  23. // @contributionAmount 1
  24. // ==/UserScript==
  25.  
  26. (function() {
  27. 'use strict';
  28. var lang = navigator.appName=="Netscape"?navigator.language:navigator.userLanguage;
  29. var i18n={};
  30. switch (lang){
  31. case "zh-CN":
  32. i18n={
  33. fetch:"开始下载小说或其他【Ctrl+F9】",
  34. info:"本文是使用懒人小说下载器(DownloadAllContent)脚本下载的",
  35. error:"该段内容获取失败"
  36. };
  37. break;
  38. default:
  39. i18n={
  40. fetch:"Download All Content[Ctrl+F9]",
  41. info:"The TXT is downloaded by 'DownloadAllContent'",
  42. error:"Failed in downloading current chapter"
  43. };
  44. break;
  45. }
  46.  
  47. function indexDownload(aEles){
  48. var rocketContent=document.createElement("div");
  49. document.body.appendChild(rocketContent);
  50. rocketContent.outerHTML=`
  51. <div id="txtDownContent" style="display: none;">
  52. <div style="width:300px;height:70px;position:fixed;left:50%;top:50%;margin-top:-25px;margin-left:-150px;z-index:100000;background-color:#ffffff;border:1px solid #afb3b6;border-radius:10px;opacity:0.95;filter:alpha(opacity=95);box-shadow:5px 5px 20px 0px #000;">
  53. <div id="txtDownWords" style="position:absolute;left:20px;top:10px;width:260px;">
  54. </div>
  55. <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAeCAMAAAAM7l6QAAAA5FBMVEUAAAD+/v7////9/f7////////+/v7+/v7////+/v7+/v7////+/v7+/v7////+/v7+/v7+/v7+/v7+/v7+/v7+/v7+/v7+/v7+/v7+/v7+/v7+/v7////////////+/v7+/v7+/v7+/v7+/v4uje3///82ke7s9P3N5PtQoPDI4fqCu/Tu9v5Im+/6/P+VxfZgqPFNnvDp8/3f7fq42Pmnz/d1tPNvsfNkq/JCmO/4+/7X6fz19/rn8PqYx/aNwfV8uPRqrvJZpfFUovAzkO3Q5vu92/mr0fieyva92fWx0vQ6lO5pygFTAAAAJHRSTlMAmfD+RMGwgj2mknlIKR/36+XGnIyHfnJfVDk2My8S4E1CJBvTatKDAAABY0lEQVQoz4WSZ1fCMBRA05ahLPfemkspBUFwgGz3+P//x/hK6ZBzvB/ak96+kZeoGCtFK5eziitqCSfWOnM2CuWUrOyQYPc0bjM2htHXh+/ffQ4lQ6zEPoZ6TwdUp10M56EtAe5MR7y0HGAeX7Ghe6MTzIZwFtTfBrepUzw4UFSGY8CUTTMGLo3eglv58By21pRnB7aNBqTwPfWq/OVey9sH22wZXiWo6Yr3GtRl/f0IRyoPYy14v97YWjVY12BPrcK9XvhaZPUdWCoLbR35yOoW5P7R8eQ3JrmbSp6HVmSrXuTrmNYyUAs2JkL6D6YjG1PgNGXK87F4nWAsUxmL2gyH6oVD7cuhdqFg9CE4T/oPE6CsgvBOP21715AP7ugaDFK+3YD1KyUcAG47bn0TSylxFd8WDTwMMByoBSUHw+jd9/3JbQODnVExygUS7FRUksPVtdDZW8dqCZm8lc1auxcq4gc02GVGTUchmgAAAABJRU5ErkJggg==" id="txtDownQuit" style="position:absolute;right:0px;top:0px;cursor: pointer;" />
  56. </div>
  57. </div>`;
  58. var txtDownContent=document.querySelector("#txtDownContent");
  59. var txtDownWords=document.querySelector("#txtDownWords");
  60. var txtDownQuit=document.querySelector("#txtDownQuit");
  61. txtDownQuit.onclick=function(){
  62. txtDownContent.style.display="none";
  63. txtDownContent.parentNode.removeChild(txtDownContent);
  64. };
  65. var j=0,rCats=[];
  66. function getDocEle(str){
  67. var doc = null;
  68. try {
  69. doc = document.implementation.createHTMLDocument('');
  70. doc.documentElement.innerHTML = str;
  71. }
  72. catch (e) {
  73. console.log('parse error');
  74. }
  75. return doc;
  76. }
  77. function processDoc(i, aTag, doc){
  78. j++;
  79. rCats[i]=(aTag.textContent+"\r\n"+getPageContent(doc));
  80. txtDownContent.style.display="block";
  81. txtDownWords.innerHTML="已下载完成 "+j+" 段,剩余 "+(aEles.length-j)+" 段"+"<br>正在下载 "+aTag.textContent;
  82. if(j==aEles.length){
  83. txtDownWords.innerHTML="已全部下载完成,共 "+j+" 段";
  84. var blob = new Blob([i18n.info+"\r\n"+document.title+"\r\n\r\n"+rCats.join("\r\n\r\n")], {type: "text/plain;charset=utf-8"});
  85. saveAs(blob, document.title+".txt");
  86. }
  87. }
  88. for(let i=0;i<aEles.length;i++){
  89. let aTag=aEles[i];
  90. GM_xmlhttpRequest({
  91. method: 'GET',
  92. url: aTag.href,
  93. overrideMimeType:"text/html;charset="+document.charset,
  94. onload: function(result) {
  95. var doc = getDocEle(result.responseText);
  96. processDoc(i, aTag, doc);
  97. }
  98. });
  99. }
  100. }
  101.  
  102. function getPageContent(doc){
  103. if(!doc)return i18n.error;
  104. var i,j,k,rStr="",pageData=(doc.body?doc.body:doc).cloneNode(true),delList=[];
  105. [].forEach.call(pageData.querySelectorAll("script,style,link"),function(item){delList.push(item);});
  106. [].forEach.call(delList,function(item){item.parentNode.removeChild(item);});
  107. var largestContent,contents=pageData.querySelectorAll("span,div,article,p,td");
  108. for(i=0;i<contents.length;i++){
  109. let content=contents[i],hasText=false,allSingle=true,item;
  110. for(j=content.childNodes.length-1;j>=0;j--){
  111. item=content.childNodes[j];
  112. if((item.nodeType==3 && /^\s*$/.test(item.data)) || (item.tagName=="FONT" && item.className=="jammer") || (item.style && item.style.display=="none"))
  113. item.parentNode.removeChild(item);
  114. }
  115. [].forEach.call(content.childNodes,function(item){
  116. if(item.nodeType==3 && item.data && !/^\s*$/.test(item.data))
  117. hasText=true;
  118. });
  119. if(content.childNodes.length>1){
  120. for(j=0;j<content.childNodes.length;j++){
  121. item=content.childNodes[j];
  122. if(item.nodeType==1){
  123. for(k=0;k<item.childNodes.length;k++){
  124. var childNode=item.childNodes[k];
  125. if(childNode.nodeType!=3 && !/^(I|A|STRONG|B|FONT|BR)$/.test(childNode.tagName)){
  126. allSingle=false;
  127. break;
  128. }
  129. }
  130. if(!allSingle)break;
  131. delList=[];
  132. [].forEach.call(item.childNodes,function(n){if((n.nodeType==3 && /^\s*$/.test(n.data)) || (n.nodeType==1 && /^\s*$/.test(n.textContent)))delList.push(n);});
  133. [].forEach.call(delList,function(n){n.parentNode.removeChild(n);});
  134. }
  135. }
  136. }else{
  137. allSingle=false;
  138. }
  139. if(!allSingle){
  140. if(!hasText)continue;
  141. if(content.firstChild && (
  142. (content.firstChild.nodeType!=3 && !/^(I|A|STRONG|B|FONT|BR)$/.test(content.firstChild.tagName)) ||
  143. (content.firstChild.nodeType==3 && /^\s*$/.test(content.firstChild.data) &&
  144. (!content.childNodes[1] || !(content.childNodes[1].nodeType==3 || /^(I|A|STRONG|B|FONT|BR)$/.test(content.childNodes[1].tagName))))
  145. ))
  146. continue;
  147. if(pageData==document && content.offsetWidth<=0 && content.offsetHeight<=0)
  148. continue;
  149. }
  150. if(navigator.userAgent.toLowerCase().indexOf('firefox')!=-1){
  151. if(!largestContent || largestContent.textContent.length<content.textContent.length){
  152. largestContent=content;
  153. }
  154. }else{
  155. if(!largestContent || largestContent.innerText.length<content.innerText.length){
  156. largestContent=content;
  157. }
  158. }
  159. }
  160. if(!largestContent)return i18n.error;
  161. var childlist=pageData.querySelectorAll(largestContent.tagName+(largestContent.className?"."+largestContent.className.replace(/(^\s*)|(\s*$)/g, '').replace(/\s+/g, '.'):""));
  162. function getRightStr(ele, noTextEnable){
  163. let childNodes=ele.childNodes,cStr="\r\n",hasText=false;
  164. for(let j=0;j<childNodes.length;j++){
  165. let childNode=childNodes[j];
  166. if(childNode.nodeType==3 && childNode.data && !/^\s*$/.test(childNode.data))hasText=true;
  167. if(childNode.textContent){
  168. cStr+=childNode.textContent.replace(/ +/g," ").replace(/([^\r]|^)\n([^\r]|$)/g,"$1\r\n$2");
  169. }
  170. if(childNode.nodeType!=3 && !/^(I|A|STRONG|B|FONT)$/.test(childNode.tagName))cStr+="\r\n";
  171. }
  172. if(hasText || noTextEnable || ele==largestContent)rStr+=cStr+"\r\n";
  173. }
  174. for(i=0;i<childlist.length;i++){
  175. var child=childlist[i];
  176. if(getDepth(child)==getDepth(largestContent)){
  177. if(largestContent.className && largestContent.className==child.className){
  178. getRightStr(child, true);
  179. }else {
  180. getRightStr(child, false);
  181. }
  182. }
  183. }
  184. return rStr;
  185. }
  186.  
  187. function getDepth(dom){
  188. var pa=dom,i=0;
  189. while(pa.parentNode){
  190. pa=pa.parentNode;
  191. i++;
  192. }
  193. return i;
  194. }
  195.  
  196. function fetch(){
  197. var aEles=document.querySelectorAll("a"),list=[];
  198. for(var i=0;i<aEles.length;i++){
  199. var aEle=aEles[i];
  200. if(aEle.href && /第.+[章|节|回|卷|折|篇|幕|集]|序|序\s*言|序\s*章|前\s*言|引\s*言|引\s*子|摘\s*要|楔\s*子|后\s*记|附\s*言|结\s*语|[\d|〇|零|一|二|三|四|五|六|七|八|九|十|百|千|万|萬|-]+(、|))/.test(aEle.innerHTML)){
  201. list.push(aEle);
  202. }
  203. }
  204. if(list.length>2){
  205. indexDownload(list);
  206. }else{
  207. var blob = new Blob([i18n.info+"\r\n"+document.title+"\r\n\r\n"+getPageContent(document)], {type: "text/plain;charset=utf-8"});
  208. saveAs(blob, document.title+".txt");
  209. }
  210. }
  211.  
  212. document.addEventListener("keydown", function(e) {
  213. if(e.keyCode == 120 && e.ctrlKey) {
  214. fetch();
  215. }
  216. });
  217. GM_registerMenuCommand(i18n.fetch, fetch);
  218. })();