Wenku Doc Downloader

下载“百度文库”“豆丁网”文档

目前为 2021-11-25 提交的版本。查看 最新版本

  1. // ==UserScript==
  2. // @name Wenku Doc Downloader
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.4
  5. // @description 下载“百度文库”“豆丁网”文档
  6. // @author allenlv2690@gmail.com
  7. // @match https://wenku.baidu.com/view/*
  8. // @match https://www.docin.com/p-*
  9. // @icon https://www.google.com/s2/favicons?domain=limestart.cn
  10. // @grant none
  11. // @license GPL-3.0-only
  12. // @create 2021-11-22
  13. // @note 更新了对豆丁网的简单支持:按下绿色按钮打印页面(等同于按下ctrl+p),得到pdf。
  14. // @note 产生的的pdf文件每一页都是图片,文字无法选中。
  15. // ==/UserScript==
  16.  
  17. /*
  18. * 附属功能函数部分
  19. */
  20.  
  21. function createAndDownloadFile(fileName, content) {
  22. // 创建并下载文件
  23. var aTag = document.createElement('a');
  24. var blob = new Blob([content]);
  25. aTag.download = fileName;
  26. aTag.href = URL.createObjectURL(blob);
  27. aTag.click();
  28. URL.revokeObjectURL(blob);
  29. }
  30.  
  31. function formatText(text){
  32. // 用于纯文本文档的文本美化
  33. var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]");
  34. var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) ");
  35. var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])");
  36.  
  37. var text_1 = text.replace(reg_exp_1, "TEMP");
  38. var text_2 = text_1.replace(reg_exp_2, "");
  39. var text_3 = text_2.replace("TEMP", "");
  40. var text_final = text_3.replace(/ /g, " ");
  41. return text_final;
  42. }
  43.  
  44. function formatText2(text) {
  45. // 用于图形文字混合型文档的文本美化
  46. var reg_exp = new RegExp("[  ]{2,}");
  47. var content_1 = text.replace(reg_exp, "\n");
  48.  
  49. var content_2 = content_1.replace(/[  ]\n/g, "\n");
  50.  
  51. var reg_exp_2 = new RegExp("\n[   ]*\n*\n");
  52. var content_3 = content_2.replace(reg_exp_2, "\n");
  53.  
  54. var reg_exp_3 = new RegExp(" *\n * ");
  55. var content_4 = content_3.replace(reg_exp_3, "\n");
  56.  
  57. var content_5 = content_4.replace(/[  ]/g, " ");
  58. var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n");
  59.  
  60. return final_content;
  61. }
  62.  
  63. /*
  64. * 主要功能函数部分
  65. */
  66.  
  67. function readAll() {
  68. var read_all_btn = document.getElementsByClassName("read-all")[0];
  69. // 如果存在“继续阅读”的按钮
  70. if (read_all_btn) {
  71. // 点击“继续阅读”按钮
  72. read_all_btn.click();
  73. }
  74. // 如果点击完之后仍旧存在该按钮,递归调用自身
  75. // read_all_btn = document.getElementsByClassName("read-all")[0];
  76. // if (read_all_btn) {
  77. // readAll();
  78. // }
  79. else{
  80. alert("文档已经完全展开,可以导出");
  81. var init_btn = document.getElementsByClassName("init-btn")[0];
  82. var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
  83. init_btn.style.display = "none";
  84. save_doc_btn.style.removeProperty("display");
  85. }
  86. }
  87.  
  88. function savePDFData() {
  89. // 存储pdf型data(假定是内容是pic)
  90. alert("Function savePDFData was called.");
  91. var pic_urls = document.getElementsByClassName("reader-pic-item");
  92. var text_list = [];
  93. // 去掉前缀
  94. var reg_exp_1 = new RegExp(": ?url[(]");
  95. // 去掉后缀
  96. var reg_exp_2 = new RegExp("[)]; ?background-position");
  97.  
  98. for (var i = 0; i < pic_urls.length; i++){
  99. var whole_text = pic_urls[i].getAttribute("style");
  100. var de_pretext = whole_text.split(reg_exp_1)[1];
  101. var url = de_pretext.split(reg_exp_2)[0];
  102. text_list.push(url);
  103. }
  104.  
  105. text_list[0] = text_list[0].replace(/"/g, "");
  106. var content = text_list.join("\n");
  107. createAndDownloadFile("urls.csv", content);
  108. }
  109.  
  110. function saveDocData() {
  111. // 存储doc型data(内容是text)
  112. alert("Function saveDocData was called.");
  113. // 获取文本
  114. var text_elements = document.getElementsByClassName("reader-word-layer");
  115. var texts = [];
  116. for (var elem of text_elements){
  117. texts.push(elem.textContent);
  118. }
  119. // 美化后导出文本
  120. var origin_content = texts.join("");
  121. var content = formatText(origin_content);
  122. createAndDownloadFile("纯文本文档.txt", content);
  123. }
  124.  
  125. function savePPTData() {
  126. // 存储ppt型data(内容是pic)
  127. alert("Function savePPTData was called.");
  128. var pic_elements = document.getElementsByClassName("ppt-image-wrap");
  129. var pic_urls = [];
  130.  
  131. for (var elem of pic_elements) {
  132. var pic_obj = elem.children[0];
  133. var url = pic_obj.src;
  134. pic_urls.push(url);
  135. }
  136.  
  137. var content = pic_urls.join("\n");
  138. createAndDownloadFile("urls.csv", content);
  139. }
  140.  
  141. function saveExcelData() {
  142. // 1. 拿到表格
  143. var table_pic = document.getElementsByClassName("reader-pic-item")[0];
  144. var url = table_pic.style.getPropertyValue("background-image");
  145. // 获取图片地址
  146. var pure_url = url.slice(5, -2);
  147.  
  148. // 2. 拿到表格内文字信息
  149. var text_elems = document.getElementsByClassName("reader-word-layer");
  150. var text_list = [];
  151. for (var elem of text_elems) {
  152. text_list.push(elem.textContent);
  153. }
  154. var _text = text_list.join("\n");
  155. // 替换奇怪的空格
  156. var text = _text.replace(/ /g, " ");
  157.  
  158. // 3. 合并至一个字符串,然后导出
  159. var head = "表格图形链接如下(复制到浏览器中打开):";
  160. var content = head + "\n\n" + pure_url + "\n\n" + text;
  161. createAndDownloadFile("图片地址和表格内容.txt", content);
  162. }
  163.  
  164. function saveDocAndPicData() {
  165. // 对于文字和图形混合型的data只能存储其中的纯文字
  166. alert("Function saveDocAndPicData was called.");
  167. // 获取文本
  168. var text_elements = document.getElementsByClassName("reader-word-layer");
  169. var texts = [];
  170. for (var elem of text_elements){
  171. texts.push(elem.textContent);
  172. }
  173. // 处理文本中的过长空格
  174. var origin_content = texts.join("");
  175. // 美化后导出文本
  176. var content = formatText2(origin_content);
  177. createAndDownloadFile("纯文本文档.txt", content);
  178. }
  179.  
  180. function detectType() {
  181. // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在
  182. var pdf = document.getElementsByClassName("reader-pic-item").length;
  183. var doc = document.getElementsByClassName("reader-word-layer").length;
  184. var ppt = document.getElementsByClassName("ppt-image-wrap").length;
  185. // 判断文档类别
  186. if (pdf && !doc && !ppt) {
  187. return "pdf";
  188. }
  189. else if (doc && !pdf && !ppt) {
  190. return "doc";
  191. }
  192. else if (ppt && !pdf && !doc) {
  193. return "ppt";
  194. }
  195. else if (pdf === 1 && doc > 1 && !ppt) {
  196. return "excel";
  197. }
  198. else if (pdf > 2 && doc > 2 && !ppt) {
  199. return "docANDpic";
  200. }
  201. else {
  202. return {"pdf元素数量": pdf, "doc元素数量": doc, "ppt元素数量": ppt};
  203. }
  204. }
  205.  
  206. function saveData() {
  207. // 存储文档数据到本地
  208. var category = detectType();
  209. if (category === "pdf"){
  210. savePDFData();
  211. }
  212. else if (category === "doc") {
  213. saveDocData();
  214. }
  215. else if (category === "ppt") {
  216. savePPTData();
  217. }
  218. else if (category === "excel") {
  219. saveExcelData();
  220. }
  221. else if (category === "docANDpic") {
  222. saveDocAndPicData();
  223. }
  224. else {
  225. var info = [];
  226. for (var key in category){
  227. info.push(key + " : " + category[key]);
  228. }
  229. alert("未知文档类型\n" + info.join("\n"));
  230. }
  231. }
  232.  
  233. /*
  234. * 主函数部分
  235. */
  236.  
  237. function baiduWenku() {
  238. // 创建脚本启动按钮1、2
  239. var btn_1 = document.createElement("button");
  240. var btn_2 = document.createElement("button");
  241. // 设定按钮1、2样式
  242. btn_1.setAttribute("class", "init-btn");
  243. btn_1.style.height = "25px";
  244. btn_1.style.width = "50%";
  245. btn_1.style.marginLeft = "25%";
  246. btn_1.style.backgroundColor = "blue";
  247.  
  248. btn_2.setAttribute("class", "save-doc-btn");
  249. btn_2.style.height = "25px";
  250. btn_2.style.width = "50%";
  251. btn_2.style.marginLeft = "25%";
  252. btn_2.style.backgroundColor = "green";
  253. btn_2.style.display = "none";
  254.  
  255. // 绑定主函数
  256. btn_1.addEventListener("click", readAll);
  257. btn_2.addEventListener("click", saveData);
  258. // 添加按钮元素到页面
  259. document.body.appendChild(btn_1);
  260. document.body.appendChild(btn_2);
  261. // 确认主程序加载完毕
  262. console.log("Program Loaded");
  263. }
  264.  
  265. function docin() {
  266. // 创建脚本启动按钮
  267. var btn = document.createElement("button");
  268. // 设定按钮1、2样式
  269. btn.style.height = "25px";
  270. btn.style.width = "50%";
  271. btn.style.marginLeft = "25%";
  272. btn.style.backgroundColor = "green";
  273.  
  274. // 绑定主函数
  275. var printPage = function() {window.print();};
  276. btn.addEventListener("click", printPage);
  277. // 添加按钮元素到页面
  278. document.body.appendChild(btn);
  279. // 确认主程序加载完毕
  280. console.log("Program Loaded");
  281. }
  282.  
  283. function main() {
  284. var host = window.location.host;
  285. if (host === "wenku.baidu.com") {
  286. baiduWenku();
  287. }
  288. else if (host === "www.docin.com") {
  289. docin();
  290. }
  291. else {
  292. console.log("匹配到了无效网页");
  293. }
  294. }
  295.  
  296. main();