Wenku Doc Downloader

下载百度文库文档

当前为 2021-11-22 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name Wenku Doc Downloader
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.1
  5. // @description 下载百度文库文档
  6. // @author allenlv2690@gmail.com
  7. // @match https://wenku.baidu.com/view/*
  8. // @icon https://www.google.com/s2/favicons?domain=limestart.cn
  9. // @grant none
  10. // @license GPL-3.0-only
  11. // @create 2021-11-22
  12. // ==/UserScript==
  13.  
  14. /*
  15. * 附属功能函数部分
  16. */
  17.  
  18. function createAndDownloadFile(fileName, content) {
  19. // 创建并下载文件
  20. var aTag = document.createElement('a');
  21. var blob = new Blob([content]);
  22. aTag.download = fileName;
  23. aTag.href = URL.createObjectURL(blob);
  24. aTag.click();
  25. URL.revokeObjectURL(blob);
  26. }
  27.  
  28. function formatText(text){
  29. var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]");
  30. var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) ");
  31. var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])");
  32.  
  33. var text_1 = text.replace(reg_exp_1, "TEMP");
  34. var text_2 = text_1.replace(reg_exp_2, "");
  35. var text_3 = text_2.replace("TEMP", "");
  36. var text_final = text_3.replace(/ /g, " ");
  37. return text_final;
  38. }
  39.  
  40. /*
  41. * 主要功能函数部分
  42. */
  43.  
  44. function readAll() {
  45. var read_all_btn = document.getElementsByClassName("read-all")[0];
  46. // 如果存在“继续阅读”的按钮
  47. if (read_all_btn) {
  48. // 点击“继续阅读”按钮
  49. read_all_btn.click();
  50. }
  51. // 如果点击完之后仍旧存在该按钮,递归调用自身
  52. // read_all_btn = document.getElementsByClassName("read-all")[0];
  53. // if (read_all_btn) {
  54. // readAll();
  55. // }
  56. else{
  57. alert("文档已经完全展开,可以导出");
  58. var init_btn = document.getElementsByClassName("init-btn")[0];
  59. var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
  60. init_btn.style.display = "none";
  61. save_doc_btn.style.removeProperty("display");
  62. }
  63. }
  64.  
  65. function savePDFData() {
  66. // 存储pdf型data(假定是内容是pic)
  67. alert("Function savePDFData was called.");
  68. var pic_urls = document.getElementsByClassName("reader-pic-item");
  69. var text_list = [];
  70. // 去掉前缀
  71. var reg_exp_1 = new RegExp(": ?url[(]");
  72. // 去掉后缀
  73. var reg_exp_2 = new RegExp("[)]; ?background-position");
  74.  
  75. for (var i = 0; i < pic_urls.length; i++){
  76. var whole_text = pic_urls[i].getAttribute("style");
  77. var de_pretext = whole_text.split(reg_exp_1)[1];
  78. var url = de_pretext.split(reg_exp_2)[0];
  79. text_list.push(url);
  80. }
  81.  
  82. text_list[0] = text_list[0].replace(/"/g, "");
  83. var content = text_list.join("\n");
  84. createAndDownloadFile("pic-urls.csv", content);
  85. }
  86.  
  87. function saveDocData() {
  88. // 存储doc型data(内容是text)
  89. alert("Function saveTextData was called.");
  90. // 获取文本
  91. var text_elements = document.getElementsByClassName("reader-word-layer");
  92. var texts = [];
  93. for (var elem of text_elements){
  94. texts.push(elem.textContent);
  95. }
  96. // 导出文本
  97. var origin_content = texts.join("");
  98. var content = formatText(origin_content);
  99. createAndDownloadFile("doc文档.txt", content);
  100. }
  101.  
  102. function savePPTData() {
  103. // 存储ppt型data(内容是pic)
  104. alert("Function savePPTData was called.");
  105. var pic_elements = document.getElementsByClassName("ppt-image-wrap");
  106. var pic_urls = [];
  107.  
  108. for (var elem of pic_elements) {
  109. var pic_obj = elem.children[0];
  110. var url = pic_obj.src;
  111. pic_urls.push(url);
  112. }
  113.  
  114. var content = pic_urls.join("\n");
  115. createAndDownloadFile("ppt-urls.csv", content);
  116. }
  117.  
  118. function detectType() {
  119. // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在
  120. var pdf = document.getElementsByClassName("reader-pic-item").length;
  121. var doc = document.getElementsByClassName("reader-word-layer").length;
  122. var ppt = document.getElementsByClassName("ppt-image-wrap").length;
  123. // 判断文档类别
  124. if (pdf && !doc && !ppt) {
  125. return "pdf";
  126. }
  127. else if (doc && !pdf && !ppt) {
  128. return "doc";
  129. }
  130. else if (ppt && !pdf && !doc) {
  131. return "ppt";
  132. }
  133. else {
  134. return {"pdf": pdf, "doc": doc, "ppt": ppt};
  135. }
  136. }
  137.  
  138. function saveData() {
  139. // 存储文档数据到本地
  140. var category = detectType();
  141. if (category === "pdf"){
  142. savePDFData();
  143. }
  144. else if (category === "doc") {
  145. saveDocData();
  146. }
  147. else if (category === "ppt") {
  148. savePPTData();
  149. }
  150. else {
  151. var info = [];
  152. for (var key in category){
  153. info.push(key + " : " + category.key);
  154. }
  155. alert("未知文档类型\n" + info.join("\n"));
  156. }
  157. }
  158.  
  159. /*
  160. * 主函数部分
  161. */
  162.  
  163. function main() {
  164. // 创建脚本启动按钮1、2
  165. var btn_1 = document.createElement("button");
  166. var btn_2 = document.createElement("button");
  167. // 设定按钮1、2样式
  168. btn_1.setAttribute("class", "init-btn");
  169. btn_1.style.height = "25px";
  170. btn_1.style.width = "50%";
  171. btn_1.style.marginLeft = "25%";
  172. btn_1.style.backgroundColor = "blue";
  173.  
  174. btn_2.setAttribute("class", "save-doc-btn");
  175. btn_2.style.height = "25px";
  176. btn_2.style.width = "50%";
  177. btn_2.style.marginLeft = "25%";
  178. btn_2.style.backgroundColor = "green";
  179. btn_2.style.display = "none";
  180.  
  181. // 绑定主函数
  182. btn_1.addEventListener("click", readAll);
  183. btn_2.addEventListener("click", saveData);
  184. // 添加按钮元素到页面
  185. document.body.appendChild(btn_1);
  186. document.body.appendChild(btn_2);
  187. // 确认主程序加载完毕
  188. console.log("Program Loaded");
  189. }
  190.  
  191. main();