Wenku Doc Downloader

下载百度文库文档

目前為 2021-11-24 提交的版本,檢視 最新版本

  1. // ==UserScript==
  2. // @name Wenku Doc Downloader
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.3
  5. // @description 下载百度文库文档
  6. // @author allenlv2690@gmail.com
  7. // @match https://wenku.baidu.com/view/*
  8. // @icon https://www.google.com/s2/favicons?domain=limestart.cn
  9. // @grant none
  10. // @license GPL-3.0-only
  11. // @create 2021-11-22
  12. // @note 1. 更新了对表格文档和图文结合型文档的简单支持(具体来说就是可以下载其中的纯文字部分)
  13. // @note 2. 更新了图片下载合并器的功能:使用后自动清理pic文件夹、删除urls.csv
  14. // ==/UserScript==
  15.  
  16. /*
  17. * 附属功能函数部分
  18. */
  19.  
  20. function createAndDownloadFile(fileName, content) {
  21. // 创建并下载文件
  22. var aTag = document.createElement('a');
  23. var blob = new Blob([content]);
  24. aTag.download = fileName;
  25. aTag.href = URL.createObjectURL(blob);
  26. aTag.click();
  27. URL.revokeObjectURL(blob);
  28. }
  29.  
  30. function formatText(text){
  31. // 用于纯文本文档的文本美化
  32. var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]");
  33. var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) ");
  34. var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])");
  35.  
  36. var text_1 = text.replace(reg_exp_1, "TEMP");
  37. var text_2 = text_1.replace(reg_exp_2, "");
  38. var text_3 = text_2.replace("TEMP", "");
  39. var text_final = text_3.replace(/ /g, " ");
  40. return text_final;
  41. }
  42.  
  43. function formatText2(text) {
  44. // 用于图形文字混合型文档的文本美化
  45. var reg_exp = new RegExp("[  ]{2,}");
  46. var content_1 = text.replace(reg_exp, "\n");
  47.  
  48. var content_2 = content_1.replace(/[  ]\n/g, "\n");
  49.  
  50. var reg_exp_2 = new RegExp("\n[   ]*\n*\n");
  51. var content_3 = content_2.replace(reg_exp_2, "\n");
  52.  
  53. var reg_exp_3 = new RegExp(" *\n * ");
  54. var content_4 = content_3.replace(reg_exp_3, "\n");
  55.  
  56. var content_5 = content_4.replace(/[  ]/g, " ");
  57. var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n");
  58.  
  59. return final_content;
  60. }
  61.  
  62. /*
  63. * 主要功能函数部分
  64. */
  65.  
  66. function readAll() {
  67. var read_all_btn = document.getElementsByClassName("read-all")[0];
  68. // 如果存在“继续阅读”的按钮
  69. if (read_all_btn) {
  70. // 点击“继续阅读”按钮
  71. read_all_btn.click();
  72. }
  73. // 如果点击完之后仍旧存在该按钮,递归调用自身
  74. // read_all_btn = document.getElementsByClassName("read-all")[0];
  75. // if (read_all_btn) {
  76. // readAll();
  77. // }
  78. else{
  79. alert("文档已经完全展开,可以导出");
  80. var init_btn = document.getElementsByClassName("init-btn")[0];
  81. var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
  82. init_btn.style.display = "none";
  83. save_doc_btn.style.removeProperty("display");
  84. }
  85. }
  86.  
  87. function savePDFData() {
  88. // 存储pdf型data(假定是内容是pic)
  89. alert("Function savePDFData was called.");
  90. var pic_urls = document.getElementsByClassName("reader-pic-item");
  91. var text_list = [];
  92. // 去掉前缀
  93. var reg_exp_1 = new RegExp(": ?url[(]");
  94. // 去掉后缀
  95. var reg_exp_2 = new RegExp("[)]; ?background-position");
  96.  
  97. for (var i = 0; i < pic_urls.length; i++){
  98. var whole_text = pic_urls[i].getAttribute("style");
  99. var de_pretext = whole_text.split(reg_exp_1)[1];
  100. var url = de_pretext.split(reg_exp_2)[0];
  101. text_list.push(url);
  102. }
  103.  
  104. text_list[0] = text_list[0].replace(/"/g, "");
  105. var content = text_list.join("\n");
  106. createAndDownloadFile("urls.csv", content);
  107. }
  108.  
  109. function saveDocData() {
  110. // 存储doc型data(内容是text)
  111. alert("Function saveDocData was called.");
  112. // 获取文本
  113. var text_elements = document.getElementsByClassName("reader-word-layer");
  114. var texts = [];
  115. for (var elem of text_elements){
  116. texts.push(elem.textContent);
  117. }
  118. // 美化后导出文本
  119. var origin_content = texts.join("");
  120. var content = formatText(origin_content);
  121. createAndDownloadFile("纯文本文档.txt", content);
  122. }
  123.  
  124. function savePPTData() {
  125. // 存储ppt型data(内容是pic)
  126. alert("Function savePPTData was called.");
  127. var pic_elements = document.getElementsByClassName("ppt-image-wrap");
  128. var pic_urls = [];
  129.  
  130. for (var elem of pic_elements) {
  131. var pic_obj = elem.children[0];
  132. var url = pic_obj.src;
  133. pic_urls.push(url);
  134. }
  135.  
  136. var content = pic_urls.join("\n");
  137. createAndDownloadFile("urls.csv", content);
  138. }
  139.  
  140. function saveExcelData() {
  141. // 1. 拿到表格
  142. var table_pic = document.getElementsByClassName("reader-pic-item")[0];
  143. var url = table_pic.style.getPropertyValue("background-image");
  144. // 获取图片地址
  145. var pure_url = url.slice(5, -2);
  146.  
  147. // 2. 拿到表格内文字信息
  148. var text_elems = document.getElementsByClassName("reader-word-layer");
  149. var text_list = [];
  150. for (var elem of text_elems) {
  151. text_list.push(elem.textContent);
  152. }
  153. var _text = text_list.join("\n");
  154. // 替换奇怪的空格
  155. var text = _text.replace(/ /g, " ");
  156.  
  157. // 3. 合并至一个字符串,然后导出
  158. var head = "表格图形链接如下(复制到浏览器中打开):";
  159. var content = head + "\n\n" + pure_url + "\n\n" + text;
  160. createAndDownloadFile("图片地址和表格内容.txt", content);
  161. }
  162.  
  163. function saveDocAndPicData() {
  164. // 对于文字和图形混合型的data只能存储其中的纯文字
  165. alert("Function saveDocAndPicData was called.");
  166. // 获取文本
  167. var text_elements = document.getElementsByClassName("reader-word-layer");
  168. var texts = [];
  169. for (var elem of text_elements){
  170. texts.push(elem.textContent);
  171. }
  172. // 处理文本中的过长空格
  173. var origin_content = texts.join("");
  174. // 美化后导出文本
  175. var content = formatText2(origin_content);
  176. createAndDownloadFile("纯文本文档.txt", content);
  177. }
  178.  
  179. function detectType() {
  180. // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在
  181. var pdf = document.getElementsByClassName("reader-pic-item").length;
  182. var doc = document.getElementsByClassName("reader-word-layer").length;
  183. var ppt = document.getElementsByClassName("ppt-image-wrap").length;
  184. // 判断文档类别
  185. if (pdf && !doc && !ppt) {
  186. return "pdf";
  187. }
  188. else if (doc && !pdf && !ppt) {
  189. return "doc";
  190. }
  191. else if (ppt && !pdf && !doc) {
  192. return "ppt";
  193. }
  194. else if (pdf === 1 && doc > 1 && !ppt) {
  195. return "excel";
  196. }
  197. else if (pdf > 2 && doc > 2 && !ppt) {
  198. return "docANDpic";
  199. }
  200. else {
  201. return {"pdf元素数量": pdf, "doc元素数量": doc, "ppt元素数量": ppt};
  202. }
  203. }
  204.  
  205. function saveData() {
  206. // 存储文档数据到本地
  207. var category = detectType();
  208. if (category === "pdf"){
  209. savePDFData();
  210. }
  211. else if (category === "doc") {
  212. saveDocData();
  213. }
  214. else if (category === "ppt") {
  215. savePPTData();
  216. }
  217. else if (category === "excel") {
  218. saveExcelData();
  219. }
  220. else if (category === "docANDpic") {
  221. saveDocAndPicData();
  222. }
  223. else {
  224. var info = [];
  225. for (var key in category){
  226. info.push(key + " : " + category[key]);
  227. }
  228. alert("未知文档类型\n" + info.join("\n"));
  229. }
  230. }
  231.  
  232. /*
  233. * 主函数部分
  234. */
  235.  
  236. function main() {
  237. // 创建脚本启动按钮1、2
  238. var btn_1 = document.createElement("button");
  239. var btn_2 = document.createElement("button");
  240. // 设定按钮1、2样式
  241. btn_1.setAttribute("class", "init-btn");
  242. btn_1.style.height = "25px";
  243. btn_1.style.width = "50%";
  244. btn_1.style.marginLeft = "25%";
  245. btn_1.style.backgroundColor = "blue";
  246.  
  247. btn_2.setAttribute("class", "save-doc-btn");
  248. btn_2.style.height = "25px";
  249. btn_2.style.width = "50%";
  250. btn_2.style.marginLeft = "25%";
  251. btn_2.style.backgroundColor = "green";
  252. btn_2.style.display = "none";
  253.  
  254. // 绑定主函数
  255. btn_1.addEventListener("click", readAll);
  256. btn_2.addEventListener("click", saveData);
  257. // 添加按钮元素到页面
  258. document.body.appendChild(btn_1);
  259. document.body.appendChild(btn_2);
  260. // 确认主程序加载完毕
  261. console.log("Program Loaded");
  262. }
  263.  
  264. main();