- // ==UserScript==
- // @name Wenku Doc Downloader
- // @namespace http://tampermonkey.net/
- // @version 1.0
- // @description 下载“百度文库”文档,导出txt或pdf。“豆丁网”“爱问共享资料”(新浪文档)文档导出pdf。在文档页面最底部有蓝/绿色长方形按钮,说明脚本生效了,否则就没有生效。
- // @author allenlv2690@gmail.com
- // @match https://wenku.baidu.com/view/*
- // @match https://www.docin.com/p-*
- // @match https://ishare.iask.sina.com.cn/f/*
- // @icon https://www.google.com/s2/favicons?domain=limestart.cn
- // @grant none
- // @license GPL-3.0-only
- // @create 2021-11-22
- // @note 修复了纯图片文档报错的bug(程序里有个字符串写错了【笑哭】)
- // @note 更新了百度文库上所有文档类型直接导出PDF的功能(纯图片文档更推荐使用【图片下载合并器】)
- // ==/UserScript==
-
- /*
- * 附属功能函数部分
- */
-
- function createAndDownloadFile(fileName, content) {
- // 创建并下载文件
- var aTag = document.createElement('a');
- var blob = new Blob([content]);
- aTag.download = fileName;
- aTag.href = URL.createObjectURL(blob);
- aTag.click();
- URL.revokeObjectURL(blob);
- }
-
- function formatText(text){
- // 用于纯文本文档的文本美化
- var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]");
- var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) ");
- var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])");
-
- var text_1 = text.replace(reg_exp_1, "TEMP");
- var text_2 = text_1.replace(reg_exp_2, "");
- var text_3 = text_2.replace("TEMP", "");
- var text_final = text_3.replace(/ /g, " ");
- return text_final;
- }
-
- function formatText2(text) {
- // 用于图形文字混合型文档的文本美化
- var reg_exp = new RegExp("[ ]{2,}");
- var content_1 = text.replace(reg_exp, "\n");
-
- var content_2 = content_1.replace(/[ ]\n/g, "\n");
-
- var reg_exp_2 = new RegExp("\n[ ]*\n*\n");
- var content_3 = content_2.replace(reg_exp_2, "\n");
-
- var reg_exp_3 = new RegExp(" *\n * ");
- var content_4 = content_3.replace(reg_exp_3, "\n");
-
- var content_5 = content_4.replace(/[ ]/g, " ");
- var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n");
-
- return final_content;
- }
-
- function detectType() {
- // 获取文档类型名称
- try {
- var doc_title_wrap = document.getElementsByClassName("doc-title-wrap")[0];
- var file_type = doc_title_wrap.children[0].className;
- }
- catch (e) {
- alert("请刷新页面以激活该按钮。\n先点击【导出pdf】橙色按钮后该按钮将无法使用。");
- return "safe_quit";
- }
- var pdf, doc, ppt, excel, type;
- // 判断文档类型
- if (file_type.search("word") !== -1) {
- type = "word";
- }
- else if (file_type.search("ppt") !== -1) {
- type = "ppt";
- }
- else if (file_type.search("excel") !== -1) {
- type = "excel";
- }
- else if (file_type.search("pdf") !== -1) {
- type = "pdf";
- }
- else if (file_type.search("txt" !== -1)) {
- type = "txt";
- }
- else {
- type = file_type;
- }
- // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在
- var pic_nums = document.getElementsByClassName("reader-pic-item").length;
- var word_nums = document.getElementsByClassName("reader-word-layer").length;
- var ppt_img_nums = document.getElementsByClassName("ppt-image-wrap").length;
-
- // 判断文档类型、文字和图片的数量状况
- if (type === "word" && !word_nums && pic_nums) {
- // doc: 纯图片
- return "doc-only-pic";
- }
- else if (type === "word" && word_nums > 2 && pic_nums <= 1) {
- // doc: 纯文字
- return "doc-only-word";
- }
- else if (type === "word" && pic_nums > 2 && word_nums > 2) {
- // doc: 图形、文字混合
- return "doc-pic-word";
- }
- else if (type === "pdf" && pic_nums > 2 && word_nums === 1) {
- // pdf: 带有一行文字标题,之后都是图形
- return "pdf-pic-title";
- }
- else if (type === "pdf" && !word_nums && pic_nums) {
- // pdf: 纯图形
- return "pdf-only-pic";
- }
- else if (type === "pdf" && !pic_nums && word_nums > 1) {
- // pdf: 纯文字
- return "pdf-only-word";
- }
- else if (type === "pdf" && word_nums > 2 && pic_nums > 1) {
- // pdf: 图形、文字混合
- return "pdf-pic-word";
- }
- else if ((type === "ppt" && ppt_img_nums > 2) || (type === "pdf" && !word_nums && !pic_nums && ppt_img_nums)) {
- // ppt: 包含至少3页内容 / 纯ppt图形页面构成
- return "ppt";
- }
- else if (type === "excel" && pic_nums && word_nums > 2) {
- // excel: 包含可选中文字
- return "excel-only-word";
- }
- else if (type === "excel" && pic_nums && !word_nums) {
- // excel: 纯图形
- return "excel-only-pic";
- }
- else if (type === "txt") {
- // txt: 纯文字
- return "txt";
- }
- else {
- return {"源文档类型": type,
- "图形数量": pic_nums,
- "文字块数量": word_nums,
- "ppt纯图形页面数量": ppt_img_nums};
- }
- }
-
- function tryToRemoveElement(element) {
- // try移除元素
- try {
- element.remove();
- }
- catch(e) {
- console.log();
- }
- }
-
- function tryToRemoveSameElem(elem_list_box) {
- // try移除[元素列表1, 元素列表2, ...]的元素
- for (var elem_list of elem_list_box) {
- if (!elem_list) {
- continue;
- }
- for (var elem of elem_list) {
- try {
- elem.remove();
- }
- catch(e) {
- console.log();
- }
- }
- }
- }
-
- function centerDoc(class_name, default_offset) {
- // 使文档居中
- var doc_main = document.getElementsByClassName(class_name)[0];
- var offset = window.prompt("请输入偏移百分位:", default_offset);
- // 如果输入的数字不在 0-59 内,提醒用户重新设置
- if (offset.length === 1 && offset.search(/[0-9]/) !== -1) {
- doc_main.style.marginLeft = offset + "%";
- return true;
- }
- else if (offset.length === 2 && offset.search(/[1-5][0-9]/) !== -1) {
- doc_main.style.marginLeft = offset + "%";
- return true
- }
- else {
- alert("请输入一个正整数,范围在0至59之间,用来使文档居中\n(不同文档偏移量不同,所以需要手动调整)");
- return false;
- }
- }
-
- /*
- * 主要功能函数部分
- */
-
- var docin_counter = 0;
-
- function printPageDocin() {
- // # 清理并打印豆丁网的文档页
- // ## 选择指针光标
- try {document.getElementById("j_select").click();} catch(e) {console.log();}
- // ## 移除页面上无关的元素
- // ### 移除单个元素
- var doc_head = document.getElementsByClassName("doc_header_mod")[0];
- var head_wrapper = document.getElementsByClassName("head_wrapper")[0];
- var aside = document.getElementsByClassName("aside")[0];
- var slide = document.getElementById("docinShareSlider");
- var no_more = document.getElementsByClassName("no_more_mod")[0];
- var like_too = document.getElementById("likeToo");
- var tools_bottom_bar = document.getElementsByClassName("tools_bottom_bar")[0];
- var page_crubms = document.getElementsByClassName("page_crubms")[0];
- var bottom_ad = document.getElementById("jControlDivRecomm");
- var back_to_top = document.getElementsByClassName("backToTop")[0];
- // ### 执行移除
- var elem_list = [doc_head,
- head_wrapper,
- aside,
- slide,
- no_more,
- like_too,
- tools_bottom_bar,
- page_crubms,
- bottom_ad,
- back_to_top
- ];
- for (var elem of elem_list) {
- tryToRemoveElement(elem);
- }
- // ### 移除全部同类元素
- var ad_box = document.getElementsByClassName("adBox");
- tryToRemoveSameElem([ad_box]);
- // 使文档居中
- var doc = document.getElementsByClassName("main")[0];
- doc.style.marginLeft = "6%";
- // 隐藏按钮,然后打印页面
- var btn_2 = document.getElementsByClassName("save-doc-btn")[0];
- btn_2.style.display = "none";
- // 打印结束,显示按钮
- alert("如果预览时有空白页,请取消打印\n请上下滚动页面,确保每页内容都加载完成\n如果文档中有广告,请取消打印,再点一次按钮\n最多不超过2次,应该没有广告了");
- window.print();
- btn_2.style.removeProperty("display");
- }
-
- function printPageiShare() {
- // # 清理并打印爱问共享资料的文档页
- // ## 移除页面上无关的元素
- // ### 移除单个元素
- var topbanner = document.getElementsByClassName("detail-topbanner")[0];
- var header = document.getElementsByClassName("new-detail-header")[0];
- var fixright = document.getElementById("fix-right");
- var redpacket = document.getElementsByClassName("loginRedPacket-dialog")[0];
- var fixedrightfull = document.getElementsByClassName("fixed-right-full")[0];
- var footer = document.getElementsByClassName("website-footer")[0];
- var guess = document.getElementsByClassName("guess-you-like-warpper")[0];
- var detailtopbox = document.getElementsByClassName("detail-top-box")[0];
- var fullscreen = document.getElementsByClassName("reader-fullScreen")[0];
- var endhint = document.getElementsByClassName("endof-trial-reading")[0];
- var crumb_arrow;
- try {crumb_arrow = document.getElementsByClassName("crumb-arrow")[0].parentElement;} catch(e) {console.log();}
- var copyright = document.getElementsByClassName("copyright-container")[0];
- var state_btn = document.getElementsByClassName("state-bottom")[0];
- // ### 执行移除
- var elem_list = [topbanner,
- header,
- fixright,
- redpacket,
- fixedrightfull,
- footer,
- guess,
- detailtopbox,
- fullscreen,
- endhint,
- crumb_arrow,
- copyright,
- state_btn
- ];
- for (var elem of elem_list) {
- tryToRemoveElement(elem);
- }
- // ### 移除全部同类元素
- var adv_container = document.getElementsByClassName("adv-container");
- tryToRemoveSameElem([adv_container]);
- // 使文档居中
- alert("建议使用:\n偏移量:18\n缩放:默认\n如果预览中有广告,就取消打印\n再点一次按钮,预览中应该就没有广告了");
- if (!centerDoc("doc-main", "18")) {
- return; // 如果输入非法,终止函数调用
- }
- // 隐藏按钮,然后打印页面
- var btn_2 = document.getElementsByClassName("save-doc-btn")[0];
- btn_2.style.display = "none";
- window.print();
- // 打印结束,显示按钮
- btn_2.style.removeProperty("display");
- }
-
- function printPageBaidu() {
- // # 清理并打百度文库的文档页
- // ## 移除无关页面元素
- // ### 要移除的单个元素
- var header_wrapper = document.getElementsByClassName("header-wrapper")[0];
- var right_wrapper = document.getElementById("right-wrapper-id");
- var reader_topbar = document.getElementsByClassName("reader-topbar")[0];
- var end_fold_page = document.getElementsByClassName("try-end-fold-page")[0];
-
- for (var elem_1 of [header_wrapper, right_wrapper, reader_topbar, end_fold_page]) {
- tryToRemoveElement(elem_1);
- }
- // ### 移除全部同类元素
- var lazy_load_list = document.getElementsByClassName("lazy-load");
- var no_full_screen_list = document.getElementsByClassName("no-full-screen");
- var ads = document.getElementsByClassName("hx-warp");
-
- tryToRemoveSameElem([lazy_load_list, ads, no_full_screen_list]);
-
- // 使文档居中
- alert("建议使用:\n偏移量:0\n缩放:118%");
- if (!centerDoc("left-wrapper", "0")) {
- return; // 如果输入非法,退出函数调用
- }
- // 隐藏按钮,然后打印页面
- var section = document.getElementsByClassName("btns_section")[0];
- section.style.display = "none";
- window.print();
- // 打印结束,显示按钮
- section.style.removeProperty("display");
- }
-
- function createSaveHtmlBtn() {
- // 创建 下载html 按钮
- var btn_3 = document.createElement("button");
- // 样式设定
- btn_3.setAttribute("class", "save-html-btn");
- btn_3.style.height = "25px";
- btn_3.style.width = "15%";
- btn_3.style.marginLeft = "0.2%";
- btn_3.style.backgroundColor = "orange";
- btn_3.style.border = "none";
- btn_3.textContent = "导出pdf";
- btn_3.style.fontWeight = "bold";
- btn_3.style.borderRadius = "10%";
- // 绑定事件,添加到页面上
- btn_3.onclick = printPageBaidu;
- var section = document.getElementsByClassName("btns_section")[0];
- section.appendChild(btn_3);
- }
-
- function readAlliShare() {
- var red_btn = document.getElementsByClassName("red-color")[0];
- var red_text = red_btn.textContent;
- // 如果可以展开,则展开
- if (red_text.search("点击可继续阅读") !== -1) {
- red_btn.click();
- }
- // 否则启动按钮2,准备清理页面然后打印为PDF
- else {
- var hint = "文档已经完全展开,可以导出";
- alert(hint);
- // 准备调整按钮,先获取按钮
- var init_btn = document.getElementsByClassName("init-btn")[0];
- var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
- // 调整按钮显示状况
- save_doc_btn.style.removeProperty("display");
- init_btn.style.display = "none";
- }
- }
-
- function readAll() {
- var read_all_btn = document.getElementsByClassName("read-all")[0];
- // 如果存在“继续阅读”的按钮
- if (read_all_btn) {
- // 点击“继续阅读”按钮
- read_all_btn.click();
- }
- else{
- var hint = "文档已经完全展开,可以导出";
- alert(hint);
- try {
- // 判断文档类型
- var category = detectType();
- }
- catch(e) {
- alert("未知/特殊文档类型,例如学术文献,暂不支持下载\n也可与作者反馈或联系:\nallenlv2690@gmail.com");
- return undefined;
- }
- // 准备调整按钮,先获取按钮
- var init_btn = document.getElementsByClassName("init-btn")[0];
- var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
-
- // 添加导出pdf功能的按钮
- save_doc_btn.style.width = "34.8%";
- createSaveHtmlBtn();
- // 纯文字类型文档推荐导出纯文本
- if (category === "doc-only-word" ||
- category === "pdf-only-word") {
- save_doc_btn.textContent += "(推荐)";
- }
- // 纯图类型文档推荐导出图片链接再拼合为PDF
- else if (category === "doc-only-pic" ||
- category === "pdf-pic-title" ||
- category === "ppt" ||
- category === "pdf-only-pic" ||
- category === "excel-only-pic"){
- save_doc_btn.textContent = "导出图片链接来合并为PDF(推荐)";
- }
- // 其他类型应该是图文混合型,推荐导出PDF
- else {
- var print_page_btn = document.getElementsByClassName("save-html-btn")[0];
- print_page_btn.textContent += "(推荐)";
- }
- // 调整按钮显示状况
- save_doc_btn.style.removeProperty("display");
- init_btn.style.display = "none";
- }
- }
-
- function savePDFData() {
- // 存储pdf型data(假定是内容是pic)
- // alert("Function savePDFData was called.");
- var pic_urls = document.getElementsByClassName("reader-pic-item");
- var text_list = [];
- // 去掉前缀
- var reg_exp_1 = new RegExp(": ?url[(]");
- // 去掉后缀
- var reg_exp_2 = new RegExp("[)]; ?background-position");
-
- for (var i = 0; i < pic_urls.length; i++){
- var whole_text = pic_urls[i].getAttribute("style");
- var de_pretext = whole_text.split(reg_exp_1)[1];
- var url = de_pretext.split(reg_exp_2)[0];
- text_list.push(url);
- }
-
- text_list[0] = text_list[0].replace(/"/g, "");
- var content = text_list.join("\n");
- // 启动下载
- createAndDownloadFile("urls.csv", content);
- }
-
- function saveDocData() {
- // 存储doc型data(内容是text)
- // alert("Function saveDocData was called.");
- // 获取文本
- var text_elements = document.getElementsByClassName("reader-word-layer");
- var texts = [];
- for (var elem of text_elements){
- texts.push(elem.textContent);
- }
- // 美化后导出文本
- var origin_content = texts.join("");
- var content = formatText(origin_content);
- createAndDownloadFile("纯文本文档.txt", content);
- }
-
- function savePPTData() {
- // 存储ppt型data(内容是pic)
- // alert("Function savePPTData was called.");
- var pic_elements = document.getElementsByClassName("ppt-image-wrap");
- var pic_urls = [];
-
- for (var elem of pic_elements) {
- var pic_obj = elem.children[0];
- var url = pic_obj.src;
- pic_urls.push(url);
- }
- var content = pic_urls.join("\n");
- // 启动下载
- createAndDownloadFile("urls.csv", content);
- }
-
- function saveExcelData() {
- // 1. 拿到表格
- var table_pic = document.getElementsByClassName("reader-pic-item")[0];
- var url = table_pic.style.getPropertyValue("background-image");
- // 获取图片地址
- var pure_url = url.slice(5, -2);
-
- // 2. 拿到表格内文字信息
- var text_elems = document.getElementsByClassName("reader-word-layer");
- var text_list = [];
- for (var elem of text_elems) {
- text_list.push(elem.textContent);
- }
- var _text = text_list.join("\n");
- // 替换奇怪的空格
- var text = _text.replace(/ /g, " ");
-
- // 3. 合并至一个字符串,然后导出
- var head = "表格图形链接如下(复制到浏览器中打开):";
- var content = head + "\n\n" + pure_url + "\n\n" + text;
- createAndDownloadFile("图片地址和表格内容.txt", content);
- }
-
- function saveDocAndPicData() {
- // 对于文字和图形混合型的data只能存储其中的纯文字
- // alert("Function saveDocAndPicData was called.");
- // 获取文本
- var text_elements = document.getElementsByClassName("reader-word-layer");
- var texts = [];
- for (var elem of text_elements){
- texts.push(elem.textContent);
- }
- var origin_content = texts.join("");
- // 美化后导出文本
- var content = formatText2(origin_content);
- createAndDownloadFile("纯文本文档.txt", content);
- }
-
- function saveTxtData() {
- // 存储纯文本到本地
- var text_elements = document.getElementsByClassName("p-txt");
- var texts = [];
- for (var elem of text_elements){
- texts.push(elem.textContent);
- }
- var content = texts.join("");
- createAndDownloadFile("纯文本文档.txt", content);
- }
-
- function saveData() {
- // 存储文档数据到本地
- var category = detectType();
- if (category === "doc-only-pic" ||
- category === "pdf-pic-title" ||
- category === "pdf-only-pic" ||
- category === "excel-only-pic"){
- // 对于纯图形文档,都用【图片下载合并器】来处理
- savePDFData();
- }
- else if (category === "doc-only-word" ||
- category === "doc-pic-word" ||
- category === "pdf-only-word" ||
- category === "pdf-pic-word") {
- // 对于包含大量文字、且非表格的文档,直接提出纯文本
- saveDocData();
- }
- else if (category === "ppt") {
- // ppt按类似于纯图文档的方法处理
- savePPTData();
- }
- else if (category === "excel-only-word") {
- // excel仅保存其中的纯文字
- saveExcelData();
- }
- else if (category === "txt") {
- // txt直接保存
- saveTxtData();
- }
- else if (category === "safe_quit") {
- // 安全退出
- return;
- }
- else {
- var info = [];
- for (var key in category){
- info.push(key + " : " + category[key]);
- }
- alert("未知处理类型,请反馈或联系作者:\nallenlv2690@gmail.com\n" + info.join("\n"));
- }
- }
-
- function create2btns() {
- // 创建两个初始按钮:展开文档、存储文档
-
- // 创建脚本启动按钮1、2
- var btn_1 = document.createElement("button");
- var btn_2 = document.createElement("button");
-
- // 设定按钮1、2样式
- btn_1.setAttribute("class", "init-btn");
- btn_1.style.height = "25px";
- btn_1.style.width = "50%";
- btn_1.style.marginLeft = "25%";
- btn_1.style.border = "none";
- btn_1.style.backgroundColor = "blue";
- btn_1.style.color = "white";
- btn_1.style.fontWeight = "bold";
- btn_1.textContent = "展开文档";
-
- btn_2.setAttribute("class", "save-doc-btn");
- btn_2.style.height = "25px";
- btn_2.style.width = "50%";
- btn_2.style.marginLeft = "25%";
- btn_2.style.backgroundColor = "green";
- btn_2.style.border = "none";
- btn_2.style.display = "none";
- btn_2.style.color = "white";
- btn_2.style.fontWeight = "bold";
-
- // 添加按钮元素到页面
- var section = document.createElement("section");
- section.setAttribute("class", "btns_section");
- section.appendChild(btn_1);
- section.appendChild(btn_2);
- document.body.appendChild(section);
- // 返回元素引用
- return [btn_1, btn_2]
- }
-
- /*
- * 主函数部分
- */
-
- function baiduWenku() {
- // 创建脚本启动按钮1、2
- var [btn_1, btn_2] = create2btns();
- btn_2.textContent = "导出纯文本";
-
- // 绑定主函数
- btn_1.onclick = readAll;
- btn_2.onclick = saveData;
-
- // 解除打印限制
- var style = document.createElement("style");
- style.innerHTML = `@media print {
- body{
- display:block;
- }
- }`;
- style.type="text/css";
- document.getElementsByTagName("head")[0].appendChild(style);
- }
-
- function docin() {
- // 创建脚本启动按钮
- var [btn_1, btn_2] = create2btns();
- btn_1.remove();
- btn_2.textContent = "打印页面到PDF";
- btn_2.style.removeProperty("display");
- // 绑定主函数
- btn_2.onclick = printPageDocin;
- }
-
- function ishare() {
- // 创建脚本启动按钮1、2
- var [btn_1, btn_2] = create2btns();
- btn_2.textContent = "打印页面到PDF";
-
- // 绑定主函数
- btn_1.onclick = readAlliShare;
- btn_2.onclick = printPageiShare;
-
- // 移除底部下载条
- var detailfixed = document.getElementsByClassName("detail-fixed")[0];
- detailfixed.remove();
- }
-
- function main() {
- var host = window.location.host;
- if (host === "wenku.baidu.com") {
- baiduWenku();
- }
- else if (host === "www.docin.com") {
- docin();
- }
- else if (host === "ishare.iask.sina.com.cn") {
- ishare();
- }
- else {
- console.log("匹配到了无效网页");
- }
- }
-
- main();