Webpage to Markdown

Convert webpage to Markdown via Tampermonkey menu. Significantly optimizes content selection logic, reducing content loss and useless elements.

当前为 2025-04-13 提交的版本,查看 最新版本

您需要先安装一个扩展,例如 篡改猴Greasemonkey暴力猴,之后才能安装此脚本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安装一个扩展,例如 篡改猴暴力猴,之后才能安装此脚本。

您需要先安装一个扩展,例如 篡改猴Userscripts ,之后才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。

您需要先安装用户脚本管理器扩展后才能安装此脚本。

(我已经安装了用户脚本管理器,让我安装!)

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

(我已经安装了用户样式管理器,让我安装!)

// ==UserScript==
// @name         Webpage to Markdown
// @namespace    http://tampermonkey.net/
// @version      1.0
// @description  Convert webpage to Markdown via Tampermonkey menu. Significantly optimizes content selection logic, reducing content loss and useless elements.
// @author       Feiyt
// @homepageURL  https://github.com/Feiyt
// @license      MIT
// @match        *://*/*
// @require      https://unpkg.com/turndown/dist/turndown.js
// @require      https://unpkg.com/[email protected]/dist/turndown-plugin-gfm.js
// @grant        GM_registerMenuCommand
// @grant        GM_download
// @grant        GM_addStyle
// @run-at       document-idle
// ==/UserScript==

// Copyright (c) 2025 Feiyt
// Released under the MIT license
// https://github.com/Feiyt (or specify the exact repo if available)

(function() {
    'use strict';
    console.log("Webpage to Markdown (v1.0) script starting..."); // Version updated here

    // --- Configuration ---
    const turndownOptions = { /* ... options from previous version ... */ }; // Reference to previous settings
    turndownOptions.headingStyle = 'atx';
    turndownOptions.hr = '---';
    turndownOptions.bulletListMarker = '*';
    turndownOptions.codeBlockStyle = 'fenced';
    turndownOptions.emDelimiter = '*';
    turndownOptions.strongDelimiter = '**';
    turndownOptions.linkStyle = 'inlined';


    // --- Helper Functions ---
    function sanitizeFilename(name) { /* ... function from previous version ... */ } // Placeholder comment
    // Sanitizes a string to be used as a filename.
    sanitizeFilename = function(name) {
        // Replace forbidden characters with underscore, collapse whitespace, trim, provide default.
        return name.replace(/[\/\\:*?"<>|#%\n\r]/g, '_').replace(/\s+/g, ' ').trim() || "markdown_export";
    };


    /**
     * Improved content selection and cleaning.
     * Prioritizes semantic tags and common content IDs/classes.
     * @returns {object|null} Object containing { title: string, contentNode: Node } or null on failure.
     */
    function getPageContentNode() {
        console.log("getPageContentNode (v1.0 logic): Starting content retrieval..."); // Adjusted log message slightly
        const pageTitle = document.title || window.location.hostname;
        let bestCandidate = null;
        let maxScore = -1; // Simple scoring mechanism

        // More robust selectors with priorities implied by order
        const selectors = [
            // Highest Priority: Semantic & Specific Roles/IDs/Classes
            'article', '[role="article"]', '.article-body', '.post-content', '.entry-content', '#article-content', '.post-body', '.markdown-body',
            // High Priority: Main content areas
            'main', '[role="main"]', '#main-content', '#main', '.main-content', '.main', '#primary',
            // Medium Priority: Common generic containers (often need cleaning)
            '#content', '.content',
            // Lower Priority: More specific layout patterns
            '#page .content', // Example of nested structure
            '.container .content',
             // Stack Overflow Example
             '#mainbar',
            // Lowest Priority (if nothing else works, but avoid body initially)
            // Maybe add specific blog platform IDs? '.hentry'?
        ];

        console.log("Searching for best content container...");
        selectors.forEach((selector, index) => {
            try {
                const element = document.querySelector(selector);
                if (element) {
                    // Basic score: higher priority selectors get higher base score
                    const score = selectors.length - index; // Higher index = lower priority = lower score
                    console.log(`Found candidate [${selector}] with score ${score}`);

                    // --- Basic Heuristic Check ---
                    const textLength = element.textContent?.trim().length || 0;
                    const childCount = element.childElementCount || 0;
                    // Arbitrary thresholds - adjust as needed
                    if (textLength < 100 && childCount < 3) {
                        console.log(`... Candidate [${selector}] seems too small/empty (Text: ${textLength}, Children: ${childCount}). Lowering confidence.`);
                    }

                    if (score > maxScore) {
                        maxScore = score;
                        bestCandidate = element;
                        console.log(`>>> New best candidate: [${selector}]`);
                    }
                }
            } catch (e) { console.warn(`Error querying selector "${selector}": ${e.message}`); }
        });

        // If no good candidate found via specific selectors, use body as last resort
        if (!bestCandidate) {
            console.warn("No suitable specific container found after checking selectors. Falling back to document.body.");
            bestCandidate = document.body;
        } else {
            const likelySelectorIndex = selectors.length - 1 - Math.floor(maxScore);
            const likelySelector = selectors[likelySelectorIndex] || 'heuristic/fallback';
            console.log(`Selected final container: <${bestCandidate.tagName.toLowerCase()}> (Selector likely: ${likelySelector})`);
        }

        // --- Clone and Clean ---
        try {
            if (!bestCandidate || typeof bestCandidate.cloneNode !== 'function') {
                console.error("Cannot clone the selected content element."); return null;
            }
            console.log("Cloning selected container...");
            const clone = bestCandidate.cloneNode(true);

            // Define selectors for elements to exclude from the conversion.
            const excludeSelectors = [
                'header', 'footer', 'nav', '.header', '.footer', '.navbar', '.menu', '.toc', '#toc', '.breadcrumb', '#breadcrumb',
                '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
                'aside', '.sidebar', '#sidebar', '.widget-area', '#secondary', '.left-column', '.right-column',
                '[role="complementary"]',
                '.actions', '.share', '.social', '.buttons', '.post-meta', '.entry-meta', '.feedback', '.related-posts',
                '.like-button-container', '.feedback-container',
                '#comments', '.comments', '.comment-section', '#respond',
                '.ad', '.ads', '.advertisement', '.adsbygoogle', '[id*="ad-"]', '[class*="ad-"]', '[class*="advert"]',
                '.edit-link', '.print-link', '[role="search"]',
                'script', 'style', 'noscript', 'template', 'link[rel="stylesheet"]', 'meta', 'input[type="hidden"]',
                '.visually-hidden', '.sr-only', '[aria-hidden="true"]',
                '.cookie-banner', '#related-articles', '.related_posts',
            ];

            console.log("Removing excluded elements from clone...");
            let removedCount = 0;
             for (const selector of excludeSelectors) {
                 try {
                     const elementsToRemove = clone.querySelectorAll(selector);
                     elementsToRemove.forEach(el => {
                         if (el !== clone && typeof el.remove === 'function') {
                             el.remove();
                             removedCount++;
                         } else if (el === clone) {
                             console.warn(`Exclusion selector "${selector}" matched the container root itself! Skipping removal of root.`);
                         }
                     });
                 } catch (e) { console.warn(`Error removing elements for selector "${selector}": ${e.message}`); }
             }
            console.log(`Removed ${removedCount} elements/subtrees from clone.`);

            // --- Post-cleaning Check ---
            if (clone.childElementCount === 0 && clone.textContent.trim().length < 50) {
                 console.warn("Clone seems empty after cleaning! Original selection or exclusion might be wrong.");
            }

            return { title: pageTitle, contentNode: clone };

        } catch (error) {
            console.error("Critical error during cloning or cleaning:", error.message, error.stack);
            return null;
        }
    }

    // --- Main Conversion and Download Logic ---
    function convertAndDownload() {
        console.log("Convert to Markdown (v1.0): Button clicked..."); // Version updated here
        try {
            // --- Initialize Turndown, Apply GFM, Add Math Rule ---
             console.log("Initializing TurndownService...");
             if (typeof TurndownService === 'undefined') { throw new Error('TurndownService is not defined.'); }
             const turndownService = new TurndownService(turndownOptions);

             console.log("Applying GFM plugin...");
             if (typeof turndownPluginGfm !== 'undefined' && typeof turndownPluginGfm.gfm === 'function') {
                  try {
                      turndownService.use(turndownPluginGfm.gfm);
                      console.log("GFM applied.");
                    }
                  catch (gfmError) { console.error("Error applying GFM plugin:", gfmError); }
             } else { console.warn("GFM plugin not loaded."); }

            // Define and Add Math Rule (for KaTeX/MathJax)
            const mathRule = {}; // Simplified for brevity, keep full logic from previous step
             mathRule.filter = function (node, options) {
                 try {
                     return (
                         (node.nodeName === 'SPAN' && (node.classList.contains('katex') || node.classList.contains('MathJax_Preview'))) ||
                         (node.nodeName === 'DIV' && node.classList.contains('katex-display')) ||
                         (node.nodeName === 'SCRIPT' && node.getAttribute('type')?.startsWith('math/tex')) ||
                         (node.getAttribute('role') === 'math')
                     );
                 } catch (filterError) { console.error("Error inside MathJax filter function:", filterError, "Node:", node); return false; }
             };
             mathRule.replacement = function (content, node, options) {
                 let latex = '', delimiter = '$';
                 try {
                     if (node.nodeName === 'SCRIPT') {
                         latex = node.textContent || '';
                         if (node.getAttribute('type')?.includes('mode=display') || latex.trim().startsWith('\\display')) { delimiter = '$$'; }
                     } else if (node.dataset && node.dataset.originalLatex) {
                         latex = node.dataset.originalLatex;
                         if (node.classList.contains('katex-display') || node.closest('.MathJax_Display')) { delimiter = '$$'; }
                     } else if (node.getAttribute('aria-label')) {
                         latex = node.getAttribute('aria-label');
                          if (node.nodeName === 'DIV' || node.classList.contains('katex-display') || node.closest('.MathJax_Display')) { delimiter = '$$'; }
                     } else if (node.classList.contains('katex')) {
                         const annotation = node.querySelector('annotation[encoding="application/x-tex"]');
                         if (annotation) {
                             latex = annotation.textContent || '';
                             if (node.classList.contains('katex-display')) { delimiter = '$$'; }
                         }
                     } else if (node.nodeName === 'MATH' && node.getAttribute('alttext')) {
                         latex = node.getAttribute('alttext');
                         if (node.getAttribute('display') === 'block') { delimiter = '$$'; }
                     }
                     if (latex) {
                         latex = latex.trim();
                         if ((latex.startsWith('$$') && latex.endsWith('$$')) || (latex.startsWith('$') && latex.endsWith('$') && !latex.startsWith('$$'))) { return latex; }
                         return `${delimiter}${latex}${delimiter}`;
                     }
                     return '';
                 } catch (ruleError) { console.error("Error processing math rule replacement for node:", node, ruleError); return ''; }
             };

            try {
                console.log("Adding Math rule...");
                if (typeof mathRule.filter !== 'function') { throw new Error("Math rule filter is not a function!"); }
                turndownService.addRule('mathjaxKatex', mathRule);
                console.log("Math rule added.");
            } catch (addRuleError) { console.error("Failed to add Math rule:", addRuleError); }

            // --- Perform Conversion ---
            console.log("Getting page content node...");
            const pageData = getPageContentNode();

            if (!pageData || !pageData.contentNode) {
                 console.error("Failed to get valid page content node. Aborting.");
                 alert("Could not get a valid page content node for conversion.");
                 return;
            }
            console.log(`Content node retrieved. Title: ${pageData.title}. Starting conversion...`);

            let markdownContent = '';
            try {
                markdownContent = turndownService.turndown(pageData.contentNode);
                console.log("Markdown conversion complete.");
            } catch (convertError) {
                 console.error("Error during Turndown conversion:", convertError.message, convertError.stack);
                 alert(`Error during Markdown conversion: ${convertError.message}`);
                 return;
            }

            if (!markdownContent || markdownContent.trim() === '') {
                 console.warn("Conversion resulted in empty Markdown content.");
                 alert("Warning: The converted Markdown content is empty.");
            }

            // --- Prepare Filename & Download ---
            const filename = sanitizeFilename(pageData.title) + ".md";
            const dataUri = `data:text/markdown;charset=utf-8,${encodeURIComponent(markdownContent)}`;
            console.log(`Initiating download for ${filename}...`);
            GM_download({
                url: dataUri,
                name: filename,
                saveAs: true,
                onerror: (err) => {
                    console.error('GM_download error:', err);
                    alert(`Error downloading file: ${err.error || 'Unknown error'}. Check Tampermonkey settings (Advanced -> Downloads BETA -> Whitelist .md).`);
                 },
             });
            console.log("Download initiated.");

        } catch (error) {
             console.error("Critical error during convertAndDownload:", error.message, error.stack);
             alert(`A critical error occurred while running the script: ${error.message}`);
        }
    }

    // --- Register Menu Command ---
    if (typeof GM_registerMenuCommand === 'function') {
        try {
             // Updated menu command text to reflect v1.0
             GM_registerMenuCommand("Convert Page to Markdown (v1.0)", convertAndDownload, "m");
             console.log("Menu command registered.");
        } catch (registerError) { console.error("Failed to register menu command:", registerError); alert("Failed to register menu command!"); }
    } else { console.error("GM_registerMenuCommand is not available."); alert("GM_registerMenuCommand is not available!"); }

    console.log("Webpage to Markdown (v1.0) script finished loading."); // Version updated here
})();