// ==UserScript==
// @name Webpage to Markdown
// @namespace http://tampermonkey.net/
// @version 1.0
// @description Convert webpage to Markdown via Tampermonkey menu. Significantly optimizes content selection logic, reducing content loss and useless elements.
// @author Feiyt
// @homepageURL https://github.com/Feiyt
// @license MIT
// @match *://*/*
// @require https://unpkg.com/turndown/dist/turndown.js
// @require https://unpkg.com/[email protected]/dist/turndown-plugin-gfm.js
// @grant GM_registerMenuCommand
// @grant GM_download
// @grant GM_addStyle
// @run-at document-idle
// ==/UserScript==
// Copyright (c) 2025 Feiyt
// Released under the MIT license
// https://github.com/Feiyt (or specify the exact repo if available)
(function() {
'use strict';
console.log("Webpage to Markdown (v1.0) script starting..."); // Version updated here
// --- Configuration ---
const turndownOptions = { /* ... options from previous version ... */ }; // Reference to previous settings
turndownOptions.headingStyle = 'atx';
turndownOptions.hr = '---';
turndownOptions.bulletListMarker = '*';
turndownOptions.codeBlockStyle = 'fenced';
turndownOptions.emDelimiter = '*';
turndownOptions.strongDelimiter = '**';
turndownOptions.linkStyle = 'inlined';
// --- Helper Functions ---
function sanitizeFilename(name) { /* ... function from previous version ... */ } // Placeholder comment
// Sanitizes a string to be used as a filename.
sanitizeFilename = function(name) {
// Replace forbidden characters with underscore, collapse whitespace, trim, provide default.
return name.replace(/[\/\\:*?"<>|#%\n\r]/g, '_').replace(/\s+/g, ' ').trim() || "markdown_export";
};
/**
* Improved content selection and cleaning.
* Prioritizes semantic tags and common content IDs/classes.
* @returns {object|null} Object containing { title: string, contentNode: Node } or null on failure.
*/
function getPageContentNode() {
console.log("getPageContentNode (v1.0 logic): Starting content retrieval..."); // Adjusted log message slightly
const pageTitle = document.title || window.location.hostname;
let bestCandidate = null;
let maxScore = -1; // Simple scoring mechanism
// More robust selectors with priorities implied by order
const selectors = [
// Highest Priority: Semantic & Specific Roles/IDs/Classes
'article', '[role="article"]', '.article-body', '.post-content', '.entry-content', '#article-content', '.post-body', '.markdown-body',
// High Priority: Main content areas
'main', '[role="main"]', '#main-content', '#main', '.main-content', '.main', '#primary',
// Medium Priority: Common generic containers (often need cleaning)
'#content', '.content',
// Lower Priority: More specific layout patterns
'#page .content', // Example of nested structure
'.container .content',
// Stack Overflow Example
'#mainbar',
// Lowest Priority (if nothing else works, but avoid body initially)
// Maybe add specific blog platform IDs? '.hentry'?
];
console.log("Searching for best content container...");
selectors.forEach((selector, index) => {
try {
const element = document.querySelector(selector);
if (element) {
// Basic score: higher priority selectors get higher base score
const score = selectors.length - index; // Higher index = lower priority = lower score
console.log(`Found candidate [${selector}] with score ${score}`);
// --- Basic Heuristic Check ---
const textLength = element.textContent?.trim().length || 0;
const childCount = element.childElementCount || 0;
// Arbitrary thresholds - adjust as needed
if (textLength < 100 && childCount < 3) {
console.log(`... Candidate [${selector}] seems too small/empty (Text: ${textLength}, Children: ${childCount}). Lowering confidence.`);
}
if (score > maxScore) {
maxScore = score;
bestCandidate = element;
console.log(`>>> New best candidate: [${selector}]`);
}
}
} catch (e) { console.warn(`Error querying selector "${selector}": ${e.message}`); }
});
// If no good candidate found via specific selectors, use body as last resort
if (!bestCandidate) {
console.warn("No suitable specific container found after checking selectors. Falling back to document.body.");
bestCandidate = document.body;
} else {
const likelySelectorIndex = selectors.length - 1 - Math.floor(maxScore);
const likelySelector = selectors[likelySelectorIndex] || 'heuristic/fallback';
console.log(`Selected final container: <${bestCandidate.tagName.toLowerCase()}> (Selector likely: ${likelySelector})`);
}
// --- Clone and Clean ---
try {
if (!bestCandidate || typeof bestCandidate.cloneNode !== 'function') {
console.error("Cannot clone the selected content element."); return null;
}
console.log("Cloning selected container...");
const clone = bestCandidate.cloneNode(true);
// Define selectors for elements to exclude from the conversion.
const excludeSelectors = [
'header', 'footer', 'nav', '.header', '.footer', '.navbar', '.menu', '.toc', '#toc', '.breadcrumb', '#breadcrumb',
'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
'aside', '.sidebar', '#sidebar', '.widget-area', '#secondary', '.left-column', '.right-column',
'[role="complementary"]',
'.actions', '.share', '.social', '.buttons', '.post-meta', '.entry-meta', '.feedback', '.related-posts',
'.like-button-container', '.feedback-container',
'#comments', '.comments', '.comment-section', '#respond',
'.ad', '.ads', '.advertisement', '.adsbygoogle', '[id*="ad-"]', '[class*="ad-"]', '[class*="advert"]',
'.edit-link', '.print-link', '[role="search"]',
'script', 'style', 'noscript', 'template', 'link[rel="stylesheet"]', 'meta', 'input[type="hidden"]',
'.visually-hidden', '.sr-only', '[aria-hidden="true"]',
'.cookie-banner', '#related-articles', '.related_posts',
];
console.log("Removing excluded elements from clone...");
let removedCount = 0;
for (const selector of excludeSelectors) {
try {
const elementsToRemove = clone.querySelectorAll(selector);
elementsToRemove.forEach(el => {
if (el !== clone && typeof el.remove === 'function') {
el.remove();
removedCount++;
} else if (el === clone) {
console.warn(`Exclusion selector "${selector}" matched the container root itself! Skipping removal of root.`);
}
});
} catch (e) { console.warn(`Error removing elements for selector "${selector}": ${e.message}`); }
}
console.log(`Removed ${removedCount} elements/subtrees from clone.`);
// --- Post-cleaning Check ---
if (clone.childElementCount === 0 && clone.textContent.trim().length < 50) {
console.warn("Clone seems empty after cleaning! Original selection or exclusion might be wrong.");
}
return { title: pageTitle, contentNode: clone };
} catch (error) {
console.error("Critical error during cloning or cleaning:", error.message, error.stack);
return null;
}
}
// --- Main Conversion and Download Logic ---
function convertAndDownload() {
console.log("Convert to Markdown (v1.0): Button clicked..."); // Version updated here
try {
// --- Initialize Turndown, Apply GFM, Add Math Rule ---
console.log("Initializing TurndownService...");
if (typeof TurndownService === 'undefined') { throw new Error('TurndownService is not defined.'); }
const turndownService = new TurndownService(turndownOptions);
console.log("Applying GFM plugin...");
if (typeof turndownPluginGfm !== 'undefined' && typeof turndownPluginGfm.gfm === 'function') {
try {
turndownService.use(turndownPluginGfm.gfm);
console.log("GFM applied.");
}
catch (gfmError) { console.error("Error applying GFM plugin:", gfmError); }
} else { console.warn("GFM plugin not loaded."); }
// Define and Add Math Rule (for KaTeX/MathJax)
const mathRule = {}; // Simplified for brevity, keep full logic from previous step
mathRule.filter = function (node, options) {
try {
return (
(node.nodeName === 'SPAN' && (node.classList.contains('katex') || node.classList.contains('MathJax_Preview'))) ||
(node.nodeName === 'DIV' && node.classList.contains('katex-display')) ||
(node.nodeName === 'SCRIPT' && node.getAttribute('type')?.startsWith('math/tex')) ||
(node.getAttribute('role') === 'math')
);
} catch (filterError) { console.error("Error inside MathJax filter function:", filterError, "Node:", node); return false; }
};
mathRule.replacement = function (content, node, options) {
let latex = '', delimiter = '$';
try {
if (node.nodeName === 'SCRIPT') {
latex = node.textContent || '';
if (node.getAttribute('type')?.includes('mode=display') || latex.trim().startsWith('\\display')) { delimiter = '$$'; }
} else if (node.dataset && node.dataset.originalLatex) {
latex = node.dataset.originalLatex;
if (node.classList.contains('katex-display') || node.closest('.MathJax_Display')) { delimiter = '$$'; }
} else if (node.getAttribute('aria-label')) {
latex = node.getAttribute('aria-label');
if (node.nodeName === 'DIV' || node.classList.contains('katex-display') || node.closest('.MathJax_Display')) { delimiter = '$$'; }
} else if (node.classList.contains('katex')) {
const annotation = node.querySelector('annotation[encoding="application/x-tex"]');
if (annotation) {
latex = annotation.textContent || '';
if (node.classList.contains('katex-display')) { delimiter = '$$'; }
}
} else if (node.nodeName === 'MATH' && node.getAttribute('alttext')) {
latex = node.getAttribute('alttext');
if (node.getAttribute('display') === 'block') { delimiter = '$$'; }
}
if (latex) {
latex = latex.trim();
if ((latex.startsWith('$$') && latex.endsWith('$$')) || (latex.startsWith('$') && latex.endsWith('$') && !latex.startsWith('$$'))) { return latex; }
return `${delimiter}${latex}${delimiter}`;
}
return '';
} catch (ruleError) { console.error("Error processing math rule replacement for node:", node, ruleError); return ''; }
};
try {
console.log("Adding Math rule...");
if (typeof mathRule.filter !== 'function') { throw new Error("Math rule filter is not a function!"); }
turndownService.addRule('mathjaxKatex', mathRule);
console.log("Math rule added.");
} catch (addRuleError) { console.error("Failed to add Math rule:", addRuleError); }
// --- Perform Conversion ---
console.log("Getting page content node...");
const pageData = getPageContentNode();
if (!pageData || !pageData.contentNode) {
console.error("Failed to get valid page content node. Aborting.");
alert("Could not get a valid page content node for conversion.");
return;
}
console.log(`Content node retrieved. Title: ${pageData.title}. Starting conversion...`);
let markdownContent = '';
try {
markdownContent = turndownService.turndown(pageData.contentNode);
console.log("Markdown conversion complete.");
} catch (convertError) {
console.error("Error during Turndown conversion:", convertError.message, convertError.stack);
alert(`Error during Markdown conversion: ${convertError.message}`);
return;
}
if (!markdownContent || markdownContent.trim() === '') {
console.warn("Conversion resulted in empty Markdown content.");
alert("Warning: The converted Markdown content is empty.");
}
// --- Prepare Filename & Download ---
const filename = sanitizeFilename(pageData.title) + ".md";
const dataUri = `data:text/markdown;charset=utf-8,${encodeURIComponent(markdownContent)}`;
console.log(`Initiating download for ${filename}...`);
GM_download({
url: dataUri,
name: filename,
saveAs: true,
onerror: (err) => {
console.error('GM_download error:', err);
alert(`Error downloading file: ${err.error || 'Unknown error'}. Check Tampermonkey settings (Advanced -> Downloads BETA -> Whitelist .md).`);
},
});
console.log("Download initiated.");
} catch (error) {
console.error("Critical error during convertAndDownload:", error.message, error.stack);
alert(`A critical error occurred while running the script: ${error.message}`);
}
}
// --- Register Menu Command ---
if (typeof GM_registerMenuCommand === 'function') {
try {
// Updated menu command text to reflect v1.0
GM_registerMenuCommand("Convert Page to Markdown (v1.0)", convertAndDownload, "m");
console.log("Menu command registered.");
} catch (registerError) { console.error("Failed to register menu command:", registerError); alert("Failed to register menu command!"); }
} else { console.error("GM_registerMenuCommand is not available."); alert("GM_registerMenuCommand is not available!"); }
console.log("Webpage to Markdown (v1.0) script finished loading."); // Version updated here
})();