Webpage to Markdown

Convert webpage to Markdown via Tampermonkey menu. Significantly optimizes content selection logic, reducing content loss and useless elements.

  1. // ==UserScript==
  2. // @name Webpage to Markdown
  3. // @namespace http://tampermonkey.net/
  4. // @version 1.0
  5. // @description Convert webpage to Markdown via Tampermonkey menu. Significantly optimizes content selection logic, reducing content loss and useless elements.
  6. // @author Feiyt
  7. // @homepageURL https://github.com/Feiyt
  8. // @license MIT
  9. // @match *://*/*
  10. // @require https://unpkg.com/turndown/dist/turndown.js
  11. // @require https://unpkg.com/turndown-plugin-gfm@1.0.2/dist/turndown-plugin-gfm.js
  12. // @grant GM_registerMenuCommand
  13. // @grant GM_download
  14. // @grant GM_addStyle
  15. // @run-at document-idle
  16. // ==/UserScript==
  17.  
  18. // Copyright (c) 2025 Feiyt
  19. // Released under the MIT license
  20. // https://github.com/Feiyt (or specify the exact repo if available)
  21.  
  22. (function() {
  23. 'use strict';
  24. console.log("Webpage to Markdown (v1.0) script starting..."); // Version updated here
  25.  
  26. // --- Configuration ---
  27. const turndownOptions = { /* ... options from previous version ... */ }; // Reference to previous settings
  28. turndownOptions.headingStyle = 'atx';
  29. turndownOptions.hr = '---';
  30. turndownOptions.bulletListMarker = '*';
  31. turndownOptions.codeBlockStyle = 'fenced';
  32. turndownOptions.emDelimiter = '*';
  33. turndownOptions.strongDelimiter = '**';
  34. turndownOptions.linkStyle = 'inlined';
  35.  
  36.  
  37. // --- Helper Functions ---
  38. function sanitizeFilename(name) { /* ... function from previous version ... */ } // Placeholder comment
  39. // Sanitizes a string to be used as a filename.
  40. sanitizeFilename = function(name) {
  41. // Replace forbidden characters with underscore, collapse whitespace, trim, provide default.
  42. return name.replace(/[\/\\:*?"<>|#%\n\r]/g, '_').replace(/\s+/g, ' ').trim() || "markdown_export";
  43. };
  44.  
  45.  
  46. /**
  47. * Improved content selection and cleaning.
  48. * Prioritizes semantic tags and common content IDs/classes.
  49. * @returns {object|null} Object containing { title: string, contentNode: Node } or null on failure.
  50. */
  51. function getPageContentNode() {
  52. console.log("getPageContentNode (v1.0 logic): Starting content retrieval..."); // Adjusted log message slightly
  53. const pageTitle = document.title || window.location.hostname;
  54. let bestCandidate = null;
  55. let maxScore = -1; // Simple scoring mechanism
  56.  
  57. // More robust selectors with priorities implied by order
  58. const selectors = [
  59. // Highest Priority: Semantic & Specific Roles/IDs/Classes
  60. 'article', '[role="article"]', '.article-body', '.post-content', '.entry-content', '#article-content', '.post-body', '.markdown-body',
  61. // High Priority: Main content areas
  62. 'main', '[role="main"]', '#main-content', '#main', '.main-content', '.main', '#primary',
  63. // Medium Priority: Common generic containers (often need cleaning)
  64. '#content', '.content',
  65. // Lower Priority: More specific layout patterns
  66. '#page .content', // Example of nested structure
  67. '.container .content',
  68. // Stack Overflow Example
  69. '#mainbar',
  70. // Lowest Priority (if nothing else works, but avoid body initially)
  71. // Maybe add specific blog platform IDs? '.hentry'?
  72. ];
  73.  
  74. console.log("Searching for best content container...");
  75. selectors.forEach((selector, index) => {
  76. try {
  77. const element = document.querySelector(selector);
  78. if (element) {
  79. // Basic score: higher priority selectors get higher base score
  80. const score = selectors.length - index; // Higher index = lower priority = lower score
  81. console.log(`Found candidate [${selector}] with score ${score}`);
  82.  
  83. // --- Basic Heuristic Check ---
  84. const textLength = element.textContent?.trim().length || 0;
  85. const childCount = element.childElementCount || 0;
  86. // Arbitrary thresholds - adjust as needed
  87. if (textLength < 100 && childCount < 3) {
  88. console.log(`... Candidate [${selector}] seems too small/empty (Text: ${textLength}, Children: ${childCount}). Lowering confidence.`);
  89. }
  90.  
  91. if (score > maxScore) {
  92. maxScore = score;
  93. bestCandidate = element;
  94. console.log(`>>> New best candidate: [${selector}]`);
  95. }
  96. }
  97. } catch (e) { console.warn(`Error querying selector "${selector}": ${e.message}`); }
  98. });
  99.  
  100. // If no good candidate found via specific selectors, use body as last resort
  101. if (!bestCandidate) {
  102. console.warn("No suitable specific container found after checking selectors. Falling back to document.body.");
  103. bestCandidate = document.body;
  104. } else {
  105. const likelySelectorIndex = selectors.length - 1 - Math.floor(maxScore);
  106. const likelySelector = selectors[likelySelectorIndex] || 'heuristic/fallback';
  107. console.log(`Selected final container: <${bestCandidate.tagName.toLowerCase()}> (Selector likely: ${likelySelector})`);
  108. }
  109.  
  110. // --- Clone and Clean ---
  111. try {
  112. if (!bestCandidate || typeof bestCandidate.cloneNode !== 'function') {
  113. console.error("Cannot clone the selected content element."); return null;
  114. }
  115. console.log("Cloning selected container...");
  116. const clone = bestCandidate.cloneNode(true);
  117.  
  118. // Define selectors for elements to exclude from the conversion.
  119. const excludeSelectors = [
  120. 'header', 'footer', 'nav', '.header', '.footer', '.navbar', '.menu', '.toc', '#toc', '.breadcrumb', '#breadcrumb',
  121. '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
  122. 'aside', '.sidebar', '#sidebar', '.widget-area', '#secondary', '.left-column', '.right-column',
  123. '[role="complementary"]',
  124. '.actions', '.share', '.social', '.buttons', '.post-meta', '.entry-meta', '.feedback', '.related-posts',
  125. '.like-button-container', '.feedback-container',
  126. '#comments', '.comments', '.comment-section', '#respond',
  127. '.ad', '.ads', '.advertisement', '.adsbygoogle', '[id*="ad-"]', '[class*="ad-"]', '[class*="advert"]',
  128. '.edit-link', '.print-link', '[role="search"]',
  129. 'script', 'style', 'noscript', 'template', 'link[rel="stylesheet"]', 'meta', 'input[type="hidden"]',
  130. '.visually-hidden', '.sr-only', '[aria-hidden="true"]',
  131. '.cookie-banner', '#related-articles', '.related_posts',
  132. ];
  133.  
  134. console.log("Removing excluded elements from clone...");
  135. let removedCount = 0;
  136. for (const selector of excludeSelectors) {
  137. try {
  138. const elementsToRemove = clone.querySelectorAll(selector);
  139. elementsToRemove.forEach(el => {
  140. if (el !== clone && typeof el.remove === 'function') {
  141. el.remove();
  142. removedCount++;
  143. } else if (el === clone) {
  144. console.warn(`Exclusion selector "${selector}" matched the container root itself! Skipping removal of root.`);
  145. }
  146. });
  147. } catch (e) { console.warn(`Error removing elements for selector "${selector}": ${e.message}`); }
  148. }
  149. console.log(`Removed ${removedCount} elements/subtrees from clone.`);
  150.  
  151. // --- Post-cleaning Check ---
  152. if (clone.childElementCount === 0 && clone.textContent.trim().length < 50) {
  153. console.warn("Clone seems empty after cleaning! Original selection or exclusion might be wrong.");
  154. }
  155.  
  156. return { title: pageTitle, contentNode: clone };
  157.  
  158. } catch (error) {
  159. console.error("Critical error during cloning or cleaning:", error.message, error.stack);
  160. return null;
  161. }
  162. }
  163.  
  164. // --- Main Conversion and Download Logic ---
  165. function convertAndDownload() {
  166. console.log("Convert to Markdown (v1.0): Button clicked..."); // Version updated here
  167. try {
  168. // --- Initialize Turndown, Apply GFM, Add Math Rule ---
  169. console.log("Initializing TurndownService...");
  170. if (typeof TurndownService === 'undefined') { throw new Error('TurndownService is not defined.'); }
  171. const turndownService = new TurndownService(turndownOptions);
  172.  
  173. console.log("Applying GFM plugin...");
  174. if (typeof turndownPluginGfm !== 'undefined' && typeof turndownPluginGfm.gfm === 'function') {
  175. try {
  176. turndownService.use(turndownPluginGfm.gfm);
  177. console.log("GFM applied.");
  178. }
  179. catch (gfmError) { console.error("Error applying GFM plugin:", gfmError); }
  180. } else { console.warn("GFM plugin not loaded."); }
  181.  
  182. // Define and Add Math Rule (for KaTeX/MathJax)
  183. const mathRule = {}; // Simplified for brevity, keep full logic from previous step
  184. mathRule.filter = function (node, options) {
  185. try {
  186. return (
  187. (node.nodeName === 'SPAN' && (node.classList.contains('katex') || node.classList.contains('MathJax_Preview'))) ||
  188. (node.nodeName === 'DIV' && node.classList.contains('katex-display')) ||
  189. (node.nodeName === 'SCRIPT' && node.getAttribute('type')?.startsWith('math/tex')) ||
  190. (node.getAttribute('role') === 'math')
  191. );
  192. } catch (filterError) { console.error("Error inside MathJax filter function:", filterError, "Node:", node); return false; }
  193. };
  194. mathRule.replacement = function (content, node, options) {
  195. let latex = '', delimiter = '$';
  196. try {
  197. if (node.nodeName === 'SCRIPT') {
  198. latex = node.textContent || '';
  199. if (node.getAttribute('type')?.includes('mode=display') || latex.trim().startsWith('\\display')) { delimiter = '$$'; }
  200. } else if (node.dataset && node.dataset.originalLatex) {
  201. latex = node.dataset.originalLatex;
  202. if (node.classList.contains('katex-display') || node.closest('.MathJax_Display')) { delimiter = '$$'; }
  203. } else if (node.getAttribute('aria-label')) {
  204. latex = node.getAttribute('aria-label');
  205. if (node.nodeName === 'DIV' || node.classList.contains('katex-display') || node.closest('.MathJax_Display')) { delimiter = '$$'; }
  206. } else if (node.classList.contains('katex')) {
  207. const annotation = node.querySelector('annotation[encoding="application/x-tex"]');
  208. if (annotation) {
  209. latex = annotation.textContent || '';
  210. if (node.classList.contains('katex-display')) { delimiter = '$$'; }
  211. }
  212. } else if (node.nodeName === 'MATH' && node.getAttribute('alttext')) {
  213. latex = node.getAttribute('alttext');
  214. if (node.getAttribute('display') === 'block') { delimiter = '$$'; }
  215. }
  216. if (latex) {
  217. latex = latex.trim();
  218. if ((latex.startsWith('$$') && latex.endsWith('$$')) || (latex.startsWith('$') && latex.endsWith('$') && !latex.startsWith('$$'))) { return latex; }
  219. return `${delimiter}${latex}${delimiter}`;
  220. }
  221. return '';
  222. } catch (ruleError) { console.error("Error processing math rule replacement for node:", node, ruleError); return ''; }
  223. };
  224.  
  225. try {
  226. console.log("Adding Math rule...");
  227. if (typeof mathRule.filter !== 'function') { throw new Error("Math rule filter is not a function!"); }
  228. turndownService.addRule('mathjaxKatex', mathRule);
  229. console.log("Math rule added.");
  230. } catch (addRuleError) { console.error("Failed to add Math rule:", addRuleError); }
  231.  
  232. // --- Perform Conversion ---
  233. console.log("Getting page content node...");
  234. const pageData = getPageContentNode();
  235.  
  236. if (!pageData || !pageData.contentNode) {
  237. console.error("Failed to get valid page content node. Aborting.");
  238. alert("Could not get a valid page content node for conversion.");
  239. return;
  240. }
  241. console.log(`Content node retrieved. Title: ${pageData.title}. Starting conversion...`);
  242.  
  243. let markdownContent = '';
  244. try {
  245. markdownContent = turndownService.turndown(pageData.contentNode);
  246. console.log("Markdown conversion complete.");
  247. } catch (convertError) {
  248. console.error("Error during Turndown conversion:", convertError.message, convertError.stack);
  249. alert(`Error during Markdown conversion: ${convertError.message}`);
  250. return;
  251. }
  252.  
  253. if (!markdownContent || markdownContent.trim() === '') {
  254. console.warn("Conversion resulted in empty Markdown content.");
  255. alert("Warning: The converted Markdown content is empty.");
  256. }
  257.  
  258. // --- Prepare Filename & Download ---
  259. const filename = sanitizeFilename(pageData.title) + ".md";
  260. const dataUri = `data:text/markdown;charset=utf-8,${encodeURIComponent(markdownContent)}`;
  261. console.log(`Initiating download for ${filename}...`);
  262. GM_download({
  263. url: dataUri,
  264. name: filename,
  265. saveAs: true,
  266. onerror: (err) => {
  267. console.error('GM_download error:', err);
  268. alert(`Error downloading file: ${err.error || 'Unknown error'}. Check Tampermonkey settings (Advanced -> Downloads BETA -> Whitelist .md).`);
  269. },
  270. });
  271. console.log("Download initiated.");
  272.  
  273. } catch (error) {
  274. console.error("Critical error during convertAndDownload:", error.message, error.stack);
  275. alert(`A critical error occurred while running the script: ${error.message}`);
  276. }
  277. }
  278.  
  279. // --- Register Menu Command ---
  280. if (typeof GM_registerMenuCommand === 'function') {
  281. try {
  282. // Updated menu command text to reflect v1.0
  283. GM_registerMenuCommand("Convert Page to Markdown (v1.0)", convertAndDownload, "m");
  284. console.log("Menu command registered.");
  285. } catch (registerError) { console.error("Failed to register menu command:", registerError); alert("Failed to register menu command!"); }
  286. } else { console.error("GM_registerMenuCommand is not available."); alert("GM_registerMenuCommand is not available!"); }
  287.  
  288. console.log("Webpage to Markdown (v1.0) script finished loading."); // Version updated here
  289. })();