您需要先安装一个扩展,例如 篡改猴、Greasemonkey 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 Userscripts ,之后才能安装此脚本。
您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。
您需要先安装用户脚本管理器扩展后才能安装此脚本。
Merges broken words that might end up due to OCR. Like "t hat" instead of "that." A naive implementation.
// ==UserScript== // @name Merge Broken Words // @namespace http://tampermonkey.net/ // @version 9 // @description Merges broken words that might end up due to OCR. Like "t hat" instead of "that." A naive implementation. // @match *://ranobes.top/* // @match *://ranobes.net/* // @icon https://www.google.com/s2/favicons?sz=64&domain=ranobes.top // @license MIT // @grant GM_xmlhttpRequest // ==/UserScript== (function() { 'use strict'; // A naive greedy approach. let validWords; // Some wordlists: // Doesn't have many words: https://www.mit.edu/~ecprice/wordlist.10000 // Contains common misspellings, etc. aswell: https://raw.githubusercontent.com/dwyl/english-words/refs/heads/master/words_alpha.txt GM_xmlhttpRequest({ method: "GET", url: "https://websites.umich.edu/~jlawler/wordlist", onload: function(response) { const words = response.responseText.split('\n'); const allowed = new Set(["i", "a"]); const exceptions = new Set(['th', 'ha', 'iwo']); // What a mess... // This is where deep learning would be nice ig... validWords = new Set(words .map(word => word.trim().toLowerCase()) .filter(word => !exceptions.has(word) && (word.length > 1 || allowed.has(word))) ); processParagraphs(); } }); function processParagraphs() { const section = document.querySelector(".story #arrticle"); let paragraphs = Array.from(section.querySelectorAll("p")); section.childNodes.forEach(child => { if (child.nodeType === Node.TEXT_NODE) { paragraphs.push(child); } }) function cleanWord(word) { word = word.toLowerCase(); if (/^\w+['’]/.test(word)) { // Otherwise single-qouted words or those at the edge of sentences will cause issues. word = word.replace(/(['’].*)|(n['’]t*)/, "") // Remove contractions like "'s", "'re" } return word.replace(/[^\w]/g, ""); // Remove remaining non-alphanumeric characters } let mergedWordsLog = []; paragraphs.forEach(p => { let text = p.textContent; let words = text.split(/\s+/); let mergedWords = []; for (let i = 0; i < words.length; i++) { if (i < words.length - 1) { const wordCurrent = cleanWord(words[i].replace(/.*-/, '')); // Dealing with hyphenated words. const wordNext = cleanWord(words[i + 1].replace(/-.*/, '')); const bothValid = validWords.has(wordCurrent) && validWords.has(wordNext); const merged = wordCurrent + wordNext; if ( !bothValid && (validWords.has(merged) || // Naive approach to remove plural, maybe could use some stemming algo. (merged.length > 2 && validWords.has(merged.replace(/s$/, ''))) ) ) { mergedWordsLog.push([words[i], words[i + 1]]); mergedWords.push(words[i] + words[i + 1]); i++; // Skip the next word since it's merged continue; } } mergedWords.push(words[i]); } p.textContent = mergedWords.join(" "); }); console.table(mergedWordsLog); } })();