自动 lang 属性

自动监测页面语言并给 html 元素添加 lang 标签,有助于浏览器正确显示字体和排版

// ==UserScript==
// @name         Auto lang attr
// @name:zh-CN   自动 lang 属性
// @namespace    auto.lang.attr
// @version      1.0
// @description       Automatically detect page language and add lang attribute to html tag, helps browser display proper fonts and typography
// @description:zh-CN 自动监测页面语言并给 html 元素添加 lang 标签,有助于浏览器正确显示字体和排版
// @author       o
// @match        *://*/*
// @grant        GM_getValue
// @grant        GM_setValue
// @grant        GM_registerMenuCommand
// @grant        GM_unregisterMenuCommand
// @license      MIT
// ==/UserScript==

(function() {
    'use strict';

    // Whitelist management
    const WHITELIST_KEY = 'lang_attr_whitelist';
    let whitelist = GM_getValue(WHITELIST_KEY, []);
    let menuCommandId = null;

    // Get current domain
    const currentDomain = window.location.hostname;

    // Check if current domain is in whitelist
    const isWhitelisted = whitelist.includes(currentDomain);

    // Register menu command
    function registerMenuCommand() {
        if (menuCommandId !== null) {
            GM_unregisterMenuCommand(menuCommandId);
        }

        const commandText = isWhitelisted ?
            `Enable for ${currentDomain}` :
            `Disable for ${currentDomain}`;

        menuCommandId = GM_registerMenuCommand(commandText, toggleWhitelist);
    }

    // Toggle domain in whitelist
    function toggleWhitelist() {
        if (isWhitelisted) {
            // Remove from whitelist
            whitelist = whitelist.filter(domain => domain !== currentDomain);
        } else {
            // Add to whitelist
            whitelist.push(currentDomain);
        }

        GM_setValue(WHITELIST_KEY, whitelist);
        window.location.reload(); // Reload to apply changes
    }

    // Register the menu command
    registerMenuCommand();

    // If whitelisted and enabled, skip processing
    if (isWhitelisted) {
        return;
    }

    // Check if html tag already has lang attribute
    const htmlElement = document.documentElement;
    if (htmlElement.hasAttribute('lang')) {
        return; // Exit if lang attribute already exists
    }

    // Function to detect language from text content
    function detectLanguage(text) {
        // Common language patterns (simplified detection)
        const languagePatterns = {
            'en': /\b(the|and|to|of|a|in|that|is|for|it)\b/gi,
            'es': /\b(el|la|los|las|de|que|y|en|a|un)\b/gi,
            'fr': /\b(le|la|les|de|et|à|en|des|un|une)\b/gi,
            'de': /\b(der|die|das|und|in|zu|den|von|mit|sich)\b/gi,
            'pt': /\b(o|a|os|as|de|e|em|um|uma|que)\b/gi,
            'it': /\b(il|lo|la|i|gli|le|di|e|a|in)\b/gi,
            'ru': /\b(и|в|не|на|я|что|с|а|он|к)\b/gi,
            'zh': /[\u4e00-\u9fff]/g, // Chinese characters
            'ja': /[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]/g, // Hiragana, Katakana, Kanji
            'ko': /[\uac00-\ud7af\u1100-\u11ff]/g, // Hangul syllables and jamo
            'ar': /[\u0600-\u06FF]/g, // Arabic script
            'hi': /[\u0900-\u097F]/g // Devanagari script (Hindi)
        };

        let maxCount = 0;
        let detectedLang = 'en'; // Default to English

        // Count occurrences of language patterns
        for (const [lang, pattern] of Object.entries(languagePatterns)) {
            const matches = text.match(pattern);
            const count = matches ? matches.length : 0;

            if (count > maxCount) {
                maxCount = count;
                detectedLang = lang;
            }
        }

        // If no strong pattern matches found, use more sophisticated detection
        if (maxCount < 3) {
            // Try to detect from meta tags
            const metaLang = document.querySelector('meta[http-equiv="content-language"], meta[name="language"]');
            if (metaLang && metaLang.content) {
                return metaLang.content.split('-')[0]; // Get primary language code
            }

            // Try to detect from charset or other hints
            const charset = document.characterSet || document.charset;
            if (charset && charset.toLowerCase().includes('utf')) {
                // For UTF pages, we might need more context
                // Check for specific language indicators in the page
                const bodyText = document.body.textContent.toLowerCase();
                if (bodyText.includes('的') || bodyText.includes('是')) return 'zh';
                if (bodyText.includes('は') || bodyText.includes('です')) return 'ja';
                if (bodyText.includes('입니다') || bodyScript.includes('는')) return 'ko';
            }
        }

        return detectedLang;
    }

    // Get text content from the page (limit to first 5000 chars for performance)
    const pageText = document.body.textContent.substring(0, 5000);

    if (pageText.trim().length > 100) { // Only detect if there's sufficient text
        const detectedLanguage = detectLanguage(pageText.toLowerCase());

        // Set the lang attribute
        htmlElement.setAttribute('lang', detectedLanguage);

        console.log(`Language detected: ${detectedLanguage}, lang attribute added to html tag`);
    }
})();