博客园新闻在线词云

Scrape news from cnblogs and generate word clouds

当前为 2025-02-06 提交的版本,查看 最新版本

您需要先安装一个扩展,例如 篡改猴Greasemonkey暴力猴,之后才能安装此脚本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安装一个扩展,例如 篡改猴暴力猴,之后才能安装此脚本。

您需要先安装一个扩展,例如 篡改猴Userscripts ,之后才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。

您需要先安装用户脚本管理器扩展后才能安装此脚本。

(我已经安装了用户脚本管理器,让我安装!)

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

(我已经安装了用户样式管理器,让我安装!)

// ==UserScript==
// @name         博客园新闻在线词云
// @name:en      Cnblogs News Scraper and WordCloud Generator
// @description:en Cnblogs News Scraper,WordCloud Generator
// @namespace    http://tampermonkey.net/
// @version      1.2.1
// @description  Scrape news from cnblogs and generate word clouds
// @author       aspen138
// @icon         https://assets.cnblogs.com/favicon.ico
// @match        *://news.cnblogs.com/*
// @grant        GM_xmlhttpRequest
// @require      https://code.jquery.com/jquery-3.6.0.min.js
// @require      https://cdnjs.cloudflare.com/ajax/libs/wordcloud2.js/1.1.2/wordcloud2.min.js
// @require      https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/segmentit.min.js
// @connect      news.cnblogs.com
// @license      MIT
// ==/UserScript==

// Acknowledgement: o1-preview

// 228~229行那里选择分词是否考虑新闻文本内容



function extract_newsIds() {
    // Select the `news_list` container
        const newsList = document.querySelector('#news_list');

        if (newsList) {
            // Find all <a> elements with href matching /n/{news_id}/
            const newsLinks = newsList.querySelectorAll('a[href^="/n/"][href$="/"]');

            // Extract the news_id from each matching link
            const newsIds = Array.from(newsLinks).map(link => {
                const match = link.getAttribute('href').match(/\/n\/(\d+)\//);
                return match ? match[1] : null;
            }).filter(Boolean); // Remove null values
            return newsIds;
        }

};


(function () {
    'use strict';

    // Function to load external scripts dynamically
    function loadScript(url, callback) {
        var script = document.createElement('script');
        script.src = url;
        script.type = 'text/javascript';
        script.onload = callback;
        document.head.appendChild(script);
    }

    // Load segmentit script
    loadScript('https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/segmentit.min.js', function () {
        // Initialize segmentit after the script is loaded
        const segmentit = Segmentit.useDefault(new Segmentit.Segment());

        // Now start your main script
        main(segmentit);
    });

    function main(segmentit) {
        // Updated form HTML with horizontal layout and lower positioning

        const newsIds=extract_newsIds();

        // Assume `newsIds` might be undefined
        const defaultMin = 781100; // Default start news ID
        const defaultMax = 781159; // Default end news ID

        // Calculate min and max, falling back to defaults if newsIds is undefined or empty
        const minNewsId = Array.isArray(newsIds) && newsIds.length > 0 ? Math.min(...newsIds) : defaultMin;
        const maxNewsId = Array.isArray(newsIds) && newsIds.length > 0 ? Math.max(...newsIds) : defaultMax;


        var formHtml = `
        <div id="news-scraper" style="position:fixed; top:50px; right:10px; background-color:#fff; padding:20px; border:1px solid #ccc; z-index:10000; display: flex; flex-direction: column; gap: 5px;">
            <h3 style="margin: 0; text-align: center;">News Scraper and WordCloud Generator</h3>
            <label style="display: flex; justify-content: space-between; align-items: center;">
                Start News ID:
                <input type="number" id="start-news-id" value="${minNewsId}" style="margin-left: 10px;" />
            </label>
            <label style="display: flex; justify-content: space-between; align-items: center;">
                End News ID:
                <input type="number" id="end-news-id" value="${maxNewsId}" style="margin-left: 10px;" />
            </label>
            <button id="start-scraping" style="align-self: center; padding: 5px 10px;">Start Scraping</button>
            <div id="scraping-status" style="margin-top: 10px; text-align: center;"></div>
        </div>
        `;


        $('body').append(formHtml);

        $('#start-scraping').click(function () {
            var startId = parseInt($('#start-news-id').val());
            var endId = parseInt($('#end-news-id').val());

            // Validate input
            if (endId < startId) {
                alert('End News ID must be greater than or equal to Start News ID');
                return;
            }

            startScraping(startId, endId);
        });

        async function startScraping(startId, endId) {
            var newsIds = [];
            if (startId <= endId) {
                for (var i = startId; i <= endId; i++) {
                    newsIds.push(i);
                }
            } else {
                for (var i = startId; i >= endId; i--) {
                    newsIds.push(i);
                }
            }
            var totalNews = newsIds.length;
            var newsData = [];
            var completedRequests = 0;

            $('#scraping-status').text('Starting scraping...');

            var concurrencyLimit = 1024; // Adjust this number as needed
            var queue = newsIds.slice(); // Copy of newsIds

            async function worker() {
                while (queue.length > 0) {
                    var newsId = queue.shift();
                    await fetchNews(newsId).then(function (newsInfo) {
                        if (newsInfo) {
                            newsData.push(newsInfo);
                        }
                    });
                    completedRequests++;
                    $('#scraping-status').text('Scraped ' + completedRequests + ' of ' + totalNews);
                }
            }

            var workers = [];
            for (var i = 0; i < concurrencyLimit; i++) {
                workers.push(worker());
            }

            await Promise.all(workers);

            // All done
            processData(newsData);
        }

        // Function to fetch a single news page
        function fetchNews(newsId) {
            return new Promise(function (resolve) {
                var url = 'https://news.cnblogs.com/n/' + newsId + '/';

                GM_xmlhttpRequest({
                    method: 'GET',
                    url: url,
                    onload: function (response) {
                        if (response.status === 200) {
                            var parser = new DOMParser();
                            var doc = parser.parseFromString(response.responseText, 'text/html');

                            var newsInfo = getNewsInfo(doc, newsId, url);
                            resolve(newsInfo);
                        } else {
                            resolve(null);
                        }
                    },
                    onerror: function (error) {
                        resolve(null);
                    }
                });
            });
        }

        // Function to extract news information from the HTML document
        function getNewsInfo(doc, newsId, url) {
            var title = 'Not Found';
            var time_text = 'Not Found';
            var views = 'Not Found';
            var news_body = 'Not Found';

            var news_title_div = doc.querySelector('#news_title');
            if (news_title_div) {
                var a = news_title_div.querySelector('a');
                if (a) {
                    title = a.textContent.trim();
                }
            }

            var news_info_div = doc.querySelector('#news_info');
            if (news_info_div) {
                var time_span = news_info_div.querySelector('span.time');
                if (time_span) {
                    time_text = time_span.textContent.trim();
                }
                var view_span = news_info_div.querySelector('span.view#News_TotalView');
                if (view_span) {
                    views = view_span.textContent.trim();
                }
            }

            var news_body_div = doc.querySelector('#news_body');
            if (news_body_div) {
                news_body = news_body_div.innerText.trim();
            }

            return {
                news_id: newsId,
                title: title,
                time: time_text,
                views: views,
                news_body: news_body,
                url: url
            };
        }

        // Function to process the scraped data and generate word clouds
        function processData(newsData) {
            // Parse time and extract year_month
            for (var i = 0; i < newsData.length; i++) {
                var item = newsData[i];
                var timeStr = item.time; // e.g., "发布于 2023-09-30 12:34"
                var dateMatch = timeStr.match(/发布于\s+(\d{4}-\d{2}-\d{2})/);
                if (dateMatch) {
                    item.date = dateMatch[1];
                    var dateObj = new Date(item.date);
                    var year = dateObj.getFullYear();
                    var month = dateObj.getMonth() + 1; // Months are 0-based
                    item.year_month = year + '-' + (month < 10 ? '0' + month : month);
                } else {
                    item.date = null;
                    item.year_month = 'Unknown';
                }
            }

            // Group data by year_month
            var groupedData = {};
            for (var i = 0; i < newsData.length; i++) {
                var item = newsData[i];
                var key = item.year_month;
                if (!groupedData[key]) {
                    groupedData[key] = [];
                }
                groupedData[key].push(item);
            }

            // For each group, generate word cloud
            for (var key in groupedData) {
                var group = groupedData[key];
                var textArray = [];
                for (var j = 0; j < group.length; j++) {
                    var item = group[j];
                    // Combine title and news_body
                    // var text = item.title + ' ' + item.news_body;
                    var text = item.title;
                    textArray.push(text);
                }
                var combinedText = textArray.join(' ');

                // Generate word cloud
                generateWordCloud(combinedText, key);
            }

            $('#scraping-status').text('All word clouds generated.');
        }

        // Function to generate word cloud using wordcloud2.js
        function generateWordCloud(text, title) {
            // Create a container div
            var container = $('<div></div>').css({
                'border': '1px solid #ccc',
                'margin': '10px',
                'padding': '10px'
            });
            // Add title
            var h3 = $('<h3></h3>').text(title);
            container.append(h3);
            // Create a canvas
            var canvas = $('<canvas></canvas>').attr('width', 500).attr('height', 500);
            container.append(canvas);

            $('#news-scraper').after(container);

            // Generate word cloud
            WordCloud(canvas[0], {
                list: getWordList(text),
                gridSize: 10,
                weightFactor: 5,
                fontFamily: 'Microsoft Yahei, SimHei, Arial, sans-serif',
                color: 'random-dark',
                backgroundColor: '#fff'
            });
        }

        // Function to segment text and generate word frequency list
        function getWordList(text) {
            // Use segmentit to segment Chinese text
            var segments = segmentit.doSegment(text);
            var words = segments.map(function (seg) {
                return seg.w;
            });

            // Count word frequencies
            var freqMap = {};
            words.forEach(function (word) {
                if (word.length > 1) { // Ignore single characters
                    if (!freqMap[word]) {
                        freqMap[word] = 0;
                    }
                    freqMap[word]++;
                }
            });

            // Convert to list of [word, frequency] pairs
            var wordList = [];
            for (var word in freqMap) {
                wordList.push([word, freqMap[word]]);
            }

            // Sort by frequency
            wordList.sort(function (a, b) {
                return b[1] - a[1];
            });

            return wordList;
        }
    }
})();