Google Scholar Author Crawler

Crawl author information from Google Scholar and export to TXT file

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey 篡改猴Greasemonkey 油猴子Violentmonkey 暴力猴,才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey 篡改猴,才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey 篡改猴Violentmonkey 暴力猴,才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey 篡改猴Userscripts ,才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey 篡改猴,才能安装此脚本。

您需要先安装一款用户脚本管理器扩展后才能安装此脚本。

(我已经安装了用户脚本管理器,让我安装!)

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

(我已经安装了用户样式管理器,让我安装!)

// ==UserScript==
// @name         Google Scholar Author Crawler
// @namespace    https://example.com/
// @version      1.0.1
// @description  Crawl author information from Google Scholar and export to TXT file
// @match        https://scholar.google.com/*
// @grant        GM_xmlhttpRequest
// @grant        unsafeWindow
// @license      MIT
// ==/UserScript==

(async () => {
    const button = document.createElement("button");
    button.textContent = "Crawl Authors";
    button.style.position = "fixed";
    button.style.top = "20px";
    button.style.right = "100px";
    button.addEventListener("click", crawlAuthors);

    document.body.appendChild(button);

    const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));

    async function crawlAuthors() {
        const urlParams = new URLSearchParams(window.location.search);
        const label = urlParams.get("mauthors").match(/label:([^&]+)/)[1];
        const query = `label:${label}`;
        const authors = await crawlPage(window.location.href, []);
        const rangeElement = document.querySelector("span.gs_nph.gsc_pgn_ppn");
        const rangeText = rangeElement.textContent.trim();
        const startRange = parseInt(rangeText.split(" ")[0]);
        const endRange = parseInt(rangeText.split(" ")[2]);
        const fileName = `${query}_${startRange}-${endRange}.txt`;
        exportToTxt(authors, fileName);
    }

    async function crawlPage(url, authors) {
        return new Promise(async (resolve) => {
            GM_xmlhttpRequest({
                method: "GET",
                url: url,
                onload: async (response) => {
                    const parser = new DOMParser();
                    const doc = parser.parseFromString(response.responseText, "text/html");

                    for (const author of doc.querySelectorAll("div.gs_ai")) {
                        const name = author.querySelector("h3.gs_ai_name > a").textContent.trim();
                        const affiliation = author.querySelector("div.gs_ai_aff").textContent.trim();
                        const fieldsLinks = author.querySelectorAll("div.gs_ai_int > a.gs_ai_one_int");
                        const separator = ",";
                        const fields = Array.from(fieldsLinks).map(link => link.textContent.trim().toLowerCase().replace(/[^a-zA-Z0-9]/g, ' ')).join(separator);
                        const citation_count_element = author.querySelector("div.gs_ai_cby");
                        const citation_count_match = citation_count_element.textContent.match(/\d+/);
                        const citation_count = citation_count_match ? citation_count_match[0] : "";
                        const profileLink = "https://scholar.google.com" + author.querySelector("h3.gs_ai_name > a").getAttribute("href");

                        const hIndex = await fetchAuthorHIndex(profileLink);

                        authors.push({
                            "Name": name,
                            "Affiliation": affiliation,
                            "Fields": fields,
                            "Citation Count": citation_count,
                            "Profile Link": profileLink,
                            "H-index": hIndex,
                        });
                    }

                    resolve(authors);
                    await delay(1000);
                }
            });
        });
    }

    async function fetchAuthorHIndex(profileLink) {
        return new Promise((resolve) => {
            GM_xmlhttpRequest({
                method: "GET",
                url: profileLink,
                onload: (response) => {
                    const parser = new DOMParser();
                    const doc = parser.parseFromString(response.responseText, "text/html");
                    const hIndex = doc.querySelectorAll("td.gsc_rsb_std")[2].textContent.trim();
                    resolve(hIndex);
                },
            });
        });
    }

    function exportToTxt(authors, fileName) {
        const lines = [
            "Name\tAffiliation\tFields\tCitation Count\tProfile Link\tH-index",
            ...authors.map(
                (a) =>
                `${a.Name}\t${a.Affiliation}\t${a.Fields}\t${a["Citation Count"]}\t${a["Profile Link"]}\t${a["H-index"]}`
            ),
        ];
        const content = lines.join("\n");
        const blob = new Blob([content], { type: "text/plain;charset=utf-8" });
        const url = URL.createObjectURL(blob);

        const a = document.createElement("a");
        a.href = url;
        a.download = fileName;
        a.click();

        URL.revokeObjectURL(url);
    }
})();