MagicScraper

Scrapes and displays data from the web page based on rules.

目前为 2023-07-20 提交的版本。查看 最新版本

此脚本不应直接安装,它是一个供其他脚本使用的外部库。如果您需要使用该库,请在脚本元属性加入:// @require https://update.cn-greasyfork.org/scripts/471264/1222745/MagicScraper.js

// ==UserScript==
// @name         MagicScraper
// @namespace    http://tampermonkey.net/
// @version      0.1
// @description  Scrapes and displays data from the web page based on rules.
// @author       aolko
// @match        *://*/*
// @grant        GM_addStyle
// ==/UserScript==

function magicScraper(rules, options = {}) {
  function createDOMFromScrapedData(data, element, keepChildren) {
    for (const key in data) {
      if (typeof data[key] === 'string') {
        const newElement = document.createElement(element.tagName);
        newElement.innerHTML = data[key];

        if (keepChildren && element.children.length > 0) {
          Array.from(element.children).forEach(child => {
            newElement.appendChild(child);
          });
        }

        element.appendChild(newElement);
      } else if (typeof data[key] === 'object') {
        const newElement = document.createElement(element.tagName);
        element.appendChild(newElement);
        createDOMFromScrapedData(data[key], newElement, keepChildren);
      }
    }
  }

  function matchPageOrDomain(pattern, current) {
    if (pattern === '*' || pattern === current) return true;
    const regexPattern = new RegExp(`^${pattern.replace('*', '.*')}$`, 'i');
    return regexPattern.test(current);
  }

  function scrapeDataByRules(rulesObj, currentDomain, currentPage) {
    const domainKeys = Object.keys(rulesObj);

    for (const domainPattern of domainKeys) {
      if (matchPageOrDomain(domainPattern, currentDomain)) {
        const domainData = rulesObj[domainPattern];

        // Check if the domain has subdomain-specific rules
        if (currentDomain !== domainPattern && currentDomain.endsWith(`.${domainPattern}`)) {
          const subdomain = currentDomain.slice(0, currentDomain.indexOf(`.${domainPattern}`));
          if (subdomain in domainData) {
            return domainData[subdomain];
          }
        }

        // Check for page-specific rules first
        if (currentPage in domainData.pages) {
          return Object.assign({}, domainData, domainData.pages[currentPage]);
        }

        return domainData;
      }
    }

    return {};
  }

  function loadExternalRules(externalRulesURL, currentDomain, currentPage, callback) {
    fetch(externalRulesURL)
      .then(response => response.json())
      .then(data => {
        const rulesObj = data.rules || {};
        const scrapedData = scrapeDataByRules(rulesObj, currentDomain, currentPage);
        callback(scrapedData);
      })
      .catch(err => {
        console.error('Error loading external rules:', err);
        callback({});
      });
  }

  function runScraping() {
    const currentDomain = window.location.hostname;
    const currentPage = window.location.pathname;

    let pageRules;

    if (typeof rules === 'string') {
      // Load external rules if the rules parameter is a URL string
      loadExternalRules(rules, currentDomain, currentPage, scrapedData => {
        pageRules = scrapedData;
        handleRules(pageRules);
      });
    } else {
      pageRules = scrapeDataByRules(rules, currentDomain, currentPage);
      handleRules(pageRules);
    }
  }

  function handleRules(pageRules) {
    if (Object.keys(pageRules).length === 0) {
      console.warn('No rules found for the current domain and page.');
      return;
    }

    const fragment = document.createDocumentFragment();
    createDOMFromScrapedData(pageRules, fragment, options.keepChildren);
    document.body.innerHTML = '';
    document.body.appendChild(fragment);
  }

  runScraping();
}