信息抓取

文章、笔记信息抓取

当前为 2024-12-30 提交的版本,查看 最新版本

您需要先安装一个扩展,例如 篡改猴Greasemonkey暴力猴,之后才能安装此脚本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安装一个扩展,例如 篡改猴暴力猴,之后才能安装此脚本。

您需要先安装一个扩展,例如 篡改猴Userscripts ,之后才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。

您需要先安装用户脚本管理器扩展后才能安装此脚本。

(我已经安装了用户脚本管理器,让我安装!)

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

(我已经安装了用户样式管理器,让我安装!)

// ==UserScript==
// @name         信息抓取
// @namespace    http://tampermonkey.net/
// @version      0.0.1
// @description  文章、笔记信息抓取
// @author       dabennn
// @match        https://*.xiaohongshu.com/explore/*
// @match        https://*.douyin.com/video/*
// @match        https://baijiahao.baidu.com/*
// @match        https://mp.weixin.qq.com/*
// @match        https://*.sohu.com/a/*
// @match        https://*.toutiao.com/article/*
// @match        https://*.toutiao.com/w/*
// @icon         https://www.google.com/s2/favicons?sz=64&domain=github.com
// @license      MIT
// @grant        GM_notification
// @grant        unsafeWindow
// ==/UserScript==

(function() {
  'use strict';
  function formatTimestamp(timestamp) {
    const date = new Date(timestamp);
    const year = date.getFullYear();
    const month = String(date.getMonth() + 1).padStart(2, '0');
    const day = String(date.getDate()).padStart(2, '0');
    const hours = String(date.getHours()).padStart(2, '0');
    const minutes = String(date.getMinutes()).padStart(2, '0');
    const seconds = String(date.getSeconds()).padStart(2, '0');
    return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
  }
  function getElementText(domSelector) {
    const el = document.querySelector(domSelector);
    return el ? el.textContent.trim() : '';
  }
  function getElementTextByXPath(xpath) {
    const el = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
    return el ? el.textContent.trim() : '';
  }
  function createInfoModal(infoText, copyText) {
    // 创建一个用于显示信息的div元素作为浮窗
    const infoDiv = document.createElement('div');
    infoDiv.style.position = 'fixed';
    infoDiv.style.top = '0';
    infoDiv.style.right = '0';
    infoDiv.style.backgroundColor = 'rgba(255, 255, 255, 0.8)';
    infoDiv.style.padding = '10px';
    infoDiv.style.border = '1px solid gray';
    infoDiv.style.zIndex = '9999';

    // 创建关闭按钮元素
    const closeButton = document.createElement('span');
    closeButton.id = '__close_btn__';
    closeButton.textContent = '×';
    closeButton.style.cursor = 'pointer';
    closeButton.style.float = 'right';
    closeButton.style.fontSize = '18px';
    closeButton.style.color = 'red';

    // 创建复制按钮元素
    const copyButton = document.createElement('span');
    copyButton.id = '__copy__';
    copyButton.textContent = '复制';
    copyButton.style.cursor = 'pointer';
    copyButton.style.float = 'right';
    copyButton.style.marginRight = '5px';
    copyButton.style.fontSize = '18px';
    copyButton.style.color = 'blue';

    // 先将关闭按钮添加到浮窗中
    infoDiv.appendChild(closeButton);
    // 再将复制按钮添加到浮窗中
    infoDiv.appendChild(copyButton);

    // 将信息拼接成字符串并设置到浮窗的innerHTML中
    infoDiv.innerHTML += infoText;

    // 将浮窗添加到页面的body元素中
    document.body.appendChild(infoDiv);

    document.querySelector('#__close_btn__').addEventListener('click', function() {
      infoDiv.parentNode.removeChild(infoDiv);
    });

    document.querySelector('#__copy__').onclick = function() {
      navigator.clipboard.writeText(copyText).then(function() {
        GM_notification('已复制到剪贴板');
      }).catch(function(err) {
        GM_notification('复制失败');
        console.error('复制失败:', err);
      });
    };
  }
  function createFloatButton() {
    const button = document.createElement('div')
    button.textContent = '信息抓取'
    button.id = '__float_btn__'
    button.style.position = 'fixed'
    button.style.top = '150px'
    button.style.right = '0'
    button.style.zIndex = '9999'
    button.style.backgroundColor = 'rgba(255, 100, 100, 0.9)'
    button.style.color = '#fff'
    button.style.borderRadius = '4px'
    button.style.padding = '10px'
    button.style.cursor = 'pointer'

    // 关闭按钮
    const closeButton = document.createElement('span')
    closeButton.textContent = '×';
    closeButton.style.position = 'absolute'
    closeButton.style.top = '-3px'
    closeButton.style.right = '2px'
    closeButton.style.fontSize = '16px'
    closeButton.style.lineHeight = '1'

    button.appendChild(closeButton)
    closeButton.addEventListener('click', (e) => {
      e.stopPropagation()
      button.parentNode.removeChild(button)
    })
    document.body.appendChild(button)
  }
  const formatTextInfo = (texts) => texts.join('<br>').replace(/undefined/g, '未获取到')
  const formatNumInfo = (texts) => texts.join(' ').replace(/undefined/g, '0')
  const getCopyTime = (time) => time.replace(/[年月]/g, '/').replace(/日/, '')
  const getInfoTexts = () => {
    let textInfo = ''
    let copyInfo = ''
    if (window.location.href.includes('xiaohongshu.com/explore')) {
      const state = unsafeWindow.__INITIAL_STATE__
      const title = getElementText('.note-content .title')
      const author = getElementText('.username')
      let time = ''
      try {
        time = formatTimestamp(state.note.noteDetailMap[state.note.firstNoteId.value].note.lastUpdateTime)
      } catch (e) {
        console.error(e)
      }
      const wordCount = getElementText('.note-content').length
      const likeNum = getElementText('.interact-container .like-wrapper .count')
      const commentNum = getElementText('.interact-container .chat-wrapper .count')
      const collectNum = getElementText('.interact-container .collect-wrapper .count')
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `字数:${wordCount}`,
        `点赞数:${likeNum}`,
        `评论数:${commentNum}`,
        `收藏数:${collectNum}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '小红书',
        author,
        title,
        window.location.href,
        formatNumInfo([`点赞${likeNum}`, `评论${commentNum}`, `收藏${collectNum}`]),
        wordCount
      ].join('\t')
    } else if (window.location.href.includes('douyin.com/video')) {
      const title= getElementText('h1 span span + span span span span span span')
      const author= getElementTextByXPath('//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[4]/div/div[1]/div[2]/a/div/span/span/span/span/span/span')
      const time= getElementTextByXPath('//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[2]/span/text()[2]')
      const duration= getElementText('.time-duration')
      const likeNum= getElementTextByXPath('//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[1]/span')
      const commentNum= getElementTextByXPath('//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[2]/span')
      const collectNum= getElementTextByXPath('//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[3]/span')
      const shareNum= getElementTextByXPath('//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[4]/span')
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `时长:${duration}`,
        `点赞数:${likeNum}`,
        `评论数:${commentNum}`,
        `收藏数:${collectNum}`,
        `分享数:${shareNum}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '抖音视频',
        author,
        title,
        window.location.href,
        formatNumInfo([`点赞${likeNum}`, `评论${commentNum}`, `收藏${collectNum}`, `分享${shareNum}`]),
        duration
      ].join('\t')
    } else if (window.location.href.includes('sohu.com/a')) {
      const title = getElementText('h1')
      const author = getElementText('#user-info h4 a')
      const time = getElementText('#news-time')
      const wordCount = getElementText('mp-editor').length
      const readNum = getElementText('.read-num em')
      const likeNum = getElementText('.like-c .count')
      const commentNum = getElementText('.comment-count')
      const collectNum = getElementText('.collection-c .count')
      const shareNum = getElementText('.share-c .count')
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `字数:${wordCount}`,
        `阅读数:${readNum}`,
        `点赞数:${likeNum}`,
        `评论数:${commentNum}`,
        `收藏数:${collectNum}`,
        `分享数:${shareNum}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '搜狐',
        author,
        title,
        window.location.href,
        formatNumInfo([`阅读${readNum}`, `点赞${likeNum}`, `评论${commentNum}`, `收藏${collectNum}`, `分享${shareNum}`]),
        wordCount
      ].join('\t')
    } else if (window.location.href.includes('toutiao.com/article')) {
      const title= getElementText('h1')
      const author= getElementText('.article-meta .name')
      const time= getElementText('.article-meta span')
      const wordCount= getElementText('.tt-article-content').length
      const likeNum= getElementText('.detail-like span')
      const commentNum= getElementText('.detail-interaction-comment span')
      const collectNum= getElementText('.detail-interaction-collect span')
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `字数:${wordCount}`,
        `点赞数:${likeNum}`,
        `评论数:${commentNum}`,
        `收藏数:${collectNum}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '今日头条',
        author,
        title,
        window.location.href,
        formatNumInfo([`点赞${likeNum}`, `评论${commentNum}`, `收藏${collectNum}`]),
        wordCount
      ].join('\t')
    } else if (window.location.href.includes('toutiao.com/w')) {
      const title= getElementText('h1')
      const author= getElementText('.desc .name')
      const time= getElementText('.abstract .time')
      const wordCount= getElementText('article').length
      const likeNum= getElementText('.detail-like span')
      const commentNum= getElementText('.detail-interaction-comment span')
      const collectNum= getElementText('.detail-interaction-collect span')
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `字数:${wordCount}`,
        `点赞数:${likeNum}`,
        `评论数:${commentNum}`,
        `收藏数:${collectNum}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '今日头条',
        author,
        title,
        window.location.href,
        formatNumInfo([`点赞${likeNum}`, `评论${commentNum}`, `收藏${collectNum}`]),
        wordCount
      ].join('\t')
    } else if (window.location.href.includes('baijiahao.baidu.com')) {
      const title = getElementText('#header div')
      const author = getElementText('#header [data-testid=author-name]')
      const time = getElementText('#header [data-testid=updatetime]')
      const wordCount = getElementText('[data-testid=article]').length
      const likeNum = getElementText('[data-testid=like-btn] .interact-desc')
      const commentNum = getElementText('[data-testid=comment-btn] .interact-desc')
      const collectNum = getElementText('[data-testid=favor-btn] .interact-desc')
      const shareNum = getElementText('[data-testid=share-btn] .interact-desc')
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `字数:${wordCount}`,
        `点赞数:${likeNum}`,
        `评论数:${commentNum}`,
        `收藏数:${collectNum}`,
        `分享数:${shareNum}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '百度',
        author,
        title,
        window.location.href,
        formatNumInfo([`点赞${likeNum}`, `评论${commentNum}`, `收藏${collectNum}`, `分享${shareNum}`]),
        wordCount
      ].join('\t')
    } else if (window.location.href.includes('mp.weixin.qq.com')) {
      const title = getElementText('h1')
      const author = getElementText('#js_name')
      const time = getElementText('#publish_time')
      const wordCount = getElementText('#js_content').length
      textInfo = formatTextInfo([
        `标题:${title}`,
        `作者:${author}`,
        `发布时间:${time}`,
        `字数:${wordCount}`,
      ])
      copyInfo = [
        getCopyTime(time),
        '公众号',
        author,
        title,
        window.location.href,
        formatNumInfo([`点赞`, `评论`, `收藏`, `分享`]),
        wordCount
      ].join('\t')
    }
    return {
      textInfo,
      copyInfo
    }
  }

  window.addEventListener('load', function() {
    createFloatButton()
    document.querySelector('#__float_btn__').addEventListener('click', () => {
      const { textInfo, copyInfo } = getInfoTexts()
      createInfoModal(textInfo, copyInfo);
    })
  });
})();