帖子导出工具

导出帖子内容到数据库

目前为 2022-07-19 提交的版本。查看 最新版本

  1. // ==UserScript==
  2. // @name:zh-CN 帖子导出工具
  3. // @name Posts_Dumper
  4. // @namespace https://blog.chrxw.com
  5. // @version 1.2
  6. // @description:zh-CN 导出帖子内容到数据库
  7. // @description 导出帖子内容到数据库
  8. // @author Chr_
  9. // @match https://keylol.com/*
  10. // @match https://dev.keylol.com/*
  11. // @connect 127.0.0.1
  12. // @connect httpbin.org
  13. // @license AGPL-3.0
  14. // @icon https://blog.chrxw.com/favicon.ico
  15. // @grant GM_setValue
  16. // @grant GM_getValue
  17. // @grant GM_deleteValue
  18. // @grant GM_xmlhttpRequest
  19. // @grant GM_addStyle
  20. // ==/UserScript==
  21.  
  22. setTimeout(async () => {
  23. 'use strict';
  24.  
  25. const port = 8000;
  26. const host = '127.0.0.1';
  27.  
  28. const matchTid = new RegExp(/(?:t|tid=)(\d+)/);
  29.  
  30. const treadList = document.querySelector("#threadlisttableid");
  31.  
  32. if (treadList !== null) {//获取帖子列表
  33.  
  34. function genBtn(name, foo) {
  35. const b = document.createElement('button');
  36. b.textContent = name;
  37. b.className = 'pd_btn';
  38. b.addEventListener('click', foo);
  39. return b;
  40. }
  41. function genDiv(cls) {
  42. const d = document.createElement('div');
  43. d.className = cls ?? 'pd_div';
  44. return d;
  45. }
  46. function genSpan(text) {
  47. const s = document.createElement('span');
  48. s.textContent = text;
  49. return s;
  50. }
  51. function genHr() {
  52. const b = document.createElement('hr');
  53. return b;
  54. }
  55. function genIframe() {
  56. const i = document.createElement('iframe');
  57. return i
  58. }
  59.  
  60. const panel = genDiv('pd_panel');
  61.  
  62. const tempIframe = genIframe();
  63.  
  64. const status = await testBackend();
  65.  
  66. const statusTips = genSpan(status ? '连接成功' : '连接失败');
  67.  
  68. const btnGrubNew = genBtn('抓取尚未记录的', async () => {
  69. const postLists = treadList.querySelectorAll("th.common>a.pd_not_added.xst,th.new>a.pd_not_added.xst,th.lock>a.pd_not_added.xst");
  70. const total = postLists.length;
  71. if (total > 0) {
  72. statusTips.textContent = `开始抓取,共 ${total} 篇`;
  73.  
  74. for (let i = 0; i < total; i++) {
  75. const postTag = postLists[i];
  76. const tid = grubTid(postTag.href);
  77. const url = genUrl(tid) + '?utm=114514';
  78. tempIframe.src = url;
  79. const result = await waitUnitlDone(tid);
  80. statusTips.textContent = `进度 ${i}/${total} TID ${tid} ${result}`;
  81. GM_deleteValue(tid);
  82. postTag.classList.remove('pd_not_added');
  83. postTag.classList.add('pd_done');
  84. }
  85. statusTips.textContent = '抓取结束';
  86. } else {
  87. statusTips.textContent = '没有可以抓取的帖子';
  88. }
  89. await freshPostList();
  90. });
  91.  
  92. const btnGrubAll = genBtn('抓取所有', async () => {
  93. const postLists = treadList.querySelectorAll("th.common>a.xst,th.new>a.xst,th.lock>a.xst");
  94. const total = postLists.length;
  95. if (total > 0) {
  96. statusTips.textContent = `开始抓取,共 ${total} 篇`;
  97.  
  98. for (let i = 0; i < total; i++) {
  99. const postTag = postLists[i];
  100. const tid = grubTid(postTag.href);
  101. const url = genUrl(tid) + '?utm=114514';
  102. tempIframe.src = url;
  103. const result = await waitUnitlDone(tid);
  104. statusTips.textContent = `进度 ${i}/${total} TID ${tid} ${result}`;
  105. GM_deleteValue(tid);
  106. postTag.classList.remove('pd_not_added');
  107. postTag.classList.remove('pd_added');
  108. postTag.classList.add('pd_done');
  109. }
  110. statusTips.textContent = '抓取结束';
  111. } else {
  112. statusTips.textContent = '没有可以抓取的帖子';
  113. }
  114. await freshPostList();
  115. });
  116.  
  117. const btnExportExcel = genBtn('导出Excel', () => {
  118. window.open(`http://${host}:${port}/api/excel`)
  119. });
  120.  
  121. const btnExportBBCode = genBtn('导出BBCode', () => {
  122. window.open(`http://${host}:${port}/api/bbcode`)
  123. });
  124.  
  125. const btnResetDB = genBtn('重置数据库(删除所有数据)', async () => {
  126. if (confirm('真的要删除所有数据吗?')) {
  127. await deleteAllData();
  128. }
  129. });
  130.  
  131. const btnControl = genBtn('打开管理面板', () => {
  132. window.open(`http://${host}:${port}/index.html`);
  133. });
  134.  
  135. panel.appendChild(statusTips);
  136. panel.appendChild(genHr());
  137.  
  138. if (status) {
  139. panel.appendChild(btnGrubNew);
  140. panel.appendChild(btnGrubAll);
  141. panel.appendChild(genHr());
  142. panel.appendChild(btnExportExcel);
  143. panel.appendChild(btnExportBBCode);
  144. panel.appendChild(genHr());
  145. panel.appendChild(btnResetDB);
  146. panel.appendChild(genHr());
  147. panel.appendChild(btnControl);
  148. panel.appendChild(genHr());
  149. panel.appendChild(tempIframe);
  150.  
  151. document.getElementById('autopbn').addEventListener('click', async () => {
  152. setTimeout(async () => {
  153. await freshPostList();
  154. }, 500);
  155. });
  156.  
  157. //判断是否已抓取
  158. await freshPostList();
  159. }
  160. else {
  161. panel.appendChild(genSpan('请检查软件是否运行以及端口是否被占用'));
  162.  
  163. setTimeout(() => {
  164. panel.style.display = 'none';
  165. }, 3000);
  166. }
  167.  
  168. document.body.appendChild(panel);
  169.  
  170. } else if (ifNeedGrub()) {//抓取帖子内容
  171. const tid = grubTid(location.href);
  172. const post_url = genUrl(tid);
  173. const post_title = document.getElementById('thread_subject')?.textContent ?? '获取失败';
  174. const eleAuthor = document.querySelector('div.pi>div.authi>a.xw1');
  175. const author_nick = eleAuthor?.textContent ?? '获取失败';
  176. const author_uid = eleAuthor?.href.replace('https://keylol.com/suid-', '') ?? '获取失败';
  177. const post_date = document.querySelector('div.pti>div.authi>em[id]')?.textContent.substring(4) ?? '获取失败';
  178. const eleContent = document.querySelector('td[id^=postmessage');
  179. const nodes = eleContent?.childNodes ?? [];
  180. const contentLines = [];
  181.  
  182. function node2text(node) {
  183. switch (node.nodeName) {
  184. case 'I':
  185. case 'A':
  186. case 'IFRAME':
  187. case 'STYLE':
  188. case 'IMG':
  189. return;
  190. case "DIV":
  191. if (node.classList.contains('aimg_tip')) {
  192. return;
  193. }
  194. }
  195.  
  196. if (node.nodeType === Node.TEXT_NODE) {
  197. const raw = node.textContent?.trim();
  198. if (raw && raw.length > 2 && raw.search('未经许可,严禁转载') === -1) {
  199. contentLines.push(raw);
  200. }
  201. }
  202. else {
  203. if (node.childNodes?.length > 0) {
  204. for (let child of node.childNodes) {
  205. node2text(child);
  206. }
  207. }
  208. }
  209. }
  210.  
  211. for (let node of nodes) {
  212. node2text(node);
  213. }
  214. const content = contentLines.join('\n');
  215.  
  216. const steamLinks = document.querySelectorAll("a[href^='https://store.steampowered.com/'],a[href^='https://steamdb.info/app/']");
  217. const grubAppid = new RegExp(/app\/(\d+)\/?/);
  218. const steamAppIDs = new Set();
  219. for (const ele of steamLinks) {
  220. const href = ele.href;
  221. if (href) {
  222. const appID = parseInt(grubAppid.exec(href)?.[1] ?? 0);
  223. if (appID > 0) {
  224. steamAppIDs.add(appID);
  225. }
  226. }
  227. }
  228. const game_list = [...steamAppIDs].join('|');
  229. const data = { tid, post_url, post_title, author_nick, author_uid, post_date, content, game_list };
  230. console.log(data);
  231. try {
  232. GM_setValue(tid, '抓取完成');
  233. await savePostData(data);
  234. }
  235. catch (error) {
  236. GM_setValue(tid, error);
  237. }
  238. }
  239.  
  240. //显示是否已经抓取
  241. async function freshPostList() {
  242. const tidSet = await getPostIds();
  243. const postLists = treadList.querySelectorAll("th.common>a.xst,th.new>a.xst,th.lock>a.xst");
  244. for (let postTag of postLists) {
  245. const tid = grubTid(postTag.href);
  246.  
  247. postTag.classList.remove('pd_not_added');
  248. postTag.classList.remove('pd_added');
  249. postTag.classList.remove('pd_done');
  250.  
  251. if (tidSet.has(tid)) {
  252. postTag.classList.add('pd_added');
  253. postTag.title = '【已抓取】';
  254. } else {
  255. postTag.classList.add('pd_not_added');
  256. postTag.title = '【未抓取】';
  257. }
  258. }
  259. }
  260.  
  261. //判断是否需要抓取
  262. function ifNeedGrub() {
  263. if (location.search.endsWith('utm=114514')) {
  264. return matchTid.test(location.href) >= 0;
  265. } else {
  266. return false;
  267. }
  268. }
  269.  
  270. //提取tid
  271. function grubTid(url) {
  272. return matchTid.exec(url)?.[1] ?? url.match(matchTid);
  273. }
  274.  
  275. //生成链接
  276. function genUrl(tid) {
  277. return `https://keylol.com/t${tid}-1-1`;
  278. }
  279.  
  280. //-----------------------------------
  281. //检测后台连通性
  282. function testBackend() {
  283. return new Promise((resolve, reject) => {
  284. $http.get(`http://${host}:${port}/api/test`)
  285. .then((response) => {
  286. resolve(response?.code === 666)
  287. })
  288. .catch((reason) => {
  289. resolve(false);
  290. });
  291. });
  292. }
  293. //检测是否抓取完成
  294. function waitUnitlDone(tid) {
  295. return new Promise((resolve, reject) => {
  296. let t1, t2;
  297.  
  298. t1 = setInterval(() => {
  299. const fin = GM_getValue(tid);
  300. if (fin) {
  301. clearInterval(t1);
  302. clearInterval(t2);
  303. resolve(fin);
  304. }
  305. }, 100);
  306.  
  307. t2 = setTimeout(() => {
  308. clearInterval(t1);
  309. resolve('操作超时');
  310. }, 10000);
  311. });
  312. }
  313. //获取已抓取的帖子tid列表
  314. function getPostIds() {
  315. return new Promise((resolve, reject) => {
  316. $http.get(`http://${host}:${port}/api/posts/ids`)
  317. .then((response) => {
  318. const tidSet = new Set();
  319. if (response?.code !== 0) {
  320. console.error(response?.msg ?? '消息为空');
  321. } else {
  322. const data = response?.data ?? [];
  323. for (let o of data) {
  324. tidSet.add(o);
  325. }
  326. }
  327. resolve(tidSet);
  328. })
  329. .catch((reason) => {
  330. reject(reason);
  331. });
  332. });
  333. }
  334. //上传抓取结果
  335. function savePostData(data) {
  336. return new Promise((resolve, reject) => {
  337. $http.post(`http://${host}:${port}/api/post`, JSON.stringify(data))
  338. .then((response) => {
  339. console.log(response);
  340. resolve(response?.code !== 0);
  341. })
  342. .catch((reason) => {
  343. console.log(reason);
  344. resolve(false);
  345. });
  346. });
  347. }
  348. //删除所有数据
  349. function deleteAllData() {
  350. return new Promise((resolve, reject) => {
  351. $http.delete(`http://${host}:${port}/api/posts`)
  352. .then((response) => {
  353. console.log(response);
  354. resolve(response?.code !== 0);
  355. })
  356. .catch((reason) => {
  357. console.log(reason);
  358. resolve(false);
  359. });
  360. });
  361. }
  362. }, 500);
  363. //-----------------------------------
  364. class Request {
  365. 'use strict';
  366. constructor(timeout = 3000) {
  367. this.timeout = timeout;
  368. }
  369. get(url, opt = {}) {
  370. return this.#baseRequest(url, 'GET', opt, 'json');
  371. }
  372. getHtml(url, opt = {}) {
  373. return this.#baseRequest(url, 'GET', opt, '');
  374. }
  375. getText(url, opt = {}) {
  376. return this.#baseRequest(url, 'GET', opt, 'text');
  377. }
  378. post(url, data, opt = {}) {
  379. opt.data = data;
  380. opt.headers = {
  381. "Content-Type": "application/json"
  382. }
  383. return this.#baseRequest(url, 'POST', opt, 'json');
  384. }
  385. delete(url, opt = {}) {
  386. return this.#baseRequest(url, 'DELETE', opt, 'json');
  387. }
  388. #baseRequest(url, method = 'GET', opt = {}, responseType = 'json') {
  389. Object.assign(opt, {
  390. url, method, responseType, timeout: this.timeout
  391. });
  392. return new Promise((resolve, reject) => {
  393. opt.ontimeout = opt.onerror = reject;
  394. opt.onload = ({ readyState, status, response, responseXML, responseText }) => {
  395. if (readyState === 4 && status === 200) {
  396. if (responseType == 'json') {
  397. resolve(response);
  398. } else if (responseType == 'text') {
  399. resolve(responseText);
  400. } else {
  401. resolve(responseXML);
  402. }
  403. } else {
  404. console.error('网络错误');
  405. console.log(readyState);
  406. console.log(status);
  407. console.log(response);
  408. reject('解析出错');
  409. }
  410. }
  411. GM_xmlhttpRequest(opt);
  412. });
  413. }
  414. }
  415. const $http = new Request();
  416.  
  417. //CSS表
  418. GM_addStyle(`
  419. .pd_div {
  420. vertical-align: middle;
  421. }
  422. .pd_panel {
  423. background: rgba(58, 58, 58, 0.5);
  424. position: fixed;
  425. top: 50%;
  426. right: 0px;
  427. text-align: center;
  428. transform: translate(0px, -50%);
  429. z-index: 100;
  430. padding: 5px;
  431. border-radius: 5px 0 0 5px;
  432. }
  433. .pd_panel > *:not(:last-child) {
  434. margin-right: 5px;
  435. }
  436. .pd_panel > hr {
  437. margin: 5px 0 5px;
  438. }
  439. .pd_panel > span {
  440. color: #fff;
  441. }
  442. .pd_panel > iframe {
  443. width: 200px;
  444. height: 200px;
  445. }
  446. .pd_added::before{
  447. content: "✅";
  448. }
  449. .pd_not_added::before{
  450. content: "❌";
  451. }
  452.  
  453. .pd_done::before{
  454. content: "🤔";
  455. }
  456. `);