帖子导出工具

导出帖子内容到数据库

当前为 2022-07-18 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name:zh-CN 帖子导出工具
  3. // @name Posts_Dumper
  4. // @namespace https://blog.chrxw.com
  5. // @version 1.0
  6. // @description:zh-CN 导出帖子内容到数据库
  7. // @description 导出帖子内容到数据库
  8. // @author Chr_
  9. // @match https://keylol.com/*
  10. // @match https://dev.keylol.com/*
  11. // @connect 127.0.0.1
  12. // @connect httpbin.org
  13. // @license AGPL-3.0
  14. // @icon https://blog.chrxw.com/favicon.ico
  15. // @grant GM_setValue
  16. // @grant GM_getValue
  17. // @grant GM_deleteValue
  18. // @grant GM_xmlhttpRequest
  19. // @grant GM_addStyle
  20. // ==/UserScript==
  21.  
  22. setTimeout(async () => {
  23. 'use strict';
  24.  
  25. const port = 8000;
  26. const host = '127.0.0.1';
  27.  
  28. const matchTid = new RegExp(/(?:t|tid=)(\d+)/);
  29.  
  30. const treadList = document.querySelector("#threadlisttableid");
  31.  
  32. if (treadList !== null) {//获取帖子列表
  33.  
  34. function genBtn(name, foo) {
  35. const b = document.createElement('button');
  36. b.textContent = name;
  37. b.className = 'pd_btn';
  38. b.addEventListener('click', foo);
  39. return b;
  40. }
  41. function genDiv(cls) {
  42. const d = document.createElement('div');
  43. d.className = cls ?? 'pd_div';
  44. return d;
  45. }
  46. function genSpan(text) {
  47. const s = document.createElement('span');
  48. s.textContent = text;
  49. return s;
  50. }
  51. function genHr() {
  52. const b = document.createElement('hr');
  53. return b;
  54. }
  55. function genIframe() {
  56. const i = document.createElement('iframe');
  57. return i
  58. }
  59.  
  60. const panel = genDiv('pd_panel');
  61.  
  62. const tempIframe = genIframe();
  63.  
  64. const status = await testBackend();
  65.  
  66. const statusTips = genSpan(status ? '连接成功' : '连接失败');
  67.  
  68. const btnGrubNew = genBtn('抓取尚未记录的', async () => {
  69. const postLists = treadList.querySelectorAll("tr>th:nth-child(2)>a.pd_not_added.xst");
  70. const total = postLists.length;
  71. if (total > 0) {
  72. statusTips.textContent = `开始抓取,共 ${total} 篇`;
  73.  
  74. for (let i = 0; i < total; i++) {
  75. const postTag = postLists[i];
  76. const tid = grubTid(postTag.href);
  77. const url = genUrl(tid) + '?utm=114514';
  78. tempIframe.src = url;
  79. const result = await waitUnitlDone(tid);
  80. statusTips.textContent = `进度 ${i}/${total} TID ${tid} ${result}`;
  81. GM_deleteValue(tid);
  82. postTag.classList.remove('pd_not_added');
  83. postTag.classList.add('pd_done');
  84. }
  85. statusTips.textContent = '抓取结束';
  86. } else {
  87. statusTips.textContent = '没有可以抓取的帖子';
  88. }
  89. await freshPostList();
  90. });
  91.  
  92. const btnGrubAll = genBtn('抓取所有', async () => {
  93. const postLists = treadList.querySelectorAll("tr>th:nth-child(2)>a.xst");
  94. const total = postLists.length;
  95. if (total > 0) {
  96. statusTips.textContent = `开始抓取,共 ${total} 篇`;
  97.  
  98. for (let i = 0; i < total; i++) {
  99. const postTag = postLists[i];
  100. const tid = grubTid(postTag.href);
  101. const url = genUrl(tid) + '?utm=114514';
  102. tempIframe.src = url;
  103. const result = await waitUnitlDone(tid);
  104. statusTips.textContent = `进度 ${i}/${total} TID ${tid} ${result}`;
  105. GM_deleteValue(tid);
  106. postTag.classList.remove('pd_not_added');
  107. postTag.classList.remove('pd_added');
  108. postTag.classList.add('pd_done');
  109. }
  110. statusTips.textContent = '抓取结束';
  111. } else {
  112. statusTips.textContent = '没有可以抓取的帖子';
  113. }
  114. await freshPostList();
  115. });
  116.  
  117. const btnExportExcel = genBtn('导出Excel', () => {
  118. window.open(`http://${host}:${port}/api/excel`)
  119. });
  120.  
  121. const btnExportBBCode = genBtn('导出BBCode', () => {
  122. window.open(`http://${host}:${port}/api/bbcode`)
  123. });
  124.  
  125. const btnResetDB = genBtn('重置数据库(删除所有数据)', async () => {
  126. if (confirm('真的要删除所有数据吗?')) {
  127. await deleteAllData();
  128. }
  129. });
  130.  
  131. const btnControl = genBtn('打开管理面板', () => {
  132. window.open(`http://${host}:${port}/index.html`);
  133. });
  134.  
  135. panel.appendChild(statusTips);
  136. panel.appendChild(genHr());
  137. if (status) {
  138. panel.appendChild(btnGrubNew);
  139. panel.appendChild(btnGrubAll);
  140. panel.appendChild(genHr());
  141. panel.appendChild(btnExportExcel);
  142. panel.appendChild(btnExportBBCode);
  143. panel.appendChild(genHr());
  144. panel.appendChild(btnResetDB);
  145. panel.appendChild(genHr());
  146. panel.appendChild(btnControl);
  147. panel.appendChild(genHr());
  148. panel.appendChild(tempIframe);
  149. }
  150. else {
  151. panel.appendChild(genSpan('请检查软件是否运行以及端口是否被占用'));
  152. }
  153.  
  154. document.body.appendChild(panel);
  155.  
  156. if (status) {
  157. document.getElementById('autopbn').addEventListener('click', async () => {
  158. setTimeout(async () => {
  159. await freshPostList();
  160. }, 500);
  161. });
  162.  
  163. //判断是否已抓取
  164. await freshPostList();
  165. }
  166. } else if (ifNeedGrub()) {//抓取帖子内容
  167. const tid = grubTid(location.href);
  168. const post_url = genUrl(tid);
  169. const post_title = document.getElementById('thread_subject')?.textContent ?? '获取失败';
  170. const eleAuthor = document.querySelector('div.pi>div.authi>a.xw1');
  171. const author_nick = eleAuthor?.textContent ?? '获取失败';
  172. const author_uid = eleAuthor?.href.replace('https://keylol.com/suid-', '') ?? '获取失败';
  173. const post_date = document.querySelector('div.pti>div.authi>em[id]')?.textContent.substring(4) ?? '获取失败';
  174. const eleContent = document.querySelector('td[id^=postmessage');
  175. const nodes = eleContent?.childNodes ?? [];
  176. const contentLines = [];
  177.  
  178. function node2text(node) {
  179. switch (node.nodeName) {
  180. case 'I':
  181. case 'A':
  182. case 'IFRAME':
  183. case 'STYLE':
  184. case 'IMG':
  185. return;
  186. }
  187.  
  188. if (node.nodeType === Node.TEXT_NODE) {
  189. const raw = node.textContent?.trim();
  190. if (raw && raw.length > 2 && raw.search('未经许可,严禁转载') === -1) {
  191. contentLines.push(raw);
  192. }
  193. }
  194. else {
  195. if (node.childNodes?.length > 0) {
  196. for (let child of node.childNodes) {
  197. node2text(child);
  198. }
  199. }
  200. }
  201. }
  202.  
  203. for (let node of nodes) {
  204. node2text(node);
  205. }
  206. const content = contentLines.join('\n');
  207.  
  208. const steamLinks = document.querySelectorAll("a[href^='https://store.steampowered.com/'],a[href^='https://steamdb.info/app/']");
  209. const grubAppid = new RegExp(/app\/(\d+)\/?/);
  210. const steamAppIDs = new Set();
  211. for (const ele of steamLinks) {
  212. const href = ele.href;
  213. if (href) {
  214. const appID = parseInt(grubAppid.exec(href)?.[1] ?? 0);
  215. if (appID > 0) {
  216. steamAppIDs.add(appID);
  217. }
  218. }
  219. }
  220. const game_list = [...steamAppIDs].join('|');
  221. const data = { tid, post_url, post_title, author_nick, author_uid, post_date, content, game_list };
  222. console.log(data);
  223. try {
  224. GM_setValue(tid, '抓取完成');
  225. await savePostData(data);
  226. }
  227. catch (error) {
  228. GM_setValue(tid, error);
  229. }
  230. }
  231.  
  232. //显示是否已经抓取
  233. async function freshPostList() {
  234. const tidSet = await getPostIds();
  235. const postLists = treadList.querySelectorAll("tr>th:nth-child(2)>a.xst");
  236. for (let postTag of postLists) {
  237. const tid = grubTid(postTag.href);
  238.  
  239. postTag.classList.remove('pd_not_added');
  240. postTag.classList.remove('pd_added');
  241. postTag.classList.remove('pd_done');
  242.  
  243. if (tidSet.has(tid)) {
  244. postTag.classList.add('pd_added');
  245. postTag.title = '【已抓取】';
  246. } else {
  247. postTag.classList.add('pd_not_added');
  248. postTag.title = '【未抓取】';
  249. }
  250. }
  251. }
  252.  
  253. //判断是否需要抓取
  254. function ifNeedGrub() {
  255. if (location.search.endsWith('utm=114514')) {
  256. return matchTid.test(location.href) >= 0;
  257. } else {
  258. return false;
  259. }
  260. }
  261.  
  262. //提取tid
  263. function grubTid(url) {
  264. return matchTid.exec(url)?.[1] ?? url.match(matchTid);
  265. }
  266.  
  267. //生成链接
  268. function genUrl(tid) {
  269. return `https://keylol.com/t${tid}-1-1`;
  270. }
  271.  
  272. //-----------------------------------
  273. //检测后台连通性
  274. function testBackend() {
  275. return new Promise((resolve, reject) => {
  276. $http.get(`http://${host}:${port}/api/test`)
  277. .then((response) => {
  278. resolve(response?.code === 666)
  279. })
  280. .catch((reason) => {
  281. resolve(false);
  282. });
  283. });
  284. }
  285. //检测是否抓取完成
  286. function waitUnitlDone(tid) {
  287. return new Promise((resolve, reject) => {
  288. let t1, t2;
  289.  
  290. t1 = setInterval(() => {
  291. const fin = GM_getValue(tid);
  292. if (fin) {
  293. clearInterval(t1);
  294. clearInterval(t2);
  295. resolve(fin);
  296. }
  297. }, 100);
  298.  
  299. t2 = setTimeout(() => {
  300. clearInterval(t1);
  301. resolve('操作超时');
  302. }, 10000);
  303. });
  304. }
  305. //获取已抓取的帖子tid列表
  306. function getPostIds() {
  307. return new Promise((resolve, reject) => {
  308. $http.get(`http://${host}:${port}/api/posts/ids`)
  309. .then((response) => {
  310. const tidSet = new Set();
  311. if (response?.code !== 0) {
  312. console.error(response?.msg ?? '消息为空');
  313. } else {
  314. const data = response?.data ?? [];
  315. for (let o of data) {
  316. tidSet.add(o);
  317. }
  318. }
  319. resolve(tidSet);
  320. })
  321. .catch((reason) => {
  322. reject(reason);
  323. });
  324. });
  325. }
  326. //上传抓取结果
  327. function savePostData(data) {
  328. return new Promise((resolve, reject) => {
  329. $http.post(`http://${host}:${port}/api/post`, JSON.stringify(data))
  330. .then((response) => {
  331. console.log(response);
  332. resolve(response?.code !== 0);
  333. })
  334. .catch((reason) => {
  335. console.log(reason);
  336. resolve(false);
  337. });
  338. });
  339. }
  340. //删除所有数据
  341. function deleteAllData() {
  342. return new Promise((resolve, reject) => {
  343. $http.delete(`http://${host}:${port}/api/posts`)
  344. .then((response) => {
  345. console.log(response);
  346. resolve(response?.code !== 0);
  347. })
  348. .catch((reason) => {
  349. console.log(reason);
  350. resolve(false);
  351. });
  352. });
  353. }
  354. }, 500);
  355. //-----------------------------------
  356. class Request {
  357. 'use strict';
  358. constructor(timeout = 3000) {
  359. this.timeout = timeout;
  360. }
  361. get(url, opt = {}) {
  362. return this.#baseRequest(url, 'GET', opt, 'json');
  363. }
  364. getHtml(url, opt = {}) {
  365. return this.#baseRequest(url, 'GET', opt, '');
  366. }
  367. getText(url, opt = {}) {
  368. return this.#baseRequest(url, 'GET', opt, 'text');
  369. }
  370. post(url, data, opt = {}) {
  371. opt.data = data;
  372. opt.headers = {
  373. "Content-Type": "application/json"
  374. }
  375. return this.#baseRequest(url, 'POST', opt, 'json');
  376. }
  377. delete(url, opt = {}) {
  378. return this.#baseRequest(url, 'DELETE', opt, 'json');
  379. }
  380. #baseRequest(url, method = 'GET', opt = {}, responseType = 'json') {
  381. Object.assign(opt, {
  382. url, method, responseType, timeout: this.timeout
  383. });
  384. return new Promise((resolve, reject) => {
  385. opt.ontimeout = opt.onerror = reject;
  386. opt.onload = ({ readyState, status, response, responseXML, responseText }) => {
  387. if (readyState === 4 && status === 200) {
  388. if (responseType == 'json') {
  389. resolve(response);
  390. } else if (responseType == 'text') {
  391. resolve(responseText);
  392. } else {
  393. resolve(responseXML);
  394. }
  395. } else {
  396. console.error('网络错误');
  397. console.log(readyState);
  398. console.log(status);
  399. console.log(response);
  400. reject('解析出错');
  401. }
  402. }
  403. GM_xmlhttpRequest(opt);
  404. });
  405. }
  406. }
  407. const $http = new Request();
  408.  
  409. //CSS表
  410. GM_addStyle(`
  411. .pd_div {
  412. vertical-align: middle;
  413. }
  414. .pd_panel {
  415. background: rgba(58, 58, 58, 0.5);
  416. position: fixed;
  417. top: 50%;
  418. right: 0px;
  419. text-align: center;
  420. transform: translate(0px, -50%);
  421. z-index: 100;
  422. padding: 5px;
  423. border-radius: 5px 0 0 5px;
  424. }
  425. .pd_panel > *:not(:last-child) {
  426. margin-right: 5px;
  427. }
  428. .pd_panel > hr {
  429. margin: 5px 0 5px;
  430. }
  431. .pd_panel > span {
  432. color: #fff;
  433. }
  434. .pd_panel > iframe {
  435. width: 200px;
  436. height: 200px;
  437. }
  438. .pd_added::before{
  439. content: "✅";
  440. }
  441. .pd_not_added::before{
  442. content: "❌";
  443. }
  444.  
  445. .pd_done::before{
  446. content: "🤔";
  447. }
  448. `);