帖子导出工具

导出帖子内容到数据库

当前为 2022-07-19 提交的版本,查看 最新版本

  1. // ==UserScript==
  2. // @name:zh-CN 帖子导出工具
  3. // @name Posts_Dumper
  4. // @namespace https://blog.chrxw.com
  5. // @version 1.1
  6. // @description:zh-CN 导出帖子内容到数据库
  7. // @description 导出帖子内容到数据库
  8. // @author Chr_
  9. // @match https://keylol.com/*
  10. // @match https://dev.keylol.com/*
  11. // @connect 127.0.0.1
  12. // @connect httpbin.org
  13. // @license AGPL-3.0
  14. // @icon https://blog.chrxw.com/favicon.ico
  15. // @grant GM_setValue
  16. // @grant GM_getValue
  17. // @grant GM_deleteValue
  18. // @grant GM_xmlhttpRequest
  19. // @grant GM_addStyle
  20. // ==/UserScript==
  21.  
  22. setTimeout(async () => {
  23. 'use strict';
  24.  
  25. const port = 8000;
  26. const host = '127.0.0.1';
  27.  
  28. const matchTid = new RegExp(/(?:t|tid=)(\d+)/);
  29.  
  30. const treadList = document.querySelector("#threadlisttableid");
  31.  
  32. if (treadList !== null) {//获取帖子列表
  33.  
  34. function genBtn(name, foo) {
  35. const b = document.createElement('button');
  36. b.textContent = name;
  37. b.className = 'pd_btn';
  38. b.addEventListener('click', foo);
  39. return b;
  40. }
  41. function genDiv(cls) {
  42. const d = document.createElement('div');
  43. d.className = cls ?? 'pd_div';
  44. return d;
  45. }
  46. function genSpan(text) {
  47. const s = document.createElement('span');
  48. s.textContent = text;
  49. return s;
  50. }
  51. function genHr() {
  52. const b = document.createElement('hr');
  53. return b;
  54. }
  55. function genIframe() {
  56. const i = document.createElement('iframe');
  57. return i
  58. }
  59.  
  60. const panel = genDiv('pd_panel');
  61.  
  62. const tempIframe = genIframe();
  63.  
  64. const status = await testBackend();
  65.  
  66. const statusTips = genSpan(status ? '连接成功' : '连接失败');
  67.  
  68. const btnGrubNew = genBtn('抓取尚未记录的', async () => {
  69. const postLists = treadList.querySelectorAll("th.common>a.pd_not_added.xst,th.new>a.pd_not_added.xst,th.lock>a.pd_not_added.xst");
  70. const total = postLists.length;
  71. if (total > 0) {
  72. statusTips.textContent = `开始抓取,共 ${total} 篇`;
  73.  
  74. for (let i = 0; i < total; i++) {
  75. const postTag = postLists[i];
  76. const tid = grubTid(postTag.href);
  77. const url = genUrl(tid) + '?utm=114514';
  78. tempIframe.src = url;
  79. const result = await waitUnitlDone(tid);
  80. statusTips.textContent = `进度 ${i}/${total} TID ${tid} ${result}`;
  81. GM_deleteValue(tid);
  82. postTag.classList.remove('pd_not_added');
  83. postTag.classList.add('pd_done');
  84. }
  85. statusTips.textContent = '抓取结束';
  86. } else {
  87. statusTips.textContent = '没有可以抓取的帖子';
  88. }
  89. await freshPostList();
  90. });
  91.  
  92. const btnGrubAll = genBtn('抓取所有', async () => {
  93. const postLists = treadList.querySelectorAll("th.common>a.xst,th.new>a.xst,th.lock>a.xst");
  94. const total = postLists.length;
  95. if (total > 0) {
  96. statusTips.textContent = `开始抓取,共 ${total} 篇`;
  97.  
  98. for (let i = 0; i < total; i++) {
  99. const postTag = postLists[i];
  100. const tid = grubTid(postTag.href);
  101. const url = genUrl(tid) + '?utm=114514';
  102. tempIframe.src = url;
  103. const result = await waitUnitlDone(tid);
  104. statusTips.textContent = `进度 ${i}/${total} TID ${tid} ${result}`;
  105. GM_deleteValue(tid);
  106. postTag.classList.remove('pd_not_added');
  107. postTag.classList.remove('pd_added');
  108. postTag.classList.add('pd_done');
  109. }
  110. statusTips.textContent = '抓取结束';
  111. } else {
  112. statusTips.textContent = '没有可以抓取的帖子';
  113. }
  114. await freshPostList();
  115. });
  116.  
  117. const btnExportExcel = genBtn('导出Excel', () => {
  118. window.open(`http://${host}:${port}/api/excel`)
  119. });
  120.  
  121. const btnExportBBCode = genBtn('导出BBCode', () => {
  122. window.open(`http://${host}:${port}/api/bbcode`)
  123. });
  124.  
  125. const btnResetDB = genBtn('重置数据库(删除所有数据)', async () => {
  126. if (confirm('真的要删除所有数据吗?')) {
  127. await deleteAllData();
  128. }
  129. });
  130.  
  131. const btnControl = genBtn('打开管理面板', () => {
  132. window.open(`http://${host}:${port}/index.html`);
  133. });
  134.  
  135. panel.appendChild(statusTips);
  136. panel.appendChild(genHr());
  137. if (status) {
  138. panel.appendChild(btnGrubNew);
  139. panel.appendChild(btnGrubAll);
  140. panel.appendChild(genHr());
  141. panel.appendChild(btnExportExcel);
  142. panel.appendChild(btnExportBBCode);
  143. panel.appendChild(genHr());
  144. panel.appendChild(btnResetDB);
  145. panel.appendChild(genHr());
  146. panel.appendChild(btnControl);
  147. panel.appendChild(genHr());
  148. panel.appendChild(tempIframe);
  149. }
  150. else {
  151. panel.appendChild(genSpan('请检查软件是否运行以及端口是否被占用'));
  152. }
  153.  
  154. document.body.appendChild(panel);
  155.  
  156. if (status) {
  157. document.getElementById('autopbn').addEventListener('click', async () => {
  158. setTimeout(async () => {
  159. await freshPostList();
  160. }, 500);
  161. });
  162.  
  163. //判断是否已抓取
  164. await freshPostList();
  165. }
  166. } else if (ifNeedGrub()) {//抓取帖子内容
  167. const tid = grubTid(location.href);
  168. const post_url = genUrl(tid);
  169. const post_title = document.getElementById('thread_subject')?.textContent ?? '获取失败';
  170. const eleAuthor = document.querySelector('div.pi>div.authi>a.xw1');
  171. const author_nick = eleAuthor?.textContent ?? '获取失败';
  172. const author_uid = eleAuthor?.href.replace('https://keylol.com/suid-', '') ?? '获取失败';
  173. const post_date = document.querySelector('div.pti>div.authi>em[id]')?.textContent.substring(4) ?? '获取失败';
  174. const eleContent = document.querySelector('td[id^=postmessage');
  175. const nodes = eleContent?.childNodes ?? [];
  176. const contentLines = [];
  177.  
  178. function node2text(node) {
  179. switch (node.nodeName) {
  180. case 'I':
  181. case 'A':
  182. case 'IFRAME':
  183. case 'STYLE':
  184. case 'IMG':
  185. return;
  186. case "DIV":
  187. if (node.classList.contains('aimg_tip')) {
  188. return;
  189. }
  190. }
  191.  
  192. if (node.nodeType === Node.TEXT_NODE) {
  193. const raw = node.textContent?.trim();
  194. if (raw && raw.length > 2 && raw.search('未经许可,严禁转载') === -1) {
  195. contentLines.push(raw);
  196. }
  197. }
  198. else {
  199. if (node.childNodes?.length > 0) {
  200. for (let child of node.childNodes) {
  201. node2text(child);
  202. }
  203. }
  204. }
  205. }
  206.  
  207. for (let node of nodes) {
  208. node2text(node);
  209. }
  210. const content = contentLines.join('\n');
  211.  
  212. const steamLinks = document.querySelectorAll("a[href^='https://store.steampowered.com/'],a[href^='https://steamdb.info/app/']");
  213. const grubAppid = new RegExp(/app\/(\d+)\/?/);
  214. const steamAppIDs = new Set();
  215. for (const ele of steamLinks) {
  216. const href = ele.href;
  217. if (href) {
  218. const appID = parseInt(grubAppid.exec(href)?.[1] ?? 0);
  219. if (appID > 0) {
  220. steamAppIDs.add(appID);
  221. }
  222. }
  223. }
  224. const game_list = [...steamAppIDs].join('|');
  225. const data = { tid, post_url, post_title, author_nick, author_uid, post_date, content, game_list };
  226. console.log(data);
  227. try {
  228. GM_setValue(tid, '抓取完成');
  229. await savePostData(data);
  230. }
  231. catch (error) {
  232. GM_setValue(tid, error);
  233. }
  234. }
  235.  
  236. //显示是否已经抓取
  237. async function freshPostList() {
  238. const tidSet = await getPostIds();
  239. const postLists = treadList.querySelectorAll("th.common>a.xst,th.new>a.xst,th.lock>a.xst");
  240. for (let postTag of postLists) {
  241. const tid = grubTid(postTag.href);
  242.  
  243. postTag.classList.remove('pd_not_added');
  244. postTag.classList.remove('pd_added');
  245. postTag.classList.remove('pd_done');
  246.  
  247. if (tidSet.has(tid)) {
  248. postTag.classList.add('pd_added');
  249. postTag.title = '【已抓取】';
  250. } else {
  251. postTag.classList.add('pd_not_added');
  252. postTag.title = '【未抓取】';
  253. }
  254. }
  255. }
  256.  
  257. //判断是否需要抓取
  258. function ifNeedGrub() {
  259. if (location.search.endsWith('utm=114514')) {
  260. return matchTid.test(location.href) >= 0;
  261. } else {
  262. return false;
  263. }
  264. }
  265.  
  266. //提取tid
  267. function grubTid(url) {
  268. return matchTid.exec(url)?.[1] ?? url.match(matchTid);
  269. }
  270.  
  271. //生成链接
  272. function genUrl(tid) {
  273. return `https://keylol.com/t${tid}-1-1`;
  274. }
  275.  
  276. //-----------------------------------
  277. //检测后台连通性
  278. function testBackend() {
  279. return new Promise((resolve, reject) => {
  280. $http.get(`http://${host}:${port}/api/test`)
  281. .then((response) => {
  282. resolve(response?.code === 666)
  283. })
  284. .catch((reason) => {
  285. resolve(false);
  286. });
  287. });
  288. }
  289. //检测是否抓取完成
  290. function waitUnitlDone(tid) {
  291. return new Promise((resolve, reject) => {
  292. let t1, t2;
  293.  
  294. t1 = setInterval(() => {
  295. const fin = GM_getValue(tid);
  296. if (fin) {
  297. clearInterval(t1);
  298. clearInterval(t2);
  299. resolve(fin);
  300. }
  301. }, 100);
  302.  
  303. t2 = setTimeout(() => {
  304. clearInterval(t1);
  305. resolve('操作超时');
  306. }, 10000);
  307. });
  308. }
  309. //获取已抓取的帖子tid列表
  310. function getPostIds() {
  311. return new Promise((resolve, reject) => {
  312. $http.get(`http://${host}:${port}/api/posts/ids`)
  313. .then((response) => {
  314. const tidSet = new Set();
  315. if (response?.code !== 0) {
  316. console.error(response?.msg ?? '消息为空');
  317. } else {
  318. const data = response?.data ?? [];
  319. for (let o of data) {
  320. tidSet.add(o);
  321. }
  322. }
  323. resolve(tidSet);
  324. })
  325. .catch((reason) => {
  326. reject(reason);
  327. });
  328. });
  329. }
  330. //上传抓取结果
  331. function savePostData(data) {
  332. return new Promise((resolve, reject) => {
  333. $http.post(`http://${host}:${port}/api/post`, JSON.stringify(data))
  334. .then((response) => {
  335. console.log(response);
  336. resolve(response?.code !== 0);
  337. })
  338. .catch((reason) => {
  339. console.log(reason);
  340. resolve(false);
  341. });
  342. });
  343. }
  344. //删除所有数据
  345. function deleteAllData() {
  346. return new Promise((resolve, reject) => {
  347. $http.delete(`http://${host}:${port}/api/posts`)
  348. .then((response) => {
  349. console.log(response);
  350. resolve(response?.code !== 0);
  351. })
  352. .catch((reason) => {
  353. console.log(reason);
  354. resolve(false);
  355. });
  356. });
  357. }
  358. }, 500);
  359. //-----------------------------------
  360. class Request {
  361. 'use strict';
  362. constructor(timeout = 3000) {
  363. this.timeout = timeout;
  364. }
  365. get(url, opt = {}) {
  366. return this.#baseRequest(url, 'GET', opt, 'json');
  367. }
  368. getHtml(url, opt = {}) {
  369. return this.#baseRequest(url, 'GET', opt, '');
  370. }
  371. getText(url, opt = {}) {
  372. return this.#baseRequest(url, 'GET', opt, 'text');
  373. }
  374. post(url, data, opt = {}) {
  375. opt.data = data;
  376. opt.headers = {
  377. "Content-Type": "application/json"
  378. }
  379. return this.#baseRequest(url, 'POST', opt, 'json');
  380. }
  381. delete(url, opt = {}) {
  382. return this.#baseRequest(url, 'DELETE', opt, 'json');
  383. }
  384. #baseRequest(url, method = 'GET', opt = {}, responseType = 'json') {
  385. Object.assign(opt, {
  386. url, method, responseType, timeout: this.timeout
  387. });
  388. return new Promise((resolve, reject) => {
  389. opt.ontimeout = opt.onerror = reject;
  390. opt.onload = ({ readyState, status, response, responseXML, responseText }) => {
  391. if (readyState === 4 && status === 200) {
  392. if (responseType == 'json') {
  393. resolve(response);
  394. } else if (responseType == 'text') {
  395. resolve(responseText);
  396. } else {
  397. resolve(responseXML);
  398. }
  399. } else {
  400. console.error('网络错误');
  401. console.log(readyState);
  402. console.log(status);
  403. console.log(response);
  404. reject('解析出错');
  405. }
  406. }
  407. GM_xmlhttpRequest(opt);
  408. });
  409. }
  410. }
  411. const $http = new Request();
  412.  
  413. //CSS表
  414. GM_addStyle(`
  415. .pd_div {
  416. vertical-align: middle;
  417. }
  418. .pd_panel {
  419. background: rgba(58, 58, 58, 0.5);
  420. position: fixed;
  421. top: 50%;
  422. right: 0px;
  423. text-align: center;
  424. transform: translate(0px, -50%);
  425. z-index: 100;
  426. padding: 5px;
  427. border-radius: 5px 0 0 5px;
  428. }
  429. .pd_panel > *:not(:last-child) {
  430. margin-right: 5px;
  431. }
  432. .pd_panel > hr {
  433. margin: 5px 0 5px;
  434. }
  435. .pd_panel > span {
  436. color: #fff;
  437. }
  438. .pd_panel > iframe {
  439. width: 200px;
  440. height: 200px;
  441. }
  442. .pd_added::before{
  443. content: "✅";
  444. }
  445. .pd_not_added::before{
  446. content: "❌";
  447. }
  448.  
  449. .pd_done::before{
  450. content: "🤔";
  451. }
  452. `);