Extract Special Characters

Extract special character data from ocr-gt-tools wiki

  1. // ==UserScript==
  2. // @name Extract Special Characters
  3. // @namespace http://github.com/kba/
  4. // @include https://github.com/UB-Mannheim/ocr-gt-tools/wiki/Special-Characters
  5. // @include https://github.com/UB-Mannheim/ocr-gt-tools/wiki/Error-Tags
  6. // @description Extract special character data from ocr-gt-tools wiki
  7. // @version 1
  8. // @require https://code.jquery.com/jquery-2.2.3.min.js
  9. // @require https://cdnjs.cloudflare.com/ajax/libs/z-schema/3.17.0/ZSchema-browser.js
  10. // @grant GM_addStyle
  11. // @grant GM_setClipboard
  12. // ==/UserScript==
  13. /*globals GM_addStyle */
  14. /*globals ZSchema */
  15.  
  16. var CSS = `
  17. pre.schema-error
  18. {
  19. background: #a00;
  20. color: white;
  21. white-space: pre-wrap;
  22. }
  23. div#glyph-bar
  24. {
  25. font-size: x-large;
  26. position:fixed;
  27. bottom: 0;
  28. height: 48px;
  29. border: 2px solid black;
  30. background: white;
  31. width: 100%;
  32. }
  33. div#glyph-bar .left * { float: left; }
  34. div#glyph-bar .right * { float: right; }
  35. div#glyph-bar *
  36. {
  37. height: 100%;
  38. font-size: x-large;
  39. }
  40. div#glyph-bar input[type='text']
  41. {
  42. font-family: "Garamond", "Bookman", serif;
  43. }
  44. div#schema-bar
  45. {
  46. position: fixed;
  47. z-index: 3000;
  48. top: 0;
  49. background: #900;
  50. color: white !important;
  51. width: 100%;
  52. font-size: x-large;
  53. height: 48px;
  54. border: 2px solid black;
  55. }
  56. div#schema-invalid
  57. {
  58. display: none;
  59. }
  60. div#schema-invalid a
  61. {
  62. display: inline-block;
  63. color: white !important;
  64. float: none;
  65. margin: 0 2px;
  66. }
  67. `;
  68.  
  69. var SCHEMAS = {
  70. 'Special-Characters': {
  71. 'type': 'object',
  72. "additionalProperties": false,
  73. 'properties': {
  74. 'id': {
  75. 'type': 'string',
  76. 'pattern': '^[a-z0-9-]+$',
  77. },
  78. 'sample': {
  79. 'type': 'array',
  80. 'items': {
  81. 'type': 'string',
  82. 'pattern': '^<a.*<img.*',
  83. }
  84. },
  85. 'recognition': {
  86. 'type': 'string'
  87. },
  88. 'baseLetter': {
  89. 'type': 'array'
  90. },
  91. 'name': {
  92. 'type': 'object'
  93. },
  94. 'notes': {
  95. 'type': 'object'
  96. },
  97. 'shortcutLinux': {
  98. 'type': 'string',
  99. 'pattern': '^<kbd',
  100. },
  101. 'shortcutWindows': {
  102. 'type': 'string',
  103. 'pattern': '^<kbd',
  104. },
  105. },
  106. 'required': ['sample', 'recognition', 'baseLetter'],
  107. },
  108. 'Error-Tags': {
  109. 'type': 'object',
  110. "additionalProperties": false,
  111. 'properties': {
  112. 'id': {
  113. 'type': 'string',
  114. 'pattern': '^[a-z0-9-]+$',
  115. },
  116. 'name': {
  117. 'type': 'object',
  118. 'properties': {
  119. 'de': {
  120. 'type': 'string',
  121. },
  122. 'en': {
  123. 'type': 'string',
  124. },
  125. },
  126. 'required': ['de']
  127. },
  128. 'frequencyAvg': {
  129. 'type': 'number',
  130. // 'format': 'float',
  131. },
  132. 'total': {
  133. 'type': 'number',
  134. // 'format': 'integer',
  135. },
  136. 'comment': {
  137. 'type': 'object',
  138. 'properties': {
  139. 'de': {
  140. 'type': 'string',
  141. },
  142. 'en': {
  143. 'type': 'string',
  144. },
  145. },
  146. 'required': ['de']
  147. },
  148. },
  149. 'required': ['name'],
  150. }
  151. };
  152.  
  153. // var log = {
  154. // 'debug': console.log.bind(console),
  155. // 'info': console.info.bind(console),
  156. // 'error': console.error.bind(console),
  157. // };
  158.  
  159. var ON_LOAD = {
  160. 'Special-Characters': function(scraped) {
  161. $("body").append(`
  162. <div id="glyph-bar">
  163. <div class="left">
  164. <label for="glyph-input" style="font-family: monospace; font-size: 30px">TRY&gt;</label>
  165. <input id="glyph-input" type="text"/>
  166. <div id="glyph-propose">&nbsp;</div>
  167. </div>
  168. </div>
  169. `);
  170. $("#glyph-input").on('keyup', function(e) {
  171. var $input = $("#glyph-input");
  172. var from = $input[0].selectionStart;
  173. var to = $input[0].selectionEnd;
  174. if (from == to) {
  175. from -= 1;
  176. }
  177. $('#glyph-propose').empty();
  178. var $propose = $('#glyph-propose');
  179. var val = $input.val();
  180. var chosen = val.substring(from, to);
  181. console.log(chosen, from, to);
  182. $.each(scraped, function() {
  183. var glyphDesc = this;
  184. if (glyphDesc.baseLetter.indexOf(chosen) === -1) {
  185. return;
  186. }
  187. $.each(glyphDesc.sample, function(i, sample) {
  188. $propose.append($(sample)
  189. .on('click', function(e) {
  190. e.preventDefault();
  191. $input.val(val.substr(0, from) + glyphDesc.recognition + val.substr(to));
  192. }));
  193. });
  194. });
  195. });
  196. },
  197. 'Error-Tags': function(scraped) {
  198. window.alert('Not Implemented');
  199. }
  200. };
  201.  
  202. function scrapeJsonFromWikiPage(schema) {
  203. var parsed = {};
  204. var validator = new ZSchema();
  205. var h2s = $(".markdown-body h2").get();
  206. for (var i = 0; i < h2s.length; i++) {
  207. var $h2 = $(h2s[i]);
  208. var thingDesc = {};
  209. var thingId = $h2.text().trim();
  210. parsed[thingId] = thingDesc;
  211. var lis = $h2.next('ul').find('li').get();
  212. for (var j = 0; j < lis.length; j++) {
  213. var liHtml = $(lis[j]).html();
  214. var colonIndex = liHtml.indexOf(':');
  215. var varName = liHtml.substring(0, colonIndex)
  216. .toLowerCase()
  217. .replace(/[^a-z0-9]+/g, '_')
  218. .replace(/_([a-z])/g, function(orig, ch) {
  219. return ch.toUpperCase();
  220. })
  221. .replace(/^_|_$/, '');
  222. // console.log(`Parsing '${varName}'`);
  223. var rawValue = liHtml.substring(colonIndex + 1).trim();
  224. if (schema.properties[varName] && schema.properties[varName].type === 'array') {
  225. thingDesc[varName] = rawValue.split(/\s*;\s*/);
  226. } else if (schema.properties[varName] && schema.properties[varName].type === 'number') {
  227. thingDesc[varName] = parseFloat(rawValue);
  228. } else if (/[A-Z][a-z]$/.test(varName)) {
  229. var lang = varName.substr(-2).toLowerCase();
  230. varName = varName.substring(0, varName.length - 2);
  231. thingDesc[varName] = thingDesc[varName] || {};
  232. thingDesc[varName][lang] = rawValue;
  233. } else {
  234. thingDesc[varName] = rawValue;
  235. }
  236. }
  237. console.log([thingDesc, schema]);
  238. if (!validator.validate(thingId, schema.properties.id)) {
  239. showError(thingId, validator.getLastErrors());
  240. }
  241. if (!validator.validate(thingDesc, schema)) {
  242. showError(thingId, validator.getLastErrors());
  243. }
  244. }
  245. return parsed;
  246. };
  247.  
  248. function escapeHTML(str) {
  249. var entityMap = {
  250. "&": "&amp;",
  251. "<": "&lt;",
  252. ">": "&gt;",
  253. '"': '&quot;',
  254. "'": '&#39;',
  255. "/": '&#x2F;'
  256. };
  257. return String(str).replace(/[&<>"'\/]/g, function(s) {
  258. return entityMap[s];
  259. });
  260. }
  261.  
  262. function showError(faultyId, err) {
  263. $(`h2:contains('${faultyId}')`).append(
  264. `<pre class='schema-error'>${escapeHTML(JSON.stringify(err, null, 2))}</pre>`);
  265. $("#schema-invalid").show().append(
  266. `<a href="#${faultyId}">[${ $("#schema-invalid a").length + 1}]</a>`);
  267. }
  268.  
  269. $(function() {
  270. GM_addStyle(CSS);
  271. $("body").prepend(
  272. `
  273. <div id="schema-bar">
  274. <div id="schema-invalid">!! INVALID </div>
  275. <div class="right">
  276. <button id="copy-schema">Copy Schema</button>
  277. <button id="copy-json">Copy Data</button>
  278. </div>
  279. </div>
  280. `);
  281. var wikiPage = window.location.href.replace(/.*\//, '').replace(/#.*$/, '');
  282. var schema = SCHEMAS[wikiPage];
  283. var scraped = scrapeJsonFromWikiPage(schema);
  284. ON_LOAD[wikiPage](scraped);
  285. $("#copy-schema").on('click', function() {
  286. GM_setClipboard(JSON.stringify(SCHEMAS[schema], null, 2));
  287. window.alert("Copied JSON schema to clipboard");
  288. });
  289. $("#copy-json").on('click', function() {
  290. GM_setClipboard(JSON.stringify(scraped, null, 2));
  291. window.alert("Copied JSON schema to clipboard");
  292. });
  293. });