cgcom-interno.vuds-omc.es website scraper

Scrapes the website in csv format

  1. // ==UserScript==
  2. // @name cgcom-interno.vuds-omc.es website scraper
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.13
  5. // @description Scrapes the website in csv format
  6. // @author You
  7. // @match https://cgcom-interno.cgcom.es/RegistroMedicos/PUBBusquedaPublica_busqueda.action
  8. // ==/UserScript==
  9.  
  10. function parseTab() {
  11. return new Promise((resolve) => {
  12. var checkExist = setInterval(() => {
  13. if($("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").length) {
  14. clearInterval(checkExist);
  15.  
  16. var row = "";
  17.  
  18. $("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").each(function() {
  19. // This accounts for multiple items in a field (i.e. Specialties)
  20. row += '"' + this.innerHTML.trim().replace(/(<br>)+/g, ";").replace(/;$/, "").replace(/<!--(.)+-->/g, '').replace(/"/g, "").trim() + '",';
  21. });
  22.  
  23. // Sometimes there's an extra field in the table, so add it as blank if it doesn't exist
  24. if($("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").length === 6) {
  25. row += '""';
  26. }
  27. else {
  28. row = row.substring(0, row.length - 1);
  29. }
  30.  
  31. // Close the tab
  32. $("[id^=botonCerrar_tabBusqueda_]").click();
  33.  
  34. resolve(row);
  35. }
  36. }, 100); // check every 100ms
  37. });
  38. }
  39.  
  40. function waitLoading() {
  41. return new Promise((resolve) => {
  42. var checkExist = setInterval(() => {
  43. if(!$(".blockUI").length) {
  44. clearInterval(checkExist);
  45.  
  46. resolve();
  47. }
  48. }, 100); // check every 100ms
  49. });
  50. }
  51.  
  52. function downloadResultSet(resultSet, provIndex) {
  53. return new Promise((resolve) => {
  54. var results = ['"Colegiado","Nombre","Provincia","Especialidad","Estado","DireccionTrabajo","ValidaciónPeriódicaColegiación"'];
  55.  
  56. results = results.concat(resultSet);
  57.  
  58. let csvContent = "";
  59. results.forEach(function(line){
  60. var lines = line.split(',');
  61. lines[6] = lines[6].replace(/[\n]/g, ";");
  62. line = lines.join(',');
  63. var cleaned = line.replace(/[\n\t]/g, " ").replace(/[ ]+/g, " ");
  64. csvContent += cleaned + "\r\n";
  65. });
  66.  
  67. // Need to create a Blob because of size of content
  68. var blob = new Blob([csvContent], {encoding:"UTF-8", type:"text/plain;charset=UTF-8"});
  69. var url = window.URL.createObjectURL(blob);
  70. var a = document.createElement("a");
  71. a.href = url;
  72. a.download = `cgcom_raw_${provIndex}.csv`;
  73. a.click();
  74. window.URL.revokeObjectURL(url);
  75.  
  76. results = [];
  77.  
  78. resolve();
  79. });
  80. }
  81.  
  82. // Ripped from original website to remove the setTimeout capatcha refresh
  83. var myBuscar = function myBuscar() {
  84. $("#numeroColegiadoDetalle").val(null);
  85. $("#pagina").val(1);
  86.  
  87. var idForm = "formBuscarColegiados";
  88. var action = "PUBBusquedaPublica_busqueda_ajax.action";
  89. var idCapaContenido = "resultadoBusquedaColegiados";
  90. enviarFormAjax(idForm, action, idCapaContenido, true);
  91. //setTimeout(function(){$("#recalcularImg").attr("src",$("#contextUrl").val()+"/Captcha.png?"+Date.now());},500);
  92. //console.log($("#contextUrl").val()+"/Captcha.png?"+Date.now());
  93. };
  94.  
  95. function initializeForm() {
  96. $("#busquedaPublicaColegiadoBuscar").removeAttr('onclick').bind( "click", myBuscar);
  97.  
  98. $("#contenidoPublico")
  99. .before("<span class='wrapAll' id='idProvidences' style='margin-left: 5px;'></span>")
  100. .before("<span class='wrapAll' id='idPages' style='margin-left: 20px;'></span>")
  101. .before("<span class='wrapAll' id='idResults' style='margin-left: 20px;'></span><br class='wrapAll'>")
  102. .before("<span class='wrapAll'>Province Scrape Range.... Start</span><input id='txtStartProvIndex' class='wrapAll' style='margin-right: 10px;' type='text'><span class='wrapAll'>End</span><input id='txtEndProvIndex' class='wrapAll' type='text'><br class='wrapAll'>")
  103. .before("<span class='wrapAll'>Start at Page:</span><input id='txtStartPage' class='wrapAll' style='margin-right: 10px;' type='text'>")
  104. .before("<span class='wrapAll'>End at Page:</span><input id='txtEndPage' class='wrapAll' style='margin-right: 10px;' type='text'>");
  105.  
  106. $( ".wrapAll" ).wrapAll( "<div style='position: absolute; right: 100px; top: 100px;' />");
  107.  
  108. $("#comboProvincia").children().eq(1).attr('selected', true);
  109.  
  110. $("#txtStartProvIndex").change(() => {
  111. $("#comboProvincia").children().eq($("#txtStartProvIndex").val()).attr('selected', true);
  112. });
  113. }
  114.  
  115. (() => {
  116. 'use strict';
  117.  
  118. initializeForm();
  119.  
  120. $("#txtStartProvIndex").val(1);
  121. $("#txtEndProvIndex").val($("#comboProvincia").children().length);
  122. $("#txtStartPage").val("1");
  123.  
  124. // Set default values
  125. $("#Nombre").val("%");
  126. $("#Apellido1").val("%");
  127.  
  128. $("#codigoCaptcha").focus();
  129.  
  130. var firstRun = true;
  131.  
  132. $("#busquedaPublicaColegiadoBuscar").click(async () => {
  133. await waitLoading();
  134.  
  135. var provIndex = parseInt($("#txtStartProvIndex").val() || 1);
  136. var provStopIndex = parseInt($("#txtEndProvIndex").val() || $("#comboProvincia").children().length);
  137.  
  138. // This code is only ran once
  139. if(firstRun) {
  140. // Set the page
  141. document.getElementById('formBuscarColegiados').pagina.value = parseInt($("#txtStartPage").val());
  142. buscarPagina();
  143.  
  144. await waitLoading();
  145.  
  146. firstRun = false;
  147. }
  148.  
  149.  
  150. if(provIndex <= provStopIndex) {
  151. var pagesLeft = true;
  152. var results = [];
  153.  
  154. do {
  155.  
  156. var providencesLeft = provStopIndex - provIndex;
  157.  
  158. $("#idProvidences").text("Provinces left: " + providencesLeft);
  159. $("#idPages").text("Pages: " + $(".this-page").text() + " / " + $("a.enlacePaginacion:nth-last-child(2)").text());
  160.  
  161. var images = $("table.resultados > tbody > tr > td:last-child > img");
  162.  
  163. for(var i = 0; i < images.length; i++) {
  164. if($(images[i]).parent().siblings(':first').text() === '') {
  165. // Skip blank ids which cause errors
  166. continue;
  167. }
  168.  
  169. $(images[i]).click();
  170.  
  171. var row = await parseTab();
  172.  
  173. results.push(row);
  174.  
  175. var resultString = "Records Collected: " + results.length;
  176.  
  177. $("#idResults").text(resultString);
  178.  
  179. // console.log(row);
  180. };
  181.  
  182. images = null;
  183.  
  184. if($("a.enlacePaginacion:last").text() === "Siguiente" && ( $("#txtEndPage").val() === "" || $(".this-page").text() != $("#txtEndPage").val() ) ) {
  185. // Go to the next page
  186. pagesLeft = true;
  187. $("a.enlacePaginacion:last")[0].click();
  188. await waitLoading();
  189. }
  190. else {
  191. pagesLeft = false;
  192. }
  193. } while (pagesLeft);
  194.  
  195. // Download the results
  196. await downloadResultSet(results, provIndex);
  197.  
  198. // Current result set has finished
  199. provIndex += 1;
  200.  
  201. results = [];
  202.  
  203. if(provIndex <= provStopIndex) {
  204. $("#txtStartProvIndex").val(provIndex);
  205. $("#comboProvincia").children().eq(provIndex).attr('selected', true);
  206. $("#busquedaPublicaColegiadoBuscar").click();
  207.  
  208. await waitLoading();
  209. }
  210. }
  211. });
  212.  
  213. })();
  214.  
  215.