Scrapes the website in csv format
// ==UserScript==
// @name cgcom-interno.vuds-omc.es website scraper
// @namespace http://tampermonkey.net/
// @version 0.13
// @description Scrapes the website in csv format
// @author You
// @match https://cgcom-interno.cgcom.es/RegistroMedicos/PUBBusquedaPublica_busqueda.action
// ==/UserScript==
function parseTab() {
return new Promise((resolve) => {
var checkExist = setInterval(() => {
if($("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").length) {
clearInterval(checkExist);
var row = "";
$("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").each(function() {
// This accounts for multiple items in a field (i.e. Specialties)
row += '"' + this.innerHTML.trim().replace(/(<br>)+/g, ";").replace(/;$/, "").replace(/<!--(.)+-->/g, '').replace(/"/g, "").trim() + '",';
});
// Sometimes there's an extra field in the table, so add it as blank if it doesn't exist
if($("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").length === 6) {
row += '""';
}
else {
row = row.substring(0, row.length - 1);
}
// Close the tab
$("[id^=botonCerrar_tabBusqueda_]").click();
resolve(row);
}
}, 100); // check every 100ms
});
}
function waitLoading() {
return new Promise((resolve) => {
var checkExist = setInterval(() => {
if(!$(".blockUI").length) {
clearInterval(checkExist);
resolve();
}
}, 100); // check every 100ms
});
}
function downloadResultSet(resultSet, provIndex) {
return new Promise((resolve) => {
var results = ['"Colegiado","Nombre","Provincia","Especialidad","Estado","DireccionTrabajo","ValidaciónPeriódicaColegiación"'];
results = results.concat(resultSet);
let csvContent = "";
results.forEach(function(line){
var lines = line.split(',');
lines[6] = lines[6].replace(/[\n]/g, ";");
line = lines.join(',');
var cleaned = line.replace(/[\n\t]/g, " ").replace(/[ ]+/g, " ");
csvContent += cleaned + "\r\n";
});
// Need to create a Blob because of size of content
var blob = new Blob([csvContent], {encoding:"UTF-8", type:"text/plain;charset=UTF-8"});
var url = window.URL.createObjectURL(blob);
var a = document.createElement("a");
a.href = url;
a.download = `cgcom_raw_${provIndex}.csv`;
a.click();
window.URL.revokeObjectURL(url);
results = [];
resolve();
});
}
// Ripped from original website to remove the setTimeout capatcha refresh
var myBuscar = function myBuscar() {
$("#numeroColegiadoDetalle").val(null);
$("#pagina").val(1);
var idForm = "formBuscarColegiados";
var action = "PUBBusquedaPublica_busqueda_ajax.action";
var idCapaContenido = "resultadoBusquedaColegiados";
enviarFormAjax(idForm, action, idCapaContenido, true);
//setTimeout(function(){$("#recalcularImg").attr("src",$("#contextUrl").val()+"/Captcha.png?"+Date.now());},500);
//console.log($("#contextUrl").val()+"/Captcha.png?"+Date.now());
};
function initializeForm() {
$("#busquedaPublicaColegiadoBuscar").removeAttr('onclick').bind( "click", myBuscar);
$("#contenidoPublico")
.before("<span class='wrapAll' id='idProvidences' style='margin-left: 5px;'></span>")
.before("<span class='wrapAll' id='idPages' style='margin-left: 20px;'></span>")
.before("<span class='wrapAll' id='idResults' style='margin-left: 20px;'></span><br class='wrapAll'>")
.before("<span class='wrapAll'>Province Scrape Range.... Start</span><input id='txtStartProvIndex' class='wrapAll' style='margin-right: 10px;' type='text'><span class='wrapAll'>End</span><input id='txtEndProvIndex' class='wrapAll' type='text'><br class='wrapAll'>")
.before("<span class='wrapAll'>Start at Page:</span><input id='txtStartPage' class='wrapAll' style='margin-right: 10px;' type='text'>")
.before("<span class='wrapAll'>End at Page:</span><input id='txtEndPage' class='wrapAll' style='margin-right: 10px;' type='text'>");
$( ".wrapAll" ).wrapAll( "<div style='position: absolute; right: 100px; top: 100px;' />");
$("#comboProvincia").children().eq(1).attr('selected', true);
$("#txtStartProvIndex").change(() => {
$("#comboProvincia").children().eq($("#txtStartProvIndex").val()).attr('selected', true);
});
}
(() => {
'use strict';
initializeForm();
$("#txtStartProvIndex").val(1);
$("#txtEndProvIndex").val($("#comboProvincia").children().length);
$("#txtStartPage").val("1");
// Set default values
$("#Nombre").val("%");
$("#Apellido1").val("%");
$("#codigoCaptcha").focus();
var firstRun = true;
$("#busquedaPublicaColegiadoBuscar").click(async () => {
await waitLoading();
var provIndex = parseInt($("#txtStartProvIndex").val() || 1);
var provStopIndex = parseInt($("#txtEndProvIndex").val() || $("#comboProvincia").children().length);
// This code is only ran once
if(firstRun) {
// Set the page
document.getElementById('formBuscarColegiados').pagina.value = parseInt($("#txtStartPage").val());
buscarPagina();
await waitLoading();
firstRun = false;
}
if(provIndex <= provStopIndex) {
var pagesLeft = true;
var results = [];
do {
var providencesLeft = provStopIndex - provIndex;
$("#idProvidences").text("Provinces left: " + providencesLeft);
$("#idPages").text("Pages: " + $(".this-page").text() + " / " + $("a.enlacePaginacion:nth-last-child(2)").text());
var images = $("table.resultados > tbody > tr > td:last-child > img");
for(var i = 0; i < images.length; i++) {
if($(images[i]).parent().siblings(':first').text() === '') {
// Skip blank ids which cause errors
continue;
}
$(images[i]).click();
var row = await parseTab();
results.push(row);
var resultString = "Records Collected: " + results.length;
$("#idResults").text(resultString);
// console.log(row);
};
images = null;
if($("a.enlacePaginacion:last").text() === "Siguiente" && ( $("#txtEndPage").val() === "" || $(".this-page").text() != $("#txtEndPage").val() ) ) {
// Go to the next page
pagesLeft = true;
$("a.enlacePaginacion:last")[0].click();
await waitLoading();
}
else {
pagesLeft = false;
}
} while (pagesLeft);
// Download the results
await downloadResultSet(results, provIndex);
// Current result set has finished
provIndex += 1;
results = [];
if(provIndex <= provStopIndex) {
$("#txtStartProvIndex").val(provIndex);
$("#comboProvincia").children().eq(provIndex).attr('selected', true);
$("#busquedaPublicaColegiadoBuscar").click();
await waitLoading();
}
}
});
})();