cgcom-interno.vuds-omc.es website scraper

Scrapes the website in csv format

您需要先安裝使用者腳本管理器擴展,如 TampermonkeyGreasemonkeyViolentmonkey 之後才能安裝該腳本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安裝使用者腳本管理器擴充功能,如 TampermonkeyViolentmonkey 後才能安裝該腳本。

您需要先安裝使用者腳本管理器擴充功能,如 TampermonkeyUserscripts 後才能安裝該腳本。

你需要先安裝一款使用者腳本管理器擴展,比如 Tampermonkey,才能安裝此腳本

您需要先安裝使用者腳本管理器擴充功能後才能安裝該腳本。

(我已經安裝了使用者腳本管理器,讓我安裝!)

你需要先安裝一款使用者樣式管理器擴展,比如 Stylus,才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展,比如 Stylus,才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展,比如 Stylus,才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展後才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展後才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展後才能安裝此樣式

(我已經安裝了使用者樣式管理器,讓我安裝!)

// ==UserScript==
// @name         cgcom-interno.vuds-omc.es website scraper
// @namespace    http://tampermonkey.net/
// @version      0.13
// @description  Scrapes the website in csv format
// @author       You
// @match        https://cgcom-interno.cgcom.es/RegistroMedicos/PUBBusquedaPublica_busqueda.action
// ==/UserScript==

function parseTab() {
    return new Promise((resolve) => {
        var checkExist = setInterval(() => {
            if($("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").length) {
                clearInterval(checkExist);

                var row = "";

                $("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").each(function() {
                    // This accounts for multiple items in a field (i.e. Specialties)
                    row += '"' + this.innerHTML.trim().replace(/(<br>)+/g, ";").replace(/;$/, "").replace(/<!--(.)+-->/g, '').replace(/"/g, "").trim() + '",';
                });

                // Sometimes there's an extra field in the table, so add it as blank if it doesn't exist
                if($("[id^=tabBusqueda_] > table > tbody > tr > td:last-child").length === 6) {
                    row += '""';
                }
                else {
                    row = row.substring(0, row.length - 1);
                }

                // Close the tab
                $("[id^=botonCerrar_tabBusqueda_]").click();

                resolve(row);
            }
        }, 100); // check every 100ms
    });
}

function waitLoading() {
    return new Promise((resolve) => {
        var checkExist = setInterval(() => {
            if(!$(".blockUI").length) {
                clearInterval(checkExist);

                resolve();
            }
        }, 100); // check every 100ms
    });
}

function downloadResultSet(resultSet, provIndex) {
    return new Promise((resolve) => {
        var results = ['"Colegiado","Nombre","Provincia","Especialidad","Estado","DireccionTrabajo","ValidaciónPeriódicaColegiación"'];

        results = results.concat(resultSet);

        let csvContent = "";
        results.forEach(function(line){
            var lines = line.split(',');
            lines[6] = lines[6].replace(/[\n]/g, ";");
            line = lines.join(',');
            var cleaned = line.replace(/[\n\t]/g, " ").replace(/[ ]+/g, " ");
            csvContent += cleaned + "\r\n";
        });

        // Need to create a Blob because of size of content
        var blob = new Blob([csvContent], {encoding:"UTF-8", type:"text/plain;charset=UTF-8"});
        var url = window.URL.createObjectURL(blob);
        var a = document.createElement("a");
        a.href = url;
        a.download = `cgcom_raw_${provIndex}.csv`;
        a.click();
        window.URL.revokeObjectURL(url);

        results = [];

        resolve();
    });
}

// Ripped from original website to remove the setTimeout capatcha refresh
var myBuscar = function myBuscar() {
    $("#numeroColegiadoDetalle").val(null);
    $("#pagina").val(1);

    var idForm = "formBuscarColegiados";
    var action = "PUBBusquedaPublica_busqueda_ajax.action";
    var idCapaContenido = "resultadoBusquedaColegiados";
    enviarFormAjax(idForm, action, idCapaContenido, true);
    //setTimeout(function(){$("#recalcularImg").attr("src",$("#contextUrl").val()+"/Captcha.png?"+Date.now());},500);
    //console.log($("#contextUrl").val()+"/Captcha.png?"+Date.now());
};

function initializeForm() {
    $("#busquedaPublicaColegiadoBuscar").removeAttr('onclick').bind( "click", myBuscar);

    $("#contenidoPublico")
        .before("<span class='wrapAll' id='idProvidences' style='margin-left: 5px;'></span>")
        .before("<span class='wrapAll' id='idPages' style='margin-left: 20px;'></span>")
        .before("<span class='wrapAll' id='idResults' style='margin-left: 20px;'></span><br class='wrapAll'>")
        .before("<span class='wrapAll'>Province Scrape Range.... Start</span><input id='txtStartProvIndex' class='wrapAll' style='margin-right: 10px;' type='text'><span class='wrapAll'>End</span><input id='txtEndProvIndex' class='wrapAll' type='text'><br class='wrapAll'>")
        .before("<span class='wrapAll'>Start at Page:</span><input id='txtStartPage' class='wrapAll' style='margin-right: 10px;' type='text'>")
        .before("<span class='wrapAll'>End at Page:</span><input id='txtEndPage' class='wrapAll' style='margin-right: 10px;' type='text'>");

    $( ".wrapAll" ).wrapAll( "<div style='position: absolute; right: 100px; top: 100px;' />");

    $("#comboProvincia").children().eq(1).attr('selected', true);

    $("#txtStartProvIndex").change(() => {
        $("#comboProvincia").children().eq($("#txtStartProvIndex").val()).attr('selected', true);
    });
}

(() => {
    'use strict';

    initializeForm();

    $("#txtStartProvIndex").val(1);
    $("#txtEndProvIndex").val($("#comboProvincia").children().length);
    $("#txtStartPage").val("1");

    // Set default values
    $("#Nombre").val("%");
    $("#Apellido1").val("%");

    $("#codigoCaptcha").focus();

    var firstRun = true;

    $("#busquedaPublicaColegiadoBuscar").click(async () => {
        await waitLoading();

        var provIndex = parseInt($("#txtStartProvIndex").val() || 1);
        var provStopIndex = parseInt($("#txtEndProvIndex").val() || $("#comboProvincia").children().length);

        // This code is only ran once
        if(firstRun) {
            // Set the page
            document.getElementById('formBuscarColegiados').pagina.value = parseInt($("#txtStartPage").val());
            buscarPagina();

            await waitLoading();

            firstRun = false;
        }


        if(provIndex <= provStopIndex) {
            var pagesLeft = true;
            var results = [];

            do {

                var providencesLeft = provStopIndex - provIndex;

                $("#idProvidences").text("Provinces left: " + providencesLeft);
                $("#idPages").text("Pages: " + $(".this-page").text() + " / " + $("a.enlacePaginacion:nth-last-child(2)").text());

                var images = $("table.resultados > tbody  > tr > td:last-child > img");

                for(var i = 0; i < images.length; i++) {
                    if($(images[i]).parent().siblings(':first').text() === '') {
                        // Skip blank ids which cause errors
                        continue;
                    }

                    $(images[i]).click();

                    var row = await parseTab();

                    results.push(row);

                    var resultString = "Records Collected: " + results.length;

                    $("#idResults").text(resultString);

                 //   console.log(row);
                };

                images = null;

                if($("a.enlacePaginacion:last").text() === "Siguiente" && ( $("#txtEndPage").val() === "" || $(".this-page").text() != $("#txtEndPage").val() ) ) {
                    // Go to the next page
                    pagesLeft = true;
                    $("a.enlacePaginacion:last")[0].click();
                    await waitLoading();
                }
                else {
                    pagesLeft = false;
                }
            } while (pagesLeft);

            // Download the results
            await downloadResultSet(results, provIndex);

            // Current result set has finished
            provIndex += 1;

            results = [];

            if(provIndex <= provStopIndex) {
                $("#txtStartProvIndex").val(provIndex);
                $("#comboProvincia").children().eq(provIndex).attr('selected', true);
                $("#busquedaPublicaColegiadoBuscar").click();

                await waitLoading();
            }
       }
    });

})();