Avito Scraping Assistant

Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici

当前为 2025-03-11 提交的版本,查看 最新版本

您需要先安装一个扩展,例如 篡改猴Greasemonkey暴力猴,之后才能安装此脚本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安装一个扩展,例如 篡改猴暴力猴,之后才能安装此脚本。

您需要先安装一个扩展,例如 篡改猴Userscripts ,之后才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。

您需要先安装用户脚本管理器扩展后才能安装此脚本。

(我已经安装了用户脚本管理器,让我安装!)

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

(我已经安装了用户样式管理器,让我安装!)

// ==UserScript==
// @name         Avito Scraping Assistant
// @namespace    https://danielfragomeli.com/
// @version      1.2
// @description  Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici
// @author       dan098
// @match        *://*.avito.ru/*
// @grant        GM_setValue
// @grant        GM_getValue
// @grant        GM_deleteValue
// @grant        GM_listValues
// @grant        GM_setClipboard
// @grant        GM_xmlhttpRequest
// @grant        unsafeWindow
// @license      MIT
// @require      https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js
// @require      https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.2/papaparse.min.js
// ==/UserScript==

(function() {
    'use strict';

    // Stili CSS per l'interfaccia
    const css = `
        #avito-scraper-panel {
            position: fixed;
            bottom: 10px;
            right: 10px;
            background-color: #fff;
            border: 2px solid #0078d7;
            border-radius: 5px;
            padding: 10px;
            width: 400px;
            z-index: 10000;
            font-family: Arial, sans-serif;
            box-shadow: 0 0 10px rgba(0,0,0,0.2);
        }
        #avito-scraper-panel h3 {
            margin-top: 0;
            margin-bottom: 10px;
            color: #0078d7;
            border-bottom: 1px solid #eee;
            padding-bottom: 5px;
        }
        #avito-urls {
            width: 100%;
            height: 100px;
            margin-bottom: 10px;
            resize: vertical;
        }
        #avito-status {
            margin: 10px 0;
            padding: 5px;
            border-radius: 3px;
            background-color: #f0f0f0;
        }
        .avito-button {
            background-color: #0078d7;
            color: white;
            border: none;
            padding: 8px 12px;
            margin: 5px 5px 5px 0;
            border-radius: 3px;
            cursor: pointer;
        }
        .avito-button:hover {
            background-color: #005a9e;
        }
        .avito-button:disabled {
            background-color: #cccccc;
            cursor: not-allowed;
        }
        .progress-bar {
            height: 15px;
            background-color: #e0e0e0;
            border-radius: 5px;
            margin: 10px 0;
        }
        .progress-fill {
            height: 100%;
            background-color: #0078d7;
            border-radius: 5px;
            width: 0%;
            transition: width 0.3s;
        }
        .captcha-alert {
            color: #d61e00;
            font-weight: bold;
            margin-top: 5px;
            display: none;
        }
        .controls-row {
            display: flex;
            justify-content: space-between;
            align-items: center;
        }
        .pause-button {
            background-color: #ff9800;
        }
        .pause-button:hover {
            background-color: #e68a00;
        }
        .loading-indicator {
            color: blue;
            font-style: italic;
        }
        #current-url-container {
            margin: 10px 0;
            border: 1px solid #ddd;
            padding: 5px;
            border-radius: 3px;
            word-break: break-all;
            font-size: 12px;
        }
        #extracted-preview {
            margin: 10px 0;
            max-height: 100px;
            overflow-y: auto;
            border: 1px solid #ddd;
            padding: 5px;
            font-size: 12px;
            background-color: #f9f9f9;
        }
    `;

    // Aggiungi stili alla pagina
    const styleEl = document.createElement('style');
    styleEl.textContent = css;
    document.head.appendChild(styleEl);

    // Crea e aggiungi il pannello di controllo alla pagina
    const panel = document.createElement('div');
    panel.id = 'avito-scraper-panel';
    panel.innerHTML = `
        <h3>Avito Scraping Assistant</h3>
        <div>
            <textarea id="avito-urls" placeholder="Inserisci gli URL da visitare, uno per riga"></textarea>
            <div class="controls-row">
                <button id="load-urls" class="avito-button">Carica URLs</button>
                <button id="load-file" class="avito-button">Carica da File</button>
                <input type="file" id="url-file" style="display: none;" accept=".txt,.csv">
            </div>
        </div>
        <div id="current-url-container" style="display: none;">
            <strong>URL Corrente:</strong> <span id="current-url"></span>
        </div>
        <div class="progress-bar">
            <div class="progress-fill" id="progress-fill"></div>
        </div>
        <div id="avito-status">Stato: Pronto</div>
        <div class="captcha-alert" id="captcha-alert">⚠️ CAPTCHA rilevato! Risolvilo manualmente, poi clicca "Continua"</div>
        <div class="loading-indicator" id="loading-indicator" style="display: none;">Caricamento pagina in corso...</div>
        <div id="extracted-preview" style="display: none;"></div>
        <div>
            <button id="start-scraping" class="avito-button" disabled>Inizia</button>
            <button id="extract-data" class="avito-button" disabled>Estrai Dati</button>
            <button id="next-url" class="avito-button" disabled>Prossimo</button>
            <button id="pause-resume" class="avito-button pause-button" disabled>Pausa</button>
        </div>
        <div>
            <button id="export-csv" class="avito-button" disabled>Esporta CSV</button>
            <button id="clear-data" class="avito-button" disabled>Cancella Dati</button>
            <button id="view-data" class="avito-button" disabled>Visualizza Dati</button>
        </div>
    `;

    document.body.appendChild(panel);

    // Funzioni helper per gestire lo stato dello scraping
    class ScrapingState {
        constructor() {
            this.urls = [];
            this.currentIndex = 0;
            this.extractedData = [];
            this.isPaused = false;
            this.isLoading = false;
            this.loadState();
        }

        loadState() {
            const savedUrls = GM_getValue('avitoUrls', null);
            const savedIndex = GM_getValue('avitoCurrentIndex', 0);
            const savedData = GM_getValue('avitoExtractedData', null);
            
            if (savedUrls) {
                this.urls = JSON.parse(savedUrls);
                document.getElementById('avito-urls').value = this.urls.join('\n');
            }
            
            this.currentIndex = savedIndex;
            
            if (savedData) {
                this.extractedData = JSON.parse(savedData);
            }
            
            this.updateButtonStates();
            this.updateProgress();
            
            if (this.urls.length > 0) {
                document.getElementById('start-scraping').disabled = false;
                
                if (this.extractedData.length > 0) {
                    document.getElementById('export-csv').disabled = false;
                    document.getElementById('clear-data').disabled = false;
                    document.getElementById('view-data').disabled = false;
                }
            }
        }

        saveState() {
            GM_setValue('avitoUrls', JSON.stringify(this.urls));
            GM_setValue('avitoCurrentIndex', this.currentIndex);
            GM_setValue('avitoExtractedData', JSON.stringify(this.extractedData));
        }

        setUrls(urlArray) {
            this.urls = urlArray.filter(url => url.trim() !== '');
            this.saveState();
            this.updateButtonStates();
            this.updateProgress();
        }

        getCurrentUrl() {
            if (this.currentIndex < this.urls.length) {
                return this.urls[this.currentIndex];
            }
            return null;
        }

        moveToNext() {
            if (this.currentIndex < this.urls.length - 1) {
                this.currentIndex++;
                this.saveState();
                this.updateProgress();
                return true;
            }
            return false;
        }

        addExtractedData(data) {
            this.extractedData.push(data);
            this.saveState();
            document.getElementById('export-csv').disabled = false;
            document.getElementById('clear-data').disabled = false;
            document.getElementById('view-data').disabled = false;
            
            // Mostra un'anteprima dei dati estratti
            this.showDataPreview(data);
        }

        showDataPreview(data) {
            const previewEl = document.getElementById('extracted-preview');
            previewEl.style.display = 'block';
            
            // Seleziona alcuni campi chiave da mostrare
            const preview = `
                <strong>Dati estratti:</strong><br>
                Venditore: ${data['seller name']}<br>
                Città: ${data['Seller City']}<br>
                Prezzo: ${data.Price}<br>
                <small>(Dati salvati correttamente)</small>
            `;
            
            previewEl.innerHTML = preview;
        }

        updateProgress() {
            const progressElement = document.getElementById('progress-fill');
            const percent = this.urls.length > 0 ? (this.currentIndex / this.urls.length) * 100 : 0;
            progressElement.style.width = `${percent}%`;
            
            const statusElement = document.getElementById('avito-status');
            statusElement.textContent = `Stato: ${this.currentIndex}/${this.urls.length} URL processati`;
            
            const currentUrlContainer = document.getElementById('current-url-container');
            const currentUrlSpan = document.getElementById('current-url');
            
            if (this.getCurrentUrl()) {
                currentUrlContainer.style.display = 'block';
                currentUrlSpan.textContent = this.getCurrentUrl();
            } else {
                currentUrlContainer.style.display = 'none';
            }
        }

        updateButtonStates() {
            const startButton = document.getElementById('start-scraping');
            const extractButton = document.getElementById('extract-data');
            const nextButton = document.getElementById('next-url');
            const pauseButton = document.getElementById('pause-resume');
            const viewDataButton = document.getElementById('view-data');
            
            startButton.disabled = this.urls.length === 0;
            
            const isCurrentPage = window.location.href.includes(this.getCurrentUrl());
            extractButton.disabled = !isCurrentPage || this.isLoading;
            nextButton.disabled = this.currentIndex >= this.urls.length - 1 || this.isLoading;
            pauseButton.disabled = this.urls.length === 0;
            viewDataButton.disabled = this.extractedData.length === 0;
        }

        setLoading(isLoading) {
            this.isLoading = isLoading;
            const loadingIndicator = document.getElementById('loading-indicator');
            loadingIndicator.style.display = isLoading ? 'block' : 'none';
            this.updateButtonStates();
        }

        clearData() {
            this.extractedData = [];
            this.saveState();
            document.getElementById('export-csv').disabled = true;
            document.getElementById('clear-data').disabled = true;
            document.getElementById('view-data').disabled = true;
            document.getElementById('extracted-preview').style.display = 'none';
        }
    }

    // Istanza dello stato dello scraping
    const state = new ScrapingState();

    // Funzione per estrarre i dati dalla pagina attuale
    function extractDataFromPage() {
        const data = {
            link: window.location.href,
            'seller name': 'N/A',
            'shop link': 'N/A',
            marketplace: 'Avito',
            product: 'Jewelry',
            image: 'N/A',
            description: 'N/A',
            'Seller City': 'N/A',
            Price: 'N/A',
            'Data scaricamento': new Date().toISOString().split('T')[0],
            month: '2025-03',
            Brand: 'Roberto Coin',
            Country: 'Russia',
            'Type of platform': 'Marketplace'
        };

        // Gli XPath forniti
        const xpaths = {
            'seller name': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a",
            'shop link': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a",
            'image': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[1]/div/div/div/div/div[3]/img",
            'description': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[2]/div/div/div/p[1]",
            'Seller City': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/div[1]/div/p[1]/span",
            'Price': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]/div/div/div/span/span/span[1]"
        };

        // Estrai i dati utilizzando gli XPath
        for (const [field, xpath] of Object.entries(xpaths)) {
            try {
                const element = document.evaluate(
                    xpath, 
                    document, 
                    null, 
                    XPathResult.FIRST_ORDERED_NODE_TYPE, 
                    null
                ).singleNodeValue;

                if (element) {
                    if (field === 'shop link') {
                        data[field] = element.getAttribute('href') || 'N/A';
                        // Aggiunge dominio se il link è relativo
                        if (data[field] && !data[field].startsWith('http')) {
                            data[field] = `https://www.avito.ru${data[field]}`;
                        }
                    } else if (field === 'image') {
                        data[field] = element.getAttribute('src') || 'N/A';
                    } else {
                        data[field] = element.textContent.trim() || 'N/A';
                    }
                }
            } catch (e) {
                console.error(`Errore nell'estrazione di ${field}:`, e);
            }
        }

        state.addExtractedData(data);
        
        // Aggiorna lo stato e mostra un messaggio di successo
        const statusElement = document.getElementById('avito-status');
        statusElement.textContent = `Stato: Dati estratti da ${state.currentIndex + 1}/${state.urls.length} URL`;
        
        // Abilita il pulsante "Next" se ci sono ancora URL da processare
        document.getElementById('next-url').disabled = state.currentIndex >= state.urls.length - 1;
        
        return data;
    }

    // Funzione per rilevare CAPTCHA
    function detectCaptcha() {
        // Questa è una semplificazione: adatta i selettori in base al reale CAPTCHA di Avito
        const captchaSelectors = [
            'iframe[src*="captcha"]',
            'div[class*="captcha"]',
            'div[id*="captcha"]',
            'input[name*="captcha"]',
            'img[src*="captcha"]',
            'form[action*="captcha"]',
            'div.firewall-container'  // Aggiunto per il firewall di Avito
        ];
        
        for (const selector of captchaSelectors) {
            if (document.querySelector(selector)) {
                return true;
            }
        }
        
        // Cerca anche per testo di CAPTCHA o blocco
        const bodyText = document.body.innerText.toLowerCase();
        const captchaTexts = ['captcha', 'проверка', 'безопасность', 'подтвердите', 'robot', 'человек'];
        for (const text of captchaTexts) {
            if (bodyText.includes(text)) {
                return true;
            }
        }
        
        return false;
    }

    // Gestori degli eventi per i pulsanti
    document.getElementById('load-urls').addEventListener('click', () => {
        const urlText = document.getElementById('avito-urls').value;
        const urlArray = urlText.split('\n').map(url => url.trim()).filter(url => url !== '');
        state.setUrls(urlArray);
        document.getElementById('start-scraping').disabled = urlArray.length === 0;
    });

    document.getElementById('load-file').addEventListener('click', () => {
        document.getElementById('url-file').click();
    });

    document.getElementById('url-file').addEventListener('change', (e) => {
        const file = e.target.files[0];
        if (file) {
            const reader = new FileReader();
            reader.onload = (event) => {
                const content = event.target.result;
                document.getElementById('avito-urls').value = content;
                const urlArray = content.split('\n').map(url => url.trim()).filter(url => url !== '');
                state.setUrls(urlArray);
                document.getElementById('start-scraping').disabled = urlArray.length === 0;
            };
            reader.readAsText(file);
        }
    });

    document.getElementById('start-scraping').addEventListener('click', () => {
        const currentUrl = state.getCurrentUrl();
        if (currentUrl) {
            state.setLoading(true);
            window.location.href = currentUrl;
        }
    });

    document.getElementById('extract-data').addEventListener('click', () => {
        const data = extractDataFromPage();
        const previewEl = document.getElementById('extracted-preview');
        previewEl.style.display = 'block';
    });

    document.getElementById('next-url').addEventListener('click', () => {
        if (state.moveToNext()) {
            const nextUrl = state.getCurrentUrl();
            if (nextUrl) {
                state.setLoading(true);
                window.location.href = nextUrl;
            }
        } else {
            alert('Hai completato tutti gli URL!');
        }
    });

    document.getElementById('pause-resume').addEventListener('click', () => {
        state.isPaused = !state.isPaused;
        const pauseButton = document.getElementById('pause-resume');
        pauseButton.textContent = state.isPaused ? 'Riprendi' : 'Pausa';
    });

    // Funzione per gestire correttamente l'encoding del CSV
    function exportCSV() {
        if (state.extractedData.length === 0) {
            alert('Nessun dato da esportare.');
            return;
        }
        
        // Utilizza PapaParse con configurazione per UTF-8
        const csvConfig = {
            quotes: true,  // Usa sempre le virgolette per proteggere i valori con caratteri speciali
            quoteChar: '"',
            escapeChar: '"',
            delimiter: ",",
            header: true,
            newline: "\r\n",
            skipEmptyLines: false
        };
        
        const csv = Papa.unparse(state.extractedData, csvConfig);
        
        // Aggiunge BOM (Byte Order Mark) per far riconoscere correttamente UTF-8 a Excel e altri programmi
        const BOM = "\uFEFF";
        const csvWithBOM = BOM + csv;
        
        // Crea un blob con encoding esplicito UTF-8
        const blob = new Blob([csvWithBOM], { type: 'text/csv;charset=utf-8;' });
        const url = URL.createObjectURL(blob);
        
        // Crea un link per il download
        const link = document.createElement('a');
        link.href = url;
        link.setAttribute('download', `avito_data_${new Date().toISOString().split('T')[0]}.csv`);
        document.body.appendChild(link);
        link.click();
        document.body.removeChild(link);
    }

    document.getElementById('export-csv').addEventListener('click', exportCSV);

    document.getElementById('clear-data').addEventListener('click', () => {
        if (confirm('Sei sicuro di voler cancellare tutti i dati estratti?')) {
            state.clearData();
            const statusElement = document.getElementById('avito-status');
            statusElement.textContent = 'Stato: Dati cancellati';
        }
    });

    document.getElementById('view-data').addEventListener('click', () => {
        if (state.extractedData.length === 0) {
            alert('Nessun dato da visualizzare.');
            return;
        }
        
        // Crea una nuova finestra per visualizzare i dati
        const dataWindow = window.open('', 'Dati Estratti', 'width=800,height=600');
        dataWindow.document.write(`
            <html>
            <head>
                <title>Dati Estratti da Avito</title>
                <meta charset="UTF-8">
                <style>
                    body { font-family: Arial, sans-serif; margin: 20px; }
                    table { border-collapse: collapse; width: 100%; }
                    th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
                    th { background-color: #f2f2f2; }
                    tr:nth-child(even) { background-color: #f9f9f9; }
                    .container { max-width: 100%; overflow-x: auto; }
                </style>
            </head>
            <body>
                <h2>Dati Estratti da Avito (${state.extractedData.length} record)</h2>
                <div class="container">
                    <table>
                        <thead>
                            <tr>
                                ${Object.keys(state.extractedData[0]).map(key => `<th>${key}</th>`).join('')}
                            </tr>
                        </thead>
                        <tbody>
                            ${state.extractedData.map(record => `
                                <tr>
                                    ${Object.values(record).map(value => `<td>${value}</td>`).join('')}
                                </tr>
                            `).join('')}
                        </tbody>
                    </table>
                </div>
            </body>
            </html>
        `);
    });

    // Controllo periodico di CAPTCHA e caricamento pagina
    function checkPageStatus() {
        const captchaAlert = document.getElementById('captcha-alert');
        
        if (detectCaptcha()) {
            captchaAlert.style.display = 'block';
            state.setLoading(false);
        } else {
            captchaAlert.style.display = 'none';
            
            // Controlla se la pagina è completamente caricata
            if (document.readyState === 'complete') {
                state.setLoading(false);
                
                // Se l'URL corrente corrisponde all'URL che stiamo processando, abilita il pulsante di estrazione
                const currentUrl = state.getCurrentUrl();
                if (currentUrl && window.location.href.includes(currentUrl)) {
                    document.getElementById('extract-data').disabled = false;
                }
            }
        }
    }

    // Controlla lo stato della pagina ogni secondo
    setInterval(checkPageStatus, 1000);

    // Evento quando la pagina è completamente caricata
    window.addEventListener('load', () => {
        // Aggiungiamo un piccolo ritardo per assicurarci che tutti gli elementi siano caricati
        setTimeout(() => {
            state.setLoading(false);
            state.updateButtonStates();
            
            // Verifica se l'URL corrente corrisponde a uno degli URL nella lista
            const currentUrl = state.getCurrentUrl();
            if (currentUrl && window.location.href.includes(currentUrl)) {
                document.getElementById('extract-data').disabled = false;
            }
        }, 1500);
    });

    console.log('Avito Scraping Assistant avviato con supporto UTF-8');
})();