您需要先安装一个扩展,例如 篡改猴、Greasemonkey 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 Userscripts ,之后才能安装此脚本。
您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。
您需要先安装用户脚本管理器扩展后才能安装此脚本。
Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici
// ==UserScript== // @name Avito Scraping Assistant // @namespace https://danielfragomeli.com/ // @version 1.2 // @description Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici // @author dan098 // @match *://*.avito.ru/* // @grant GM_setValue // @grant GM_getValue // @grant GM_deleteValue // @grant GM_listValues // @grant GM_setClipboard // @grant GM_xmlhttpRequest // @grant unsafeWindow // @license MIT // @require https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js // @require https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.2/papaparse.min.js // ==/UserScript== (function() { 'use strict'; // Stili CSS per l'interfaccia const css = ` #avito-scraper-panel { position: fixed; bottom: 10px; right: 10px; background-color: #fff; border: 2px solid #0078d7; border-radius: 5px; padding: 10px; width: 400px; z-index: 10000; font-family: Arial, sans-serif; box-shadow: 0 0 10px rgba(0,0,0,0.2); } #avito-scraper-panel h3 { margin-top: 0; margin-bottom: 10px; color: #0078d7; border-bottom: 1px solid #eee; padding-bottom: 5px; } #avito-urls { width: 100%; height: 100px; margin-bottom: 10px; resize: vertical; } #avito-status { margin: 10px 0; padding: 5px; border-radius: 3px; background-color: #f0f0f0; } .avito-button { background-color: #0078d7; color: white; border: none; padding: 8px 12px; margin: 5px 5px 5px 0; border-radius: 3px; cursor: pointer; } .avito-button:hover { background-color: #005a9e; } .avito-button:disabled { background-color: #cccccc; cursor: not-allowed; } .progress-bar { height: 15px; background-color: #e0e0e0; border-radius: 5px; margin: 10px 0; } .progress-fill { height: 100%; background-color: #0078d7; border-radius: 5px; width: 0%; transition: width 0.3s; } .captcha-alert { color: #d61e00; font-weight: bold; margin-top: 5px; display: none; } .controls-row { display: flex; justify-content: space-between; align-items: center; } .pause-button { background-color: #ff9800; } .pause-button:hover { background-color: #e68a00; } .loading-indicator { color: blue; font-style: italic; } #current-url-container { margin: 10px 0; border: 1px solid #ddd; padding: 5px; border-radius: 3px; word-break: break-all; font-size: 12px; } #extracted-preview { margin: 10px 0; max-height: 100px; overflow-y: auto; border: 1px solid #ddd; padding: 5px; font-size: 12px; background-color: #f9f9f9; } `; // Aggiungi stili alla pagina const styleEl = document.createElement('style'); styleEl.textContent = css; document.head.appendChild(styleEl); // Crea e aggiungi il pannello di controllo alla pagina const panel = document.createElement('div'); panel.id = 'avito-scraper-panel'; panel.innerHTML = ` <h3>Avito Scraping Assistant</h3> <div> <textarea id="avito-urls" placeholder="Inserisci gli URL da visitare, uno per riga"></textarea> <div class="controls-row"> <button id="load-urls" class="avito-button">Carica URLs</button> <button id="load-file" class="avito-button">Carica da File</button> <input type="file" id="url-file" style="display: none;" accept=".txt,.csv"> </div> </div> <div id="current-url-container" style="display: none;"> <strong>URL Corrente:</strong> <span id="current-url"></span> </div> <div class="progress-bar"> <div class="progress-fill" id="progress-fill"></div> </div> <div id="avito-status">Stato: Pronto</div> <div class="captcha-alert" id="captcha-alert">⚠️ CAPTCHA rilevato! Risolvilo manualmente, poi clicca "Continua"</div> <div class="loading-indicator" id="loading-indicator" style="display: none;">Caricamento pagina in corso...</div> <div id="extracted-preview" style="display: none;"></div> <div> <button id="start-scraping" class="avito-button" disabled>Inizia</button> <button id="extract-data" class="avito-button" disabled>Estrai Dati</button> <button id="next-url" class="avito-button" disabled>Prossimo</button> <button id="pause-resume" class="avito-button pause-button" disabled>Pausa</button> </div> <div> <button id="export-csv" class="avito-button" disabled>Esporta CSV</button> <button id="clear-data" class="avito-button" disabled>Cancella Dati</button> <button id="view-data" class="avito-button" disabled>Visualizza Dati</button> </div> `; document.body.appendChild(panel); // Funzioni helper per gestire lo stato dello scraping class ScrapingState { constructor() { this.urls = []; this.currentIndex = 0; this.extractedData = []; this.isPaused = false; this.isLoading = false; this.loadState(); } loadState() { const savedUrls = GM_getValue('avitoUrls', null); const savedIndex = GM_getValue('avitoCurrentIndex', 0); const savedData = GM_getValue('avitoExtractedData', null); if (savedUrls) { this.urls = JSON.parse(savedUrls); document.getElementById('avito-urls').value = this.urls.join('\n'); } this.currentIndex = savedIndex; if (savedData) { this.extractedData = JSON.parse(savedData); } this.updateButtonStates(); this.updateProgress(); if (this.urls.length > 0) { document.getElementById('start-scraping').disabled = false; if (this.extractedData.length > 0) { document.getElementById('export-csv').disabled = false; document.getElementById('clear-data').disabled = false; document.getElementById('view-data').disabled = false; } } } saveState() { GM_setValue('avitoUrls', JSON.stringify(this.urls)); GM_setValue('avitoCurrentIndex', this.currentIndex); GM_setValue('avitoExtractedData', JSON.stringify(this.extractedData)); } setUrls(urlArray) { this.urls = urlArray.filter(url => url.trim() !== ''); this.saveState(); this.updateButtonStates(); this.updateProgress(); } getCurrentUrl() { if (this.currentIndex < this.urls.length) { return this.urls[this.currentIndex]; } return null; } moveToNext() { if (this.currentIndex < this.urls.length - 1) { this.currentIndex++; this.saveState(); this.updateProgress(); return true; } return false; } addExtractedData(data) { this.extractedData.push(data); this.saveState(); document.getElementById('export-csv').disabled = false; document.getElementById('clear-data').disabled = false; document.getElementById('view-data').disabled = false; // Mostra un'anteprima dei dati estratti this.showDataPreview(data); } showDataPreview(data) { const previewEl = document.getElementById('extracted-preview'); previewEl.style.display = 'block'; // Seleziona alcuni campi chiave da mostrare const preview = ` <strong>Dati estratti:</strong><br> Venditore: ${data['seller name']}<br> Città: ${data['Seller City']}<br> Prezzo: ${data.Price}<br> <small>(Dati salvati correttamente)</small> `; previewEl.innerHTML = preview; } updateProgress() { const progressElement = document.getElementById('progress-fill'); const percent = this.urls.length > 0 ? (this.currentIndex / this.urls.length) * 100 : 0; progressElement.style.width = `${percent}%`; const statusElement = document.getElementById('avito-status'); statusElement.textContent = `Stato: ${this.currentIndex}/${this.urls.length} URL processati`; const currentUrlContainer = document.getElementById('current-url-container'); const currentUrlSpan = document.getElementById('current-url'); if (this.getCurrentUrl()) { currentUrlContainer.style.display = 'block'; currentUrlSpan.textContent = this.getCurrentUrl(); } else { currentUrlContainer.style.display = 'none'; } } updateButtonStates() { const startButton = document.getElementById('start-scraping'); const extractButton = document.getElementById('extract-data'); const nextButton = document.getElementById('next-url'); const pauseButton = document.getElementById('pause-resume'); const viewDataButton = document.getElementById('view-data'); startButton.disabled = this.urls.length === 0; const isCurrentPage = window.location.href.includes(this.getCurrentUrl()); extractButton.disabled = !isCurrentPage || this.isLoading; nextButton.disabled = this.currentIndex >= this.urls.length - 1 || this.isLoading; pauseButton.disabled = this.urls.length === 0; viewDataButton.disabled = this.extractedData.length === 0; } setLoading(isLoading) { this.isLoading = isLoading; const loadingIndicator = document.getElementById('loading-indicator'); loadingIndicator.style.display = isLoading ? 'block' : 'none'; this.updateButtonStates(); } clearData() { this.extractedData = []; this.saveState(); document.getElementById('export-csv').disabled = true; document.getElementById('clear-data').disabled = true; document.getElementById('view-data').disabled = true; document.getElementById('extracted-preview').style.display = 'none'; } } // Istanza dello stato dello scraping const state = new ScrapingState(); // Funzione per estrarre i dati dalla pagina attuale function extractDataFromPage() { const data = { link: window.location.href, 'seller name': 'N/A', 'shop link': 'N/A', marketplace: 'Avito', product: 'Jewelry', image: 'N/A', description: 'N/A', 'Seller City': 'N/A', Price: 'N/A', 'Data scaricamento': new Date().toISOString().split('T')[0], month: '2025-03', Brand: 'Roberto Coin', Country: 'Russia', 'Type of platform': 'Marketplace' }; // Gli XPath forniti const xpaths = { 'seller name': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a", 'shop link': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a", 'image': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[1]/div/div/div/div/div[3]/img", 'description': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[2]/div/div/div/p[1]", 'Seller City': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/div[1]/div/p[1]/span", 'Price': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]/div/div/div/span/span/span[1]" }; // Estrai i dati utilizzando gli XPath for (const [field, xpath] of Object.entries(xpaths)) { try { const element = document.evaluate( xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue; if (element) { if (field === 'shop link') { data[field] = element.getAttribute('href') || 'N/A'; // Aggiunge dominio se il link è relativo if (data[field] && !data[field].startsWith('http')) { data[field] = `https://www.avito.ru${data[field]}`; } } else if (field === 'image') { data[field] = element.getAttribute('src') || 'N/A'; } else { data[field] = element.textContent.trim() || 'N/A'; } } } catch (e) { console.error(`Errore nell'estrazione di ${field}:`, e); } } state.addExtractedData(data); // Aggiorna lo stato e mostra un messaggio di successo const statusElement = document.getElementById('avito-status'); statusElement.textContent = `Stato: Dati estratti da ${state.currentIndex + 1}/${state.urls.length} URL`; // Abilita il pulsante "Next" se ci sono ancora URL da processare document.getElementById('next-url').disabled = state.currentIndex >= state.urls.length - 1; return data; } // Funzione per rilevare CAPTCHA function detectCaptcha() { // Questa è una semplificazione: adatta i selettori in base al reale CAPTCHA di Avito const captchaSelectors = [ 'iframe[src*="captcha"]', 'div[class*="captcha"]', 'div[id*="captcha"]', 'input[name*="captcha"]', 'img[src*="captcha"]', 'form[action*="captcha"]', 'div.firewall-container' // Aggiunto per il firewall di Avito ]; for (const selector of captchaSelectors) { if (document.querySelector(selector)) { return true; } } // Cerca anche per testo di CAPTCHA o blocco const bodyText = document.body.innerText.toLowerCase(); const captchaTexts = ['captcha', 'проверка', 'безопасность', 'подтвердите', 'robot', 'человек']; for (const text of captchaTexts) { if (bodyText.includes(text)) { return true; } } return false; } // Gestori degli eventi per i pulsanti document.getElementById('load-urls').addEventListener('click', () => { const urlText = document.getElementById('avito-urls').value; const urlArray = urlText.split('\n').map(url => url.trim()).filter(url => url !== ''); state.setUrls(urlArray); document.getElementById('start-scraping').disabled = urlArray.length === 0; }); document.getElementById('load-file').addEventListener('click', () => { document.getElementById('url-file').click(); }); document.getElementById('url-file').addEventListener('change', (e) => { const file = e.target.files[0]; if (file) { const reader = new FileReader(); reader.onload = (event) => { const content = event.target.result; document.getElementById('avito-urls').value = content; const urlArray = content.split('\n').map(url => url.trim()).filter(url => url !== ''); state.setUrls(urlArray); document.getElementById('start-scraping').disabled = urlArray.length === 0; }; reader.readAsText(file); } }); document.getElementById('start-scraping').addEventListener('click', () => { const currentUrl = state.getCurrentUrl(); if (currentUrl) { state.setLoading(true); window.location.href = currentUrl; } }); document.getElementById('extract-data').addEventListener('click', () => { const data = extractDataFromPage(); const previewEl = document.getElementById('extracted-preview'); previewEl.style.display = 'block'; }); document.getElementById('next-url').addEventListener('click', () => { if (state.moveToNext()) { const nextUrl = state.getCurrentUrl(); if (nextUrl) { state.setLoading(true); window.location.href = nextUrl; } } else { alert('Hai completato tutti gli URL!'); } }); document.getElementById('pause-resume').addEventListener('click', () => { state.isPaused = !state.isPaused; const pauseButton = document.getElementById('pause-resume'); pauseButton.textContent = state.isPaused ? 'Riprendi' : 'Pausa'; }); // Funzione per gestire correttamente l'encoding del CSV function exportCSV() { if (state.extractedData.length === 0) { alert('Nessun dato da esportare.'); return; } // Utilizza PapaParse con configurazione per UTF-8 const csvConfig = { quotes: true, // Usa sempre le virgolette per proteggere i valori con caratteri speciali quoteChar: '"', escapeChar: '"', delimiter: ",", header: true, newline: "\r\n", skipEmptyLines: false }; const csv = Papa.unparse(state.extractedData, csvConfig); // Aggiunge BOM (Byte Order Mark) per far riconoscere correttamente UTF-8 a Excel e altri programmi const BOM = "\uFEFF"; const csvWithBOM = BOM + csv; // Crea un blob con encoding esplicito UTF-8 const blob = new Blob([csvWithBOM], { type: 'text/csv;charset=utf-8;' }); const url = URL.createObjectURL(blob); // Crea un link per il download const link = document.createElement('a'); link.href = url; link.setAttribute('download', `avito_data_${new Date().toISOString().split('T')[0]}.csv`); document.body.appendChild(link); link.click(); document.body.removeChild(link); } document.getElementById('export-csv').addEventListener('click', exportCSV); document.getElementById('clear-data').addEventListener('click', () => { if (confirm('Sei sicuro di voler cancellare tutti i dati estratti?')) { state.clearData(); const statusElement = document.getElementById('avito-status'); statusElement.textContent = 'Stato: Dati cancellati'; } }); document.getElementById('view-data').addEventListener('click', () => { if (state.extractedData.length === 0) { alert('Nessun dato da visualizzare.'); return; } // Crea una nuova finestra per visualizzare i dati const dataWindow = window.open('', 'Dati Estratti', 'width=800,height=600'); dataWindow.document.write(` <html> <head> <title>Dati Estratti da Avito</title> <meta charset="UTF-8"> <style> body { font-family: Arial, sans-serif; margin: 20px; } table { border-collapse: collapse; width: 100%; } th, td { border: 1px solid #ddd; padding: 8px; text-align: left; } th { background-color: #f2f2f2; } tr:nth-child(even) { background-color: #f9f9f9; } .container { max-width: 100%; overflow-x: auto; } </style> </head> <body> <h2>Dati Estratti da Avito (${state.extractedData.length} record)</h2> <div class="container"> <table> <thead> <tr> ${Object.keys(state.extractedData[0]).map(key => `<th>${key}</th>`).join('')} </tr> </thead> <tbody> ${state.extractedData.map(record => ` <tr> ${Object.values(record).map(value => `<td>${value}</td>`).join('')} </tr> `).join('')} </tbody> </table> </div> </body> </html> `); }); // Controllo periodico di CAPTCHA e caricamento pagina function checkPageStatus() { const captchaAlert = document.getElementById('captcha-alert'); if (detectCaptcha()) { captchaAlert.style.display = 'block'; state.setLoading(false); } else { captchaAlert.style.display = 'none'; // Controlla se la pagina è completamente caricata if (document.readyState === 'complete') { state.setLoading(false); // Se l'URL corrente corrisponde all'URL che stiamo processando, abilita il pulsante di estrazione const currentUrl = state.getCurrentUrl(); if (currentUrl && window.location.href.includes(currentUrl)) { document.getElementById('extract-data').disabled = false; } } } } // Controlla lo stato della pagina ogni secondo setInterval(checkPageStatus, 1000); // Evento quando la pagina è completamente caricata window.addEventListener('load', () => { // Aggiungiamo un piccolo ritardo per assicurarci che tutti gli elementi siano caricati setTimeout(() => { state.setLoading(false); state.updateButtonStates(); // Verifica se l'URL corrente corrisponde a uno degli URL nella lista const currentUrl = state.getCurrentUrl(); if (currentUrl && window.location.href.includes(currentUrl)) { document.getElementById('extract-data').disabled = false; } }, 1500); }); console.log('Avito Scraping Assistant avviato con supporto UTF-8'); })();