Avito Scraping Assistant

Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici

  1. // ==UserScript==
  2. // @name Avito Scraping Assistant
  3. // @namespace https://danielfragomeli.com/
  4. // @version 1.2
  5. // @description Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici
  6. // @author dan098
  7. // @match *://*.avito.ru/*
  8. // @grant GM_setValue
  9. // @grant GM_getValue
  10. // @grant GM_deleteValue
  11. // @grant GM_listValues
  12. // @grant GM_setClipboard
  13. // @grant GM_xmlhttpRequest
  14. // @grant unsafeWindow
  15. // @license MIT
  16. // @require https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js
  17. // @require https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.2/papaparse.min.js
  18. // ==/UserScript==
  19.  
  20. (function() {
  21. 'use strict';
  22.  
  23. // Stili CSS per l'interfaccia
  24. const css = `
  25. #avito-scraper-panel {
  26. position: fixed;
  27. bottom: 10px;
  28. right: 10px;
  29. background-color: #fff;
  30. border: 2px solid #0078d7;
  31. border-radius: 5px;
  32. padding: 10px;
  33. width: 400px;
  34. z-index: 10000;
  35. font-family: Arial, sans-serif;
  36. box-shadow: 0 0 10px rgba(0,0,0,0.2);
  37. }
  38. #avito-scraper-panel h3 {
  39. margin-top: 0;
  40. margin-bottom: 10px;
  41. color: #0078d7;
  42. border-bottom: 1px solid #eee;
  43. padding-bottom: 5px;
  44. }
  45. #avito-urls {
  46. width: 100%;
  47. height: 100px;
  48. margin-bottom: 10px;
  49. resize: vertical;
  50. }
  51. #avito-status {
  52. margin: 10px 0;
  53. padding: 5px;
  54. border-radius: 3px;
  55. background-color: #f0f0f0;
  56. }
  57. .avito-button {
  58. background-color: #0078d7;
  59. color: white;
  60. border: none;
  61. padding: 8px 12px;
  62. margin: 5px 5px 5px 0;
  63. border-radius: 3px;
  64. cursor: pointer;
  65. }
  66. .avito-button:hover {
  67. background-color: #005a9e;
  68. }
  69. .avito-button:disabled {
  70. background-color: #cccccc;
  71. cursor: not-allowed;
  72. }
  73. .progress-bar {
  74. height: 15px;
  75. background-color: #e0e0e0;
  76. border-radius: 5px;
  77. margin: 10px 0;
  78. }
  79. .progress-fill {
  80. height: 100%;
  81. background-color: #0078d7;
  82. border-radius: 5px;
  83. width: 0%;
  84. transition: width 0.3s;
  85. }
  86. .captcha-alert {
  87. color: #d61e00;
  88. font-weight: bold;
  89. margin-top: 5px;
  90. display: none;
  91. }
  92. .controls-row {
  93. display: flex;
  94. justify-content: space-between;
  95. align-items: center;
  96. }
  97. .pause-button {
  98. background-color: #ff9800;
  99. }
  100. .pause-button:hover {
  101. background-color: #e68a00;
  102. }
  103. .loading-indicator {
  104. color: blue;
  105. font-style: italic;
  106. }
  107. #current-url-container {
  108. margin: 10px 0;
  109. border: 1px solid #ddd;
  110. padding: 5px;
  111. border-radius: 3px;
  112. word-break: break-all;
  113. font-size: 12px;
  114. }
  115. #extracted-preview {
  116. margin: 10px 0;
  117. max-height: 100px;
  118. overflow-y: auto;
  119. border: 1px solid #ddd;
  120. padding: 5px;
  121. font-size: 12px;
  122. background-color: #f9f9f9;
  123. }
  124. `;
  125.  
  126. // Aggiungi stili alla pagina
  127. const styleEl = document.createElement('style');
  128. styleEl.textContent = css;
  129. document.head.appendChild(styleEl);
  130.  
  131. // Crea e aggiungi il pannello di controllo alla pagina
  132. const panel = document.createElement('div');
  133. panel.id = 'avito-scraper-panel';
  134. panel.innerHTML = `
  135. <h3>Avito Scraping Assistant</h3>
  136. <div>
  137. <textarea id="avito-urls" placeholder="Inserisci gli URL da visitare, uno per riga"></textarea>
  138. <div class="controls-row">
  139. <button id="load-urls" class="avito-button">Carica URLs</button>
  140. <button id="load-file" class="avito-button">Carica da File</button>
  141. <input type="file" id="url-file" style="display: none;" accept=".txt,.csv">
  142. </div>
  143. </div>
  144. <div id="current-url-container" style="display: none;">
  145. <strong>URL Corrente:</strong> <span id="current-url"></span>
  146. </div>
  147. <div class="progress-bar">
  148. <div class="progress-fill" id="progress-fill"></div>
  149. </div>
  150. <div id="avito-status">Stato: Pronto</div>
  151. <div class="captcha-alert" id="captcha-alert">⚠️ CAPTCHA rilevato! Risolvilo manualmente, poi clicca "Continua"</div>
  152. <div class="loading-indicator" id="loading-indicator" style="display: none;">Caricamento pagina in corso...</div>
  153. <div id="extracted-preview" style="display: none;"></div>
  154. <div>
  155. <button id="start-scraping" class="avito-button" disabled>Inizia</button>
  156. <button id="extract-data" class="avito-button" disabled>Estrai Dati</button>
  157. <button id="next-url" class="avito-button" disabled>Prossimo</button>
  158. <button id="pause-resume" class="avito-button pause-button" disabled>Pausa</button>
  159. </div>
  160. <div>
  161. <button id="export-csv" class="avito-button" disabled>Esporta CSV</button>
  162. <button id="clear-data" class="avito-button" disabled>Cancella Dati</button>
  163. <button id="view-data" class="avito-button" disabled>Visualizza Dati</button>
  164. </div>
  165. `;
  166.  
  167. document.body.appendChild(panel);
  168.  
  169. // Funzioni helper per gestire lo stato dello scraping
  170. class ScrapingState {
  171. constructor() {
  172. this.urls = [];
  173. this.currentIndex = 0;
  174. this.extractedData = [];
  175. this.isPaused = false;
  176. this.isLoading = false;
  177. this.loadState();
  178. }
  179.  
  180. loadState() {
  181. const savedUrls = GM_getValue('avitoUrls', null);
  182. const savedIndex = GM_getValue('avitoCurrentIndex', 0);
  183. const savedData = GM_getValue('avitoExtractedData', null);
  184. if (savedUrls) {
  185. this.urls = JSON.parse(savedUrls);
  186. document.getElementById('avito-urls').value = this.urls.join('\n');
  187. }
  188. this.currentIndex = savedIndex;
  189. if (savedData) {
  190. this.extractedData = JSON.parse(savedData);
  191. }
  192. this.updateButtonStates();
  193. this.updateProgress();
  194. if (this.urls.length > 0) {
  195. document.getElementById('start-scraping').disabled = false;
  196. if (this.extractedData.length > 0) {
  197. document.getElementById('export-csv').disabled = false;
  198. document.getElementById('clear-data').disabled = false;
  199. document.getElementById('view-data').disabled = false;
  200. }
  201. }
  202. }
  203.  
  204. saveState() {
  205. GM_setValue('avitoUrls', JSON.stringify(this.urls));
  206. GM_setValue('avitoCurrentIndex', this.currentIndex);
  207. GM_setValue('avitoExtractedData', JSON.stringify(this.extractedData));
  208. }
  209.  
  210. setUrls(urlArray) {
  211. this.urls = urlArray.filter(url => url.trim() !== '');
  212. this.saveState();
  213. this.updateButtonStates();
  214. this.updateProgress();
  215. }
  216.  
  217. getCurrentUrl() {
  218. if (this.currentIndex < this.urls.length) {
  219. return this.urls[this.currentIndex];
  220. }
  221. return null;
  222. }
  223.  
  224. moveToNext() {
  225. if (this.currentIndex < this.urls.length - 1) {
  226. this.currentIndex++;
  227. this.saveState();
  228. this.updateProgress();
  229. return true;
  230. }
  231. return false;
  232. }
  233.  
  234. addExtractedData(data) {
  235. this.extractedData.push(data);
  236. this.saveState();
  237. document.getElementById('export-csv').disabled = false;
  238. document.getElementById('clear-data').disabled = false;
  239. document.getElementById('view-data').disabled = false;
  240. // Mostra un'anteprima dei dati estratti
  241. this.showDataPreview(data);
  242. }
  243.  
  244. showDataPreview(data) {
  245. const previewEl = document.getElementById('extracted-preview');
  246. previewEl.style.display = 'block';
  247. // Seleziona alcuni campi chiave da mostrare
  248. const preview = `
  249. <strong>Dati estratti:</strong><br>
  250. Venditore: ${data['seller name']}<br>
  251. Città: ${data['Seller City']}<br>
  252. Prezzo: ${data.Price}<br>
  253. <small>(Dati salvati correttamente)</small>
  254. `;
  255. previewEl.innerHTML = preview;
  256. }
  257.  
  258. updateProgress() {
  259. const progressElement = document.getElementById('progress-fill');
  260. const percent = this.urls.length > 0 ? (this.currentIndex / this.urls.length) * 100 : 0;
  261. progressElement.style.width = `${percent}%`;
  262. const statusElement = document.getElementById('avito-status');
  263. statusElement.textContent = `Stato: ${this.currentIndex}/${this.urls.length} URL processati`;
  264. const currentUrlContainer = document.getElementById('current-url-container');
  265. const currentUrlSpan = document.getElementById('current-url');
  266. if (this.getCurrentUrl()) {
  267. currentUrlContainer.style.display = 'block';
  268. currentUrlSpan.textContent = this.getCurrentUrl();
  269. } else {
  270. currentUrlContainer.style.display = 'none';
  271. }
  272. }
  273.  
  274. updateButtonStates() {
  275. const startButton = document.getElementById('start-scraping');
  276. const extractButton = document.getElementById('extract-data');
  277. const nextButton = document.getElementById('next-url');
  278. const pauseButton = document.getElementById('pause-resume');
  279. const viewDataButton = document.getElementById('view-data');
  280. startButton.disabled = this.urls.length === 0;
  281. const isCurrentPage = window.location.href.includes(this.getCurrentUrl());
  282. extractButton.disabled = !isCurrentPage || this.isLoading;
  283. nextButton.disabled = this.currentIndex >= this.urls.length - 1 || this.isLoading;
  284. pauseButton.disabled = this.urls.length === 0;
  285. viewDataButton.disabled = this.extractedData.length === 0;
  286. }
  287.  
  288. setLoading(isLoading) {
  289. this.isLoading = isLoading;
  290. const loadingIndicator = document.getElementById('loading-indicator');
  291. loadingIndicator.style.display = isLoading ? 'block' : 'none';
  292. this.updateButtonStates();
  293. }
  294.  
  295. clearData() {
  296. this.extractedData = [];
  297. this.saveState();
  298. document.getElementById('export-csv').disabled = true;
  299. document.getElementById('clear-data').disabled = true;
  300. document.getElementById('view-data').disabled = true;
  301. document.getElementById('extracted-preview').style.display = 'none';
  302. }
  303. }
  304.  
  305. // Istanza dello stato dello scraping
  306. const state = new ScrapingState();
  307.  
  308. // Funzione per estrarre i dati dalla pagina attuale
  309. function extractDataFromPage() {
  310. const data = {
  311. link: window.location.href,
  312. 'seller name': 'N/A',
  313. 'shop link': 'N/A',
  314. marketplace: 'Avito',
  315. product: 'Jewelry',
  316. image: 'N/A',
  317. description: 'N/A',
  318. 'Seller City': 'N/A',
  319. Price: 'N/A',
  320. 'Data scaricamento': new Date().toISOString().split('T')[0],
  321. month: '2025-03',
  322. Brand: 'Roberto Coin',
  323. Country: 'Russia',
  324. 'Type of platform': 'Marketplace'
  325. };
  326.  
  327. // Gli XPath forniti
  328. const xpaths = {
  329. 'seller name': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a",
  330. 'shop link': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a",
  331. 'image': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[1]/div/div/div/div/div[3]/img",
  332. 'description': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[2]/div/div/div/p[1]",
  333. 'Seller City': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/div[1]/div/p[1]/span",
  334. 'Price': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]/div/div/div/span/span/span[1]"
  335. };
  336.  
  337. // Estrai i dati utilizzando gli XPath
  338. for (const [field, xpath] of Object.entries(xpaths)) {
  339. try {
  340. const element = document.evaluate(
  341. xpath,
  342. document,
  343. null,
  344. XPathResult.FIRST_ORDERED_NODE_TYPE,
  345. null
  346. ).singleNodeValue;
  347.  
  348. if (element) {
  349. if (field === 'shop link') {
  350. data[field] = element.getAttribute('href') || 'N/A';
  351. // Aggiunge dominio se il link è relativo
  352. if (data[field] && !data[field].startsWith('http')) {
  353. data[field] = `https://www.avito.ru${data[field]}`;
  354. }
  355. } else if (field === 'image') {
  356. data[field] = element.getAttribute('src') || 'N/A';
  357. } else {
  358. data[field] = element.textContent.trim() || 'N/A';
  359. }
  360. }
  361. } catch (e) {
  362. console.error(`Errore nell'estrazione di ${field}:`, e);
  363. }
  364. }
  365.  
  366. state.addExtractedData(data);
  367. // Aggiorna lo stato e mostra un messaggio di successo
  368. const statusElement = document.getElementById('avito-status');
  369. statusElement.textContent = `Stato: Dati estratti da ${state.currentIndex + 1}/${state.urls.length} URL`;
  370. // Abilita il pulsante "Next" se ci sono ancora URL da processare
  371. document.getElementById('next-url').disabled = state.currentIndex >= state.urls.length - 1;
  372. return data;
  373. }
  374.  
  375. // Funzione per rilevare CAPTCHA
  376. function detectCaptcha() {
  377. // Questa è una semplificazione: adatta i selettori in base al reale CAPTCHA di Avito
  378. const captchaSelectors = [
  379. 'iframe[src*="captcha"]',
  380. 'div[class*="captcha"]',
  381. 'div[id*="captcha"]',
  382. 'input[name*="captcha"]',
  383. 'img[src*="captcha"]',
  384. 'form[action*="captcha"]',
  385. 'div.firewall-container' // Aggiunto per il firewall di Avito
  386. ];
  387. for (const selector of captchaSelectors) {
  388. if (document.querySelector(selector)) {
  389. return true;
  390. }
  391. }
  392. // Cerca anche per testo di CAPTCHA o blocco
  393. const bodyText = document.body.innerText.toLowerCase();
  394. const captchaTexts = ['captcha', 'проверка', 'безопасность', 'подтвердите', 'robot', 'человек'];
  395. for (const text of captchaTexts) {
  396. if (bodyText.includes(text)) {
  397. return true;
  398. }
  399. }
  400. return false;
  401. }
  402.  
  403. // Gestori degli eventi per i pulsanti
  404. document.getElementById('load-urls').addEventListener('click', () => {
  405. const urlText = document.getElementById('avito-urls').value;
  406. const urlArray = urlText.split('\n').map(url => url.trim()).filter(url => url !== '');
  407. state.setUrls(urlArray);
  408. document.getElementById('start-scraping').disabled = urlArray.length === 0;
  409. });
  410.  
  411. document.getElementById('load-file').addEventListener('click', () => {
  412. document.getElementById('url-file').click();
  413. });
  414.  
  415. document.getElementById('url-file').addEventListener('change', (e) => {
  416. const file = e.target.files[0];
  417. if (file) {
  418. const reader = new FileReader();
  419. reader.onload = (event) => {
  420. const content = event.target.result;
  421. document.getElementById('avito-urls').value = content;
  422. const urlArray = content.split('\n').map(url => url.trim()).filter(url => url !== '');
  423. state.setUrls(urlArray);
  424. document.getElementById('start-scraping').disabled = urlArray.length === 0;
  425. };
  426. reader.readAsText(file);
  427. }
  428. });
  429.  
  430. document.getElementById('start-scraping').addEventListener('click', () => {
  431. const currentUrl = state.getCurrentUrl();
  432. if (currentUrl) {
  433. state.setLoading(true);
  434. window.location.href = currentUrl;
  435. }
  436. });
  437.  
  438. document.getElementById('extract-data').addEventListener('click', () => {
  439. const data = extractDataFromPage();
  440. const previewEl = document.getElementById('extracted-preview');
  441. previewEl.style.display = 'block';
  442. });
  443.  
  444. document.getElementById('next-url').addEventListener('click', () => {
  445. if (state.moveToNext()) {
  446. const nextUrl = state.getCurrentUrl();
  447. if (nextUrl) {
  448. state.setLoading(true);
  449. window.location.href = nextUrl;
  450. }
  451. } else {
  452. alert('Hai completato tutti gli URL!');
  453. }
  454. });
  455.  
  456. document.getElementById('pause-resume').addEventListener('click', () => {
  457. state.isPaused = !state.isPaused;
  458. const pauseButton = document.getElementById('pause-resume');
  459. pauseButton.textContent = state.isPaused ? 'Riprendi' : 'Pausa';
  460. });
  461.  
  462. // Funzione per gestire correttamente l'encoding del CSV
  463. function exportCSV() {
  464. if (state.extractedData.length === 0) {
  465. alert('Nessun dato da esportare.');
  466. return;
  467. }
  468. // Utilizza PapaParse con configurazione per UTF-8
  469. const csvConfig = {
  470. quotes: true, // Usa sempre le virgolette per proteggere i valori con caratteri speciali
  471. quoteChar: '"',
  472. escapeChar: '"',
  473. delimiter: ",",
  474. header: true,
  475. newline: "\r\n",
  476. skipEmptyLines: false
  477. };
  478. const csv = Papa.unparse(state.extractedData, csvConfig);
  479. // Aggiunge BOM (Byte Order Mark) per far riconoscere correttamente UTF-8 a Excel e altri programmi
  480. const BOM = "\uFEFF";
  481. const csvWithBOM = BOM + csv;
  482. // Crea un blob con encoding esplicito UTF-8
  483. const blob = new Blob([csvWithBOM], { type: 'text/csv;charset=utf-8;' });
  484. const url = URL.createObjectURL(blob);
  485. // Crea un link per il download
  486. const link = document.createElement('a');
  487. link.href = url;
  488. link.setAttribute('download', `avito_data_${new Date().toISOString().split('T')[0]}.csv`);
  489. document.body.appendChild(link);
  490. link.click();
  491. document.body.removeChild(link);
  492. }
  493.  
  494. document.getElementById('export-csv').addEventListener('click', exportCSV);
  495.  
  496. document.getElementById('clear-data').addEventListener('click', () => {
  497. if (confirm('Sei sicuro di voler cancellare tutti i dati estratti?')) {
  498. state.clearData();
  499. const statusElement = document.getElementById('avito-status');
  500. statusElement.textContent = 'Stato: Dati cancellati';
  501. }
  502. });
  503.  
  504. document.getElementById('view-data').addEventListener('click', () => {
  505. if (state.extractedData.length === 0) {
  506. alert('Nessun dato da visualizzare.');
  507. return;
  508. }
  509. // Crea una nuova finestra per visualizzare i dati
  510. const dataWindow = window.open('', 'Dati Estratti', 'width=800,height=600');
  511. dataWindow.document.write(`
  512. <html>
  513. <head>
  514. <title>Dati Estratti da Avito</title>
  515. <meta charset="UTF-8">
  516. <style>
  517. body { font-family: Arial, sans-serif; margin: 20px; }
  518. table { border-collapse: collapse; width: 100%; }
  519. th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
  520. th { background-color: #f2f2f2; }
  521. tr:nth-child(even) { background-color: #f9f9f9; }
  522. .container { max-width: 100%; overflow-x: auto; }
  523. </style>
  524. </head>
  525. <body>
  526. <h2>Dati Estratti da Avito (${state.extractedData.length} record)</h2>
  527. <div class="container">
  528. <table>
  529. <thead>
  530. <tr>
  531. ${Object.keys(state.extractedData[0]).map(key => `<th>${key}</th>`).join('')}
  532. </tr>
  533. </thead>
  534. <tbody>
  535. ${state.extractedData.map(record => `
  536. <tr>
  537. ${Object.values(record).map(value => `<td>${value}</td>`).join('')}
  538. </tr>
  539. `).join('')}
  540. </tbody>
  541. </table>
  542. </div>
  543. </body>
  544. </html>
  545. `);
  546. });
  547.  
  548. // Controllo periodico di CAPTCHA e caricamento pagina
  549. function checkPageStatus() {
  550. const captchaAlert = document.getElementById('captcha-alert');
  551. if (detectCaptcha()) {
  552. captchaAlert.style.display = 'block';
  553. state.setLoading(false);
  554. } else {
  555. captchaAlert.style.display = 'none';
  556. // Controlla se la pagina è completamente caricata
  557. if (document.readyState === 'complete') {
  558. state.setLoading(false);
  559. // Se l'URL corrente corrisponde all'URL che stiamo processando, abilita il pulsante di estrazione
  560. const currentUrl = state.getCurrentUrl();
  561. if (currentUrl && window.location.href.includes(currentUrl)) {
  562. document.getElementById('extract-data').disabled = false;
  563. }
  564. }
  565. }
  566. }
  567.  
  568. // Controlla lo stato della pagina ogni secondo
  569. setInterval(checkPageStatus, 1000);
  570.  
  571. // Evento quando la pagina è completamente caricata
  572. window.addEventListener('load', () => {
  573. // Aggiungiamo un piccolo ritardo per assicurarci che tutti gli elementi siano caricati
  574. setTimeout(() => {
  575. state.setLoading(false);
  576. state.updateButtonStates();
  577. // Verifica se l'URL corrente corrisponde a uno degli URL nella lista
  578. const currentUrl = state.getCurrentUrl();
  579. if (currentUrl && window.location.href.includes(currentUrl)) {
  580. document.getElementById('extract-data').disabled = false;
  581. }
  582. }, 1500);
  583. });
  584.  
  585. console.log('Avito Scraping Assistant avviato con supporto UTF-8');
  586. })();