// ==UserScript==
// @name Eza's Tumblr Scrape
// @namespace https://inkbunny.net/ezalias
// @description Creates a new page showing just the images from any Tumblr
// @license Public domain / No rights reserved
// @include http://*?ezastumblrscrape*
// @include http://*/ezastumblrscrape*
// @include http://*.tumblr.com/
// @include http://*.tumblr.com/page/*
// @include http://*.tumblr.com/tagged/*
// @version 3.0
// ==/UserScript==
// Because the cross-domain resource policy is just plain stupid (there is no reason I shouldn't be able to HTTP GET pages and files I can trivially load, or even execute without looking) this script creates an imaginary page at the relevant domain. Thankfully this does save a step: the user is not required to type in the domain they want to rip, because we can just check the URL in the address bar.
// This script also works on off-site Tumblrs, by the way - just add /archive?ezastumblrscrape?scrapewholesite after the ".com" or whatever.
// Make it work, make it fast, make it pretty - in that order.
// TODO:
// http://officialbrostrider.tumblr.com/tagged/homestuck/ezastumblrscrape does some seriously wacky shit - even /ezastumblrscrape doesn't wholly work, and it shows some other URL for siteurl sometimes.
// check if http://eleanorappreciates.tumblr.com/post/57980902871/here-is-the-second-sketch-i-got-at-the-chisaii-3#dnr does the same thing, it has snow
// handling dosopod and other redirect-themes might require taking over /archive and directly imitating a theme - e.g. requesting unstyled posts like infinite-scrolling pages and /archive must do
// http://dosopod.tumblr.com/ doesn't redirect anymore, but nor do the images scrape. same problem with http://kavwoshin.tumblr.com/.
// For scrapewholesite, I could test many distant pages asynchronously, wait until they all come back, then search more finely between the last good and first bad page. (pointless, but interesting.)
// scrape for image links, but don't post links that are also images? this would require removing duplicate elements in url_array[n][1] - naively, O(N^2), but for small N. Duplicates hardly matter and happen anyway.
// going one page at a time for /scrapewholesite is dog-slow, especially when there are more than a thousand pages. any balance between synchronicity and speed throttling is desirable.
// maybe grab several pages at once? no, damn, that doesn't work without explicit parallelism. I don't know if JS has that. really, I just need to get some timer function working.
// does setInterval work? the auto-repeat one, I mean.
// http://ymirsgirlfriend.tumblr.com/ - http://kavwoshin.tumblr.com/ does some ugly nonsense where images go off the left side of the page. wtf.
// Infinite-scrolling tumblrs don't necessarily link to the next page. I need another metric - like if pages only contain the same images as last time. (Empty pages sometimes display foreground images.)
// I'll have to add filtering as some kind of text input... and could potentially do multi-tag filtering, if I can reliably identify posts and/or reliably match tag definitions to images and image sets.
// This is a good feature for doing /scrapewholesite to get text links and then paging through them with fancy dynamic presentation nonsense. Also: duplicate elision.
// I'd love to do some multi-scrape stuff, e.g. scraping both /tagged/homestuck and /tagged/art, but that requires some communication between divs to avoid constant repetition.
// I should start handling "after the cut" situations somehow, e.g. http://banavalope.tumblr.com/post/72117644857/roachpatrol-punispompouspornpalace-happy-new
// Just grab any link to a specific /post. Occasional duplication is fine, we don't care.
// Wait, shit. Every theme should link to every page. And my banavalope example doesn't even link to the same domain, so we couldn't get it with raw AJAX. Meh. It's just a rare problem we'll have to ignore.
// http://askleijon.tumblr.com/ezastumblrscrape is a good example - lots of posts link to outside images (mostly imgur)
// I could detect "read more" links if I can identify the text-content portion of posts. links to /post/ pages are universal theme elements, but become special when they're something the user links to intentionally.
// for example: narcisso's dream on http://cute-blue.tumblr.com/ only shows the cover because the rest is behind a break.
// post-level detection would also be great because it'd let me filter out reblogs. fuck all these people with 1000-page tumblrs, shitty animated gifs in their theme, infinite scrolling, and NO FUCKING TAGS. looking
// Look into Tumblr Saviour to see how they handle and filter out text posts. at you, http://neuroticnick.tumblr.com/post/16618331343/oh-gamzee#dnr - you prick.
// Should non-image links from images be gathered at the top of each 'page' on the image browser? E.g. http://askNSFWcobaltsnow.tumblr.com links to Derpibooru a lot. Should those be listed before the images?
// I worry it'd pick up a lot of crap, like facebook and the main page. More blacklists / whitelists. Save it for when individual posts are detected.
// Using the Back button screws up the favicon. Weird.
// Ah fuck. onError might be linking to the wrong-size images again. That's an oooold bug making a comeback.
// It might just be blimpcat-art, actually. That site had serious problems before switching to /archive?.
// Consider going back to page-matching /thumbnail links for the "scrape" button. Single-tab weirdos may want to go back and forth from the page links on the embedded pages.
// http://playbunny.tumblr.com/archive?/tagged/homestuck/ezastumblrscrape/thumbnails photosets start with a non-image link.
// e.g. http://assets.tumblr.com/assets/styles/mobile_handset/mh_photoset.css?_v=50006b83e288948d62d0251c6d4a77fb#photoset#http://playbunny.tumblr.com/post/96067079633/photoset_iframe/playbunny/tumblr_nb21beiawY1qemks9/500/false
// ScrapeWholeSite: 10 pages at once by doing 10 separate xmlhttpwhatever objects, waiting for each to flip some bit in a 10-bool array? Clumsy parallelism. Possibly recursion, if the check for are-we-all-done-yet is in the status==4 callback.
// I should probably implement a box and button for choosing lastpage, just for noob usability's sake. Maybe it'd only appear if pages==2.
// Options and shit should really be generalized to ?options instead of /options, so they can be safely ignored by existing code. e.g. explicit ?pagesatonce=10, ?startingpage=17, etc.
// get_site would remove anything that starts with ?, up to the next ?, or the end of string, or to a slash I guess. would you even do /ezastumblrscrape? hardly matters now, could be ?ezastumblrscrape fo @Include.
// fuck it, more global vars. scrape ?options before branching to either scrape function.
// I started implementing this while drunk. It's a mess. Possibly related: http://promstuck.tumblr.com/tagged/promstuck/chrono/page/2 leads to http://promstuck.tumblr.com/archive?/tagged/promstuck/chrono/page/2?ezastumblrscrape?scrapewholesite which is incorrect for either system.
// standardize options into some array or map of strings and variables. foreach( option-name ) check each option, then later add them together for the sake of archive-site or add-archive or whatever
// I think "read more" links are standardized. I should be grabbing them like photosets.
// Browsing mode should link back to scrape mode, for when people manually add /ezastumblrscrape. (Also, the URL-handling functions should add /archive to handle that.)
// Immediate todo: add "read more" support, & fix up the Scrape link to work on all themes.
// ------------------------------------ Global variables ------------------------------------ //
var last_page = 0; // We need this global variable because GreaseMonkey still can't handle a button activating a function with parameters. It's used in scrape_whole_tumblr.
var options_map = new Object(); // Associative array for ?key=value pairs in URL. Will eventually subsume several of the above variables.
// Here's the URL options currently used. Scalars are at their default values; boolean flags are all set to false.
options_map[ "lastpage" ] = 0; // How many pages to scrape for image links when scouring the whole site. Useful for infinite-scrolling themes that can't be counted automatically.
options_map[ "startpage" ] = 1; // Page to start at when browsing images.
options_map[ "pagesatonce" ] = 10; // How many Tumblr pages to browse images from at once.
options_map[ "thumbnails" ] = false; // For browsing mode, 240px-wide images v.s full-size.
options_map[ "find" ] = ""; // What goes after the Tumblr URL. E.g. /tagged/art or /chrono.
// ------------------------------------ Script start, general setup ------------------------------------ //
// First, determine if we're loading many pages and listing/embedding them, or if we're just adding a convenient button to that functionality.
if( window.location.href.indexOf( 'ezastumblrscrape' ) > -1 ) { // If we're scraping pages:
// Replace Tumblr-standard Archive page with our own custom nonsense
var subdomain = window.location.href.substring( window.location.href.indexOf( "/" ) + 2, window.location.href.indexOf( "." ) ); // everything between http:// and .tumblr.com
var title = document.title;
document.head.innerHTML = ""; // Delete CSS. We'll start with a blank page.
document.title = subdomain + " - " + title;
document.body.outerHTML = "<div id='maindiv'><div id='fetchdiv'></div></div><div id='bottom_controls_div'></div>"; // This is our page. Top stuff, content, bottom stuff.
document.body.style.backgroundColor="#DDDDDD"; // Light grey BG to make image boundaries more obvious
var mydiv = document.getElementById( "maindiv" ); // I apologize for "mydiv." This script used to be a lot simpler.
// Identify options in URL (in the form of ?key=value pairs)
var options_pointer = 0;
while( window.location.href.indexOf( "?", options_pointer ) > -1 ) { // For each ?key in the URL
options_pointer = window.location.href.indexOf( "?", options_pointer ) + 1;
var next_stop = window.location.href.indexOf( "?", options_pointer ); // Options will be packed like ?this=1?that=2?other=3, so find the next ? if it exists
if( next_stop < 0 ) { next_stop = window.location.href.length; } // If this is the last option, go to the end of the URL
var this_option = window.location.href.substring( options_pointer, next_stop ); // This should now contain e.g. "that=2"
// Separate key from value, then put them in options_map
var this_key, this_value;
var equals_sign = this_option.indexOf( "=" );
if( equals_sign < 0 ) { // If there's a value defined, use it, otherwise use true (e.g. for ?thumbnails)
this_key = this_option;
this_value = true;
} else {
this_key = this_option.substring( 0, equals_sign );
this_value = this_option.substring( equals_sign+1 );
}
if( this_value == "false" ) { this_value = false; } // if it's thing=false, make it actually false
else if( !isNaN( parseInt( this_value ) ) ) { this_value = parseInt( this_value ); } // if it looks like a number, make it actually a number
options_map[ this_key ] = this_value;
}
if( options_map.find == "/" ) { options_map.find = ""; } // kludge - prevents example.tumblr.com//page/2 nonsense.
// Go to image browser or link scraper according to URL options.
mydiv.innerHTML = "Not all images are guaranteed to appear.<br>"; // Thanks to Javascript's wacky accomodating nature, mydiv is global despite appearing in an if-else block.
if( window.location.href.indexOf( "?scrapewholesite" ) < 0 ) {
// if( options_map.scrapewholesite ) {
scrape_tumblr_pages(); // Ten pages of embedded images at a time
} else {
scrape_whole_tumblr(); // Images from every page, presented as text links
}
} else { // If it's just a normal Tumblr page, add a link to the appropriate /ezastumblrscrape URL
// Add link(s) to the standard "+Follow / Dashboard" nonsense. Before +Follow, I think - to avoid messing with users' muscle memory.
// This is currently beyond my ability to dick with JS through a script in a plugin. Let's kludge it for immediate usability.
// kludge by Ivan - http://userscripts-mirror.org/scripts/review/65725.html
var site = window.location.href;
url = site.substring( 0, site.indexOf( ".com" ) + 4 ) + "/archive?ezastumblrscrape?scrapewholesite?find=" + site.substring( site.indexOf( ".com" ) + 4 );
if( url.lastIndexOf( "/page/" ) > 0 ) { url = url.substring( 0, url.lastIndexOf( "/page/" ) ); } // Don't include e.g. /page/2. We'll add that ourselves.
// Don't clean this up. It's not permanent.
var eLink = document.createElement("a");
eLink.setAttribute("id","edit_link");
eLink.setAttribute("style","position:absolute;top:26px;right:2px;padding:2px 0 0;width:50px;height:18px;display:block;overflow:hidden;-moz-border-radius:3px;background:#777;color:#fff;font-size:8pt;text-decoration:none;font-weight:bold;text-align:center;line-height:12pt;");
eLink.setAttribute("href", url);
eLink.appendChild(document.createTextNode("Scrape"));
var elBody = document.getElementsByTagName("body")[0];
elBody.appendChild(eLink);
}
// ------------------------------------ Whole-site scraper for use with DownThemAll ------------------------------------ //
// Monolithic scrape-whole-site function, recreating the original intent (before I added pages and made it a glorified multipage image browser)
// I still can't determine the existence of _1280 images without downloading them entirely, so there will be some different-size duplicates. Better too much than not enough.
// So for archiving, I need some kind of sister Perl script that goes 'foreach filename containing _500, if (regex _1280) exists, delete this _500 file.'
function scrape_whole_tumblr() {
var highest_known_page = 0;
var site = get_site( window.location.href );
mydiv.innerHTML += "<h1><a href='" + options_url( "scrapewholesite", false ) + "?thumbnails'>Browse images</a></h1><br>"; // link to image-viewing version, preserving current tags
// Find out how many pages we need to scrape.
if( isNaN( options_map.lastpage ) ) { options_map.lastpage = 0; }
last_page = options_map.lastpage; // kludge. I'm lazy.
if( last_page == 0 ) {
// Find upper bound in a small number of fetches. Ideally we'd skip this - some themes list e.g. "Page 1 of 24." I think that requires back-end cooperation.
mydiv.innerHTML += "Finding out how many pages are in <b>" + site.substring( site.indexOf( '/' ) + 2 ) + "</b>:<br><br>"; // Telling users what's going on. "site" has http(s):// removed for readability.
for( var n = 2; n > 0 && n < 10000; n *= 2 ) { // 10,000 is an arbitrary upper bound to prevent infinite loops, but some crazy-old Tumblrs might have more pages. This used to stop at 5000.
var siteurl = site + "/page/" + n;
var xmlhttp = new XMLHttpRequest();
xmlhttp.onreadystatechange=function() {
if( xmlhttp.readyState == 4 ) {
if( xmlhttp.responseText.indexOf( "/page/" + (n+1) ) < 0 ) { // Does this page link to the next page? Pages too far will only link backwards. (Infinite scrolling doesn't link anywhere. Bleh.)
mydiv.innerHTML += siteurl + " is too high.<br>";
last_page = n;
n = -1; // break for(n) loop
} else {
mydiv.innerHTML += siteurl + " exists.<br>";
highest_known_page = n;
}
}
}
xmlhttp.open("GET", siteurl, false); // false=synchronous, for linear execution. There's no point checking if a page is the last one if we've already sent requests for the next dozen.
xmlhttp.send();
}
// Binary-search closer to the actual last page
// 1000+ page examples: http://neuroticnick.tumblr.com/ - http://teufeldiabolos.co.vu/
while( last_page > highest_known_page + 10 ) { // Arbitrary cutoff. We're just trying to minimize the range. A couple extra pages is reasonable; a hundred is excessive.
mydiv.innerHTML +="Narrowing down last page: ";
var middlepage = parseInt( (last_page + highest_known_page) / 2 ); // integer midpoint between highest-known and too-high pages
var siteurl = site + "/page/" + middlepage;
var xmlhttp = new XMLHttpRequest();
xmlhttp.onreadystatechange=function() {
if( xmlhttp.readyState == 4 ) {
if( xmlhttp.responseText.indexOf( "/page/" + (middlepage+1) ) < 0 ) { // Test for the presence of a link to the next page.
mydiv.innerHTML += siteurl + " is high.<br>";
last_page = middlepage;
} else {
mydiv.innerHTML += siteurl + " exists.<br>";
highest_known_page = middlepage;
}
}
}
xmlhttp.open("GET", siteurl, false); // false=synchronous, for linear execution. There's no point checking if a page is the last one if we've already sent requests for the next dozen.
xmlhttp.send();
}
}
options_map.lastpage = last_page;
// If we suspect infinite scrolling, or if someone silly has entered a negative number in the URL, tell them how to choose their own last_page value:
if( options_map.lastpage < 3 ) {
mydiv.innerHTML += "<br>Infinite-scrolling Tumblr themes will sometimes stop at 2 pages. " // Inform user
mydiv.innerHTML += "<a href='" + options_url( "lastpage", 100 ) + "'>Click here to try 100 instead.</a><br>"; // link to N-page version
}
mydiv.innerHTML += "<br>Last page detected is " + options_map.lastpage + " or lower.<br><br>";
// Add button to scrape every page, one after another.
// Buttons within GreaseMonkey are a huge pain in the ass. I stole this from stackoverflow.com/questions/6480082/ - thanks, Brock Adams.
var button = document.createElement ('div');
button.innerHTML = '<button id="myButton" type="button">Find image links from all pages</button>';
button.setAttribute ( 'id', 'scrape_button' ); // I'm really not sure why this id and the above HTML id aren't the same property.
document.body.appendChild ( button ); // Add button (at the end is fine)
document.getElementById ("myButton").addEventListener ( "click", scrape_all_pages, false ); // Activate button - when clicked, it triggers scrape_all_pages()
}
function scrape_all_pages() { // Example code implies that this function /can/ take a parameter via the event listener, but I'm not sure how.
var button = document.getElementById( "scrape_button" ); // First, remove the button. There's no reason it should be clickable twice.
button.parentNode.removeChild( button ); // The DOM can only remove elements from a higher level. "Elements can't commit suicide, but infanticide is permitted."
// We need to find "site" again, because we can't pass it. Putting a button on the page and making it activate a GreaseMonkey function borders on magic. Adding parameters is straight-up dark sorcery.
var site = get_site( window.location.href );
mydiv.innerHTML += "Scraping page: <div id='pagecounter'></div><br>"; // This makes it easier to view progress, since Firefox / Pale Moon only scrolls with the scroll wheel on pages which are still loading.
// Fetch all pages with content on them
for( var x = 1; x <= last_page; x++ ) {
var siteurl = site + "/page/" + x;
mydiv.innerHTML += "<b>Page " + x + " fetched</b><br><div id='" + siteurl + "'></div>";
document.getElementById( 'pagecounter' ).innerHTML = " " + x;
if( x != last_page ) {
asynchronous_fetch( siteurl, false ); // Sorry for the function spaghetti. Scrape_all_pages exists so a thousand pages aren't loaded in the background, and asynchronous_fetch prevents race conditions.
} else {
asynchronous_fetch( siteurl, true ); // Stop = true when we're on the last page. No idea if it accomplishes anything at this point. (Probably not, thanks to /archive?.
document.getElementById( 'pagecounter' ).innerHTML += "<br>Done. Use DownThemAll (or a similar plugin) to grab all these links.";
}
}
}
function asynchronous_fetch( siteurl, stop ) { // separated into another function to prevent race condition (i.e. variables changing while asynronous request is happening)
var xmlhttp = new XMLHttpRequest(); // AJAX object
xmlhttp.onreadystatechange = function() { // When the request returns, this anonymous function will trigger (repeatedly, for various stages of the reply)
if( xmlhttp.readyState == 4 ) { // Don't do anything until we're done downloading the page.
var thisdiv = document.getElementById( siteurl ); // identify the div we printed for this page
thisdiv.innerHTML += "<a href='" + siteurl + "'>" + siteurl + "</a><br>"; // link to page, in case you want to see something in-situ (e.g. for proper sourcing)
var url_array = soft_scrape_page( xmlhttp.responseText ); // turn HTML dump into list of URLs
// Print URLs so DownThemAll (or similar) can grab them
for( var n = 0; n < url_array.length; n++ ) {
var image_url = url_array[n][1]; // url_array is an array of 2-element arrays. each inner array goes <url, position on page>.
thisdiv.innerHTML += "<a href=" + image_url + ">" + image_url + "</a><br>"; // These URLs don't need to be links, but why not? Anyway, lusers don't know what "URL" means.
// Some images are automatically resized. We'll add the maximum-sized link in case it exists - unfortunately, there's no easy way to check if it exists. We'll just post both.
var fixed_url = "";
if( image_url.lastIndexOf( "_500." ) > -1 ) { fixed_url = image_url.replace( "_500.", "_1280." ); }
if( image_url.lastIndexOf( "_400." ) > -1 ) { fixed_url = image_url.replace( "_400.", "_1280." ); }
if( fixed_url.indexOf( "#photoset" ) > 0 ) { fixed_url = ""; } // Photoset image links are never resized. Tumblr did at least this one thing right.
if( fixed_url !== "" ) { thisdiv.innerHTML += "<a href=" + fixed_url + ">" + fixed_url + "</a><br>"; }
if( stop ) { window.stop(); } // clumsy way to finish up for sites with uncooperative script bullshit that makes everything vanish after loading completes. (not sure this does anything anymore.)
}
}
}
xmlhttp.open("GET", siteurl, false); // This should probably be "true" for asynchronous at some point, but naively, it spams hundreds of GETs per second. This spider script shouldn't act like a DDOS.
xmlhttp.send();
}
// ------------------------------------ Multi-page scraper with embedded images ------------------------------------ //
// I should probably change page numbers such that ezastumblrscrape/100 starts at /page/100 and goes to /page/(100+numberofpages). Just ignore /page/0.
function scrape_tumblr_pages() { // Create a page where many images are displayed as densely as seems sensible
// Figure out which site we're scraping
var site = get_site( window.location.href ); // remove /archive? nonsense, remove /ezastumblrscrape nonsense, preserve /tagged/whatever, /chrono, etc.
options_map.lastpage = false; // cosmetic. we don't need ?lastpage in the URL because it doesn't matter here.
var next_link = options_url( "startpage", options_map.startpage + options_map.pagesatonce );
var prev_link = options_url( "startpage", options_map.startpage - options_map.pagesatonce );
options_url( "startpage", 1000 ); // debug - I think I'm getting side-effects from copy_map
if( !isNaN( parseInt( options_map.startpage ) ) && options_map.startpage <= 1 ) {
options_map.startpage = 1; // Reset in case it's screwy. Negative numbers work, but all return page 1 anyway.
var prev_next_controls = "<br><a href='" + next_link + "'>Next >>></a><br><br>";
} else {
var prev_next_controls = "<br><a href='" + prev_link + "'><<< Previous</a> - <a href='" + next_link + "'>Next >>></a><br><br>";
}
mydiv.innerHTML += prev_next_controls;
document.getElementById("bottom_controls_div").innerHTML += prev_next_controls;
// Link to the thumbnail page or full-size-image page as appropriate
if( options_map.thumbnails ) { mydiv.innerHTML += "<a href='"+ options_url( "thumbnails", false ) + "'>Switch to full-size images</a><br>"; }
else { mydiv.innerHTML += "<a href='"+ options_url( "thumbnails", true ) + "'>Switch to thumbnails</a><br>"; }
// Grab several pages and extract/embed images.
start_page = parseInt( options_map.startpage ); // debug-ish. I'll use these more directly soon enough.
number_of_pages_at_once = parseInt( options_map.pagesatonce );
for( x = start_page; x < start_page + number_of_pages_at_once; x++ ) {
var siteurl = site + "/page/" + x;
mydiv.innerHTML += "<hr><b>Page " + x + " fetched</b><br><div id='" + siteurl + "'></div>"; // TODO: Sanitize the URL here and in fetch_page. It's just a unique ID.
fetch_page( siteurl, mydiv, options_map.thumbnails ); // I'd rather do this right here, but unless the whole AJAX mess is inside its own function, matching a responseText to its siteurl is fucking intractable.
}
}
function fetch_page( siteurl, mydiv, thumbnails ) { // Grab a page, scrape its image URLs, and embed them for easy browsing
var xmlhttp = new XMLHttpRequest(); // AJAX object
xmlhttp.onreadystatechange = function() { // When the request returns, this anonymous function will trigger (repeatedly, for various stages of the reply)
if( xmlhttp.readyState == 4 ) { // Don't do anything until we're done downloading the page.
var thisdiv = document.getElementById( siteurl ); // identify the div we printed for this page // TODO: Sanitize, as above. Code execution through this niche script is unlikely, but why keep it possible?
thisdiv.innerHTML += "<a href='" + siteurl + "'>" + siteurl + "</a><br>"; // link to page, in case you want to see something in-situ (e.g. for proper sourcing)
var url_array = soft_scrape_page( xmlhttp.responseText ); // turn HTML dump into list of URLs
// Embed high-res images to be seen, clicked, and saved
for( var n = 0; n < url_array.length; n++ ) {
var image_url = url_array[n][1];
// For images which might have been automatically resized, assume the highest resolution exists, and change the URL accordingly.
var fixed_url = "";
if( image_url.lastIndexOf( "_500." ) > -1 ) { fixed_url = image_url.replace( "_500.", "_1280." ); }
if( image_url.lastIndexOf( "_400." ) > -1 ) { fixed_url = image_url.replace( "_400.", "_1280." ); }
if( image_url.lastIndexOf( "_250." ) > -1 ) { fixed_url = image_url.replace( "_250.", "_1280." ); }
if( image_url.lastIndexOf( "_100." ) > -1 ) { fixed_url = image_url.replace( "_100.", "_1280." ); }
if( fixed_url.indexOf( "#photoset" ) > 0 ) { fixed_url = ""; } // Photosets always link to the highest resolution available.
if( fixed_url !== "" ) { image_url = fixed_url; }
// This clunky <img onError> function looks for a lower-res image if the high-res version doesn't exist.
var on_error = 'if(this.src.indexOf("_1280")>0){this.src=this.src.replace("_1280","_500");}'; // Swap 1280 for 500
on_error += 'else if(this.src.indexOf("_500")>0){this.src=this.src.replace("_500","_400");}'; // Or swap 500 for 400
on_error += 'else if(this.src.indexOf("_400")>0){this.src=this.src.replace("_400","_250");}'; // Or swap 400 for 250
on_error += 'else{this.src=this.src.replace("_250","_100");this.onerror=null;}'; // Or swap 250 for 100, then give up
on_error += 'document.getElementById("' + image_url + '").href=this.src;'; // Link the image to itself, regardless of size
// Embed images (linked to themselves) and link to photosets
if( image_url.indexOf( "#" ) < 0 ) { // if it's just an image, then embed that image, linked to itself
if( options_map.thumbnails ) { thisdiv.innerHTML += "<a id='" + image_url + "' href='" + image_url + "'><img width='240' src='" + image_url + "' onerror='" + on_error + "'></a> "; }
else { thisdiv.innerHTML += "<a id='" + image_url + "' href='" + image_url + "'><img src='" + image_url + "' onerror='" + on_error + "'></a> "; }
} else { // but if it's an image from a photoset, also print the photoset link. (is on_error necessary here? these images are already high-res. I guess it's an unintrusive fallback.)
var photoset_url = image_url.substring( image_url.lastIndexOf( "#" ) + 1 ); // separate everything past the last hash - it's like http://tumblr.com/image#photoset#http://tumblr.com/photoset_iframe
if( photoset_url.substring( 0, 4) == "http" ) { thisdiv.innerHTML += " <a href='" + photoset_url + "'>Set:</a>"; } // if the #photoset tag is followed by an #http URL, link the URL
if ( options_map.thumbnails ) { thisdiv.innerHTML += "<a id='" + image_url + "' href='" + image_url + "'><img alt='(Wait for image)' width='240' src='" + image_url + "' onerror='" + on_error + "'></a> "; }
else { thisdiv.innerHTML += "<a id='" + image_url + "' href='" + image_url + "'><img alt='(Image)' src='" + image_url + "' onerror='" + on_error + "'></a> "; }
}
}
}
}
xmlhttp.open("GET", siteurl, true); // True = asynchronous. Finally got the damn thing to work! It's a right bitch to do in an inline function. JS scopes are screwy as hell.
xmlhttp.send();
}
// ------------------------------------ Universal page-scraping function (and other helped functions) ------------------------------------ //
// This scrapes all embedded images, iframe photosets, and linked image files into an array. Including all content is a work in progress.
function soft_scrape_page( html_copy ) {
var url_array = new Array();
// look for <img> tags, isolate src URLs
var string_counter = 0; // this is what we'll use instead of copying and scraping everything. indexOf( "thing", string_counter ).
while( html_copy.indexOf( '<img', string_counter ) > -1 ) { // For each <img> tag in the page's HTML
// String_counter must ALWAYS be higher at the end of this loop than the beginning, because otherwise, while() fucks us. In fact, let's enforce that:
// Firefox is aggravatingly susceptible to freezing for infinite loops. In a sandbox! I hope it's because GM is a plugin, because otherwise, yeesh.
var string_counter_enforcement = string_counter; // if string_counter isn't higher than this at the end of the while() loop, you done goofed
// Seek to next <img> tag, extract source
string_counter = html_copy.indexOf( '<img', string_counter ) + 4;
var next_image_src = html_copy.indexOf( 'src=', string_counter ) + 5; // note: we advance past the quote, not just past the equals sign
var next_angle_bracket = html_copy.indexOf( '>', string_counter );
if( next_angle_bracket > next_image_src ) { // If this <img> tag contains a src, grab it. (I doubt any <img> tags are malformed, but let's be cautious.
string_counter = next_image_src;
var quote_type = html_copy.substring( string_counter - 1, string_counter ); // either a singlequote or a doublequote
var image_url = html_copy.substring( string_counter, html_copy.indexOf( quote_type, string_counter ) );
}
// Exclude a bunch of useless nonsense with a blacklist
if( image_url.indexOf( "//assets.tumblr.com" ) > 0 ) { image_url = ""; } // let's ignore avatar icons and Tumblr stuff.
if( image_url.indexOf( "//static.tumblr.com" ) > 0 ) { image_url = ""; }
if( image_url.indexOf( "//www.tumblr.com" ) > 0 ) { image_url = ""; }
if( image_url.indexOf( "/avatar_" ) > 0 ) { image_url = ""; }
// Include potentially interesting nonsense with a whitelist
// General offsite whitelist would include crap like Facebook buttons, Twitter icons, etc.
if( image_url.indexOf( ".tumblr.com" ) < 0 ) { // note that this test is different from the others - we blank image_url if the search term is not found, instead of blanking if it is found
var original_image_url = image_url;
image_url = "";
if( original_image_url.indexOf( "deviantart.net" ) > 0 ) { image_url = original_image_url; } // this is a sloppy whitelist of non-tumblr domains
if( original_image_url.indexOf( "imgur.com" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( "imageshack.com" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( "imageshack.us" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( "tinypic.com" ) > 0 ) { image_url = original_image_url; } // this originally read "tinypic.com1", but I assume I was drunk.
if( original_image_url.indexOf( "gifninja.com" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( "photobucket.com" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( "dropbox.com" ) > 0 ) { image_url = original_image_url; }
}
if( image_url !== "" ) {
url_array.push( [string_counter, image_url] ); // Push the page location alongside the URL, for a 2D array where the first element (url_array[n][0]) is its display order - for later sorting
}
if( string_counter_enforcement > string_counter ) { string_counter = string_counter_enforcement + 1; } // Make sure our while() eventually ends. Possibly throw an error here, for debugging.
}
// Look for links to offsite images, isolate URLs
string_counter = 0; // reset to scrape for links this time
while( html_copy.indexOf( '<a', string_counter ) > -1 ) {
var string_counter_enforcement = string_counter; // if string_counter isn't higher than this at the end of the while() loop, you done goofed
// I probably don't even need to look for '<a'. 'href=' is enough, since we're filtering things to look like images.
string_counter = html_copy.indexOf( '<a', string_counter ) + 2; // advance to where the next link is defined
string_counter = html_copy.indexOf( 'href=', string_counter ) + 6; // advance to the target URL (note: we advance past the quote, not just past the equals sign)
var quote_type = html_copy.substring( string_counter - 1, string_counter ); // find either a singlequote or a doublequote delimiter past the 'href='
var image_url = html_copy.substring( string_counter, html_copy.indexOf( quote_type, string_counter ) ); // grab the delimited target URL
if( image_url.indexOf( ".tumblr.com" ) > 0 ) { // ignore tumblr links, they're probably just embedded images (which we already scrape)
image_url = "";
} else { // clumsy whitelist - keep anything that looks like an image format
var original_image_url = image_url;
image_url = "";
if( original_image_url.indexOf( ".gif" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( ".jpg" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( ".jpeg" ) > 0 ) { image_url = original_image_url; }
if( original_image_url.indexOf( ".png" ) > 0 ) { image_url = original_image_url; }
}
if( image_url !== "" ) {
url_array.push( [parseFloat("0." + string_counter), image_url] ); // We lie about their order on the page (zero point string_counter) to avoid doubling-up when embedded images link to themselves
}
if( string_counter_enforcement > string_counter ) { string_counter = string_counter_enforcement + 1; } // making sure our while() eventually ends, even if we fuck up
}
// look for photoset iframes, then fetch them and soft-scrape them
string_counter = 0; // reset to scrape for photosets this time
while( html_copy.indexOf( 'id="photoset', string_counter ) > -1 ) {
string_counter = html_copy.indexOf( 'id="photoset', string_counter ) + 10; // advance to where the next photoset is defined
string_counter = html_copy.indexOf( 'src="', string_counter ) + 5; // advance to the source URL (we can assume doublequotes b/c photosets are never themed)
var photoset_url = html_copy.substring( string_counter, html_copy.indexOf( '"', string_counter ) ); // grab the doublequote-delimited source URL
if( photoset_url.indexOf( "photoset_iframe" ) > 0 ) { // do not attempt to extract photoset links from false-positive id="photoset" hits - it causes this function to fail
var photosetxml = new XMLHttpRequest();
photosetxml.onreadystatechange = function() { // this will trigger whenever a photoset request comes back
if( photosetxml.readyState == 4 ) { // when we're finally done loading the request
var photoset_html = photosetxml.responseText; // I'm not sure you can write to responseText, but this is smarter practice regardless.
var photoset_string_counter = 0; // best not to overload string_counter for a different scope.
var first_image = true;
while( photoset_html.indexOf( 'href="', photoset_string_counter ) > -1 ) { // do photosets need to be singlequote/doublequote-agnostic? I think they're 100% Tumblr-standardized.
photoset_string_counter = photoset_html.indexOf( 'href="', photoset_string_counter ) + 6; // advance to next link href
var image_url = photoset_html.substring( photoset_string_counter, photoset_html.indexOf( '"', photoset_string_counter ) ); // grab contents of link href
// push [string.photoset as a float for sorting, image URL # photoset URL for linking to photoset
if( first_image ) {
url_array.push( [parseFloat(string_counter + "." + photoset_string_counter), image_url + "#photoset#" + photoset_url] );
first_image = false; // We want the photoset URL attached to just the first image found, so it's only linked once. Other images are only generically marked #photoset.
} else {
url_array.push( [parseFloat(string_counter + "." + photoset_string_counter), image_url + "#photoset"] );
}
}
}
}
photosetxml.open("GET", photoset_url, false);
photosetxml.send();
}
}
url_array.sort( function(a,b) { return a[0] - b[0]; } ); // given two array elements, each of which is a [string_counter, image_url] array, return whichever has the lower string_counter (i.e. a if a-b > 0)
return url_array;
}
function get_site( site ) {
site = site.substring( 0, site.indexOf( "/archive?" ) ); // so we should get e.g. "sample.tumblr.com"
// if( options_map.find.substring( 0, 1 ) != "/" ) { options_map.find = "/" + options_map.find; } // Always start with a slash
// if( options_map.find.substring( options_map.find.length ) == "/" ) { options_map.find = options_map.find.substring( 0, options_map.find.length - 1 ); } // Never end with a slash
site += options_map.find; // e.g. add "/tagged/homestuck" back on
return site;
}
// minor kludge: options_url( key, value ) { copy_map = options_map; copy_map[ key ] = value; construct url from copy_map, return url }
// why is copy_map only a reference to options_map? why the fuck doesn't javascript support copying objects by value?!
function options_url( key, value ) {
var copy_map = new Object();
// copy_map.assign = my_assign; // Allow copying to copy_map. Yes - it is literally impossible to copy an object without this code. Fuck you, Javascript.
// copy_map = options_map; // operate on a copy so we can make changes
for( var i in options_map ) { copy_map[ i ] = options_map[ i ]; } // In any sensible language, this would read "copy_map = object_map." Javascript genuinely does not know how to copy objects. Fuck's sake.
if( key ) { // the parameters are optional. just calling options_url() will return e.g. example.tumblr.com/archive?ezastumblrscrape?startpage=1
if( !value ) { value = false; } // if there's no value then use false
copy_map[ key ] = value; // change this key, so we can e.g. link to example.tumblr.com/archive?ezastumblrscrape?startpage=2
}
// Construct URL from options
var site = window.location.href.substring( 0, window.location.href.indexOf( "?" ) ); // should include /archive, but if not, it still works on most pages
for( var k in copy_map ) { // JS associative maps are weird. We're actually setting attributes of a generic object. So options_map[ "thumbnails" ] is the same as options_map.thumbnails.
if( copy_map.hasOwnProperty( k ) ) { // So if this variable "in" the map is an attribute of the object, it's a key. At least so far as we're concerned.
if( copy_map[ k ] ) { // Don't add tags for options which are set to false
site += "?" + k + "=" + copy_map[ k ];
}
}
}
return site;
}