I solved this problem by writing a small program (called extract.js) in node.js to scrape the text. I used this page to help me: http://blog.miguelgrinberg.com/post/easy-web-scraping-with-nodejs
Each html page contains multiple book pages. Therefore if we increment the page parameter in the url only by 1 then we would be scraping duplicate book pages if we are not careful (this was the part I was particularly stuck on). I got around this by using a jquery selector to only select the individual book page specified in the url and to ignore the other book pages present in the html. This way I could quickly construct a text file using a spreadsheet program with the urls for each single page in order (because the increment is only 1).
So far I have successfully scraped the first two volumes, five more to go! The code is given below, it may serve as a useful starter for scraping other Google books.
// Usage: node extract.js input output // where input (mandatory) is the text file containing your list of urls // and output (optional) is the directory where the output files will be saved var fs = require('fs'); var request = require('request'); var cheerio = require('cheerio'); // Read the command line parameters var input = process.argv[2]; var output = process.argv[3]; if (!input) { console.log("Missing input parameter"); return; } // Read the url input file, each url is on a new line var urls = fs.readFileSync(input).toString().split('\n'); // Check for non urls and remove for (var i = 0; i < urls.length; i++) { if (urls[i].slice(0, 4) != 'http') { urls.splice(i, 1); } } // Iterate through the urls for (var i = 0; i < urls.length; i++) { var url = urls[i]; // request function is asynchronous, hence requirement for self-executing function // Cannot guarantee the execution order of the callback for each url, therefore save results to separate files request(url, ( function(url) { return function(err, resp, body) { if (err) throw err; // Extract the pg parameter (book page) from the url // We will use this to only extract the text from this book page // because a retrieved html page contains multiple book pages var pg = url.slice(url.indexOf('pg=') + 3, url.indexOf('&output=text')); // // Define the filename // var number = pg.slice(2, pg.length); var zeroes = 4 - number.length; // Insert leading zeroes for (var j = 0; j < zeroes; j++) { number = '0' + number; } var filename = pg.slice(0, 2) + number + '.txt'; // Add path to filename if (output) { if (!fs.existsSync(output)) fs.mkdirSync(output); filename = output + '/' + filename; } // Delete the file if it already exists if (fs.existsSync(filename)) fs.unlinkSync(filename); // Make the DOM available to jquery $ = cheerio.load(body); // Select the book page // Pages are contained within 'div' elements (where class='flow'), // each of which contains an 'a' element where id is equal to the page // Use ^ to match pages because sometimes page ids can have a trailing hyphen and extra characters var page = $('div.flow:has(a[id=' + pg + ']), div.flow:has(a[id^=' + pg + '-])'); // // Extract and save the text of the book page to the file // var hasText = false; // Text is in 'gtxt_body', 'gtxt_column' and 'gtxt_footnote' page.find('div.gtxt_body, div.gtxt_column, div.gtxt_footnote').each(function() { this.find('p.gtxt_body, p.gtxt_column, p.gtxt_footnote').each(function() { hasText = true; fs.appendFileSync(filename, this.text()); fs.appendFileSync(filename, '\n\n'); }); }); // Log progress if (hasText) { console.log("Retrieved and saved page: " + pg); } else { console.log("Skipping page: " + pg); } } } )(url)); }