save html output of page after execution of the page's javascript

JavascriptPhantomjsHeadless Browser

Javascript Problem Overview


There is a site I am trying to scrape, that first loads an html/js modifies the form input fields using js and then POSTs. How can I get the final html output of the POSTed page?

I tried to do this with phantomjs, but it seems to only have an option to render image files. Googling around suggests it should be possible , but I can't figure out how. My attempt:

var page = require('webpage').create();
var fs = require('fs');
page.open('https://www.somesite.com/page.aspx', function () {
    page.evaluate(function(){
        
    });

    page.render('export.png');
    fs.write('1.html', page.content, 'w');
    phantom.exit();
});

This code will be used for a client, I can't expect him to install too many packages (nodejs , casperjs etc)

Thanks

Javascript Solutions


Solution 1 - Javascript

the output code you have is correct, but there is an issue with synchronicity. The output lines that you have are being executed before the page is done loading. You can tie into the onLoadFinished Callback to find out when that happens. See full code below.

    var page = new WebPage()
    var fs = require('fs');

    page.onLoadFinished = function() {
      console.log("page load finished");
      page.render('export.png');
      fs.write('1.html', page.content, 'w');
      phantom.exit();
    };

    page.open("http://www.google.com", function() {
      page.evaluate(function() {
      });
    });

When using a site like google, it can be deceiving because it loads so quicker, that you can often execute a screengrab inline like you have it. Timing is a tricky thing in phantomjs, sometimes I test with setTimeout to see if timing is an issue.

Solution 2 - Javascript

When I copied your code directly, and changed the URL to www.google.com, it worked fine, with two files saved:

  • 1.html
  • export.png

Bear in mind that the files will be written to the location you run the script from, not where your .js file is located

Solution 3 - Javascript

After 2 long days of struggling and frustration I finally got my similar issue solved. What did the trick was the waitfor.js example in PhantomJS' official website. Be happy!

"use strict";

function waitFor(testFx, onReady, timeOutMillis) {
    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
        start = new Date().getTime(),
        condition = false,
        interval = setInterval(function() {
            if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
                // If not time-out yet and condition not yet fulfilled
                condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
            } else {
                if(!condition) {
                    // If condition still not fulfilled (timeout but condition is 'false')
                    console.log("'waitFor()' timeout");
                    phantom.exit(1);
                } else {
                    // Condition fulfilled (timeout and/or condition is 'true')
                    console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
                    typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
                    clearInterval(interval); //< Stop this interval
                }
            }
        }, 250); //< repeat check every 250ms
};


var page = require('webpage').create();

// Open Twitter on 'sencha' profile and, onPageLoad, do...
page.open("http://twitter.com/#!/sencha", function (status) {
    // Check for page load success
    if (status !== "success") {
        console.log("Unable to access network");
    } else {
        // Wait for 'signin-dropdown' to be visible
        waitFor(function() {
            // Check in the page if a specific element is now visible
            return page.evaluate(function() {
                return $("#signin-dropdown").is(":visible");
            });
        }, function() {
           console.log("The sign-in dialog should be visible now.");
           phantom.exit();
        });
    }
});

Solution 4 - Javascript

I tried several approaches to similar task and the best results I got using Selenium.

Before I tried PhantomJS and Cheerio. Phantom was crashing too often while executing JS on the page.

Solution 5 - Javascript

I'm using CasperJS to run tests with PhantomJS. I added this code to my tearDown function:

var require = patchRequire(require);
var fs = require('fs');

casper.test.begin("My Test", {
    tearDown: function(){
        casper.capture("export.png");
        fs.write("1.html", casper.getHTML(undefined, true), 'w');
    },
    test: function(test){
        // test code

        casper.run(function(){
            test.done();
        });
    }
});

See docs for capture and getHTML.

Solution 6 - Javascript

one approach that comes to my mind, besides using a headless browser is obviously to simulate the ajax calls and to ensemble the page post-process, request by request.. this however is often kind of tricky and should be used as a last resort, unless you really like to dig through javascript code..

Solution 7 - Javascript

This can easily be done with some php code and javascript use fopen() and fwrite() and this function to save it: var generatedSource = new XMLSerializer().serializeToString(document);

Attributions

All content for this solution is sourced from the original question on Stackoverflow.

The content on this page is licensed under the Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.

Content TypeOriginal AuthorOriginal Content on Stackoverflow
Questiongyaani_guyView Question on Stackoverflow
Solution 1 - JavascriptuffaView Answer on Stackoverflow
Solution 2 - JavascriptOwen MartinView Answer on Stackoverflow
Solution 3 - JavascriptHeitorView Answer on Stackoverflow
Solution 4 - JavascriptstrahView Answer on Stackoverflow
Solution 5 - JavascriptBen HutchisonView Answer on Stackoverflow
Solution 6 - JavascriptDropoutView Answer on Stackoverflow
Solution 7 - JavascriptSem VoigtländerView Answer on Stackoverflow