Skip to content

More robust snapshot script #31

@johanneskiesel

Description

@johanneskiesel

Version of @phoerious

waitForLoadStateWithTimeout should probably be moved into pages.js.

The waiting-and-resizing-part could also be moved into pages.js. Likely helpful also for other scripts.

This script also covers the case of crawling several URLs at once. Not sure yet whether to keep that part. Probably yes.

const fs = require("fs-extra");
const path = require("path");

const { AbstractScriptorScript, files, pages, log } = require("@webis-de/scriptor");

const NAME = "Snapshot";
const VERSION = "0.2.0";

const waitForLoadStateWithTimeout = async (page, event, timeout) => {
  try {
    return await page.waitForLoadState(event, { timeout: timeout });
  } catch (ex) {
    return null;
  }
}

module.exports = class extends AbstractScriptorScript {

  constructor() {
    super(NAME, VERSION);
  }

  async run(browserContexts, scriptDirectory, inputDirectory, outputDirectory) {
    const browserContext = browserContexts[files.BROWSER_CONTEXT_DEFAULT];

    // Script options
    const defaultScriptOptions = {
      viewportAdjust: {},
      snapshot: {
        screenshot: { timeout: 120000 }  // Screenshotting complex pages can take a very long time
      }
    };
    const requiredScriptOptions = [ "url" ];
    const scriptOptions = files.readOptions(files.getExisting(
      files.SCRIPT_OPTIONS_FILE_NAME, [ scriptDirectory, inputDirectory ]),
      defaultScriptOptions, requiredScriptOptions);
    log.info({options: scriptOptions}, "script.options");

    fs.writeJsonSync(path.join(outputDirectory, files.SCRIPT_OPTIONS_FILE_NAME), scriptOptions);

    // Load page(s)
    let url = scriptOptions["url"];
    if (typeof url === "string") {
      url = [url];
    }

    const promises = [];
    for (const [i, u] of url.entries()) {
      const page = await browserContext.newPage();
      promises.push(page.goto(u, { waitUntil: "domcontentloaded" }).then(async (resp) => {
        await waitForLoadStateWithTimeout(page, "load", 10000);

        // Adjust viewport height to scroll height to trigger loading dynamic content
        await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);

        // Wait for three networkidle intervals to ensure dynamic content finished loading
        for (let i = 0; i < 3; ++i) {
          await waitForLoadStateWithTimeout(page, "networkidle", 3500);
        }

        // Update viewport up to three times to accomodate for layout changes and
        // to trigger further dynamic content
        let resizes = 0;
        while (resizes < 3 && await page.viewportSize().height !== await pages.getHeight(page)) {
          await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);
          await waitForLoadStateWithTimeout(page, "networkidle", 2500);
          await page.waitForTimeout(250);
          ++resizes;
        }

        // Take snapshot(s)
        const snapName = url.length > 1 ? `snapshot-${i}` : "snapshot";
        await pages.takeSnapshot(page, Object.assign(
            { path: path.join(outputDirectory, snapName) }, scriptOptions["snapshot"]
        ));
      }));
    }
    await Promise.all(promises);

    return true;
  }
};

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions