Source code for feed2exec.plugins.wayback

import logging

import requests

import html5lib

WAYBACK_URL = 'https://web.archive.org'

ERROR_XPATHS = [
    # XPath query for detecting blocking issues such as
    # URL syntax errors or unresolvable domains.
    #
    # <h2>Sorry</h2>
    # <p>Error text</p>
    # ...
    '//*[text()="Sorry"]/following-sibling::*/text()',

    # XPath query for detecting errors of the saving process,
    # the main one appears to be resaving the page too quickly.
    #
    # <h2 id="spn-title">Saving page https://example.org/</h2>
    # <p>Error text</p>
    # <div id="spn-option-mywebarchive">
    # ...
    '//*[starts-with(text(),"Saving page ")]' \
    '/following-sibling::*[not(@id)]/text()',
]


def _check_response(url, res):
    """Check the wayback machine response for errors

    url is the URL that has been archived
    res is the response from the archive request
    """

    # SPN2 POST requests always return HTML error pages that need parsing
    # SPN1 GET requests return HTML error pages with HTTP error codes
    parse = res.request.method == 'POST' or res.status_code != requests.codes.ok

    if parse and res.text:
        html = html5lib.parse(res.text, treebuilder="lxml")

        for error_xpath in ERROR_XPATHS:
            error = html.xpath(error_xpath)
            if error:
                break

        if error and isinstance(error, list):
            error = error[0]

    else:
        error = []

    if error and res.status_code != requests.codes.ok:
        logging.warning('wayback machine failed to save URL %s, status %d %s: %s',  # noqa
                        url, res.status_code, res.reason, error)
    elif not error and res.status_code != requests.codes.ok:
        logging.warning('wayback machine failed to save URL %s, status %d %s',  # noqa
                        url, res.status_code, res.reason)
    elif error and res.status_code == requests.codes.ok:
        logging.warning('wayback machine failed to save URL %s: %s',
                        url, error)
    elif not error and res.status_code == requests.codes.ok:
        return True

    return False


[docs]def output(*args, feed=None, item=None, session=None, **kwargs):
    """This plugin saves the feed items `link` element to the wayback
    machine. It will retry URLs that fail, so it may be necessary to
    manually catchup feeds if they have broken `link` fields.

    There are two wayback machine APIs that can be used, the default one
    archives the full page while the other one archives only the page URLs.

    The mechanism for archiving the full page uses a browser on the server
    to download all the resources used by the page including img/CSS/JS/etc:

    https://blog.archive.org/2019/10/23/the-wayback-machines-save-page-now-is-new-and-improved/

    Unfortunately the SPN2 page is just a HTML form, the response code is
    always 200 OK, any errors are returned in a HTML page and there are
    no easily machine-readable ways to find the errors on the page so we
    have to parse the HTML and query it using XPath based heuristics,
    but if there are errors in a form that is not yet known then the
    unknown errors will not be detected properly.

    Example::

      [NASA IOTD wayback]
      url = https://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss
      output = feed2exec.plugins.wayback
      args = full

    The above will save the Image of the day updates to the wayback
    machine. Since the page has images and everything is loaded by JS,
    using the SPN2 API is nessecary to capture the useful information.

    Example::

      [ikiwiki RecentChanges wayback]
      url = https://ikiwiki.info/recentchanges/index.rss
      output = feed2exec.plugins.wayback
      args = page

    The above will save the ikiwiki RecentChanges to the wayback machine.
    Since the page is plain text, saving the full page resources is not
    really nessecary, the CSS does not add information to the page.
    """

    full = not args or 'full' in args

    if item:
        url = item.get('link')

    if item and url:
        if feed.get('catchup'):
            return True

    # Use a browser on the server to download the entire page:
    if item and url and full:
        logging.debug('saving URL %s with resources to wayback machine', url)
        wayback_url = '%s/save' % (WAYBACK_URL)
        form = {
            'url': url,
            'capture_all': 'on',  # Enables saving HTTP error pages
        }
        res = session.post(wayback_url, data=form, headers={"Cache-Control": "no-store"})
        if _check_response(url, res):
            logging.info('URL %s with resources probably saved to wayback machine', url)  # noqa
            return True

    # Just save the URL and not any resources used by it
    if item and url and not full:
        logging.debug('saving URL %s without resources to wayback machine', url)  # noqa
        wayback_url = '%s/save/%s' % (WAYBACK_URL, url)
        res = session.get(wayback_url, allow_redirects=True, headers={"Cache-Control": "no-store"})
        if _check_response(url, res):
            logging.info('URL %s saved without resources to wayback machine: %s', url, res.url)  # noqa
            return True

    return False