Source code for feed2exec.plugins.archive

import logging
import os.path
import requests
from feed2exec.utils import slug, make_dirs_helper


#: default archive directory
DEFAULT_ARCHIVE_DIR = '/run/user/1000/'


[docs]def output(*args, feed=None, item=None, **kwargs):
    """The archive plugin saves the feed's item.link URLs into a
    directory, specified by DEFAULT_ARCHIVE_DIR or through the output
    `args` value.

    Example::

      [NASA breaking news]
      url = https://www.nasa.gov/rss/dyn/breaking_news.rss
      output = archive
      args = /srv/archive/nasa/

    The above will save the "NASA breaking news" into the
    ``/srv/archive/nasa`` directory. Do *not* use interpolation here
    as the feed's variable could be used to mount a directory
    transversal attack.
    """

    # make a safe path from the item name
    path = slug(item.get('title', 'no-name'))
    # take the archive dir from the user or use the default
    archive_dir = ' '.join(args) if args else DEFAULT_ARCHIVE_DIR
    make_dirs_helper(archive_dir)
    # put the file in the archive directory
    path = os.path.join(archive_dir, path)
    # only operate on items that actually have a link
    if item.get('link'):
        # tell the user what's going on, if verbose
        # otherwise, we try to stay silent if all goes well
        logging.info('saving feed item %s to %s from %s%s',
                     item.get('title'), path, item.get('link'),
                     feed.get('catchup', '') and ' (simulated)')
        if feed.get('catchup'):
            return True
        # fetch the URL in memory
        result = feed.session.get(item.get('link'))
        if result.status_code != requests.codes.ok:
            logging.warning('failed to fetch link %s: %s',
                            item.get('link'), result.status_code)
            # make sure we retry next time
            return False
        # open the file
        with open(path, 'w') as archive:
            # write the response
            archive.write(result.text)
        return True
    else:
        logging.info('no link for feed item %s, not archiving',
                     item.get('title'))
        # still consider the item processed, as there's nothing to archive
        return True