Source code for exporters.writers.hubstorage_writer

import six
from exporters.writers.base_writer import BaseWriter


[docs]class HubstorageWriter(BaseWriter):
    """
    This writer sends items into Scrapinghub Hubstorage collection.

        - apikey (str)
            API key with access to the project where the items are being generated.

        - project_id (str)
            Id of the project.

        - collection_name (str)
            Name of the collection of items.

        - key_field (str)
            Record field which should be used as Hubstorage item key
    """

    # List of options to set up the writer
    supported_options = {
        "project_id": {
            'type': six.string_types,
            'help': 'Id of the project'
        },
        "collection_name": {
            'type': six.string_types,
            'help': 'Name of the collection of items'
        },
        'key_field': {
            'type': six.string_types,
            'default': '_key',
            'help': 'Record field which should be used as Hubstorage item key'
        },
        'apikey': {
            'type': six.string_types,
            'help': 'Hubstorage API key',
            'env_fallback': 'EXPORTERS_HS_APIKEY'
        }
    }

    def __init__(self, *args, **kwargs):
        super(HubstorageWriter, self).__init__(*args, **kwargs)
        self.project_id = self.read_option('project_id')
        self.collection_name = self.read_option('collection_name')
        self.key_field = self.read_option('key_field')
        self.collection = self._get_collection()
        self.collection_writer = self.collection.create_writer()
        self.logger.info('Will write items into project {}, '
                         ' collection {}'.format(self.project_id, self.collection_name))

[docs]    def write_batch(self, batch):
        for item in batch:
            item_key = item[self.key_field]
            self.collection_writer.write(dict(item, _key=item_key))
            self.increment_written_items()
            self._check_items_limit()

[docs]    def flush(self):
        self.collection_writer.flush()

    def _get_collection(self):
        import hubstorage
        client = hubstorage.HubstorageClient(self.read_option('apikey'))
        project = client.get_project(self.project_id)
        return project.collections.new_store(self.collection_name)