swift/swift/common/middleware/dlo.py

# Copyright (c) 2013 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from ConfigParser import ConfigParser, NoSectionError, NoOptionError
from hashlib import md5
from swift.common import constraints
from swift.common.exceptions import ListingIterError, SegmentError
from swift.common.http import is_success
from swift.common.swob import Request, Response, \
    HTTPRequestedRangeNotSatisfiable, HTTPBadRequest, HTTPConflict
from swift.common.utils import get_logger, json, \
    RateLimitedIterator, read_conf_dir, quote, close_if_possible, \
    closing_if_possible
from swift.common.request_helpers import SegmentedIterable
from swift.common.wsgi import WSGIContext, make_subrequest
from urllib import unquote


class GetContext(WSGIContext):
    def __init__(self, dlo, logger):
        super(GetContext, self).__init__(dlo.app)
        self.dlo = dlo
        self.logger = logger

    def _get_container_listing(self, req, version, account, container,
                               prefix, marker=''):
        con_req = make_subrequest(
            req.environ, path='/'.join(['', version, account, container]),
            method='GET',
            headers={'x-auth-token': req.headers.get('x-auth-token')},
            agent=('%(orig)s ' + 'DLO MultipartGET'), swift_source='DLO')
        con_req.query_string = 'format=json&prefix=%s' % quote(prefix)
        if marker:
            con_req.query_string += '&marker=%s' % quote(marker)

        con_resp = con_req.get_response(self.dlo.app)
        if not is_success(con_resp.status_int):
            return con_resp, None
        with closing_if_possible(con_resp.app_iter):
            return None, json.loads(''.join(con_resp.app_iter))

    def _segment_listing_iterator(self, req, version, account, container,
                                  prefix, segments, first_byte=None,
                                  last_byte=None):
        # It's sort of hokey that this thing takes in the first page of
        # segments as an argument, but we need to compute the etag and content
        # length from the first page, and it's better to have a hokey
        # interface than to make redundant requests.
        if first_byte is None:
            first_byte = 0
        if last_byte is None:
            last_byte = float("inf")

        marker = ''
        while True:
            for segment in segments:
                seg_length = int(segment['bytes'])

                if first_byte >= seg_length:
                    # don't need any bytes from this segment
                    first_byte = max(first_byte - seg_length, -1)
                    last_byte = max(last_byte - seg_length, -1)
                    continue
                elif last_byte < 0:
                    # no bytes are needed from this or any future segment
                    break

                seg_name = segment['name']
                if isinstance(seg_name, unicode):
                    seg_name = seg_name.encode("utf-8")

                # (obj path, etag, size, first byte, last byte)
                yield ("/" + "/".join((version, account, container,
                                       seg_name)),
                       # We deliberately omit the etag and size here;
                       # SegmentedIterable will check size and etag if
                       # specified, but we don't want it to. DLOs only care
                       # that the objects' names match the specified prefix.
                       None, None,
                       (None if first_byte <= 0 else first_byte),
                       (None if last_byte >= seg_length - 1 else last_byte))

                first_byte = max(first_byte - seg_length, -1)
                last_byte = max(last_byte - seg_length, -1)

            if len(segments) < constraints.CONTAINER_LISTING_LIMIT:
                # a short page means that we're done with the listing
                break
            elif last_byte < 0:
                break

            marker = segments[-1]['name']
            error_response, segments = self._get_container_listing(
                req, version, account, container, prefix, marker)
            if error_response:
                # we've already started sending the response body to the
                # client, so all we can do is raise an exception to make the
                # WSGI server close the connection early
                close_if_possible(error_response.app_iter)
                raise ListingIterError(
                    "Got status %d listing container /%s/%s" %
                    (error_response.status_int, account, container))

    def get_or_head_response(self, req, x_object_manifest,
                             response_headers=None):
        if response_headers is None:
            response_headers = self._response_headers

        container, obj_prefix = x_object_manifest.split('/', 1)
        container = unquote(container)
        obj_prefix = unquote(obj_prefix)

        # manifest might point to a different container
        req.acl = None
        version, account, _junk = req.split_path(2, 3, True)
        error_response, segments = self._get_container_listing(
            req, version, account, container, obj_prefix)
        if error_response:
            return error_response
        have_complete_listing = len(segments) < \
            constraints.CONTAINER_LISTING_LIMIT

        first_byte = last_byte = None
        actual_content_length = None
        content_length_for_swob_range = None
        if req.range and len(req.range.ranges) == 1:
            content_length_for_swob_range = sum(o['bytes'] for o in segments)

            # This is a hack to handle suffix byte ranges (e.g. "bytes=-5"),
            # which we can't honor unless we have a complete listing.
            _junk, range_end = req.range.ranges_for_length(float("inf"))[0]

            # If this is all the segments, we know whether or not this
            # range request is satisfiable.
            #
            # Alternately, we may not have all the segments, but this range
            # falls entirely within the first page's segments, so we know
            # that it is satisfiable.
            if (have_complete_listing
               or range_end < content_length_for_swob_range):
                byteranges = req.range.ranges_for_length(
                    content_length_for_swob_range)
                if not byteranges:
                    return HTTPRequestedRangeNotSatisfiable(request=req)
                first_byte, last_byte = byteranges[0]
                # For some reason, swob.Range.ranges_for_length adds 1 to the
                # last byte's position.
                last_byte -= 1
                actual_content_length = last_byte - first_byte + 1
            else:
                # The range may or may not be satisfiable, but we can't tell
                # based on just one page of listing, and we're not going to go
                # get more pages because that would use up too many resources,
                # so we ignore the Range header and return the whole object.
                actual_content_length = None
                content_length_for_swob_range = None
                req.range = None

        response_headers = [
            (h, v) for h, v in response_headers
            if h.lower() not in ("content-length", "content-range")]

        if content_length_for_swob_range is not None:
            # Here, we have to give swob a big-enough content length so that
            # it can compute the actual content length based on the Range
            # header. This value will not be visible to the client; swob will
            # substitute its own Content-Length.
            #
            # Note: if the manifest points to at least CONTAINER_LISTING_LIMIT
            # segments, this may be less than the sum of all the segments'
            # sizes. However, it'll still be greater than the last byte in the
            # Range header, so it's good enough for swob.
            response_headers.append(('Content-Length',
                                     str(content_length_for_swob_range)))
        elif have_complete_listing:
            actual_content_length = sum(o['bytes'] for o in segments)
            response_headers.append(('Content-Length',
                                     str(actual_content_length)))

        if have_complete_listing:
            response_headers = [(h, v) for h, v in response_headers
                                if h.lower() != "etag"]
            etag = md5()
            for seg_dict in segments:
                etag.update(seg_dict['hash'].strip('"'))
            response_headers.append(('Etag', '"%s"' % etag.hexdigest()))

        app_iter = None
        if req.method == 'GET':
            listing_iter = RateLimitedIterator(
                self._segment_listing_iterator(
                    req, version, account, container, obj_prefix, segments,
                    first_byte=first_byte, last_byte=last_byte),
                self.dlo.rate_limit_segments_per_sec,
                limit_after=self.dlo.rate_limit_after_segment)

            app_iter = SegmentedIterable(
                req, self.dlo.app, listing_iter, ua_suffix="DLO MultipartGET",
                swift_source="DLO", name=req.path, logger=self.logger,
                max_get_time=self.dlo.max_get_time,
                response_body_length=actual_content_length)

            try:
                app_iter.validate_first_segment()
            except (SegmentError, ListingIterError):
                return HTTPConflict(request=req)

        resp = Response(request=req, headers=response_headers,
                        conditional_response=True,
                        app_iter=app_iter)

        return resp

    def handle_request(self, req, start_response):
        """
        Take a GET or HEAD request, and if it is for a dynamic large object
        manifest, return an appropriate response.

        Otherwise, simply pass it through.
        """
        resp_iter = self._app_call(req.environ)

        # make sure this response is for a dynamic large object manifest
        for header, value in self._response_headers:
            if (header.lower() == 'x-object-manifest'):
                close_if_possible(resp_iter)
                response = self.get_or_head_response(req, value)
                return response(req.environ, start_response)
        else:
            # Not a dynamic large object manifest; just pass it through.
            start_response(self._response_status,
                           self._response_headers,
                           self._response_exc_info)
            return resp_iter


class DynamicLargeObject(object):
    def __init__(self, app, conf):
        self.app = app
        self.logger = get_logger(conf, log_route='dlo')

        # DLO functionality used to live in the proxy server, not middleware,
        # so let's try to go find config values in the proxy's config section
        # to ease cluster upgrades.
        self._populate_config_from_old_location(conf)

        self.max_get_time = int(conf.get('max_get_time', '86400'))
        self.rate_limit_after_segment = int(conf.get(
            'rate_limit_after_segment', '10'))
        self.rate_limit_segments_per_sec = int(conf.get(
            'rate_limit_segments_per_sec', '1'))

    def _populate_config_from_old_location(self, conf):
        if ('rate_limit_after_segment' in conf or
                'rate_limit_segments_per_sec' in conf or
                'max_get_time' in conf or
                '__file__' not in conf):
            return

        cp = ConfigParser()
        if os.path.isdir(conf['__file__']):
            read_conf_dir(cp, conf['__file__'])
        else:
            cp.read(conf['__file__'])

        try:
            pipe = cp.get("pipeline:main", "pipeline")
        except (NoSectionError, NoOptionError):
            return

        proxy_name = pipe.rsplit(None, 1)[-1]
        proxy_section = "app:" + proxy_name
        for setting in ('rate_limit_after_segment',
                        'rate_limit_segments_per_sec',
                        'max_get_time'):
            try:
                conf[setting] = cp.get(proxy_section, setting)
            except (NoSectionError, NoOptionError):
                pass

    def __call__(self, env, start_response):
        """
        WSGI entry point
        """
        req = Request(env)
        try:
            vrs, account, container, obj = req.split_path(4, 4, True)
        except ValueError:
            return self.app(env, start_response)

        # install our COPY-callback hook
        env['swift.copy_hook'] = self.copy_hook(
            env.get('swift.copy_hook',
                    lambda src_req, src_resp, sink_req: src_resp))

        if ((req.method == 'GET' or req.method == 'HEAD') and
                req.params.get('multipart-manifest') != 'get'):
            return GetContext(self, self.logger).\
                handle_request(req, start_response)
        elif req.method == 'PUT':
            error_response = self.validate_x_object_manifest_header(
                req, start_response)
            if error_response:
                return error_response(env, start_response)
        return self.app(env, start_response)

    def validate_x_object_manifest_header(self, req, start_response):
        """
        Make sure that X-Object-Manifest is valid if present.
        """
        if 'X-Object-Manifest' in req.headers:
            value = req.headers['X-Object-Manifest']
            container = prefix = None
            try:
                container, prefix = value.split('/', 1)
            except ValueError:
                pass
            if not container or not prefix or '?' in value or '&' in value or \
                    prefix[0] == '/':
                return HTTPBadRequest(
                    request=req,
                    body=('X-Object-Manifest must be in the '
                          'format container/prefix'))

    def copy_hook(self, inner_hook):

        def dlo_copy_hook(source_req, source_resp, sink_req):
            x_o_m = source_resp.headers.get('X-Object-Manifest')
            if x_o_m:
                if source_req.params.get('multipart-manifest') == 'get':
                    # To copy the manifest, we let the copy proceed as normal,
                    # but ensure that X-Object-Manifest is set on the new
                    # object.
                    sink_req.headers['X-Object-Manifest'] = x_o_m
                else:
                    ctx = GetContext(self, self.logger)
                    source_resp = ctx.get_or_head_response(
                        source_req, x_o_m, source_resp.headers.items())
            return inner_hook(source_req, source_resp, sink_req)

        return dlo_copy_hook


def filter_factory(global_conf, **local_conf):
    conf = global_conf.copy()
    conf.update(local_conf)

    def dlo_filter(app):
        return DynamicLargeObject(app, conf)
    return dlo_filter