diff --git a/include/boost/compute/distributed/command_queue.hpp b/include/boost/compute/distributed/command_queue.hpp
new file mode 100644
index 000000000..77acb9191
--- /dev/null
+++ b/include/boost/compute/distributed/command_queue.hpp
@@ -0,0 +1,256 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_COMMAND_QUEUE_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_COMMAND_QUEUE_HPP
+
+#include <vector>
+
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/distributed/context.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+class command_queue
+{
+public:
+    enum properties {
+        enable_profiling = CL_QUEUE_PROFILING_ENABLE,
+        enable_out_of_order_execution = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+    };
+
+    enum map_flags {
+        map_read = CL_MAP_READ,
+        map_write = CL_MAP_WRITE
+        #ifdef CL_VERSION_1_2
+        ,
+        map_write_invalidate_region = CL_MAP_WRITE_INVALIDATE_REGION
+        #endif
+    };
+
+    /// Creates a null distributed command queue.
+    command_queue()
+        : m_context(),
+          m_queues()
+    {
+
+    }
+
+    /// Creates a distributed command queue for all devices in \p context with
+    /// \p properties.
+    explicit
+    command_queue(const ::boost::compute::distributed::context &context,
+                  cl_command_queue_properties properties = 0)
+        : m_context(context)
+    {
+        size_t n = m_context.size();
+        for(size_t i = 0; i < n; i++) {
+            ::boost::compute::context c = m_context.get(i);
+            std::vector<device> devices = c.get_devices();
+            for(size_t j = 0; j < devices.size(); j++) {
+                m_queues.push_back(
+                    ::boost::compute::command_queue(c, devices[j], properties)
+                );
+            }
+        }
+    }
+
+    /// Creates a distributed command queue containing command queues for each
+    /// corresponding device and context from \p devices and \p contexts.
+    command_queue(const std::vector< ::boost::compute::context> &contexts,
+                  const std::vector< std::vector<device> > &devices,
+                  cl_command_queue_properties properties = 0)
+    {
+        m_context = context(contexts);
+        for(size_t i = 0; i < m_context.size(); i++) {
+            for(size_t j = 0; j < devices[i].size(); j++) {
+                m_queues.push_back(
+                    ::boost::compute::command_queue(
+                        m_context.get(i), devices[i][j], properties
+                    )
+                );
+            }
+        }
+    }
+
+    /// Creates a distributed command queue for all devices in \p context.
+    command_queue(const ::boost::compute::context &context,
+                  cl_command_queue_properties properties = 0)
+    {
+        m_context = ::boost::compute::distributed::context(context);
+        std::vector<device> devices = context.get_devices();
+        for(size_t i = 0; i < devices.size(); i++) {
+            m_queues.push_back(
+                ::boost::compute::command_queue(
+                    context, devices[i], properties
+                )
+            );
+        }
+    }
+
+    /// Creates a distributed command queue containing \p queues.
+    explicit
+    command_queue(const std::vector< ::boost::compute::command_queue> queues)
+        : m_queues(queues)
+    {
+        std::vector< ::boost::compute::context> contexts;
+        for(size_t i = 0; i < m_queues.size(); i++) {
+            contexts.push_back(
+                m_queues[i].get_context()
+            );
+        }
+        m_context = context(contexts);
+    }
+
+    /// Creates a new command queue object as a copy of \p other.
+    command_queue(const command_queue &other)
+        : m_context(other.m_context),
+          m_queues(other.m_queues)
+    {
+
+    }
+
+    /// Copies the command queue object from \p other to \c *this.
+    command_queue& operator=(const command_queue &other)
+    {
+        if(this != &other){
+            m_context = other.m_context;
+            m_queues = other.m_queues;
+        }
+        return *this;
+    }
+
+    #ifndef BOOST_COMPUTE_NO_RVALUE_REFERENCES
+    /// Move-constructs a new command queue object from \p other.
+    command_queue(command_queue&& other) BOOST_NOEXCEPT
+        : m_context(std::move(other.m_context)),
+          m_queues(std::move(other.m_queues))
+    {
+
+    }
+
+    /// Move-assigns the command queue from \p other to \c *this.
+    command_queue& operator=(command_queue&& other) BOOST_NOEXCEPT
+    {
+        m_context = std::move(other.m_context);
+        m_queues = std::move(other.m_queues);
+        return *this;
+    }
+    #endif // BOOST_COMPUTE_NO_RVALUE_REFERENCES
+
+    /// Returns the number of individual command queues in this
+    /// distributed command queue.
+    size_t size() const
+    {
+        return m_queues.size();
+    }
+
+    /// Returns the nth command queue.
+    ::boost::compute::command_queue& get(size_t n)
+    {
+        return m_queues[n];
+    }
+
+    /// Returns the nth command queue.
+    const ::boost::compute::command_queue& get(size_t n) const
+    {
+        return m_queues[n];
+    }
+
+    /// Returns the distributed context used for creating this distributed
+    /// command queue.
+    const context& get_context() const
+    {
+        return m_context;
+    }
+
+    /// Returns the context of the nth command queue from distributed
+    /// command queue.
+    ::boost::compute::context get_context(size_t n) const
+    {
+        return m_queues[n].get_context();
+    }
+
+    /// Returns true if all device command queues are in the same OpenCL
+    /// context.
+    bool one_context() const
+    {
+        return m_context.one_context();
+    }
+
+    /// Returns nth context from command queue's distributed context.
+    ::boost::compute::device get_device(size_t n) const
+    {
+        return m_queues[n].get_device();
+    }
+
+    /// Returns \c true if the command queue is the same as \p other.
+    bool operator==(const command_queue &other) const
+    {
+        return (m_context == other.m_context) && (m_queues == other.m_queues);
+    }
+
+    /// Returns \c true if the command queue is different from \p other.
+    bool operator!=(const command_queue &other) const
+    {
+        return (m_context != other.m_context) || (m_queues != other.m_queues);
+    }
+
+    /// Returns information about nth command queue.
+    template<class T>
+    T get_info(size_t n, cl_command_queue_info info) const
+    {
+        return m_queues[n].get_info<T>(info);
+    }
+
+    /// Flushes the command queue.
+    void flush()
+    {
+        for(size_t i = 0; i < m_queues.size(); i++)
+        {
+            m_queues[i].flush();
+        }
+    }
+
+    /// Blocks until all outstanding commands in the queue have finished.
+    void finish()
+    {
+        for(size_t i = 0; i < m_queues.size(); i++)
+        {
+            m_queues[i].finish();
+        }
+    }
+
+    /// \internal_
+    /// Return true if every device supports at least OpenCL major.minor
+    bool check_devices_version(int major, int minor) const
+    {
+        bool check = true;
+        for(size_t i = 0; i < m_queues.size(); i++)
+        {
+            check = check && m_queues[i].get_device().check_version(major, minor);
+        }
+        return check;
+    }
+
+private:
+    ::boost::compute::distributed::context m_context;
+    std::vector< ::boost::compute::command_queue> m_queues;
+};
+
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_COMMAND_QUEUE_HPP */
diff --git a/include/boost/compute/distributed/context.hpp b/include/boost/compute/distributed/context.hpp
new file mode 100644
index 000000000..41f5a6e80
--- /dev/null
+++ b/include/boost/compute/distributed/context.hpp
@@ -0,0 +1,226 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_CONTEXT_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_CONTEXT_HPP
+
+#include <vector>
+
+#include <boost/compute/context.hpp>
+#include <boost/compute/device.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+class context
+{
+public:
+    /// Create a null context object.
+    context()
+        : m_contexts()
+    {
+    }
+
+    /// Creates a new distributed context for \p devices, containing
+    /// boost::compute::context objects, each constructed using corresponding
+    /// vector of OpenCL devices and \p properties.
+    context(const std::vector< std::vector< ::boost::compute::device> > &devices,
+            const std::vector<cl_context_properties*> &properties)
+    {
+        m_contexts = std::vector< ::boost::compute::context>();
+        for(size_t i = 0; i < devices.size(); i++) {
+            m_contexts.push_back(
+                ::boost::compute::context(devices[i], properties[i])
+            );
+        }
+    }
+
+    /// Creates a new distributed context for \p devices, containing
+    /// boost::compute::context objects, each constructed using corresponding
+    /// vector of OpenCL devices and default properties.
+    explicit context(const std::vector< std::vector< ::boost::compute::device> > &devices)
+    {
+        m_contexts = std::vector< ::boost::compute::context>();
+        for(size_t i = 0; i < devices.size(); i++) {
+            m_contexts.push_back(
+                ::boost::compute::context(devices[i])
+            );
+        }
+    }
+
+    /// Creates a new distributed context for \p devices with \p properties.
+    context(const std::vector< ::boost::compute::device> &devices,
+            const std::vector<cl_context_properties*> &properties)
+    {
+        m_contexts = std::vector< ::boost::compute::context>();
+        for(size_t i = 0; i < devices.size(); i++) {
+            m_contexts.push_back(
+                ::boost::compute::context(devices[i], properties[i])
+            );
+        }
+    }
+
+    /// Creates a new distributed context for \p devices
+    explicit context(const std::vector< ::boost::compute::device> &devices)
+    {
+        m_contexts = std::vector< ::boost::compute::context>();
+        for(size_t i = 0; i < devices.size(); i++) {
+            m_contexts.push_back(
+                ::boost::compute::context(devices[i])
+            );
+        }
+    }
+
+    /// Creates a new distributed context using \p contexts.
+    explicit context(const std::vector< ::boost::compute::context>& contexts)
+        : m_contexts(contexts)
+    {
+
+    }
+
+    /// Creates a new distributed context using contexts from range
+    /// [\p first, \p last).
+    template <class Iterator>
+    explicit context(Iterator first, Iterator last)
+        : m_contexts(first, last)
+    {
+
+    }
+
+    /// Creates a new distributed context from one \p context.
+    explicit context(const ::boost::compute::context& context)
+        : m_contexts(1, context)
+    {
+
+    }
+
+    /// Creates a new context object as a copy of \p other.
+    context(const context &other)
+        : m_contexts(other.m_contexts)
+    {
+
+    }
+
+    /// Copies the context object from \p other to \c *this.
+    context& operator=(const context &other)
+    {
+        if(this != &other){
+            m_contexts =
+                std::vector< ::boost::compute::context>(other.m_contexts);
+        }
+        return *this;
+    }
+
+    #ifndef BOOST_COMPUTE_NO_RVALUE_REFERENCES
+    /// Move-constructs a new context object from \p other.
+    context(context&& other) BOOST_NOEXCEPT
+        : m_contexts(std::move(other.m_contexts))
+    {
+
+    }
+
+    /// Move-assigns the context from \p other to \c *this.
+    context& operator=(context&& other) BOOST_NOEXCEPT
+    {
+        m_contexts = std::move(other.m_contexts);
+        return *this;
+    }
+    #endif // BOOST_COMPUTE_NO_RVALUE_REFERENCES
+
+    /// Returns number of individual contexts in distributed context.
+    size_t size() const
+    {
+        return m_contexts.size();
+    }
+
+    /// Returns nth context.
+    const ::boost::compute::context& get(size_t n) const
+    {
+        return m_contexts[n];
+    }
+
+    /// Returns true if all contexts are in fact the same OpenCL context.
+    bool one_context() const
+    {
+        bool one_context = m_contexts[0].get() != 0;
+        for(size_t i = 1; i < m_contexts.size(); i++) {
+            one_context = (one_context && (m_contexts[0] == m_contexts[i]));
+        }
+        return one_context;
+    }
+
+    /// Returns information about nth context.
+    template<class T>
+    T get_info(size_t n, cl_context_info info) const
+    {
+        return m_contexts[n].get_info<T>(info);
+    }
+
+    /// Returns the device for the nth context in distributed context.
+    std::vector<device> get_devices(size_t n) const
+    {
+        return m_contexts[n].get_info<std::vector<device> >(CL_CONTEXT_DEVICES);
+    }
+
+    /// Returns a vector of devices for the distributed context.
+    std::vector<std::vector<device> > get_devices() const
+    {
+        std::vector<std::vector<device> > devices;
+        for(size_t i = 0; i < m_contexts.size(); i++) {
+            devices.push_back(
+                m_contexts[i].get_info<std::vector<device> >(CL_CONTEXT_DEVICES)
+            );
+        }
+        return devices;
+    }
+
+    /// Returns \c true if the context is the same as \p other.
+    bool operator==(const context &other) const
+    {
+        return m_contexts == other.m_contexts;
+    }
+
+    /// Returns \c true if the context is different from \p other.
+    bool operator!=(const context &other) const
+    {
+        return m_contexts != other.m_contexts;
+    }
+
+private:
+    std::vector< ::boost::compute::context> m_contexts;
+};
+
+
+inline bool operator==(const ::boost::compute::context &lhs, const context& rhs)
+{
+    return (rhs.size() == 1) && (rhs.get(0) == lhs);
+}
+
+inline bool operator==(const context& lhs, const ::boost::compute::context &rhs)
+{
+    return (lhs.size() == 1) && (lhs.get(0) == rhs);
+}
+
+inline bool operator!=(const ::boost::compute::context &lhs, const context& rhs)
+{
+    return !(lhs == rhs);
+}
+
+inline bool operator!=(const context& lhs, const ::boost::compute::context &rhs)
+{
+    return !(lhs == rhs);
+}
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_CONTEXT_HPP */
diff --git a/include/boost/compute/distributed/copy.hpp b/include/boost/compute/distributed/copy.hpp
new file mode 100644
index 000000000..3b12193cb
--- /dev/null
+++ b/include/boost/compute/distributed/copy.hpp
@@ -0,0 +1,525 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_COPY_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_COPY_HPP
+
+#include <algorithm>
+#include <iterator>
+
+#include <boost/utility/enable_if.hpp>
+
+#include <boost/mpl/and.hpp>
+#include <boost/mpl/not.hpp>
+#include <boost/mpl/or.hpp>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/wait_list.hpp>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/detail/weight_func.hpp>
+#include <boost/compute/distributed/detail/is_distributed_vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+// forward declaration for distributed::vector<T, weight_func, Alloc>
+template<
+    class T,
+    weight_func weight,
+    class Alloc
+>
+class vector;
+
+// host -> distributed::vector
+/// Copies the values in the range [\p first, \p last) allocated on host to
+/// distributed::vector \p result. The copy is performed asynchronously.
+template <
+    class InputIterator,
+    class T, weight_func weight, class Alloc
+>
+inline std::vector<event>
+copy_async(InputIterator first,
+           InputIterator last,
+           vector<T, weight, Alloc> &result,
+           command_queue &queue,
+           typename boost::enable_if_c<
+               !is_device_iterator<InputIterator>::value
+           >::type* = 0)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::difference_type diff_type;
+
+    std::vector<event> events;
+    events.reserve(result.parts());
+
+    InputIterator part_first = first;
+    InputIterator part_end = first;
+    for(size_t i = 0; i < result.parts(); i++)
+    {
+        part_end = (std::min)(
+            part_end + static_cast<diff_type>(result.part_size(i)),
+            last
+        );
+        event e =
+            ::boost::compute::copy_async(
+                part_first,
+                part_end,
+                result.begin(i),
+                queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+        part_first = part_end;
+    }
+    return events;
+}
+
+// host -> distributed::vector
+/// Copies the values in the range [\p first, \p last) allocated on host to
+/// distributed::vector \p result.
+template <
+    class InputIterator,
+    class T, weight_func weight, class Alloc
+>
+inline void
+copy(InputIterator first,
+     InputIterator last,
+     vector<T, weight, Alloc> &result,
+     command_queue &queue,
+     typename boost::enable_if_c<
+         !is_device_iterator<InputIterator>::value
+     >::type* = 0)
+{
+    std::vector<event> events =
+        ::boost::compute::distributed::copy_async(first, last, result, queue);
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+}
+
+// distributed::vector -> host
+/// Copy all values from distributed::vector \p input to the range beginning at
+/// \p result allocated on the host.
+template <
+    class T, weight_func weight, class Alloc,
+    class OutputIterator
+>
+inline std::vector<event>
+copy_async(const vector<T, weight, Alloc> &input,
+           OutputIterator result,
+           command_queue &queue,
+           typename boost::enable_if<
+               mpl::and_<
+                   mpl::not_<
+                       is_device_iterator<OutputIterator>
+                   >,
+                   mpl::not_<
+                       detail::is_distributed_vector<OutputIterator>
+                   >
+               >
+           >::type* = 0)
+{
+    typedef typename
+        std::iterator_traits<OutputIterator>::difference_type diff_type;
+
+    std::vector<event> events;
+    events.reserve(input.parts());
+
+    OutputIterator part_result = result;
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        event e =
+            ::boost::compute::copy_async(
+                input.begin(i),
+                input.end(i),
+                part_result,
+                queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+        part_result += static_cast<diff_type>(input.part_size(i));
+    }
+    return events;
+}
+
+// distributed::vector -> host
+/// Copy all values from distributed::vector \p input to the range beginning at
+/// \p result allocated on the host.
+template <
+    class T, weight_func weight, class Alloc,
+    class OutputIterator
+>
+inline void
+copy(const vector<T, weight, Alloc> &input,
+     OutputIterator result,
+     command_queue &queue,
+     typename boost::enable_if<
+         mpl::and_<
+             mpl::not_<
+                 is_device_iterator<OutputIterator>
+             >,
+             mpl::not_<
+                 detail::is_distributed_vector<OutputIterator>
+             >
+         >
+     >::type* = 0)
+{
+    std::vector<event> events =
+        ::boost::compute::distributed::copy_async(input, result, queue);
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+}
+
+// device -> distributed::vector
+/// Copies the values in the range [\p first, \p last) allocated on an OpenCL
+/// device to the distributed::vector \p result. The copy is performed
+/// asynchronously.
+template <
+    class InputIterator,
+    class T, weight_func weight, class Alloc
+>
+inline std::vector<event>
+copy_async(InputIterator first,
+           InputIterator last,
+           vector<T, weight, Alloc> &result,
+           ::boost::compute::command_queue &device_queue,
+           command_queue &distributed_queue,
+           typename boost::enable_if_c<
+               is_device_iterator<InputIterator>::value
+           >::type* = 0)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::difference_type diff_type;
+
+    std::vector<event> events;
+    events.reserve(result.parts());
+
+    InputIterator part_first = first;
+    InputIterator part_end = first;
+    for(size_t i = 0; i < result.parts(); i++)
+    {
+        BOOST_ASSERT_MSG(
+            distributed_queue.get(i).get_context() == device_queue.get_context(),
+            "copy_async() is only supported when context of every queue in"
+            " distributed_queue is the same context as context of device_queue"
+        );
+        part_end = (std::min)(
+            part_end + static_cast<diff_type>(result.part_size(i)),
+            last
+        );
+        event e =
+            ::boost::compute::copy_async(
+                part_first,
+                part_end,
+                result.begin(i),
+                distributed_queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+        part_first = part_end;
+    }
+    return events;
+}
+
+// host -> distributed::vector
+/// Copies the values in the range [\p first, \p last) allocated on an OpenCL
+/// device to the distributed::vector \p result.
+template <
+    class InputIterator,
+    class T, weight_func weight, class Alloc
+>
+inline void
+copy(InputIterator first,
+     InputIterator last,
+     vector<T, weight, Alloc> &result,
+     ::boost::compute::command_queue &device_queue,
+     command_queue &distributed_queue,
+     typename boost::enable_if_c<
+         is_device_iterator<InputIterator>::value
+     >::type* = 0)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::difference_type diff_type;
+
+    std::vector<event> events;
+    events.reserve(result.parts());
+
+    InputIterator part_first = first;
+    InputIterator part_end = first;
+    for(size_t i = 0; i < result.parts(); i++)
+    {
+        part_end = (std::min)(
+            part_end + static_cast<diff_type>(result.part_size(i)),
+            last
+        );
+        if(distributed_queue.get(i).get_context() == device_queue.get_context())
+        {
+            event e =
+                ::boost::compute::copy_async(
+                    part_first,
+                    part_end,
+                    result.begin(i),
+                    distributed_queue.get(i)
+                ).get_event();
+            if(e.get()) {
+                events.push_back(e);
+            }
+        }
+        else {
+            std::vector<T> host(result.part_size(i));
+            ::boost::compute::copy(
+                part_first,
+                part_end,
+                host.begin(),
+                device_queue
+            );
+            ::boost::compute::copy(
+                host.begin(),
+                host.end(),
+                result.begin(i),
+                distributed_queue.get(i)
+            );
+        }
+        part_first = part_end;
+    }
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+}
+
+// distributed::vector -> device
+/// Copy all values from distributed::vector \p input to the range beginning at
+/// \p result allocated on an OpenCL device.
+template <
+    class T, weight_func weight, class Alloc,
+    class OutputIterator
+>
+inline std::vector<event>
+copy_async(const vector<T, weight, Alloc> &input,
+           OutputIterator result,
+           command_queue &distributed_queue,
+           ::boost::compute::command_queue &device_queue,
+           typename boost::enable_if<
+               mpl::and_<
+                   is_device_iterator<OutputIterator>,
+                   mpl::not_<
+                       detail::is_distributed_vector<OutputIterator>
+                   >
+               >
+           >::type* = 0)
+{
+    typedef typename
+        std::iterator_traits<OutputIterator>::difference_type diff_type;
+
+    std::vector<event> events;
+    events.reserve(input.parts());
+
+    OutputIterator part_result = result;
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        BOOST_ASSERT_MSG(
+            distributed_queue.get(i).get_context() == device_queue.get_context(),
+            "copy_async() is only supported when context of every queue in"
+            " distributed_queue is the same context as context of device_queue"
+        );
+        event e =
+            ::boost::compute::copy_async(
+                input.begin(i),
+                input.end(i),
+                part_result,
+                distributed_queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+        part_result += static_cast<diff_type>(input.part_size(i));
+    }
+    return events;
+}
+
+// distributed::vector -> device
+/// Copy all values from distributed::vector \p input to the range beginning at
+/// \p result allocated on an OpenCL device.
+template <
+    class T, weight_func weight, class Alloc,
+    class OutputIterator
+>
+inline void
+copy(const vector<T, weight, Alloc> &input,
+     OutputIterator result,
+     command_queue &distributed_queue,
+     ::boost::compute::command_queue &device_queue,
+     typename boost::enable_if<
+         mpl::and_<
+             is_device_iterator<OutputIterator>,
+             mpl::not_<
+                 detail::is_distributed_vector<OutputIterator>
+             >
+         >
+     >::type* = 0)
+{
+    typedef typename
+        std::iterator_traits<OutputIterator>::difference_type diff_type;
+
+    std::vector<event> events;
+    events.reserve(input.parts());
+
+    OutputIterator part_result = result;
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        if(distributed_queue.get(i).get_context() == device_queue.get_context())
+        {
+            event e =
+                ::boost::compute::copy_async(
+                    input.begin(i),
+                    input.end(i),
+                    part_result,
+                    distributed_queue.get(i)
+                ).get_event();
+            if(e.get()) {
+                events.push_back(e);
+            }
+        }
+        else {
+            std::vector<T> host(input.part_size(i));
+            ::boost::compute::copy(
+                input.begin(i),
+                input.end(i),
+                host.begin(),
+                distributed_queue.get(i)
+            );
+            ::boost::compute::copy(
+                host.begin(),
+                host.end(),
+                part_result,
+                device_queue
+            );
+        }
+        part_result += static_cast<diff_type>(input.part_size(i));
+    }
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+}
+
+// distributed::vector -> distributed::vector
+/// Copy distributed::vector \p input into \p output. The copy is performed
+/// asynchronously.
+///
+/// Both vectors must be able to use \p queue, have the same weight function,
+/// the same size and the same number of parts.
+template<
+    class T1, weight_func weight, class Alloc1,
+    class T2, class Alloc2
+>
+inline std::vector<event>
+copy_async(const vector<T1, weight, Alloc1> &input,
+           vector<T2, weight, Alloc2> &output,
+           command_queue &queue)
+{
+    BOOST_ASSERT(input.parts() == output.parts());
+    BOOST_ASSERT(input.size() == output.size());
+
+    std::vector<event> events;
+    events.reserve(input.parts());
+
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        event e =
+            ::boost::compute::copy_async(
+                input.begin(i),
+                input.end(i),
+                output.begin(i),
+                queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+    }
+    return events;
+}
+
+// distributed::vector -> distributed::vector
+/// Copy distributed::vector \p input into \p output.
+template<
+    class T1, weight_func weight1, class Alloc1,
+    class T2, weight_func weight2, class Alloc2
+>
+inline void
+copy(const vector<T1, weight1, Alloc1> &input,
+     vector<T2, weight2, Alloc2> &output,
+     command_queue &input_queue,
+     command_queue &output_queue)
+{
+    std::vector<event> events;
+    events.reserve(input.parts());
+
+    std::vector<T2> host(input.size());
+    events =
+        ::boost::compute::distributed::copy_async(
+            input, host.begin(), input_queue
+         );
+    // wait for copying from input to host
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+    // copy from host to output vector
+    events =
+        ::boost::compute::distributed::copy_async(
+            host.begin(), host.end(), output, output_queue
+    );
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+}
+
+// distributed::vector -> distributed::vector
+/// Copy distributed::vector \p input into \p output.
+///
+/// Both vectors must be able to use \p queue, have the same weight function,
+/// and have the same number of parts.
+template<
+    class T1, weight_func weight, class Alloc1,
+    class T2, class Alloc2
+>
+inline void
+copy(const vector<T1, weight, Alloc1> &input,
+     vector<T2, weight, Alloc2> &output,
+     command_queue &queue)
+{
+    BOOST_ASSERT(input.parts() == output.parts());
+    if(input.size() == output.size()) {
+        std::vector<event> events =
+            ::boost::compute::distributed::copy_async(
+                input, output, queue
+            );
+        for(size_t i = 0; i < events.size(); i++) {
+              events[i].wait();
+        }
+    }
+    else {
+        ::boost::compute::distributed::copy(
+            input, output, queue, queue
+        );
+    }
+}
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_COPY_HPP */
diff --git a/include/boost/compute/distributed/detail/is_distributed_vector.hpp b/include/boost/compute/distributed/detail/is_distributed_vector.hpp
new file mode 100644
index 000000000..57fe5a66d
--- /dev/null
+++ b/include/boost/compute/distributed/detail/is_distributed_vector.hpp
@@ -0,0 +1,35 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_DETAIL_IS_DISTRIBUTED_VECTOR_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_DETAIL_IS_DISTRIBUTED_VECTOR_HPP
+
+
+#include <boost/type_traits/integral_constant.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+namespace detail {
+
+template<class T>
+struct is_distributed_vector : boost::false_type {};
+
+template<class T>
+struct is_distributed_vector<const T> : is_distributed_vector<T> {};
+
+} // end detail namespace
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_DETAIL_IS_DISTRIBUTED_VECTOR_HPP */
diff --git a/include/boost/compute/distributed/detail/weight_func.hpp b/include/boost/compute/distributed/detail/weight_func.hpp
new file mode 100644
index 000000000..0910137c6
--- /dev/null
+++ b/include/boost/compute/distributed/detail/weight_func.hpp
@@ -0,0 +1,77 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DETAIL_WEIGHT_FUNC_HPP
+#define BOOST_COMPUTE_DETAIL_WEIGHT_FUNC_HPP
+
+#include <vector>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+typedef std::vector<double> (*weight_func)(const command_queue&);
+
+std::vector<double> default_weight_func(const command_queue& queue)
+{
+    return std::vector<double>(queue.size(), 1.0/queue.size());
+}
+
+namespace detail {
+
+/// \internal_
+/// Rounds up \p n to the nearest multiple of \p m.
+/// Note: \p m must be a multiple of 2.
+size_t round_up(size_t n, size_t m)
+{
+    assert(m && ((m & (m -1)) == 0));
+    return (n + m - 1) & ~(m - 1);
+}
+
+std::vector<size_t> partition(const command_queue& queue,
+                              weight_func weight_func,
+                              const size_t size,
+                              const size_t align)
+{
+    std::vector<double> weights = weight_func(queue);
+    std::vector<size_t> partition;
+    partition.reserve(queue.size() + 1);
+    partition.push_back(0);
+
+    if(queue.size() > 1)
+    {
+        double acc = 0;
+        for(size_t i = 0; i < queue.size(); i++)
+        {
+            acc += weights[i];
+            partition.push_back(
+                std::min(
+                    size,
+                    round_up(size * acc, align)
+                )
+            );
+        }
+        return partition;
+    }
+    partition.push_back(size);
+    return partition;
+}
+
+} // end distributed detail
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+
+#endif /* INCLUDE_BOOST_COMPUTE_DETAIL_WEIGHT_FUNC_HPP_ */
diff --git a/include/boost/compute/distributed/exclusive_scan.hpp b/include/boost/compute/distributed/exclusive_scan.hpp
new file mode 100644
index 000000000..884de3f38
--- /dev/null
+++ b/include/boost/compute/distributed/exclusive_scan.hpp
@@ -0,0 +1,183 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_EXCLUSIVE_SCAN_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_EXCLUSIVE_SCAN_HPP
+
+#include <vector>
+
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/algorithm/merge.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
+
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputType,
+    class BinaryOperator
+>
+inline void
+exclusive_scan(const vector<InputType, weight, Alloc> &input,
+               vector<OutputType, weight, Alloc> &result,
+               OutputType init,
+               BinaryOperator binary_op,
+               command_queue &queue)
+{
+    BOOST_ASSERT(input.parts() == result.parts());
+    BOOST_ASSERT(input.size() == result.size());
+
+    std::vector<OutputType> input_tails;
+    input_tails.reserve(input.parts() - 1);
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        if(input.begin(i) != input.end(i) && i < (input.parts() - 1))
+        {
+            input_tails.push_back(
+                static_cast<OutputType>(
+                    (input.end(i) - 1).read(queue.get(i))
+                )
+            );
+        }
+
+        if(i == 0)
+        {
+            ::boost::compute::exclusive_scan(
+                input.begin(i),
+                input.end(i),
+                result.begin(i),
+                init,
+                binary_op,
+                queue.get(i)
+            );
+        }
+        else
+        {
+            ::boost::compute::exclusive_scan(
+                input.begin(i),
+                input.end(i),
+                result.begin(i),
+                input_tails[i - 1],
+                binary_op,
+                queue.get(i)
+            );
+        }
+    }
+
+    // find device for calculating partial sum of last elements of input vector
+    ::boost::compute::command_queue& device_queue = queue.get(0);
+    // CPU device is preferred, however if there is none, the first device
+    // queue is used
+    for(size_t i = 0; i < queue.size(); i++)
+    {
+        if(queue.get(i).get_device().type() & ::boost::compute::device::cpu)
+        {
+            device_queue = queue.get(i);
+            break;
+        }
+    }
+
+    std::vector<OutputType> output_tails(input_tails.size());
+    for(size_t i = 0; i < input.parts() - 1; i++)
+    {
+        if(input.begin(i) != input.end(i))
+        {
+            output_tails[i] = (result.end(i) - 1).read(queue.get(i));
+        }
+    }
+    ::boost::compute::vector<OutputType> output_tails_device(
+        output_tails.size(), device_queue.get_context()
+    );
+    ::boost::compute::copy_async(
+        output_tails.begin(),
+        output_tails.end(),
+        output_tails_device.begin(),
+        device_queue
+    );
+    ::boost::compute::inclusive_scan(
+        output_tails_device.begin(),
+        output_tails_device.end(),
+        output_tails_device.begin(),
+        device_queue
+    );
+    ::boost::compute::copy(
+        output_tails_device.begin(),
+        output_tails_device.end(),
+        output_tails.begin(),
+        device_queue
+    );
+    for(size_t i = 1; i < input.parts(); i++)
+    {
+        ::boost::compute::transform(
+            result.begin(i),
+            result.end(i),
+            ::boost::compute::make_constant_iterator(
+                output_tails[i - 1]
+            ),
+            result.begin(i),
+            binary_op,
+            queue.get(i)
+        );
+    }
+}
+
+/// \overload
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputType
+>
+inline void
+exclusive_scan(const vector<InputType, weight, Alloc> &input,
+               vector<OutputType, weight, Alloc> &result,
+               OutputType init,
+               command_queue &queue)
+{
+    ::boost::compute::distributed::exclusive_scan(
+        input,
+        result,
+        init,
+        boost::compute::plus<OutputType>(),
+        queue
+    );
+}
+
+/// \overload
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputType
+>
+inline void
+exclusive_scan(const vector<InputType, weight, Alloc> &input,
+               vector<OutputType, weight, Alloc> &result,
+               command_queue &queue)
+{
+    ::boost::compute::distributed::exclusive_scan(
+        input,
+        result,
+        OutputType(0),
+        boost::compute::plus<OutputType>(),
+        queue
+    );
+}
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_SCAN_HPP */
diff --git a/include/boost/compute/distributed/reduce.hpp b/include/boost/compute/distributed/reduce.hpp
new file mode 100644
index 000000000..cac7d0de1
--- /dev/null
+++ b/include/boost/compute/distributed/reduce.hpp
@@ -0,0 +1,395 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_REDUCE_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_REDUCE_HPP
+
+#include <vector>
+
+#include <boost/utility/enable_if.hpp>
+
+#include <boost/compute/buffer.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/is_device_iterator.hpp>
+
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+namespace detail {
+
+template<class OutputIterator>
+inline ::boost::compute::command_queue&
+final_reduce_queue(OutputIterator result,
+                   command_queue &queue,
+                   typename boost::enable_if_c<
+                       !is_device_iterator<OutputIterator>::value
+                   >::type* = 0)
+{
+    (void) result;
+
+    ::boost::compute::command_queue& device_queue = queue.get(0);
+    // CPU device is preferred, however if there is none, the first device
+    // queue is used
+    for(size_t i = 0; i < queue.size(); i++)
+    {
+        if(queue.get(i).get_device().type() & ::boost::compute::device::cpu)
+        {
+            device_queue = queue.get(i);
+            break;
+        }
+    }
+    return device_queue;
+}
+
+template<class OutputIterator>
+inline ::boost::compute::command_queue&
+final_reduce_queue(OutputIterator result,
+                   command_queue &queue,
+                   typename boost::enable_if_c<
+                       is_device_iterator<OutputIterator>::value
+                   >::type* = 0)
+{
+    // first, find all queues that can be used with result iterator
+    const ::boost::compute::context& result_context =
+        result.get_buffer().get_context();
+    std::vector<size_t> compatible_queues;
+    for(size_t i = 0; i < queue.size(); i++)
+    {
+        if(queue.get(i).get_context() == result_context)
+        {
+            compatible_queues.push_back(i);
+        }
+    }
+    BOOST_ASSERT_MSG(
+        !compatible_queues.empty(),
+        "There is no device command queue that can be use to copy to result"
+    );
+
+    // then choose device queue from compatible device queues
+
+    // CPU device is preferred, however if there is none, the first
+    // compatible device queue is used
+    ::boost::compute::command_queue& device_queue = queue.get(compatible_queues[0]);
+    for(size_t i = 0; i < compatible_queues.size(); i++)
+    {
+        size_t n = compatible_queues[i];
+        if(queue.get(n).get_device().type() & ::boost::compute::device::cpu)
+        {
+            device_queue = queue.get(n);
+            break;
+        }
+    }
+    return device_queue;
+}
+
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputIterator,
+    class BinaryFunction
+>
+inline void
+dispatch_reduce(const vector<InputType, weight, Alloc> &input,
+                OutputIterator result,
+                BinaryFunction function,
+                command_queue &queue)
+{
+    typedef typename
+        boost::compute::result_of<BinaryFunction(InputType, InputType)>::type
+        result_type;
+
+    // find device queue for the final reduction
+    ::boost::compute::command_queue& device_queue =
+        final_reduce_queue(result, queue);
+
+    ::boost::compute::buffer parts_results_device(
+        device_queue.get_context(), input.parts() * sizeof(result_type)
+    );
+
+    // if all devices queues are in the same OpenCL context we can
+    // save part reduction directly into parts_results_device buffer
+    size_t reduced = 0;
+    if(queue.one_context())
+    {
+        // reduce each part of input vector
+        for(size_t i = 0; i < input.parts(); i++)
+        {
+            if(input.begin(i) != input.end(i))
+            {
+                // async, because it stores result on device
+                ::boost::compute::reduce(
+                    input.begin(i),
+                    input.end(i),
+                    ::boost::compute::make_buffer_iterator<result_type>(
+                        parts_results_device, reduced
+                    ),
+                    function,
+                    queue.get(i)
+                );
+                reduced++;
+            }
+        }
+        // add marker on every queue that is not device_queue, because
+        // we need to know when reductions are done
+        wait_list reduce_markers;
+        reduce_markers.reserve(reduced);
+        for(size_t i = 0; i < input.parts(); i++)
+        {
+            if(input.begin(i) != input.end(i) && queue.get(i) != device_queue)
+            {
+                reduce_markers.insert(queue.get(i).enqueue_marker());
+            }
+        }
+        // if it is possible we enqueue a barrier in device_queue which waits
+        // for reduce_markers (we can do this since all events are in the same
+        // context); otherwise, we need to sync. wait for those events
+        #ifdef CL_VERSION_1_2
+        if(device_queue.check_device_version(1, 2)) {
+            device_queue.enqueue_barrier(reduce_markers);
+        }
+        #endif
+        {
+            reduce_markers.wait();
+        }
+    }
+    else
+    {
+        // reduce each part of input vector
+        std::vector<result_type> parts_results_host(input.parts());
+        for(size_t i = 0; i < input.parts(); i++)
+        {
+            if(input.begin(i) != input.end(i))
+            {
+                // sync, because it stores result on host
+                ::boost::compute::reduce(
+                    input.begin(i),
+                    input.end(i),
+                    &parts_results_host[reduced],
+                    function,
+                    queue.get(i)
+                );
+                reduced++;
+            }
+        }
+        // sync, because it copies from host to device
+        ::boost::compute::copy_n(
+            parts_results_host.begin(),
+            reduced,
+            ::boost::compute::make_buffer_iterator<result_type>(
+                parts_results_device
+            ),
+            device_queue
+        );
+    }
+    // final reduction
+    // async if result is device_iterator, sync otherwise
+    ::boost::compute::reduce(
+        ::boost::compute::make_buffer_iterator<result_type>(
+            parts_results_device
+        ),
+        ::boost::compute::make_buffer_iterator<result_type>(
+            parts_results_device, reduced
+        ),
+        result,
+        function,
+        device_queue
+    );
+}
+
+// special case for when OutputIterator is a host iterator
+// and binary operator is plus<T>
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputIterator,
+    class T
+>
+inline void
+dispatch_reduce(const vector<InputType, weight, Alloc> &input,
+                OutputIterator result,
+                ::boost::compute::plus<T> function,
+                command_queue &queue,
+                typename boost::enable_if_c<
+                    !is_device_iterator<OutputIterator>::value
+                >::type* = 0)
+{
+    // reduce each part of input vector
+    std::vector<T> parts_results_host(input.parts());
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        ::boost::compute::reduce(
+            input.begin(i),
+            input.end(i),
+            &parts_results_host[i],
+            function,
+            queue.get(i)
+        );
+    }
+
+    // final reduction
+    *result = parts_results_host[0];
+    for(size_t i = 1; i < input.parts(); i++)
+    {
+        *result += static_cast<T>(parts_results_host[i]);
+    }
+}
+
+// special case for when OutputIterator is a host iterator
+// and binary operator is min<T>
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputIterator,
+    class T
+>
+inline void
+dispatch_reduce(vector<InputType, weight, Alloc> &input,
+                OutputIterator result,
+                ::boost::compute::min<T> function,
+                command_queue &queue,
+                typename boost::enable_if_c<
+                    !is_device_iterator<OutputIterator>::value
+                >::type* = 0)
+{
+    // reduce each part of input vector
+    std::vector<T> parts_results_host(input.parts());
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        ::boost::compute::reduce(
+            input.begin(i),
+            input.end(i),
+            &parts_results_host[i],
+            function,
+            queue.get(i)
+        );
+    }
+
+    // final reduction
+    *result = parts_results_host[0];
+    for(size_t i = 1; i < input.parts(); i++)
+    {
+        *result = (std::min)(static_cast<T>(*result), parts_results_host[i]);
+    }
+}
+
+// special case for when OutputIterator is a host iterator
+// and binary operator is max<T>
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputIterator,
+    class T
+>
+inline void
+dispatch_reduce(const vector<InputType, weight, Alloc> &input,
+                OutputIterator result,
+                ::boost::compute::max<T> function,
+                command_queue &queue,
+                typename boost::enable_if_c<
+                    !is_device_iterator<OutputIterator>::value
+                >::type* = 0)
+{
+    // reduce each part of input vector
+    std::vector<T> parts_results_host(input.parts());
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        ::boost::compute::reduce(
+            input.begin(i),
+            input.end(i),
+            &parts_results_host[i],
+            function,
+            queue.get(i)
+        );
+    }
+
+    // final reduction
+    *result = parts_results_host[0];
+    for(size_t i = 1; i < input.parts(); i++)
+    {
+        *result = (std::max)(static_cast<T>(*result), parts_results_host[i]);
+    }
+}
+
+} // end detail namespace
+
+/// Returns the result of applying \p function to the elements in the
+/// \p input vector.
+///
+/// If no function is specified, \c plus will be used.
+///
+/// \param input input vector
+/// \param result iterator pointing to the output
+/// \param function binary reduction function
+/// \param queue distributed command queue to perform the operation
+///
+/// Distributed command queue \p queue has to span same set of compute devices
+/// (including their contexts) as distributed command queue used to create
+///  \p input vector.
+///
+/// If \p result is a device iterator, its underlying buffer must be allocated
+/// in context of at least one device command queue from \p queue.
+///
+/// The \c reduce() algorithm assumes that the binary reduction function is
+/// associative. When used with non-associative functions the result may
+/// be non-deterministic and vary in precision. Notably this affects the
+/// \c plus<float>() function as floating-point addition is not associative
+/// and may produce slightly different results than a serial algorithm.
+///
+/// This algorithm supports both host and device iterators for the
+/// result argument. This allows for values to be reduced and copied
+/// to the host all with a single function call.
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputIterator,
+    class BinaryFunction
+>
+inline void
+reduce(const vector<InputType, weight, Alloc> &input,
+       OutputIterator result,
+       BinaryFunction function,
+       command_queue &queue)
+{
+    if(input.empty()) {
+        return;
+    }
+
+    if(input.parts() == 1) {
+        ::boost::compute::reduce(
+            input.begin(0),
+            input.end(0),
+            result,
+            function,
+            queue.get(0)
+        );
+        return;
+    }
+    detail::dispatch_reduce(input, result, function, queue);
+}
+
+/// \overload
+template<
+    class InputType, weight_func weight, class Alloc,
+    class OutputIterator
+>
+inline void
+reduce(const vector<InputType, weight, Alloc> &input,
+       OutputIterator result,
+       command_queue &queue)
+{
+    return reduce(input, result, ::boost::compute::plus<InputType>(), queue);
+}
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_REDUCE_HPP */
diff --git a/include/boost/compute/distributed/transform.hpp b/include/boost/compute/distributed/transform.hpp
new file mode 100644
index 000000000..55782ff49
--- /dev/null
+++ b/include/boost/compute/distributed/transform.hpp
@@ -0,0 +1,170 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_TRANSFORM_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_TRANSFORM_HPP
+
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+#include <boost/compute/functional/detail/unpack.hpp>
+
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+/// Transforms all the elements from vector \p input using operator \p op
+/// and stores the results in \p result. The transform is performed
+/// asynchronously and it returns a vector of events, each assisted with
+/// transformation of successive parts of \p input.
+///
+/// Distributed command queue \p queue has to span same set of compute devices
+/// (including their contexts) as distributed command queue used to create
+/// \p input and \p output vectors.
+///
+/// \see distributed::transform()
+template<
+    class InputType, class OutputType,
+    weight_func weight, class Alloc,
+    class UnaryOperator
+>
+inline std::vector<event>
+transform_async(const vector<InputType, weight, Alloc> &input,
+                vector<OutputType, weight, Alloc> &result,
+                UnaryOperator op,
+                command_queue &queue)
+{
+    BOOST_ASSERT(input.parts() == result.parts());
+    BOOST_ASSERT(input.size() == result.size());
+
+    std::vector<event> events;
+    events.reserve(input.parts());
+
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        event e =
+            ::boost::compute::copy_async(
+                ::boost::compute::make_transform_iterator(input.begin(i), op),
+                ::boost::compute::make_transform_iterator(input.end(i), op),
+                result.begin(i),
+                queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+    }
+    return events;
+}
+
+/// \overload
+template<
+    class InputType1, class InputType2, class OutputType,
+    weight_func weight, class Alloc,
+    class BinaryOperator
+>
+inline std::vector<event>
+transform_async(const vector<InputType1, weight, Alloc> &input1,
+                const vector<InputType2, weight, Alloc> &input2,
+                vector<OutputType, weight, Alloc> &result,
+                BinaryOperator op,
+                command_queue &queue)
+{
+    BOOST_ASSERT(input1.parts() == input2.parts());
+    BOOST_ASSERT(input1.parts() == result.parts());
+    BOOST_ASSERT(input1.size() == input1.size());
+    BOOST_ASSERT(input1.size() == result.size());
+
+    std::vector<event> events;
+    events.reserve(input1.parts());
+
+    ::boost::compute::detail::unpacked<BinaryOperator> unpacked_op =
+        ::boost::compute::detail::unpack(op);
+    for(size_t i = 0; i < input1.parts(); i++)
+    {
+        event e =
+            ::boost::compute::copy_async(
+                ::boost::compute::make_transform_iterator(
+                    ::boost::compute::make_zip_iterator(
+                        boost::make_tuple(input1.begin(i), input2.begin(i))
+                    ),
+                    unpacked_op
+                ),
+                ::boost::compute::make_transform_iterator(
+                    ::boost::compute::make_zip_iterator(
+                        boost::make_tuple(input1.end(i), input2.end(i))
+                    ),
+                    unpacked_op
+                ),
+                result.begin(i),
+                queue.get(i)
+            ).get_event();
+        if(e.get()) {
+            events.push_back(e);
+        }
+    }
+    return events;
+}
+
+/// Transforms all the elements from vector \p input using operator \p op
+/// and stores the results in \p result.
+///
+/// Distributed command queue \p queue has to span same set of compute devices
+/// (including their contexts) as distributed command queue used to create
+/// \p input and \p output vectors.
+///
+/// \see  distributed::transform_async()
+template<
+    class InputType, weight_func weight, class Alloc, class OutputType,
+    class UnaryOperator
+>
+inline void
+transform(const vector<InputType, weight, Alloc> &input,
+          vector<OutputType, weight, Alloc> &output,
+          UnaryOperator op,
+          command_queue &queue)
+{
+    std::vector<event> events =
+        transform_async(
+            input, output, op, queue
+        );
+    for(size_t i = 0; i < events.size(); i++) {
+          events[i].wait();
+    }
+}
+
+/// \overload
+template<
+    class InputType1, class InputType2, class OutputType,
+    weight_func weight, class Alloc,
+    class BinaryOperator
+>
+inline void
+transform(const vector<InputType1, weight, Alloc> &input1,
+          const vector<InputType2, weight, Alloc> &input2,
+          vector<OutputType, weight, Alloc> &output,
+          BinaryOperator op,
+          command_queue &queue)
+{
+    std::vector<event> events =
+        transform_async(
+            input1, input2, output, op, queue
+        );
+    for(size_t i = 0; i < events.size(); i++) {
+          events[i].wait();
+    }
+}
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_TRANSFORM_HPP */
diff --git a/include/boost/compute/distributed/vector.hpp b/include/boost/compute/distributed/vector.hpp
new file mode 100644
index 000000000..3ad4c19ca
--- /dev/null
+++ b/include/boost/compute/distributed/vector.hpp
@@ -0,0 +1,597 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_DISTRIBUTED_VECTOR_HPP
+#define BOOST_COMPUTE_DISTRIBUTED_VECTOR_HPP
+
+#include <algorithm>
+#include <vector>
+#include <cstddef>
+#include <iterator>
+#include <exception>
+
+#include <boost/throw_exception.hpp>
+
+#include <boost/compute/config.hpp>
+
+#ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+#include <initializer_list>
+#endif
+
+#include <boost/compute/buffer.hpp>
+#include <boost/compute/device.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/fill.hpp>
+#include <boost/compute/allocator/buffer_allocator.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/detail/capture_traits.hpp>
+#include <boost/compute/detail/buffer_value.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/copy.hpp>
+#include <boost/compute/distributed/detail/weight_func.hpp>
+#include <boost/compute/distributed/detail/is_distributed_vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace distributed {
+
+/// \class vector
+/// \brief A resizable array of values allocated across multiple devices.
+///
+/// The vector<T> class stores a dynamic array of values. Internally, the data
+/// is stored in OpenCL buffer objects from multiple OpenCL contexts.
+template<
+    class T,
+    weight_func weight = default_weight_func,
+    class Alloc = ::boost::compute::buffer_allocator<T>
+>
+class vector
+{
+public:
+    typedef T value_type;
+    typedef Alloc allocator_type;
+    typedef typename allocator_type::size_type size_type;
+    typedef typename allocator_type::difference_type difference_type;
+    typedef ::boost::compute::detail::buffer_value<T> reference;
+    typedef const ::boost::compute::detail::buffer_value<T> const_reference;
+    typedef typename allocator_type::pointer pointer;
+    typedef typename allocator_type::const_pointer const_pointer;
+    typedef buffer_iterator<T> iterator;
+    typedef buffer_iterator<T> const_iterator;
+    typedef std::reverse_iterator<iterator> reverse_iterator;
+    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+    /// Creates an empty distributed vector using \p queue.
+    explicit vector(const command_queue &queue)
+        : m_queue(queue),
+          m_size(0)
+    {
+        for(size_t i = 0; i < m_queue.size(); i++)
+        {
+            m_allocators.push_back(Alloc(m_queue.get_context(i)));
+            m_data.push_back(
+                m_allocators.back()
+                    .allocate(_minimum_capacity())
+            );
+            m_data_indices.push_back(0);
+        }
+        m_data_indices.push_back(0);
+    }
+
+    /// Creates a distributed vector with space for \p count elements
+    /// in \p context.
+    explicit vector(size_t count, const command_queue &queue)
+        : m_queue(queue),
+          m_size(count)
+    {
+        allocate_memory(count);
+    }
+
+    /// Creates a distributed  vector with space for \p count elements and
+    /// sets each equal to \p value.
+    ///
+    /// For example:
+    /// \code
+    /// // creates a vector with four values set to nine (e.g. [9, 9, 9, 9]).
+    /// boost::compute::distributed::vector<int> vec(4, 9, queue);
+    /// \endcode
+    vector(size_type count,
+           const T &value,
+           command_queue &queue)
+        : m_queue(queue),
+          m_size(count)
+    {
+        allocate_memory(m_size);
+        std::vector<event> events;
+        events.reserve(m_data.size());
+        for(size_t i = 0; i < m_data.size(); i++)
+        {
+            event e =
+                ::boost::compute::fill_async(
+                    begin(i),
+                    end(i),
+                    value,
+                    queue.get(i)
+                ).get_event();
+            if(e.get()) {
+                events.push_back(e);
+            }
+        }
+        for(size_t i = 0; i < events.size(); i++) {
+            events[i].wait();
+        }
+    }
+
+    /// Creates a vector with space for the values in the range [\p first,
+    /// \p last) allocated on the host and copies them into the vector
+    /// with \p queue.
+    ///
+    /// For example:
+    /// \code
+    /// // values on the host
+    /// int data[] = { 1, 2, 3, 4 };
+    ///
+    /// // create a vector of size four and copy the values from data
+    /// boost::compute::distributed::vector<int> vec(data, data + 4, queue);
+    /// \endcode
+    template<class HostIterator>
+    vector(HostIterator first,
+           HostIterator last,
+           command_queue &queue,
+           typename boost::enable_if_c<
+               !is_device_iterator<HostIterator>::value
+           >::type* = 0)
+        : m_queue(queue),
+          m_size(::boost::compute::detail::iterator_range_size(first, last))
+    {
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(first, last, *this, m_queue);
+    }
+
+    /// Creates a vector with space for the values in the range [\p first,
+    /// \p last) allocated on an OpenCL device and copies them into the vector
+    /// with \p queue.
+    template<class DeviceIterator>
+    vector(DeviceIterator first,
+           DeviceIterator last,
+           ::boost::compute::command_queue &device_queue,
+           command_queue &distributed_queue,
+           typename boost::enable_if_c<
+               is_device_iterator<DeviceIterator>::value
+           >::type* = 0)
+        : m_queue(distributed_queue),
+          m_size(::boost::compute::detail::iterator_range_size(first, last))
+    {
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(
+            first, last, *this, device_queue, m_queue
+        );
+    }
+
+    /// Creates a new vector and copies the values from \p other.
+    explicit vector(const vector &other)
+        : m_queue(other.m_queue),
+          m_size(other.m_size)
+    {
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(
+            other, *this, m_queue
+        );
+    }
+
+    /// Creates a new vector and copies the values from \p other
+    /// with \p queue.
+    vector(const vector &other, command_queue &queue)
+        : m_queue(queue),
+          m_size(other.m_size)
+    {
+        allocate_memory(m_size);
+        if(m_queue == other.m_queue) {
+            ::boost::compute::distributed::copy(
+                other, *this, m_queue
+            );
+        }
+        else {
+            command_queue other_queue = other.get_queue();
+            ::boost::compute::distributed::copy(
+                other, *this, other_queue, m_queue
+            );
+        }
+    }
+
+    /// Creates a new vector and copies the values from \p other
+    /// with \p queue.
+    template<class OtherAlloc>
+    vector(const vector<T, weight, OtherAlloc> &other)
+        : m_queue(other.m_queue),
+          m_size(other.m_size)
+    {
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(
+            other, *this, m_queue
+        );
+    }
+
+    /// Creates a new vector and copies the values from \p other.
+    template<class OtherAlloc>
+    vector(const vector<T, weight, OtherAlloc> &other,
+           command_queue &queue)
+        : m_queue(queue),
+          m_size(other.size())
+    {
+        allocate_memory(m_size);
+        if(m_queue == other.get_queue()) {
+            ::boost::compute::distributed::copy(
+                other, *this, m_queue
+            );
+        }
+        else {
+            command_queue other_queue = other.get_queue();
+            ::boost::compute::distributed::copy(
+                other, *this, other_queue, m_queue
+            );
+        }
+    }
+
+    /// Creates a new vector and copies the values from \p vector.
+    template<class OtherAlloc>
+    vector(const std::vector<T, OtherAlloc> &vector,
+           command_queue &queue)
+        : m_queue(queue),
+          m_size(vector.size())
+    {
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(
+            vector.begin(), vector.end(), *this, m_queue
+        );
+    }
+
+    /// Copy assignment. This operation is always non-blocking.
+    vector& operator=(const vector &other)
+    {
+        if(this != &other){
+            m_queue = other.m_queue;
+            m_size = other.m_size;
+            allocate_memory(m_size);
+            ::boost::compute::distributed::copy(other, *this, m_queue);
+        }
+        return *this;
+    }
+
+    /// Copy assignment. This operation is always non-blocking.
+    template<class OtherAlloc>
+    vector& operator=(const vector<T, weight, OtherAlloc> &other)
+    {
+        m_queue = other.get_queue();
+        m_size = other.size();
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(other, *this, m_queue);
+        return *this;
+    }
+
+    /// Copy assignment. This operation is always non-blocking.
+    template<class OtherAlloc>
+    vector& operator=(const std::vector<T, OtherAlloc> &vector)
+    {
+        m_size = vector.size();
+        allocate_memory(m_size);
+        ::boost::compute::distributed::copy(
+            vector.begin(), vector.end(), *this, m_queue
+        );
+        return *this;
+    }
+
+    #ifndef BOOST_COMPUTE_NO_RVALUE_REFERENCES
+    /// Move-constructs a new vector from \p other.
+    vector(vector&& other)
+        : m_queue(std::move(m_queue)),
+          m_size(other.m_size),
+          m_data(std::move(other.m_data)),
+          m_data_indices(std::move(other.m_data_indices)),
+          m_allocators(std::move(other.m_allocators))
+    {
+        other.m_size = 0;
+    }
+
+    /// Move-assigns the data from \p other to \c *this.
+    vector& operator=(vector&& other)
+    {
+        if(m_size) {
+            for(size_t i = 0; i < m_allocators.size(); i++) {
+                m_allocators[i].deallocate(m_data[i], part_size(i));
+            }
+        }
+
+        m_queue = std::move(other.m_queue);
+        m_size = other.m_size;
+        m_data = std::move(other.m_data);
+        m_data_indices = std::move(other.m_data_indices);
+        m_allocators = std::move(other.m_allocators);
+
+        other.m_size = 0;
+
+        return *this;
+    }
+    #endif // BOOST_COMPUTE_NO_RVALUE_REFERENCES
+
+    /// Destroys the vector object.
+    ~vector()
+    {
+        if(m_size) {
+            for(size_t i = 0; i < m_allocators.size(); i++) {
+                m_allocators[i].deallocate(m_data[i], part_size(i));
+            }
+        }
+    }
+
+    /// Returns the number of elements in the vector.
+    size_type size() const
+    {
+        return m_size;
+    }
+
+    size_t parts() const
+    {
+        return m_data.size();
+    }
+
+    std::vector<size_type> parts_sizes() const
+    {
+        std::vector<size_type> part_sizes(parts());
+        for(size_t i = 0; i < parts(); i++) {
+            part_sizes[i] = part_size(i);
+        }
+        return part_sizes;
+    }
+
+    size_type part_size(size_t n) const
+    {
+        return m_data_indices[n+1] - m_data_indices[n];
+    }
+
+    std::vector<size_t> parts_starts() const
+    {
+        return m_data_indices;
+    }
+
+    size_t part_start(size_t n) const
+    {
+        return m_data_indices[n];
+    }
+
+    /// Returns \c true if the vector is empty.
+    bool empty() const
+    {
+        return m_size == 0;
+    }
+
+    iterator begin(size_t n)
+    {
+        return ::boost::compute::make_buffer_iterator<T>(
+            m_data[n].get_buffer(), 0
+        );
+    }
+
+    const_iterator begin(size_t n) const
+    {
+        return ::boost::compute::make_buffer_iterator<T>(
+            m_data[n].get_buffer(), 0
+        );
+    }
+
+    const_iterator cbegin(size_t n) const
+    {
+        return begin(n);
+    }
+
+    iterator end(size_t n)
+    {
+        return ::boost::compute::make_buffer_iterator<T>(
+            m_data[n].get_buffer(), m_data_indices[n+1] - m_data_indices[n]
+        );
+    }
+
+    const_iterator end(size_t n) const
+    {
+        return ::boost::compute::make_buffer_iterator<T>(
+            m_data[n].get_buffer(), m_data_indices[n+1] - m_data_indices[n]
+        );
+    }
+
+    const_iterator cend(size_t n) const
+    {
+        return end(n);
+    }
+
+    reverse_iterator rbegin(size_t n)
+    {
+        return reverse_iterator(end(n) - 1);
+    }
+
+    const_reverse_iterator rbegin(size_t n) const
+    {
+        return reverse_iterator(end(n) - 1);
+    }
+
+    const_reverse_iterator crbegin(size_t n) const
+    {
+        return rbegin(n);
+    }
+
+    reverse_iterator rend(size_t n)
+    {
+        return reverse_iterator(begin(n) - 1);
+    }
+
+    const_reverse_iterator rend(size_t n) const
+    {
+        return reverse_iterator(begin(n) - 1);
+    }
+
+    const_reverse_iterator crend(size_t n) const
+    {
+        return rend(n);
+    }
+
+    reference operator[](size_type index)
+    {
+        size_t n =
+            std::upper_bound(m_data_indices.begin(), m_data_indices.end(), index)
+                - m_data_indices.begin() - 1;
+        size_t part_index = index - m_data_indices[n];
+        return *(begin(n) + static_cast<difference_type>(part_index));
+    }
+
+    const_reference operator[](size_type index) const
+    {
+        size_t n =
+            std::upper_bound(m_data_indices.begin(), m_data_indices.end(), index)
+                - m_data_indices.begin() - 1;
+        size_t part_index = index - m_data_indices[n];
+        return *(begin(n) + static_cast<difference_type>(part_index));
+    }
+
+    reference at(size_type index)
+    {
+        if(index >= size()){
+            BOOST_THROW_EXCEPTION(std::out_of_range("index out of range"));
+        }
+
+        return operator[](index);
+    }
+
+    const_reference at(size_type index) const
+    {
+        if(index >= size()){
+            BOOST_THROW_EXCEPTION(std::out_of_range("index out of range"));
+        }
+
+        return operator[](index);
+    }
+
+    reference front()
+    {
+        return *begin(parts() - 1);
+    }
+
+    const_reference front() const
+    {
+        return *begin(parts() - 1);
+    }
+
+    reference back()
+    {
+        return *(end(parts() - 1) - static_cast<difference_type>(1));
+    }
+
+    const_reference back() const
+    {
+        return *(end(parts() - 1) - static_cast<difference_type>(1));
+    }
+
+    /// Swaps the contents of \c *this with \p other.
+    void swap(vector &other)
+    {
+        std::swap(m_data, other.m_data);
+        std::swap(m_data_indices, other.m_data_indices);
+        std::swap(m_size, other.m_size);
+        std::swap(m_allocators, other.m_allocators);
+        std::swap(m_queue, other.m_queue);
+    }
+
+    /// Returns the underlying buffer.
+    std::vector<buffer> get_buffers() const
+    {
+        std::vector<buffer> buffers;
+        buffers.reserve(m_data.size());
+        for(size_t i = 0; i < m_data.size(); i++) {
+            buffers.push_back(m_data[i].get_buffer());
+        }
+        return buffers;
+    }
+
+    /// Returns the underlying buffer for part \p n.
+    const buffer& get_buffer(size_t n) const
+    {
+        return m_data[n].get_buffer();
+    }
+
+    command_queue get_queue() const
+    {
+        return m_queue;
+    }
+
+    /// command queue.
+    const context& get_context() const
+    {
+        return m_queue.get_context();
+    }
+
+private:
+    /// \internal_
+    BOOST_CONSTEXPR size_type _align() const { return 16; }
+
+    /// \internal_
+    BOOST_CONSTEXPR size_type _minimum_capacity() const { return _align(); }
+
+    /// \internal_
+    BOOST_CONSTEXPR float _growth_factor() const { return 1.5; }
+
+    void allocate_memory(size_type count)
+    {
+        m_allocators.clear();
+        m_data.clear();
+        m_data_indices.clear();
+
+        m_allocators.reserve(m_queue.size());
+        m_data.reserve(m_queue.size());
+        m_data_indices.reserve(m_queue.size() + 1);
+
+        std::vector<size_t> partition =
+            detail::partition(m_queue, weight, count, _align());
+        for(size_t i = 0; i < m_queue.size(); i++)
+        {
+            size_type data_size = partition[i + 1] - partition[i];
+            m_allocators.push_back(Alloc(m_queue.get_context(i)));
+            m_data.push_back(
+                m_allocators.back()
+                    .allocate((std::max)(data_size, _minimum_capacity()))
+                );
+            m_data_indices.push_back(partition[i]);
+        }
+        m_data_indices.push_back(partition[m_queue.size()]);
+    }
+
+private:
+    command_queue m_queue;
+    size_type m_size;
+
+    std::vector<size_t> m_data_indices;
+    std::vector<pointer> m_data;
+    std::vector<allocator_type> m_allocators;
+};
+
+
+namespace detail {
+
+template<class T, weight_func w, class Alloc>
+struct is_distributed_vector< vector<T, w, Alloc> > : boost::true_type {};
+
+} // end detail namespace
+
+} // end distributed namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_DISTRIBUTED_VECTOR_HPP */
diff --git a/include/boost/compute/utility/wait_list.hpp b/include/boost/compute/utility/wait_list.hpp
index 8b81924d6..5b53c5aad 100644
--- a/include/boost/compute/utility/wait_list.hpp
+++ b/include/boost/compute/utility/wait_list.hpp
@@ -155,6 +155,23 @@ class wait_list
         insert(future.get_event());
     }
 
+    /// Inserts \p event into the wait-list.
+    bool safe_insert(const event &event)
+    {
+        if(event.get()) {
+            m_events.push_back(event);
+            return true;
+        }
+        return false;
+    }
+
+    /// Inserts the event from \p future into the wait-list.
+    template<class T>
+    bool safe_insert(const future<T> &future)
+    {
+        return safe_insert(future.get_event());
+    }
+
     /// Blocks until all of the events in the wait-list have completed.
     ///
     /// Does nothing if the wait-list is empty.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 42e3969e1..0fba1a464 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -81,6 +81,14 @@ add_compute_test("core.system" test_system.cpp)
 add_compute_test("core.type_traits" test_type_traits.cpp)
 add_compute_test("core.user_event" test_user_event.cpp)
 
+add_compute_test("distributed.context" test_distributed_context.cpp)
+add_compute_test("distributed.command_queue" test_distributed_command_queue.cpp)
+add_compute_test("distributed.vector" test_distributed_vector.cpp)
+add_compute_test("distributed.copy" test_distributed_copy.cpp)
+add_compute_test("distributed.reduce" test_distributed_reduce.cpp)
+add_compute_test("distributed.transform" test_distributed_transform.cpp)
+add_compute_test("distributed.transform" test_distributed_scan.cpp)
+
 add_compute_test("utility.extents" test_extents.cpp)
 add_compute_test("utility.invoke" test_invoke.cpp)
 add_compute_test("utility.program_cache" test_program_cache.cpp)
diff --git a/test/distributed_check_functions.hpp b/test/distributed_check_functions.hpp
new file mode 100644
index 000000000..b1a374f98
--- /dev/null
+++ b/test/distributed_check_functions.hpp
@@ -0,0 +1,131 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_TEST_CHECK_FUNCTIONS_HPP
+#define BOOST_COMPUTE_TEST_CHECK_FUNCTIONS_HPP
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+#include <boost/compute/distributed/copy.hpp>
+
+template<
+    class T1, boost::compute::distributed::weight_func weight, class Alloc1,
+    class T2, class Alloc2
+>
+inline bool
+distributed_equal(const boost::compute::distributed::vector<T1, weight, Alloc1> &input1,
+                  const boost::compute::distributed::vector<T2, weight, Alloc2> &input2,
+                  boost::compute::distributed::command_queue &queue)
+{
+    if(input1.parts() != input2.parts()) {
+        return false;
+    }
+    if(input1.size() != input2.size()) {
+        return false;
+    }
+
+    for(size_t i = 0; i < input1.parts(); i++)
+    {
+        if(
+            !boost::compute::equal(
+                input1.begin(i), input1.end(i), input2.begin(i), queue.get(i)
+            )
+        )
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template<
+    class T, boost::compute::distributed::weight_func weight, class Alloc
+>
+inline bool
+distributed_equal(const boost::compute::distributed::vector<T, weight, Alloc> &input,
+                  const T value,
+                  boost::compute::distributed::command_queue &queue)
+{
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        if(
+            !boost::compute::equal(
+                input.begin(i),
+                input.end(i),
+                boost::compute::make_constant_iterator(value),
+                queue.get(i)
+            )
+        )
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template<
+    class T, boost::compute::distributed::weight_func weight, class Alloc
+>
+inline bool
+distributed_equal(const boost::compute::distributed::vector<T, weight, Alloc> &input,
+                  typename std::vector<T>::iterator first,
+                  typename std::vector<T>::iterator end,
+                  boost::compute::distributed::command_queue &queue)
+{
+    if(std::distance(first, end) != input.size()) {
+        return false;
+    }
+
+    typename std::vector<T>::iterator part_first = first;
+    typename std::vector<T>::iterator part_end = first;
+    for(size_t i = 0; i < input.parts(); i++)
+    {
+        part_end += input.part_size(i);
+        boost::compute::vector<T> data(part_first, part_end, queue.get(i));
+        if(
+            !boost::compute::equal(
+                input.begin(i),
+                input.end(i),
+                data.begin(),
+                queue.get(i)
+            )
+        )
+        {
+            return false;
+        }
+        part_first = part_end;
+    }
+    return true;
+}
+
+template<
+    class T1, boost::compute::distributed::weight_func weight1, class Alloc1,
+    class T2, boost::compute::distributed::weight_func weight2, class Alloc2
+>
+inline bool
+distributed_equal(const boost::compute::distributed::vector<T1, weight1, Alloc1> &input1,
+                  const boost::compute::distributed::vector<T2, weight2, Alloc2> &input2,
+                  boost::compute::distributed::command_queue &queue1,
+                  boost::compute::distributed::command_queue &queue2)
+{
+    if(input1.parts() != input2.parts()) {
+        return false;
+    }
+    if(input1.size() != input2.size()) {
+        return false;
+    }
+
+    std::vector<T2> host1(input1.size());
+    boost::compute::distributed::copy(input1, host1.begin(), queue1);
+    return distributed_equal(input2, host1.begin(), host1.end(), queue2);
+}
+
+#endif /* BOOST_COMPUTE_TEST_TEST_CHECK_FUNCTIONS_HPP */
diff --git a/test/distributed_queue_setup.hpp b/test/distributed_queue_setup.hpp
new file mode 100644
index 000000000..062ba1698
--- /dev/null
+++ b/test/distributed_queue_setup.hpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_TEST_DISTRIBUTED_QUEUE_SETUP_HPP
+#define BOOST_COMPUTE_TEST_DISTRIBUTED_QUEUE_SETUP_HPP
+
+#include <boost/compute/distributed/command_queue.hpp>
+
+inline boost::compute::distributed::command_queue
+get_distributed_queue(boost::compute::command_queue& queue,
+                      size_t n = 1,
+                      bool same_context = false)
+{
+    std::vector<boost::compute::command_queue> queues;
+    queues.push_back(queue);
+    for(size_t i = 0; i < n; i++) {
+        if(same_context) {
+            queues.push_back(
+                boost::compute::command_queue(
+                    queue.get_context(),
+                    queue.get_device()
+                )
+            );
+        }
+        else {
+            queues.push_back(
+                boost::compute::command_queue(
+                    boost::compute::context(queue.get_device()),
+                    queue.get_device()
+                )
+            );
+        }
+    }
+
+    return boost::compute::distributed::command_queue(queues);
+}
+
+#endif /* BOOST_COMPUTE_TEST_DISTRIBUTED_QUEUE_SETUP_HPP */
diff --git a/test/test_distributed_command_queue.cpp b/test/test_distributed_command_queue.cpp
new file mode 100644
index 000000000..d17fc3452
--- /dev/null
+++ b/test/test_distributed_command_queue.cpp
@@ -0,0 +1,193 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedCommandQueue
+#include <boost/test/unit_test.hpp>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(construct_from_distributed_context)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+
+    bc::distributed::context distributed_context(contexts);
+    bc::distributed::command_queue distributed_queue1(distributed_context);
+    BOOST_CHECK_EQUAL(
+        distributed_queue1.size(),
+        distributed_context.get_devices(0).size()
+    );
+
+    bc::distributed::command_queue distributed_queue2(
+        distributed_context, bc::distributed::command_queue::enable_profiling
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_queue2.size(),
+        distributed_context.get_devices(0).size()
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_queue2
+            .get_info<cl_command_queue_properties>(0, CL_QUEUE_PROPERTIES),
+        bc::distributed::command_queue::enable_profiling
+    );
+}
+
+BOOST_AUTO_TEST_CASE(construct_from_contexts)
+{
+    std::vector<bc::context> contexts;
+    std::vector<std::vector<bc::device> > devices;
+
+    contexts.push_back(context);
+    devices.push_back(std::vector<bc::device>());
+    devices[0].push_back(device);
+
+    bc::distributed::command_queue distributed_queue1(contexts, devices);
+    BOOST_CHECK_EQUAL(
+        distributed_queue1.size(),
+        1
+    );
+
+    devices[0].push_back(device);
+
+    bc::distributed::command_queue distributed_queue2(
+        contexts, devices, bc::distributed::command_queue::enable_profiling
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_queue2.size(),
+        2
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_queue2
+            .get_info<cl_command_queue_properties>(0, CL_QUEUE_PROPERTIES),
+        bc::distributed::command_queue::enable_profiling
+    );
+}
+
+BOOST_AUTO_TEST_CASE(construct_from_context)
+{
+    bc::distributed::command_queue distributed_queue(context);
+    BOOST_CHECK_EQUAL(
+        distributed_queue.size(),
+        context.get_devices().size()
+    );
+    for(size_t i = 0; i < distributed_queue.size(); i++) {
+        BOOST_CHECK_EQUAL(
+            distributed_queue.get_context(i),
+            context
+        );
+    }
+}
+
+BOOST_AUTO_TEST_CASE(construct_from_command_queues)
+{
+    std::vector<bc::command_queue> queues;
+    queues.push_back(queue);
+
+    bc::distributed::command_queue distributed_queue1(queues);
+    BOOST_CHECK_EQUAL(distributed_queue1.size(), 1);
+
+    queues.push_back(queue);
+    bc::distributed::command_queue distributed_queue2(queues);
+    BOOST_CHECK_EQUAL(distributed_queue2.size(), 2);
+}
+
+BOOST_AUTO_TEST_CASE(get_context)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    contexts.push_back(context);
+
+    bc::distributed::context distributed_context(contexts);
+    bc::distributed::command_queue distributed_queue(distributed_context);
+
+    BOOST_CHECK(
+        distributed_queue.get_context() == distributed_context
+    );
+    BOOST_CHECK(
+        distributed_queue.get_context(0) == context
+    );
+    BOOST_CHECK(
+        distributed_queue.get_context(1) == context
+    );
+}
+
+BOOST_AUTO_TEST_CASE(get_devices)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    contexts.push_back(context);
+
+    bc::distributed::context distributed_context(contexts);
+    bc::distributed::command_queue distributed_queue(distributed_context);
+
+    BOOST_CHECK(
+        distributed_queue.get_device(0) == context.get_device()
+    );
+    BOOST_CHECK(
+        distributed_queue.get_device(1) == context.get_device()
+    );
+}
+
+BOOST_AUTO_TEST_CASE(get_command_queue)
+{
+    std::vector<bc::command_queue> queues;
+    queues.push_back(queue);
+    queues.push_back(queue);
+
+    bc::distributed::command_queue distributed_queue(queues);
+
+    BOOST_CHECK(
+        distributed_queue.get(0) == queue
+    );
+    BOOST_CHECK(
+        distributed_queue.get(1) == queue
+    );
+}
+
+BOOST_AUTO_TEST_CASE(enqueue_kernel)
+{
+    std::vector<bc::command_queue> queues;
+    queues.push_back(queue);
+    queues.push_back(queue);
+
+    bc::distributed::command_queue distributed_queue(queues);
+
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void foo(__global int *output)
+        {
+            output[get_global_id(0)] = get_global_id(0);
+        }
+    );
+
+    for(size_t i = 0; i < distributed_queue.size(); i++)
+    {
+        bc::command_queue& queue = distributed_queue.get(0);
+        bc::vector<bc::uint_> output(8, queue.get_context());
+
+        bc::kernel kernel =
+            bc::kernel::create_with_source(source, "foo", queue.get_context());
+
+        kernel.set_arg(0, output);
+        queue.enqueue_1d_range_kernel(kernel, 0, output.size(), 0);
+        CHECK_RANGE_EQUAL(bc::uint_, 8, output, (0, 1, 2, 3, 4, 5, 6, 7));
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/test_distributed_context.cpp b/test/test_distributed_context.cpp
new file mode 100644
index 000000000..792d5d361
--- /dev/null
+++ b/test/test_distributed_context.cpp
@@ -0,0 +1,155 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedContext
+#include <boost/test/unit_test.hpp>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/distributed/context.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(construct_from_devices)
+{
+    std::vector<std::vector<bc::device> > all_devices;
+
+    const std::vector<bc::platform> &platforms = bc::system::platforms();
+    for(size_t i = 0; i < platforms.size(); i++){
+        const bc::platform &platform = platforms[i];
+
+        std::vector<bc::device> platform_devices = platform.devices();
+        std::vector<cl_context_properties*> properties(platform_devices.size(), 0);
+
+        // create a distributed context for devices in current platform
+        bc::distributed::context ctx1(platform_devices);
+        bc::distributed::context ctx2(platform_devices, properties);
+
+        // check context count
+        BOOST_CHECK_EQUAL(ctx1.size(), platform_devices.size());
+        BOOST_CHECK_EQUAL(ctx2.size(), platform_devices.size());
+
+        all_devices.push_back(platform_devices);
+    }
+
+    // create a distributed context for devices in current platform
+    std::vector<cl_context_properties*> properties(all_devices.size(), 0);
+    bc::distributed::context ctx(all_devices, properties);
+    BOOST_CHECK_EQUAL(ctx.size(), all_devices.size());
+}
+
+BOOST_AUTO_TEST_CASE(construct_from_contexts)
+{
+    std::vector<bc::context> contexts;
+
+    const std::vector<bc::platform> &platforms = bc::system::platforms();
+    for(size_t i = 0; i < platforms.size(); i++){
+        const bc::platform &platform = platforms[i];
+
+        // create a context for containing all devices in the platform
+        bc::context ctx(platform.devices());
+        contexts.push_back(ctx);
+    }
+
+    bc::distributed::context ctx1(contexts);
+    bc::distributed::context ctx2(contexts.begin(), contexts.end());
+
+    BOOST_CHECK_EQUAL(ctx1.size(), contexts.size());
+    BOOST_CHECK_EQUAL(ctx2.size(), contexts.size());
+    for(size_t i = 0; i < contexts.size(); i++) {
+        BOOST_CHECK_EQUAL(ctx1.get(i), contexts[i]);
+        BOOST_CHECK_EQUAL(ctx2.get(i), contexts[i]);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(construct_from_context)
+{
+    bc::distributed::context ctx(context);
+    BOOST_CHECK_EQUAL(ctx.size(), 1);
+    BOOST_CHECK_EQUAL(ctx.get(0), context);
+}
+
+BOOST_AUTO_TEST_CASE(copy_ctor)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    bc::distributed::context distributed_context1(contexts);
+    bc::distributed::context distributed_context2(distributed_context1);
+    BOOST_CHECK(
+        distributed_context1 == distributed_context2
+    );
+}
+
+BOOST_AUTO_TEST_CASE(assign_operator)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    bc::distributed::context distributed_context1(contexts);
+    bc::distributed::context distributed_context2 = distributed_context1;
+    BOOST_CHECK(
+        distributed_context1 == distributed_context2
+    );
+}
+
+BOOST_AUTO_TEST_CASE(equality_operator)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    bc::distributed::context distributed_context1(contexts);
+    bc::distributed::context distributed_context2(contexts);
+
+    contexts.push_back(
+        bc::context(queue.get_device())
+    );
+    bc::distributed::context distributed_context3(contexts);
+
+    BOOST_CHECK(distributed_context1 == distributed_context2);
+    BOOST_CHECK(distributed_context2 == distributed_context1);
+
+    BOOST_CHECK(distributed_context1 != distributed_context3);
+    BOOST_CHECK(distributed_context3 != distributed_context1);
+
+    BOOST_CHECK(distributed_context2 != distributed_context3);
+    BOOST_CHECK(distributed_context3 != distributed_context2);
+}
+
+BOOST_AUTO_TEST_CASE(get_info)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    bc::distributed::context distributed_context(contexts);
+
+    BOOST_CHECK(
+        distributed_context.get_info<std::vector<cl_device_id> >(0, CL_CONTEXT_DEVICES)
+            == context.get_info<std::vector<cl_device_id> >(CL_CONTEXT_DEVICES)
+    );
+}
+
+BOOST_AUTO_TEST_CASE(get_context)
+{
+    std::vector<bc::context> contexts;
+    contexts.push_back(context);
+    contexts.push_back(context);
+    contexts.push_back(bc::context(context.get_device()));
+    bc::distributed::context distributed_context(contexts);
+
+    BOOST_CHECK_EQUAL(distributed_context.get(0), context);
+    BOOST_CHECK_EQUAL(distributed_context.get(1), context);
+    for(size_t i = 0; i < distributed_context.size(); i++) {
+        BOOST_CHECK_EQUAL(distributed_context.get(i), contexts[i]);
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/test_distributed_copy.cpp b/test/test_distributed_copy.cpp
new file mode 100644
index 000000000..a00aaea47
--- /dev/null
+++ b/test/test_distributed_copy.cpp
@@ -0,0 +1,519 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedCopy
+#include <boost/test/unit_test.hpp>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+#include <boost/compute/distributed/copy.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+#include "distributed_check_functions.hpp"
+#include "distributed_queue_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(copy_from_host)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value = -1;
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size, value, distributed_queue
+    );
+    distributed_queue.finish();
+
+    std::vector<bc::int_> host(size_t(size), bc::int_(1000));
+    host[size - 1] = -10;
+    host[0] = -20;
+    host[size/2] = -30;
+
+    // empty copy
+    bc::distributed::copy(
+        host.begin(), host.begin(), distributed_vector, distributed_queue
+    );
+    BOOST_CHECK(distributed_equal(distributed_vector, value, distributed_queue));
+
+    // full copy
+    bc::distributed::copy(
+        host.begin(), host.end(), distributed_vector, distributed_queue
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector,
+            host.begin(), host.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_async_from_host)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value = -1;
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size, value, distributed_queue
+    );
+    distributed_queue.finish();
+
+    std::vector<bc::int_> host(size_t(size), bc::int_(1000));
+    host[size - 1] = -10;
+    host[0] = -20;
+    host[size/2] = -30;
+
+    // empty copy
+    std::vector<boost::compute::event> events =
+        bc::distributed::copy_async(
+            host.begin(), host.begin(), distributed_vector, distributed_queue
+        );
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+    BOOST_CHECK(distributed_equal(distributed_vector, value, distributed_queue));
+
+    // full copy
+    events =
+        bc::distributed::copy_async(
+            host.begin(), host.end(), distributed_vector, distributed_queue
+        );
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector,
+            host.begin(), host.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_to_host)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value = -1;
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size, value, distributed_queue
+    );
+    distributed_queue.finish();
+
+    std::vector<bc::int_> host(size, bc::int_(1000));
+    bc::distributed::copy(
+        distributed_vector, host.begin(), distributed_queue
+    );
+    for(size_t i = 0; i < host.size(); i++) {
+        BOOST_CHECK_EQUAL(host[i], value);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(copy_async_to_host)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value = -1;
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size, value, distributed_queue
+    );
+    distributed_queue.finish();
+
+    std::vector<bc::int_> host(size, bc::int_(1000));
+    std::vector<boost::compute::event> events =
+        bc::distributed::copy_async(
+            distributed_vector, host.begin(), distributed_queue
+        );
+    // wait for copy
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+    // check values
+    for(size_t i = 0; i < host.size(); i++) {
+        BOOST_CHECK_EQUAL(host[i], value);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(copy_from_vector_to_vector)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::int_ value2 = 1;
+    bc::distributed::vector<bc::int_> distributed_vector1(
+        size, value1, distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_vector2(
+        size, value2, distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_CHECK(distributed_equal(distributed_vector1, value1, distributed_queue));
+    BOOST_CHECK(distributed_equal(distributed_vector2, value2, distributed_queue));
+
+    bc::distributed::copy(
+        distributed_vector1, distributed_vector2, distributed_queue
+    );
+
+    // check if distributed_vector1 is the same
+    BOOST_CHECK(distributed_equal(distributed_vector1, value1, distributed_queue));
+    // and it was copied into distributed_vector2
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_queue
+        )
+    );
+
+    // change distributed_vector1
+    distributed_vector1
+        .begin(distributed_vector1.parts() - 1)
+        .write(99, distributed_queue.get(distributed_vector1.parts() - 1));
+    distributed_queue.get(distributed_vector1.parts() - 1).finish();
+
+    // copy once again
+    bc::distributed::copy(
+        distributed_vector1, distributed_vector2, distributed_queue
+    );
+
+    // check
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_queue
+        )
+    );
+
+    // copy between vectors of different sizes
+    bc::int_ value3 = 12;
+    bc::distributed::vector<bc::int_> distributed_vector3(
+        2 * size, value3, distributed_queue
+    );
+
+    bc::distributed::copy(
+        distributed_vector1, distributed_vector3, distributed_queue
+    );
+
+    // check
+    std::vector<bc::int_> host(distributed_vector3.size(), value3);
+    bc::distributed::copy(
+        distributed_vector1, host.begin(), distributed_queue
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector3,
+            host.begin(), host.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_async_from_vector_to_vector)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::int_ value2 = 1;
+    bc::distributed::vector<bc::int_> distributed_vector1(
+        size, value1, distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_vector2(
+        size, value2, distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_CHECK(distributed_equal(distributed_vector1, value1, distributed_queue));
+    BOOST_CHECK(distributed_equal(distributed_vector2, value2, distributed_queue));
+
+    std::vector<boost::compute::event> events = bc::distributed::copy_async(
+        distributed_vector1, distributed_vector2, distributed_queue
+    );
+    // wait for copy
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+
+    // check if distributed_vector1 is the same
+    BOOST_CHECK(distributed_equal(distributed_vector1, value1, distributed_queue));
+    // and it was copied into distributed_vector2
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_queue
+        )
+    );
+
+    // change distributed_vector1
+    distributed_vector1
+        .begin(distributed_vector1.parts() - 1)
+        .write(99, distributed_queue.get(distributed_vector1.parts() - 1));
+    distributed_queue.get(distributed_vector1.parts() - 1).finish();
+
+    // copy once again
+    events = bc::distributed::copy_async(
+        distributed_vector1, distributed_vector2, distributed_queue
+    );
+    // wait for copy
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+
+    // check
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_queue
+        )
+    );
+}
+
+std::vector<double> custom_weight_func(const bc::distributed::command_queue& queue)
+{
+    return std::vector<double>(queue.size(), 1.0/queue.size());
+}
+
+BOOST_AUTO_TEST_CASE(copy_from_vector_to_vector_different_vectors)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::long_ value2 = 1;
+    bc::distributed::vector<bc::int_> distributed_vector1(
+        size, value1, distributed_queue
+    );
+    bc::distributed::vector<bc::long_, custom_weight_func> distributed_vector2(
+        size, value2, distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_CHECK(distributed_equal(distributed_vector1, value1, distributed_queue));
+    BOOST_CHECK(distributed_equal(distributed_vector2, value2, distributed_queue));
+
+    bc::distributed::copy(
+        distributed_vector1, distributed_vector2, distributed_queue, distributed_queue
+    );
+
+    // check if distributed_vector1 is the same
+    BOOST_CHECK(distributed_equal(distributed_vector1, value1, distributed_queue));
+    // and it was copied into distributed_vector2
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_queue,
+            distributed_queue
+        )
+    );
+
+    // change distributed_vector1
+    distributed_vector1
+        .begin(distributed_vector1.parts() - 1)
+        .write(99, distributed_queue.get(distributed_vector1.parts() - 1));
+    distributed_queue.get(distributed_vector1.parts() - 1).finish();
+
+    // copy once again
+    bc::distributed::copy(
+        distributed_vector1, distributed_vector2, distributed_queue, distributed_queue
+    );
+
+    // check
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_queue,
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_device_to_vector)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::long_ value2 = 1;
+    bc::vector<bc::int_> device_vector(size, value1, queue);
+    bc::distributed::vector<bc::long_> distributed_vector(
+        size, value2, distributed_queue
+    );
+
+    bc::distributed::copy(
+        device_vector.begin(), device_vector.end(), distributed_vector,
+        queue, distributed_queue
+    );
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector, bc::long_(value1), distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_async_device_to_vector)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue, 2, true);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::long_ value2 = 1;
+    bc::vector<bc::int_> device_vector(size, value1, queue);
+    bc::distributed::vector<bc::long_> distributed_vector(
+        size, value2, distributed_queue
+    );
+
+    std::vector<bc::event> events =
+        bc::distributed::copy_async(
+            device_vector.begin(), device_vector.end(), distributed_vector,
+            queue, distributed_queue
+        );
+    // wait for copy
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector, bc::long_(value1), distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_vector_to_device)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::long_ value2 = 1;
+    bc::vector<bc::int_> device_vector(size, value1, queue);
+    bc::distributed::vector<bc::long_> distributed_vector(
+        size, value2, distributed_queue
+    );
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector, value2, distributed_queue
+        )
+    );
+    BOOST_CHECK(
+        boost::compute::equal(
+            device_vector.begin(),
+            device_vector.end(),
+            boost::compute::make_constant_iterator(value1),
+            queue
+        )
+    );
+
+    bc::distributed::copy(
+        distributed_vector, device_vector.begin(),
+        distributed_queue, queue
+    );
+
+    BOOST_CHECK(
+        boost::compute::equal(
+            device_vector.begin(),
+            device_vector.end(),
+            boost::compute::make_constant_iterator(static_cast<bc::int_>(value2)),
+            queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(copy_async_vector_to_device)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue, 2, true);
+
+    size_t size = 64;
+    bc::int_ value1 = -1;
+    bc::long_ value2 = 1;
+    bc::vector<bc::int_> device_vector(size, value1, queue);
+    bc::distributed::vector<bc::long_> distributed_vector(
+        size, value2, distributed_queue
+    );
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector, value2, distributed_queue
+        )
+    );
+    BOOST_CHECK(
+        boost::compute::equal(
+            device_vector.begin(),
+            device_vector.end(),
+            boost::compute::make_constant_iterator(value1),
+            queue
+        )
+    );
+
+    std::vector<bc::event> events =
+        bc::distributed::copy_async(
+            distributed_vector, device_vector.begin(),
+            distributed_queue, queue
+        );
+    // wait for copy
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+
+    BOOST_CHECK(
+        boost::compute::equal(
+            device_vector.begin(),
+            device_vector.end(),
+            boost::compute::make_constant_iterator(static_cast<bc::int_>(value2)),
+            queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/test_distributed_reduce.cpp b/test/test_distributed_reduce.cpp
new file mode 100644
index 000000000..1f361c867
--- /dev/null
+++ b/test/test_distributed_reduce.cpp
@@ -0,0 +1,175 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedReduce
+#include <boost/test/unit_test.hpp>
+
+#include <algorithm>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/function.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+#include <boost/compute/distributed/reduce.hpp>
+#include <boost/compute/distributed/copy.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+#include "distributed_check_functions.hpp"
+#include "distributed_queue_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(reduce_int_to_host)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::int_ data[] = { 5, 1, 9, 17, 13 };
+    bc::distributed::vector<bc::int_> distributed_vector(
+        data, data + 5, distributed_queue
+    );
+    distributed_queue.finish();
+
+    bc::int_ sum;
+    bc::distributed::reduce(
+        distributed_vector,
+        &sum,
+        bc::plus<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(sum, 45);
+
+    bc::int_ product;
+    bc::distributed::reduce(
+        distributed_vector,
+        &product,
+        bc::multiplies<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(product, 9945);
+
+    bc::int_ min_value;
+    bc::distributed::reduce(
+        distributed_vector,
+        &min_value,
+        bc::min<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(min_value, 1);
+
+    bc::int_ max_value;
+    bc::distributed::reduce(
+        distributed_vector,
+        &max_value,
+        bc::max<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(max_value, 17);
+}
+
+BOOST_AUTO_TEST_CASE(reduce_int_to_device)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::int_ data[] = { 1, 5, 9, 13, 17 };
+    bc::distributed::vector<bc::int_> distributed_vector(
+        data, data + 5, distributed_queue
+    );
+    distributed_queue.finish();
+
+    bc::vector<bc::int_> result1(1, distributed_queue.get_context(0));
+    bc::distributed::reduce(
+        distributed_vector,
+        result1.begin(),
+        bc::plus<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(result1.begin().read(queue), 45);
+
+    bc::vector<bc::int_> result2(1, distributed_queue.get_context(1));
+    bc::distributed::reduce(
+        distributed_vector,
+        result2.begin(),
+        bc::multiplies<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(result2.begin().read(distributed_queue.get(1)), 9945);
+}
+
+BOOST_AUTO_TEST_CASE(reduce_int_to_device_one_context)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue, 1, true);
+
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size_t(1024), bc::int_(1), distributed_queue
+    );
+    distributed_vector[0] = 2;
+    distributed_queue.finish();
+
+    bc::vector<bc::int_> result1(1, context);
+    bc::distributed::reduce(
+        distributed_vector,
+        result1.begin(),
+        bc::plus<bc::int_>(),
+        distributed_queue
+    );
+    BOOST_CHECK_EQUAL(result1.begin().read(queue), 1025);
+
+    bc::vector<bc::int_> result2(1, context);
+    bc::distributed::reduce(
+        distributed_vector,
+        result2.begin(),
+        bc::multiplies<bc::int_>(),
+        distributed_queue
+    );
+    distributed_queue.finish();
+    BOOST_CHECK_EQUAL(result2.begin().read(distributed_queue.get(0)), 2);
+}
+
+BOOST_AUTO_TEST_CASE(reduce_int_custom_function)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size_t(34), bc::int_(2), distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_COMPUTE_FUNCTION(bc::float_, custom_sum, (bc::float_ x, bc::float_ y),
+    {
+        return x + y;
+    });
+
+
+    bc::float_ sum;
+    bc::distributed::reduce(
+        distributed_vector,
+        &sum,
+        custom_sum,
+        distributed_queue
+    );
+    BOOST_CHECK_CLOSE(sum, bc::float_(68), 0.01);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/test_distributed_scan.cpp b/test/test_distributed_scan.cpp
new file mode 100644
index 000000000..4c6e423f8
--- /dev/null
+++ b/test/test_distributed_scan.cpp
@@ -0,0 +1,169 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedScan
+#include <boost/test/unit_test.hpp>
+
+#include <algorithm>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/function.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+#include <boost/compute/distributed/exclusive_scan.hpp>
+#include <boost/compute/distributed/copy.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+#include "distributed_check_functions.hpp"
+#include "distributed_queue_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(exclusive_scan_int)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue, 4);
+
+    std::vector<bc::int_> data(size_t(128));
+    for(size_t i = 0; i < data.size(); i++) {
+        data[i] = i;
+    }
+
+    bc::distributed::vector<bc::int_> distributed_input(
+        data.begin(), data.end(), distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_result(
+        data.size(), distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_input,
+            data.begin(), data.end(),
+            distributed_queue
+        )
+    );
+
+    bc::distributed::exclusive_scan(
+        distributed_input,
+        distributed_result,
+        bc::int_(10),
+        distributed_queue
+    );
+    distributed_queue.finish();
+
+    bc::vector<bc::int_> device_input(data.begin(), data.end(), queue);
+    bc::vector<bc::int_> device_expected(data.size(), context);
+    std::vector<bc::int_> host_expected(device_expected.size());
+    bc::exclusive_scan(
+            device_input.begin(),
+            device_input.end(),
+            device_expected.begin(),
+            bc::int_(10),
+            queue
+    );
+    bc::copy(
+        device_expected.begin(),
+        device_expected.end(),
+        host_expected.begin(),
+        queue
+    );
+    queue.finish();
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_input,
+            data.begin(), data.end(),
+            distributed_queue
+        )
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_result,
+            host_expected.begin(), host_expected.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(exclusive_scan_custom_function_int)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue, 3);
+
+    BOOST_COMPUTE_FUNCTION(bc::int_, custom_sum, (bc::int_ x, bc::int_ y),
+    {
+        return x + y;
+    });
+
+    std::vector<bc::int_> data(size_t(128));
+    for(size_t i = 0; i < data.size(); i++) {
+        data[i] = i;
+    }
+
+    bc::distributed::vector<bc::int_> distributed_input(
+        data.begin(), data.end(), distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_input,
+            data.begin(), data.end(),
+            distributed_queue
+        )
+    );
+
+    bc::distributed::exclusive_scan(
+        distributed_input,
+        distributed_input,
+        bc::int_(10),
+        custom_sum,
+        distributed_queue
+    );
+    distributed_queue.finish();
+
+    bc::vector<bc::int_> device_input(data.begin(), data.end(), queue);
+    bc::vector<bc::int_> device_expected(data.size(), context);
+    std::vector<bc::int_> host_expected(device_expected.size());
+    bc::exclusive_scan(
+            device_input.begin(),
+            device_input.end(),
+            device_expected.begin(),
+            bc::int_(10),
+            queue
+    );
+    bc::copy(
+        device_expected.begin(),
+        device_expected.end(),
+        host_expected.begin(),
+        queue
+    );
+    queue.finish();
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_input,
+            host_expected.begin(), host_expected.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/test_distributed_transform.cpp b/test/test_distributed_transform.cpp
new file mode 100644
index 000000000..a275d4e14
--- /dev/null
+++ b/test/test_distributed_transform.cpp
@@ -0,0 +1,233 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedTransform
+#include <boost/test/unit_test.hpp>
+
+#include <algorithm>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/function.hpp>
+
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+#include <boost/compute/distributed/transform.hpp>
+#include <boost/compute/distributed/copy.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+#include "distributed_check_functions.hpp"
+#include "distributed_queue_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(transform_async_int_abs)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::int_ data[] = { 1, -2, -3, -4, 5 };
+    bc::distributed::vector<bc::int_> distributed_vector(
+        data, data + 5, distributed_queue
+    );
+    distributed_queue.finish();
+
+    std::vector<bc::int_> host(data, data + 5);
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector,
+            host.begin(), host.end(),
+            distributed_queue
+        )
+    );
+
+    // transform
+    std::vector<bc::event> events =
+        bc::distributed::transform_async(
+            distributed_vector,
+            distributed_vector,
+            bc::abs<bc::int_>(),
+            distributed_queue
+        );
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+
+    bc::int_ expected_data[] = { 1, 2, 3, 4, 5 };
+    std::vector<bc::int_> host_expected(expected_data, expected_data + 5);
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector,
+            host_expected.begin(), host_expected.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(transform_float_custom_funtion)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::float_ data[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f };
+    bc::distributed::vector<bc::float_> distributed_vector(
+        data, data + 5, distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_COMPUTE_FUNCTION(float, pow3add4, (float x),
+    {
+        return pow(x, 3.0f) + 4.0f;
+    });
+
+    // transform
+    bc::distributed::transform(
+        distributed_vector,
+        distributed_vector,
+        pow3add4,
+        distributed_queue
+    );
+    distributed_queue.finish();
+
+    BOOST_CHECK_CLOSE(bc::float_(distributed_vector[0]), 5.0f, 1e-4f);
+    BOOST_CHECK_CLOSE(bc::float_(distributed_vector[1]), 12.0f, 1e-4f);
+    BOOST_CHECK_CLOSE(bc::float_(distributed_vector[2]), 31.0f, 1e-4f);
+    BOOST_CHECK_CLOSE(bc::float_(distributed_vector[3]), 68.0f, 1e-4f);
+    BOOST_CHECK_CLOSE(bc::float_(distributed_vector[4]), 129.0f, 1e-4f);
+}
+
+BOOST_AUTO_TEST_CASE(transform_async_int_add)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::int_ data1[] = { 1, -2, -3, -4, 5 };
+    bc::int_ data2[] = { -1, 2, 3, 4, 0 };
+    std::vector<bc::int_> host1(data1, data1 + 5);
+    std::vector<bc::int_> host2(data2, data2 + 5);
+
+    bc::distributed::vector<bc::int_> distributed_vector1(
+        data1, data1 + 5, distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_vector2(
+        data2, data2 + 5, distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_vector3(
+        size_t(5), distributed_queue
+    );
+    distributed_queue.finish();
+
+    // add
+    std::vector<bc::event> events =
+        bc::distributed::transform_async(
+            distributed_vector1,
+            distributed_vector2,
+            distributed_vector3,
+            bc::plus<bc::int_>(),
+            distributed_queue
+        );
+    for(size_t i = 0; i < events.size(); i++) {
+        events[i].wait();
+    }
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            host1.begin(), host1.end(),
+            distributed_queue
+        )
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector2,
+            host2.begin(), host2.end(),
+            distributed_queue
+        )
+    );
+
+    bc::int_ expected_data_add[] = { 0, 0, 0, 0, 5 };
+    std::vector<bc::int_> expected_add(
+        expected_data_add, expected_data_add + 5
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector3,
+            expected_add.begin(), expected_add.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_CASE(transform_int_multiply)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::int_ data1[] = { 1, -2, -3, -4, 5 };
+    bc::int_ data2[] = { -1, 2, 3, 4, 0 };
+    std::vector<bc::int_> host1(data1, data1 + 5);
+    std::vector<bc::int_> host2(data2, data2 + 5);
+
+    bc::distributed::vector<bc::int_> distributed_vector1(
+        data1, data1 + 5, distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_vector2(
+        data2, data2 + 5, distributed_queue
+    );
+    bc::distributed::vector<bc::int_> distributed_vector3(
+        size_t(5), distributed_queue
+    );
+    distributed_queue.finish();
+
+    // multiply
+    bc::distributed::transform(
+        distributed_vector1,
+        distributed_vector2,
+        distributed_vector3,
+        bc::multiplies<bc::int_>(),
+        distributed_queue
+    );
+
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector1,
+            host1.begin(), host1.end(),
+            distributed_queue
+        )
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector2,
+            host2.begin(), host2.end(),
+            distributed_queue
+        )
+    );
+
+    bc::int_ expected_data_multiply[] = { -1, -4, -9, -16, 0 };
+    std::vector<bc::int_> expected_multiply(
+        expected_data_multiply, expected_data_multiply + 5
+    );
+    BOOST_CHECK(
+        distributed_equal(
+            distributed_vector3,
+            expected_multiply.begin(), expected_multiply.end(),
+            distributed_queue
+        )
+    );
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/test_distributed_vector.cpp b/test/test_distributed_vector.cpp
new file mode 100644
index 000000000..6647d1eee
--- /dev/null
+++ b/test/test_distributed_vector.cpp
@@ -0,0 +1,342 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define BOOST_TEST_MODULE TestDistributedVector
+#include <boost/test/unit_test.hpp>
+
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/distributed/context.hpp>
+#include <boost/compute/distributed/command_queue.hpp>
+#include <boost/compute/distributed/vector.hpp>
+
+#include "check_macros.hpp"
+#include "context_setup.hpp"
+
+#include "distributed_check_functions.hpp"
+#include "distributed_queue_setup.hpp"
+
+namespace bc = boost::compute;
+
+BOOST_AUTO_TEST_CASE(empty_vector)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::distributed::vector<bc::uint_> distributed_vector(distributed_queue);
+
+    BOOST_CHECK(distributed_vector.empty());
+    BOOST_CHECK(distributed_vector.size() == 0);
+    BOOST_CHECK(distributed_vector.parts() == 2);
+
+    for(size_t i = 0; i < distributed_vector.parts(); i++)
+    {
+        BOOST_CHECK(distributed_vector.begin(i) == distributed_vector.end(i));
+        BOOST_CHECK(distributed_vector.part_start(i) == 0);
+        BOOST_CHECK(distributed_vector.part_size(i) == 0);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(count_ctor)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::distributed::vector<bc::uint_> distributed_vector(
+        64, distributed_queue
+    );
+
+    BOOST_CHECK(!distributed_vector.empty());
+    BOOST_CHECK(distributed_vector.size() == 64);
+    BOOST_CHECK(distributed_vector.parts() == distributed_vector.get_queue().size());
+
+    size_t size_sum = 0;
+    for(size_t i = 0; i < distributed_vector.parts(); i++)
+    {
+        size_sum += distributed_vector.part_size(i);
+    }
+    BOOST_CHECK_EQUAL(distributed_vector.size(), size_sum);
+
+    std::vector<bc::buffer> buffers = distributed_vector.get_buffers();
+    for(size_t i = 0; i < distributed_vector.parts(); i++)
+    {
+        BOOST_CHECK(buffers[i].get() != 0);
+        BOOST_CHECK(buffers[i] == distributed_vector.get_buffer(i));
+    }
+}
+
+BOOST_AUTO_TEST_CASE(command_queue_ctor)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::uint_ value = 991;
+    bc::distributed::vector<bc::uint_> distributed_vector(
+        size_t(35), value, distributed_queue
+    );
+
+    BOOST_CHECK(!distributed_vector.empty());
+    BOOST_CHECK(distributed_vector.size() == 35);
+    BOOST_CHECK(distributed_vector.parts() == distributed_vector.get_queue().size());
+
+    size_t size_sum = 0;
+    for(size_t i = 0; i < distributed_vector.parts(); i++)
+    {
+        size_sum += distributed_vector.part_size(i);
+    }
+    BOOST_CHECK_EQUAL(distributed_vector.size(), size_sum);
+
+    // need to finish since back() and front()
+    // use different (self-made) queues
+    distributed_queue.finish();
+    BOOST_CHECK_EQUAL(distributed_vector.back(), value);
+    BOOST_CHECK_EQUAL(distributed_vector.front(), value);
+
+    BOOST_CHECK(distributed_equal(distributed_vector, value, distributed_queue));
+}
+
+BOOST_AUTO_TEST_CASE(command_queue_ctor_one_queue)
+{
+    // construct distributed::command_queue
+    // with only 1 device command queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue, 0);
+
+    bc::uint_ value = 1;
+    bc::distributed::vector<bc::uint_> distributed_vector(
+        size_t(5), value, distributed_queue
+    );
+
+    BOOST_CHECK(!distributed_vector.empty());
+    BOOST_CHECK(distributed_vector.size() == 5);
+    BOOST_CHECK(distributed_vector.parts() == 1);
+    BOOST_CHECK_EQUAL(distributed_vector.size(), distributed_vector.part_size(0));
+
+    // need to finish since back() and front()
+    // use different (self-made) queues
+    distributed_queue.finish();
+    BOOST_CHECK_EQUAL(distributed_vector.back(), value);
+    BOOST_CHECK_EQUAL(distributed_vector.front(), value);
+
+    BOOST_CHECK(distributed_equal(distributed_vector, value, distributed_queue));
+}
+
+BOOST_AUTO_TEST_CASE(host_iterator_ctor)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::int_ value = -1;
+    std::vector<bc::int_> host_vector(50, value);
+
+    bc::distributed::vector<bc::int_> distributed_vector(
+        host_vector.begin(), host_vector.end(), distributed_queue
+    );
+
+    BOOST_CHECK(!distributed_vector.empty());
+    BOOST_CHECK(distributed_vector.size() == host_vector.size());
+    BOOST_CHECK(distributed_vector.parts() == distributed_vector.get_queue().size());
+
+    size_t size_sum = 0;
+    for(size_t i = 0; i < distributed_vector.parts(); i++)
+    {
+        size_sum += distributed_vector.part_size(i);
+    }
+    BOOST_CHECK_EQUAL(distributed_vector.size(), size_sum);
+
+    BOOST_CHECK(distributed_equal(distributed_vector, value, distributed_queue));
+
+    // need to finish since back() and front()
+    // use different (self-made) queues
+    distributed_queue.finish();
+
+    distributed_vector.front() = 1;
+    distributed_vector.back() = 1;
+    BOOST_CHECK_EQUAL(distributed_vector.back(), 1);
+    BOOST_CHECK_EQUAL(distributed_vector.front(), 1);
+}
+
+BOOST_AUTO_TEST_CASE(copy_ctor)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue1 =
+        get_distributed_queue(queue);
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue2 =
+        get_distributed_queue(queue, 2);
+
+    bc::int_ value = -1;
+    size_t size = 64;
+
+    bc::distributed::vector<bc::int_> distributed_vector(
+        size, value, distributed_queue1
+    );
+    bc::distributed::vector<bc::int_> distributed_vector_copy1(
+        distributed_vector
+    );
+    bc::distributed::vector<bc::int_> distributed_vector_copy2(
+        distributed_vector, distributed_queue2
+    );
+    bc::distributed::vector<
+        bc::int_,
+        bc::distributed::default_weight_func, bc::pinned_allocator<bc::int_>
+    > distributed_vector_copy3(
+        distributed_vector, distributed_queue2
+    );
+    bc::distributed::vector<bc::int_> distributed_vector_copy4(
+        distributed_vector, distributed_queue1
+    );
+    bc::distributed::vector<
+        bc::int_,
+        bc::distributed::default_weight_func, bc::pinned_allocator<bc::int_>
+    > distributed_vector_copy5(
+        distributed_vector, distributed_queue1
+    );
+
+    BOOST_CHECK(
+        distributed_equal(distributed_vector, value, distributed_queue1)
+    );
+    BOOST_CHECK(
+        distributed_equal(distributed_vector_copy1, value, distributed_queue1)
+    );
+    BOOST_CHECK(
+        distributed_equal(distributed_vector_copy2, value, distributed_queue2)
+    );
+    BOOST_CHECK(
+        distributed_equal(distributed_vector_copy3, value, distributed_queue2)
+    );
+    BOOST_CHECK(
+        distributed_equal(distributed_vector_copy4, value, distributed_queue1)
+    );
+    BOOST_CHECK(
+        distributed_equal(distributed_vector_copy5, value, distributed_queue1)
+    );
+}
+
+BOOST_AUTO_TEST_CASE(at)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::distributed::vector<bc::uint_> distributed_vector(
+        size_t(64), bc::uint_(64), distributed_queue
+    );
+
+    distributed_vector.begin(1).write(33, distributed_queue.get(1));
+    distributed_queue.get(1).finish();
+
+    BOOST_CHECK_EQUAL(
+        distributed_vector.at(distributed_vector.part_start(1)),
+        bc::uint_(33)
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_vector.at(distributed_vector.part_start(1) + 1),
+        bc::uint_(64)
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_vector.at(distributed_vector.part_start(1) - 1),
+        bc::uint_(64)
+    );
+
+    distributed_vector.at(distributed_vector.part_start(1)) = 55;
+    BOOST_CHECK_EQUAL(
+        *distributed_vector.begin(1),
+        bc::uint_(55)
+    );
+
+    BOOST_CHECK_THROW(distributed_vector.at(64), std::out_of_range);
+}
+
+BOOST_AUTO_TEST_CASE(subscript_operator)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue =
+        get_distributed_queue(queue);
+
+    bc::distributed::vector<bc::uint_> distributed_vector(
+        size_t(64), bc::uint_(64), distributed_queue
+    );
+
+    distributed_vector.begin(1).write(bc::uint_(33), distributed_queue.get(1));
+    distributed_queue.get(1).finish();
+
+    BOOST_CHECK_EQUAL(
+        distributed_vector[distributed_vector.part_start(1)],
+        bc::uint_(33)
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_vector[distributed_vector.part_start(1) + 1],
+        bc::uint_(64)
+    );
+    BOOST_CHECK_EQUAL(
+        distributed_vector[distributed_vector.part_start(1) - 1],
+        bc::uint_(64)
+    );
+
+    distributed_vector[distributed_vector.part_start(1)] = bc::uint_(55);
+    BOOST_CHECK_EQUAL(
+        *distributed_vector.begin(1),
+        bc::uint_(55)
+    );
+}
+
+BOOST_AUTO_TEST_CASE(swap)
+{
+    // construct distributed::command_queue
+    bc::distributed::command_queue distributed_queue1 =
+        get_distributed_queue(queue);
+    // construct 2nd distributed::command_queue
+    bc::distributed::command_queue distributed_queue2 =
+        get_distributed_queue(queue, 2);
+
+    bc::int_ value1 = -88;
+    bc::int_ value2 = 99;
+    size_t size1 = 64;
+    size_t size2 = 48;
+
+    bc::distributed::vector<bc::int_> distributed_vector1(
+        size1, value1, distributed_queue1
+    );
+    bc::distributed::vector<bc::int_> distributed_vector2(
+        size2, value2, distributed_queue2
+    );
+
+    BOOST_CHECK_EQUAL(distributed_vector1.size(), size1);
+    BOOST_CHECK(
+        distributed_equal(distributed_vector1, value1, distributed_queue1)
+    );
+    BOOST_CHECK_EQUAL(distributed_vector2.size(), size2);
+    BOOST_CHECK(
+        distributed_equal(distributed_vector2, value2, distributed_queue2)
+    );
+    distributed_queue1.finish();
+    distributed_queue2.finish();
+
+    distributed_vector1.swap(distributed_vector2);
+
+    BOOST_CHECK_EQUAL(distributed_vector1.size(), size2);
+    BOOST_CHECK(
+        distributed_equal(distributed_vector1, value2, distributed_queue2)
+    );
+    BOOST_CHECK_EQUAL(distributed_vector2.size(), size1);
+    BOOST_CHECK(
+        distributed_equal(distributed_vector2, value1, distributed_queue1)
+    );
+}
+
+BOOST_AUTO_TEST_SUITE_END()