CUV: src/cuv/basics/tensor.hpp Source File

Go to the documentation of this file.
 //*LB*
 // Copyright (c) 2010, University of Bonn, Institute for Computer Science VI
 // All rights reserved.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 // 
 //  * Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimer.
 //  * Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution.
 //  * Neither the name of the University of Bonn 
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this software without specific prior written
 //    permission.
 // 
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //*LE*
 
 #ifndef __TENSOR2_HPP__
 #     define __TENSOR2_HPP__
 
 #include <iostream>
 #include <vector>
 #include <stdexcept>
 #include <numeric>
 #include <boost/shared_ptr.hpp>
 #include <boost/multi_array/extent_gen.hpp>
 #include <boost/multi_array/index_gen.hpp>
 #include <cuv/tools/cuv_general.hpp>
 #include <cuv/tools/meta_programming.hpp>
 #include "reference.hpp"
 
 namespace boost { namespace serialization {
         class access; } }
 
 namespace cuv
 {
 
         struct column_major {};
         struct row_major    {};
 
     struct linear_memory_tag{};
     struct pitched_memory_tag{}; // tags
 
         template<class T>
         struct other_memory_layout{};
         template<>
         struct other_memory_layout<column_major>{ 
         typedef row_major type;  
     };
         template<>
         struct other_memory_layout<row_major>{ 
         typedef column_major type;   
     };
 
 
         template<class T>
         struct other_memory_space{};
         template<>
         struct other_memory_space<dev_memory_space>{ 
         typedef host_memory_space type;  
     };
         template<>
         struct other_memory_space<host_memory_space>{ 
         typedef dev_memory_space type;  
     };
         using boost::detail::multi_array::extent_gen;
         using boost::detail::multi_array::index_gen;
         typedef boost::detail::multi_array::index_range<boost::detail::multi_array::index,boost::detail::multi_array::size_type> index_range;
         typedef index_range::index index;
 #ifndef CUV_DONT_CREATE_EXTENTS_OBJ
         namespace{
                 extent_gen<0> extents;
                 index_gen<0,0> indices;
         }
 #endif
 
 
     template<class V, class M, class L> class tensor;
     template<class V, class M> class linear_memory;
 
     template<class V, class M, class L, class S>
         void fill(tensor<V, M, L>& v, const V& p);
 
     template<class V, class M>
         class memory{
             public:
                 typedef typename unconst<V>::type value_type; 
                 typedef const V const_value_type;   
                 typedef M memory_space_type;        
                 typedef unsigned int size_type;               
                 typedef int          index_type;              
                 typedef reference<V,M,index_type> reference_type; 
                 typedef const reference<V,M,index_type> const_reference_type; 
                 typedef value_type* pointer_type;
                 typedef const_value_type* const_pointer_type;
 
             private:
                 friend class boost::serialization::access;
                 allocator<value_type, size_type, memory_space_type> m_allocator; 
 
                 memory(const memory&){}
                 memory& operator=(const memory& o){return *this;}
             protected:
                 pointer_type m_ptr;  
                 size_type    m_size; 
             public:
                 pointer_type ptr(){return m_ptr;}
                 const_pointer_type ptr()const{return m_ptr;}
 
                 size_type size()const{    return m_size; }
                 size_type memsize()const{ return size()*sizeof(V); }
 
                 void reset(pointer_type p, size_type s){ m_ptr = p; m_size = s; }
 
 
                 memory():m_ptr(NULL),m_size(0){}
                 
                 memory(value_type* ptr, size_type size):m_ptr(ptr),m_size(size){}
 
                 ~memory(){ dealloc(); }
 
                 void dealloc(){
                     if (m_ptr)
                         m_allocator.dealloc(&this->m_ptr);
                     m_ptr=NULL;
                 }
         };
 
     template<class V, class M>
         class linear_memory
         : public memory<V,M> {
             private:
                 typedef memory<V,M> super;
             public:
                 typedef typename super::value_type       value_type; 
                 typedef typename super::const_value_type const_value_type;   
                 typedef typename super::memory_space_type memory_space_type; 
                 typedef typename super::index_type index_type; 
                 typedef typename super::size_type  size_type;       
                 typedef typename super::reference_type reference_type; 
                 typedef typename super::const_reference_type const_reference_type; 
             private:
                 friend class boost::serialization::access;
                 typedef linear_memory<V,M> my_type; 
                 allocator<value_type, size_type, memory_space_type> m_allocator; 
                 using super::m_size;
                 using super::m_ptr;
             public:
 
                 linear_memory(){}
 
                 linear_memory(size_type i){m_size = i; alloc();}
 
                 value_type* release(){ value_type* ptr = m_ptr; m_ptr = NULL; return ptr; }
 
                 void set_size(size_type s){
                     if(s!=this->size()){
                         dealloc();
                         m_size = s;
                         alloc();
                     }
                 }
 
                 void alloc(){
                     assert(this->m_ptr == NULL);
                     if(m_size>0)
                         m_allocator.alloc( &m_ptr,m_size);
                 }
 
                 void dealloc(){
                     if (m_ptr)
                         m_allocator.dealloc(&this->m_ptr);
                     this->m_ptr=NULL;
                     this->m_size=0;
                 }
 
                 my_type& 
                     operator=(const my_type& o){
                         if(this == &o)
                             return *this;
 
                         if(this->size() != o.size()){
                             this->dealloc();
                             m_size = o.size();
                             this->alloc();
                         }
                         m_allocator.copy(this->m_ptr,o.ptr(),this->size(),memory_space_type());
 
                         return *this;
                     }
 
                 template<class OM>
                     my_type& 
                     operator=(const linear_memory<value_type, OM>& o){
                         if(this->size() != o.size()){
                             this->dealloc();
                             m_size = o.size();
                             this->alloc();
                         }
                         m_allocator.copy(m_ptr,o.ptr(),this->size(),OM());
                         return *this;
                     }
 
                 linear_memory(const my_type& o){
                     operator=(o);
                 }
 
                 template<class OM>
                 linear_memory(const linear_memory<V,OM>& o){
                     operator=(o);
                 }
 
                 reference_type
                     operator[](const index_type& idx)    
                     {
                         assert(idx>=0);
                         assert((size_type)idx<m_size);
                         return reference_type(this->m_ptr+idx);
                     }
 
                 const_reference_type
                     operator[](const index_type& idx)const
                     {
                         assert(idx>=0);
                         assert((size_type)idx<m_size);
                         return const_reference_type(this->m_ptr+idx);
                     }
                 
                 ~linear_memory(){ dealloc(); }
                 
                 void set_strides(
                         linear_memory<index_type,host_memory_space>& strides,
                         const linear_memory<size_type,host_memory_space>& shape, row_major){
                     size_type size = 1;
                     for (int i = shape.size()-1; i >= 0; --i)
                     {
                         strides[i] = (shape[i] == 1) ? 0 : size;
                         size *= shape[i];
                     }
                 }
                 void set_strides(
                         linear_memory<index_type,host_memory_space>& strides,
                         const linear_memory<size_type,host_memory_space>& shape, column_major){
                     size_type size = 1;
                     for (unsigned int i = 0; i <  shape.size(); ++i)
                     {
                         strides[i] = (shape[i] == 1) ? 0 : size;
                         size *= shape[i];
                     }
                 }
 
                 void reverse(){
                     if(IsSame<dev_memory_space,memory_space_type>::Result::value)
                         throw std::runtime_error("reverse of dev linear memory not implemented");
                     value_type* __first = m_ptr, *__last = m_ptr + this->size();
                     while (true)
                         if (__first == __last || __first == --__last)
                             return;
                         else
                         {
                             std::iter_swap(__first, __last);
                             ++__first;
                         }
                 }
 
         }; // data_structures
 
     namespace detail{
 
         inline bool is_c_contiguous(row_major, const linear_memory<unsigned int,host_memory_space>& shape, const linear_memory<int,host_memory_space>& stride){
             bool c_contiguous = true;
             int size = 1;
             for (int i = shape.size()-1; (i >= 0) && c_contiguous; --i)
             {
                 if (shape[i] == 1)
                     continue;
                 if (stride[i] != size)
                     c_contiguous = false;
                 size = size * shape[i];
             }
             return c_contiguous;
         }
 
         inline bool is_c_contiguous(column_major, const linear_memory<unsigned int,host_memory_space>& shape, const linear_memory<int,host_memory_space>& stride){
             bool c_contiguous = true;
             int size = 1;
             for (unsigned int i = 0; i<shape.size() && c_contiguous; ++i)
             {
                 if (shape[i] == 1)
                     continue;
                 if (stride[i] != size)
                     c_contiguous = false;
                 size = size * shape[i];
             }
             return c_contiguous;
         }
 
         inline bool is_2dcopyable(row_major, const linear_memory<unsigned int,host_memory_space>& shape, const linear_memory<int,host_memory_space>& stride){
             bool c_contiguous = shape.size()>1;
             int pitched_dim = shape.size()-1; // last dim
             while(shape[pitched_dim]==1)
                 pitched_dim --;
             int size = 1;
             for (int i = shape.size()-1; (i >= 0) && c_contiguous; --i)
             {
                 if(shape[i] == 1){
                     continue;
                 }else if(i == pitched_dim){
                     size *= stride[i-1];
                 }else if(stride[i] != size) {
                     c_contiguous = false;
                 }else{
                     size *= shape[i];
                 }
             }
             return c_contiguous;
         }
 
         inline bool is_2dcopyable(column_major, const linear_memory<unsigned int,host_memory_space>& shape, const linear_memory<int,host_memory_space>& stride){
             bool c_contiguous = shape.size()>1;
             unsigned int pitched_dim = 0; 
             while(shape[pitched_dim]==1)
                 pitched_dim ++;
             int size = 1;
             for (unsigned int i = 0; (i <  shape.size()) && c_contiguous; ++i)
             {
                 if(shape[i] == 1){
                     continue;
                 }else if(i == pitched_dim){
                     size *= stride[i];
                 }else if(stride[i] != size) {
                     c_contiguous = false;
                 }else{
                     size *= shape[i];
                 }
             }
             return c_contiguous;
         }
 
         template<class index_type, class size_type>
         void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch,
                 const linear_memory<size_type,host_memory_space>& shape,
                 const linear_memory<index_type,host_memory_space>& stride,
                 row_major){
             // strided dimension is the LAST one
             rows  = std::accumulate(shape[0].ptr,
                     shape[0].ptr+shape.size()-1,
                     1, std::multiplies<index_type>());
             cols  = shape[shape.size()-1];
             pitch = stride[shape.size()-2];
         }
         template<class index_type, class size_type>
         void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch,
                 const linear_memory<size_type,host_memory_space>& shape,
                 const linear_memory<index_type,host_memory_space>& stride,
                 column_major){
             // strided dimension is the FIRST one
             rows = std::accumulate(shape[0].ptr+1,
                     shape[0].ptr+shape.size(),
                     1, std::multiplies<index_type>());
             cols = shape[0];
             pitch = stride[1];
         }
     }
 
     template<class V, class M>
         class pitched_memory 
         : public memory<V,M> {
             private:
                 typedef memory<V,M> super;
             public:
                 typedef typename super::value_type       value_type; 
                 typedef typename super::const_value_type const_value_type;   
                 typedef typename super::memory_space_type memory_space_type; 
                 typedef typename super::index_type index_type; 
                 typedef typename super::size_type  size_type;       
                 typedef typename super::reference_type reference_type; 
                 typedef typename super::const_reference_type const_reference_type; 
             private:
                 friend class boost::serialization::access;
                 typedef pitched_memory<V,M> my_type; 
                 allocator<value_type, size_type, memory_space_type> m_allocator; 
                 size_type m_rows;  
                 size_type m_cols;  
                 size_type m_pitch;  
                 using super::m_ptr;
                 using super::m_size;
             public:
 
                 size_type rows()const{return m_rows;}
                 
                 size_type cols()const{return m_cols;}
                 
                 size_type pitch()const{return m_pitch;}
 
                 size_type size()const{    return m_rows*m_pitch; }
 
                 size_type memsize()const{ return size()*sizeof(V); }
 
                 pitched_memory():m_rows(0),m_cols(0),m_pitch(0){}
 
                 pitched_memory(index_type i, index_type j)
                     :m_rows(i),m_cols(j),m_pitch(0){alloc();}
 
                 void alloc(){
                     assert(this->m_ptr == NULL);
                     m_allocator.alloc2d(&this->m_ptr,m_pitch,m_rows,m_cols); 
                     assert(m_pitch%sizeof(value_type)==0);
                     m_pitch/=sizeof(value_type);
                     m_size = m_rows*m_pitch; // in class memory
                 }
 
                 void dealloc(){
                     if (this->m_ptr)
                         m_allocator.dealloc(&this->m_ptr);
                     this->m_ptr=NULL;
                     this->m_size=NULL;
                 }
 
                 value_type* release(){ value_type* ptr = m_ptr; m_ptr = NULL; return ptr; }
 
                 void set_size(size_type rows, size_type cols){
                     if(        cols>m_pitch
                             || rows>m_rows
                             ){
                                 dealloc();
                                 m_rows = rows;
                                 m_cols = cols;
                                 alloc();
                             }else{
                                 m_rows = rows;
                                 m_cols = cols;
                             }
                 }
 
 
                 my_type& 
                     operator=(const my_type& o){
                         if(this==&o) return *this;
 
                         if(        m_pitch < o.m_cols
                                 || m_rows  < o.m_rows
                                 ){
                             this->dealloc();
                             m_cols = o.m_cols;
                             m_rows = o.m_rows;
                             this->alloc();
                         }
                         m_cols = o.m_cols;
                         m_rows = o.m_rows;
                         m_allocator.copy2d(this->m_ptr,o.ptr(),m_pitch*sizeof(value_type),o.m_pitch*sizeof(value_type),m_rows,m_cols,memory_space_type());
 
                         return *this;
                     }
 
                 template<class OM>
                     my_type& 
                     operator=(const pitched_memory<value_type, OM>& o){
                         if(        m_pitch < o.m_cols
                                 || m_rows  < o.m_rows
                                 ){
                             this->dealloc();
                             m_cols = o.m_cols;
                             m_rows = o.m_rows;
                             this->alloc();
                         }
                         m_cols = o.m_cols;
                         m_rows = o.m_rows;
                         m_allocator.copy2d(this->m_ptr,o.ptr(),m_pitch*sizeof(value_type),o.m_pitch*sizeof(value_type),m_rows,m_cols,OM());
                         return *this;
                     }
 
                 reference_type
                     operator[](const index_type& idx)    
                     {
                         assert(idx>=0);
                         index_type row = idx/m_cols;
                         index_type col = idx%m_cols;
                         assert((size_type)row < m_rows);
                         assert((size_type)col < m_cols);
                         return reference_type(this->m_ptr+row*m_pitch+col);
                     }
 
                 const_reference_type
                     operator[](const index_type& idx)const
                     {
                         return const_cast<pitched_memory&>(*this)(idx);
                     }
 
                 reference_type
                     operator()(const index_type& i, const index_type& j){
                         assert(i>=0);
                         assert(j>=0);
                         assert((size_type)i < m_rows);
                         assert((size_type)j < m_cols);
                         return reference_type(this->m_ptr+i*m_pitch+j);
                     }
                 const_reference_type
                     operator()(const index_type& i, const index_type& j)const{
                         return const_cast<pitched_memory&>(*this)(i,j);
                     }
 
 
                 void set_strides(
                         linear_memory<index_type,host_memory_space>& strides,
                         const linear_memory<size_type,host_memory_space>& shape, row_major){
                     size_type size = 1;
                     assert(shape.size()>=2);
                     const int pitched_dim = shape.size()-1;
                     for (int i = shape.size()-1; i >= 0; --i)
                     {
                         if(shape[i] == 1){
                             strides[i] = 0;
                         }else if(i == pitched_dim){
                             strides[i] = 1;
                             size *= pitch();
                         }else {
                             strides[i] = size;
                             size *= shape[i];
                         }
                     }
                 }
                 void set_strides(
                         linear_memory<index_type,host_memory_space>& strides,
                         const linear_memory<size_type,host_memory_space>& shape, column_major){
                     size_type size = 1;
                     assert(shape.size()>=2);
                     const size_type pitched_dim = 0;
                     for (unsigned int i = 0; i < shape.size(); ++i)
                     {
                         if(shape[i] == 1){
                             strides[i] = 0;
                         }else if(i == pitched_dim){
                             strides[i] = 1;
                             size *= pitch();
                         }else {
                             strides[i] = size;
                             size *= shape[i];
                         }
                     }
                 }
         };
 
     template<class M, class L>
     struct tensor_info{
         typedef unsigned int   size_type;        
         typedef int            index_type;       
         typedef M data_memory_space; 
 
         linear_memory<size_type, host_memory_space> host_shape;
         linear_memory<index_type, host_memory_space> host_stride;
 
         linear_memory<size_type, data_memory_space> data_shape;
         linear_memory<index_type, data_memory_space> data_stride;
 
         tensor_info(){}
 
         size_type size(){ return host_shape.size(); }
 
         tensor_info(size_type s){ resize(s); }
 
         void resize(size_type s){
             host_shape.set_size(s);
             host_stride.set_size(s);
             //data_shape.set_size(s);
             //data_stride.set_size(s);
         }
 
         tensor_info(const tensor_info<M,L>& o)
             : host_shape(o.host_shape)
             , host_stride(o.host_stride)
             //, data_shape(o.data_shape)
             //, data_stride(o.data_stride)
         {}
 
         template<class OM>
         tensor_info(const tensor_info<OM,L>& o)
             : host_shape(o.host_shape)
             , host_stride(o.host_stride)
             //, data_shape(o.data_shape)
             //, data_stride(o.data_stride)
         {}
 
     } ;
 
     template<class V, class M, class L>
         class tensor;
     template<class V, class M, class L>
         class tensor_view;
 
 
 
     template<class V, class M, class L=row_major>
     class tensor{
         public:
             typedef memory<V,M> memory_type; 
             typedef typename memory_type::reference_type reference_type; 
             typedef typename memory_type::const_reference_type const_reference_type; 
             typedef typename memory_type::memory_space_type memory_space_type; 
             typedef typename memory_type::value_type value_type; 
             typedef typename memory_type::size_type size_type; 
             typedef typename memory_type::index_type index_type; 
             typedef typename memory_type::pointer_type pointer_type; 
             typedef typename memory_type::const_pointer_type const_pointer_type; 
             typedef          L memory_layout_type; 
 
             typedef tensor_info<M,L> info_type; 
             typedef tensor_view<V,M,L> view_type; 
 
         protected:
             info_type  m_info;  
 
             boost::shared_ptr<memory_type> m_memory;
             
             V* m_ptr;
 
             template <class _V, class _M, class _L>
             friend class tensor_view;
 
                         size_type
                         index_of(int D, index_type* arr)const{
                                 index_type pos = 0;
                                 for(int i=0; i<D; i++){
                                         index_type temp = arr[i];
                     if(temp<0) temp = m_info.host_shape[i]+temp;
                                         pos += temp * m_info.host_stride[i];
                                 }
                                 return pos;
                         }
             void allocate(tensor& t,linear_memory_tag){
                     linear_memory<V,M> d(t.size());
                     d.set_strides(t.m_info.host_stride,t.m_info.host_shape, L());
                     t.m_ptr = d.ptr();
                     t.m_memory.reset(new memory<V,M>(d.release(), d.size()));
                 }
 
             void allocate(tensor& t,pitched_memory_tag){
                 typename tensor<V,M,L>::size_type row,col,pitch;
                 detail::get_pitched_params(row,col,pitch,t.m_info.host_shape, t.m_info.host_stride,L());
                 pitched_memory<V,M> d(row,col);
                 d.set_strides(t.m_info.host_stride,t.m_info.host_shape, L());
                 t.m_ptr = d.ptr();
                 t.m_memory.reset(new memory<V,M>(d.release(),d.size()));
             }
 
 
         public:
             template<std::size_t D>
                         size_type
                         index_of(const extent_gen<D>& eg)const{
                                 index_type pos = 0;
                                 for(int i=0; i<D; i++){
                                         index_type temp = eg.ranges_[i].finish();
                     if(temp<0) temp = m_info.host_shape[i]+temp;
                                         pos += temp * m_info.host_stride[i];
                                 }
                                 return pos;
                         }
 
             index_type ndim()const{ return m_info.host_shape.size(); }
 
             size_type shape(const index_type& i)const{return m_info.host_shape[i];}
 
             index_type stride(const index_type& i)const{return m_info.host_stride[i];}
 
             pointer_type       ptr()       {return m_ptr;}
 
             const_pointer_type ptr() const {return m_ptr;}
 
             void set_ptr_offset(long int i){ m_ptr = m_memory->ptr() + i; }
 
             boost::shared_ptr<memory_type>& mem(){ return m_memory; }
             const boost::shared_ptr<memory_type>& mem()const{ return m_memory; }
 
 
             size_type size()const{
                 return std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr+m_info.host_shape.size(), 1, std::multiplies<index_type>());
             }
 
             size_type memsize()const{
 #ifndef NDEBUG
                 cuvAssert(is_c_contiguous());
 #endif
                 return std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr+m_info.host_shape.size(), 1, std::multiplies<index_type>());
             }
 
             std::vector<size_type> shape()const{
                 if(ndim()==0)
                     return std::vector<size_type>();
                 return std::vector<size_type>(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr+m_info.host_shape.size());
             }
 
             std::vector<size_type> effective_shape()const{
                 std::vector<size_type> shape;
                 shape.reserve(ndim());
                 if(ndim()==0)
                     return shape;
                 std::remove_copy_if(
                         m_info.host_shape[0].ptr, 
                         m_info.host_shape[0].ptr+m_info.host_shape.size(),
                         std::back_inserter(shape),
                         std::bind2nd(std::equal_to<size_type>(),1));
                 return shape;
             }
 
             const info_type& info()const{return m_info;}
 
             info_type& info(){return m_info;}
 
             bool is_c_contiguous()const{
                 return detail::is_c_contiguous(memory_layout_type(), m_info.host_shape, m_info.host_stride);
             }
             
             bool is_2dcopyable()const{
                 return detail::is_2dcopyable(memory_layout_type(), m_info.host_shape, m_info.host_stride);
             }
  // accessors
 
             reference_type operator[](index_type idx){
                 size_type ndim = m_info.host_shape.size();
                 size_type* virtualstride = new size_type[ndim];
                 size_type pos = 0;
                 if(IsSame<L,row_major>::Result::value){
                     // row major
                     {   size_type virt_size = 1;
                         for(int i=ndim-1;i>=0;--i){
                             virtualstride[i] = virt_size;
                             virt_size *= m_info.host_shape[i];
                         }
                     }
                     for(size_type i=0; i<ndim; ++i){
                         pos += (idx/virtualstride[i])*m_info.host_stride[i];
                         idx -= (idx/virtualstride[i])*virtualstride[i];
                     }
                 }else{
                     // column major
                     {   size_type virt_size = 1;
                         for(unsigned int i=0;i<ndim;++i){
                             virtualstride[i] = virt_size;
                             virt_size *= m_info.host_shape[i];
                         }
                     }
                     for(int i=ndim-1; i>=0; --i){
                         pos += (idx/virtualstride[i])*m_info.host_stride[i];
                         idx -= (idx/virtualstride[i])*virtualstride[i];
                     }
                 }
                 delete[] virtualstride;
                 return reference_type(m_ptr + pos);
             }
 
             const_reference_type operator[](index_type idx)const{
                 return const_cast<tensor&>(*this)[idx];
             }
 
             reference_type operator()(index_type i0){
 #ifndef NDEBUG
                 cuvAssert(ndim()==1);
                 cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)<shape(0)+1) )
 #endif
                 if(i0>=0){
                         return reference_type(m_ptr + i0);
                 }else{
                         return reference_type(m_ptr + shape(0) - i0);
                 }
             }
             const_reference_type operator()(index_type i0)const{ return const_cast<tensor&>(*this)(i0); }
 
             const_reference_type operator()(index_type i0, index_type i1)const{ return const_cast<tensor&>(*this)(i0,i1); }
             reference_type operator()(index_type i0, index_type i1){
 #ifndef NDEBUG
                 cuvAssert(ndim()==2);
                 cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)<shape(0)+1) )
                 cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)(-i1)<shape(1)+1) )
 #endif
                 index_type arr[2] = {i0,i1};
                 return reference_type(m_ptr + index_of( 2,arr));
             }
 
             const_reference_type operator()(index_type i0, index_type i1, index_type i2)const{ return const_cast<tensor&>(*this)(i0,i1,i2); }
             reference_type operator()(index_type i0, index_type i1, index_type i2){
 #ifndef NDEBUG
                 cuvAssert(ndim()==3);
                 cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1) )
                 cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1) )
                 cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1) )
 #endif
                 index_type arr[3] = {i0,i1,i2};
                 return reference_type(m_ptr + index_of( 3,arr));
             }
 
             const_reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3)const{ return const_cast<tensor&>(*this)(i0,i1,i2,i3); }
             reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3){
 #ifndef NDEBUG
                 cuvAssert(ndim()==4);
                 cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1) )
                 cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1) )
                 cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1) )
                 cuvAssert((i3>=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3<shape(3)+1) )
 #endif
                 index_type arr[4] = {i0,i1,i2,i3};
                 return reference_type(m_ptr + index_of( 4,arr));
             }
 
             const_reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3, index_type i4)const{ return const_cast<tensor&>(*this)(i0,i1,i2,i3,i4); }
             reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3, index_type i4){
 #ifndef NDEBUG
                 cuvAssert(ndim()==5);
                 cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1) )
                 cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1) )
                 cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1) )
                 cuvAssert((i3>=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3<shape(3)+1) )
                 cuvAssert((i4>=0 && (size_type)i4 < shape(4)) || (i4<0 && (size_type)-i4<shape(4)+1) )
 #endif
                 index_type arr[5] = {i0,i1,i2,i3,i4};
                 return reference_type(m_ptr + index_of( 5,arr));
             } // accessing stored values
 
             tensor():m_ptr(NULL){}
 
             // ****************************************************************
             //        Constructing from other tensor
             // ****************************************************************
 
             tensor(const tensor& o)
             : m_info(o.m_info)     // copy only shape
             , m_memory(o.m_memory) // increase ref counter
             , m_ptr(o.m_ptr){}     // same pointer in memory
 
             template<class OM>
                 tensor(const tensor<value_type,OM,L>& o)
                 :m_info(o.info()) // primarily to copy shape
                 ,m_ptr(NULL)
                 {
                     copy_memory(*this, o, linear_memory_tag());
                     m_ptr = m_memory->ptr();
                 }
 
                 explicit tensor(const tensor& o, pitched_memory_tag)
                 :m_info(o.m_info) // primarily to copy shape
                 ,m_ptr(NULL)
                 {
                     copy_memory(*this, o, pitched_memory_tag());
                     m_ptr = m_memory->ptr();
                 }
 
             template<class OM>
                 explicit tensor(const tensor<value_type,OM,L>& o, pitched_memory_tag)
                 :m_info(o.info()) // primarily to copy shape
                 ,m_ptr(NULL)
                 {
                     copy_memory(*this, o, pitched_memory_tag());
                     m_ptr = m_memory->ptr();
                 }
 
                 explicit tensor(const tensor& o, linear_memory_tag)
                 :m_info(o.m_info) // primarily to copy shape
                 ,m_ptr(NULL)
                 {
                     copy_memory(*this, o, linear_memory_tag());
                     m_ptr = m_memory->ptr();
                 }
             template<class OM>
                 explicit tensor(const tensor<value_type,OM,L>& o, linear_memory_tag)
                 :m_info(o.info()) // primarily to copy shape
                 ,m_ptr(NULL)
                 {
                     copy_memory(*this, o, linear_memory_tag());
                     m_ptr = m_memory->ptr();
                 }
             template<class OL>
                 explicit tensor(const tensor<value_type,M,OL>& o)
             : m_memory(o.mem()) // increase ref counter
             , m_ptr(const_cast<pointer_type>( o.ptr() )) { // same pointer in memory
                 m_info.host_shape = o.info().host_shape; 
                 m_info.host_shape.reverse();
                 m_info.host_stride = o.info().host_stride; 
                 m_info.host_stride.reverse();
             }    
             
             // ****************************************************************
             //        Constructing from SHAPE
             // ****************************************************************
 
                         explicit tensor(const size_type i)
                 :m_ptr(NULL)
             {
                                 m_info.resize(1);
                 m_info.host_shape[0] = i;
                 allocate(*this,linear_memory_tag());
                         }
                         explicit tensor(const size_type i, const int j)
                 :m_ptr(NULL)
             {
                                 m_info.resize(2);
                 m_info.host_shape[0] = i;
                 m_info.host_shape[1] = j;
                 allocate(*this,linear_memory_tag());
                         }
                         template<std::size_t D>
                         explicit tensor(const extent_gen<D>& eg)
                 :m_ptr(NULL)
             {
                                 m_info.resize(D);
                                 for(std::size_t i=0;i<D;i++)
                                         m_info.host_shape[i] = eg.ranges_[i].finish();
                 allocate(*this,linear_memory_tag());
                         }
 
                         explicit tensor(const std::vector<size_type>& eg)
                 :m_ptr(NULL)
             {
                                 m_info.resize(eg.size());
                                 for(std::size_t i=0;i<eg.size();i++)
                                         m_info.host_shape[i] = eg[i];
                 allocate(*this,linear_memory_tag());
                         }
 
                         explicit tensor(const std::vector<size_type>& eg, pitched_memory_tag)
                 :m_ptr(NULL)
             {
                                 m_info.resize(eg.size());
                                 for(std::size_t i=0;i<eg.size();i++)
                                         m_info.host_shape[i] = eg[i];
                 allocate(*this,pitched_memory_tag());
                         }
 
                         template<std::size_t D>
                         explicit tensor(const extent_gen<D>& eg, pitched_memory_tag)
                 :m_ptr(NULL)
             {
                                 m_info.resize(D);
                                 for(std::size_t i=0;i<D;i++)
                                         m_info.host_shape[i] = eg.ranges_[i].finish();
                 allocate(*this,pitched_memory_tag());
                         }
 
             // ****************************************************************
             //        Constructing from shape and raw pointer
             // ****************************************************************
 
             template<std::size_t D>
                 explicit tensor(const extent_gen<D>& eg, value_type* ptr)
                 :m_ptr(ptr)
                 {
                     m_info.resize(D);
                     size_type size = 1;
                     if(IsSame<memory_layout_type,row_major>::Result::value)
                         for(int i=D-1;i>=0;i--){
                             m_info.host_shape[i] = eg.ranges_[i].finish();
                             m_info.host_stride[i] = size;
                             size *= eg.ranges_[i].finish();
                         }
                     else
                         for(std::size_t i=0;i<D;i++){
                             m_info.host_shape[i] = eg.ranges_[i].finish();
                             m_info.host_stride[i] = size;
                             size *= eg.ranges_[i].finish();
                         }
                 }
             explicit tensor(const std::vector<size_type>& shape, value_type* ptr)
                 :m_ptr(ptr)
             {
                 unsigned int D = shape.size();
                 m_info.resize(D);
                 size_type size = 1;
                 if(IsSame<memory_layout_type,row_major>::Result::value)
                     for(int i=D-1;i>=0;i--){
                         m_info.host_shape[i] = shape[i];
                         m_info.host_stride[i] = size;
                         size *= shape[i];
                     }
                 else
                     for(std::size_t i=0;i<D;i++){
                         m_info.host_shape[i] = shape[i];
                         m_info.host_stride[i] = size;
                         size *= shape[i];
                     }
             }
             template<int D, int E>
                 explicit tensor(const index_gen<D,E>& idx, value_type* ptr)
                 :m_ptr(ptr)
                 {
                     m_info.resize(D);
                     size_type size = 1;
                     if(IsSame<memory_layout_type,row_major>::Result::value)
                         for(int i=D-1;i>=0;i--){
                             m_info.host_shape[i] = idx.ranges_[i].finish();
                             m_info.host_stride[i] = size;
                             size *= idx.ranges_[i].finish();
                         }
                     else
                         for(std::size_t i=0;i<D;i++){
                             m_info.host_shape[i] = idx.ranges_[i].finish();
                             m_info.host_stride[i] = size;
                             size *= idx.ranges_[i].finish();
                         }
                 }
             // @} // constructors
 
 
             // ****************************************************************
             //   assignment operators (try not to reallocate if shapes match)
             // ****************************************************************
 
             template<class _M, class _L>
             tensor& assign(const tensor<V,_M,_L>& o){
                 static_cast<tensor_view<V,M,L>&>(*this).operator=(o);
                 return *this;
             }
 
             tensor& operator=(const tensor& o){
                 if(this==&o) return *this; // check for self-assignment
                 /*
                  *if(copy_memory(*this,o,false))
                  *    return *this;
                  */
                 m_memory = o.mem();
                 m_ptr = const_cast<pointer_type>(o.ptr());
                 m_info = o.info();
                 return *this;
             }
 
             template<class _V>
             typename boost::enable_if_c<boost::is_convertible<_V,value_type>::value, tensor&>::type
             operator=(const _V& scalar){
                 fill(*this, scalar);
                 return *this;
             }
 
             template<class OM>
             tensor& operator=(const tensor<value_type,OM,L>& o){
                 if(!copy_memory(*this,o,false))
                     copy_memory(*this,o,linear_memory_tag());
                 if(mem())
                     // if mem() does not exist, we're just wrapping a pointer
                     // of a std::vector or so -> simply keep it
                     m_ptr = mem()->ptr();
                 return *this;
             }
 
             template<class OL>
             tensor& operator=(const tensor<value_type,M,OL>& o){
                 m_memory = o.mem();
                 m_ptr    = const_cast<V*>(o.ptr());
                 m_info.host_shape   = o.info().host_shape;
                 m_info.host_stride  = o.info().host_stride;
                 m_info.host_stride.reverse();
                 m_info.host_shape.reverse();
                 return *this;
             }
  // assignment
 
 
             template<class T>
             tensor copy(T tag=linear_memory_tag())const{
                     tensor t;
                     const tensor& o = *this;
                     t.m_info   = o.info();
                     copy_memory(t,o,tag);
                     t.m_ptr    = t.mem()->ptr();
                     return t;
                 }
 
             tensor copy()const{
                 return copy(linear_memory_tag());
             }
 
 
             template<int D, int E>
                 tensor_view<V,M,L>
                 operator[](const index_gen<D,E>& idx)const
                 {
                     tensor_view<V,M,L> t;
                     const tensor& o = *this;
                     t.m_memory = o.mem();
                     t.m_ptr    = const_cast<pointer_type>(o.ptr());
 
                     std::vector<int> shapes;
                     std::vector<int> strides;
                     shapes.reserve(o.ndim());
                     strides.reserve(o.ndim());
                     for(std::size_t i=0;i<D;i++){
                         int start  = idx.ranges_[i].get_start(0);
                         int finish = idx.ranges_[i].get_finish(o.shape(i));
                         int stride = idx.ranges_[i].stride();
                         if (start <0) start  += o.shape(i);
                         if (finish<0) finish += o.shape(i);
 #ifndef NDEBUG
                         cuvAssert(finish>start);
 #endif
                         t.m_ptr += start*o.stride(i);
                         if(idx.ranges_[i].is_degenerate()){
                             // skip dimension
                         }else{
                             shapes.push_back((finish-start)/stride);
                             strides.push_back(o.stride(i)*stride);
                         }
                     }
 
 
                     // adds missing shapes 
                     for(int i = D; i < o.ndim();i++){
                        shapes.push_back(o.shape(i));
                        strides.push_back(o.stride(i));
                     }
 
                     // store in m_info
                     t.m_info.resize(shapes.size());
                     std::copy(shapes.begin(),shapes.end(),t.m_info.host_shape[0].ptr);
                     std::copy(strides.begin(),strides.end(),t.m_info.host_stride[0].ptr);
                     return t; // should not copy mem, only m_info
                 }
 
             template<std::size_t D>
             void reshape(const extent_gen<D>& eg){
                 std::vector<size_type> shape(D);
                 for(std::size_t i=0;i<D;i++) 
                     shape[i] = eg.ranges_[i].finish();
                 reshape(shape);
             }
             void reshape(const std::vector<size_type>& shape){
                 size_type new_size = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_type>());
                 if(!is_c_contiguous())
                     throw std::runtime_error("cannot reshape: tensor is not c_contiguous");
                 if(size() != new_size)
                     throw std::runtime_error("cannot reshape: products do not match");
                 m_info.resize(shape.size());
                 size_type size = 1;
                 if(IsSame<memory_layout_type,row_major>::Result::value)
                     for(int i=shape.size()-1;i>=0;i--){
                         m_info.host_shape[i] = shape[i];
                         m_info.host_stride[i] = size;
                         size *= shape[i];
                     }
                 else
                     for(std::size_t i=0;i<shape.size();i++){
                         m_info.host_shape[i] = shape[i];
                         m_info.host_stride[i] = size;
                         size *= shape[i];
                     }
             }
             void reshape(size_type r, size_type c){
                 reshape(extents[r][c]);
             }
 
             void resize(const std::vector<size_type>& shape){
                 if(ndim()!=0){
                     size_type new_size = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_type>());
                     if(is_c_contiguous() && size()==new_size)
                         reshape(shape);
                     else
                         *this = tensor(shape);
                 }
                 else
                     *this = tensor(shape);
             }
             template<std::size_t D>
                 void resize(const extent_gen<D>& eg){
                     std::vector<size_type> shape(D);
                     for(std::size_t i=0;i<D;i++) 
                         shape[i] = eg.ranges_[i].finish();
                     resize(shape);
                 }
 
             void dealloc(){
                 m_memory.reset();
                 m_ptr = NULL;
                 m_info.host_shape.set_size(0);
             }
 
     };
 
     template<class V, class M, class L=row_major>
         class tensor_view
         : public tensor<V,M,L>
         {
             private:
                 typedef tensor<V,M,L> super;
                 using super::m_memory;
                 using super::m_ptr;
                 using super::m_info;
             public:
                 tensor_view(){}
 
                 tensor_view& operator=(const tensor<V,M,L>& o){
                     if(!copy_memory(*this, o, false))
                         throw std::runtime_error("copying tensor to tensor_view did not succeed. Maybe a shape mismatch?");
                     return *this;
                 }
                 tensor_view& operator=(const tensor_view<V,M,L>& o){
                     if(!copy_memory(*this, o, false))
                         throw std::runtime_error("copying tensor to tensor_view did not succeed. Maybe a shape mismatch?");
                     return *this;
                 }
                 template<class _V>
                     typename boost::enable_if_c<boost::is_convertible<_V,V>::value, tensor_view&>::type
                     operator=(const _V& scalar){
                         super::operator=(scalar);
                         return *this;
                     }
                 template<class OM>
                 tensor_view& operator=(const tensor<V,OM,L>& o){
                     if(!copy_memory(*this, o, false))
                         throw std::runtime_error("copying tensor to tensor_view did not succeed. Maybe a shape mismatch?");
                     return *this;
                 }
 
                 template<class OM>
                 tensor_view& operator=(const tensor_view<V,OM,L>& o){
                     if(!copy_memory(*this, o, false))
                         throw std::runtime_error("copying tensor to tensor_view did not succeed. Maybe a shape mismatch?");
                     return *this;
                 }
 
                 template<int D, int E>
                     explicit tensor_view(const tensor<V,M,L>&o, const index_gen<D,E>& idx)
                     {
                         m_memory = o.mem();
                         m_ptr    = const_cast<V*>(o.ptr());
                         std::vector<int> shapes;
                         std::vector<int> strides;
                         shapes.reserve(o.ndim());
                         strides.reserve(o.ndim());
                         for(std::size_t i=0;i<D;i++){
                             int start  = idx.ranges_[i].get_start(0);
                             int finish = idx.ranges_[i].get_finish(o.shape(i));
                             int stride = idx.ranges_[i].stride();
                             if (start <0) start  += o.shape(i);
                             if (finish<0) finish += o.shape(i);
 #ifndef NDEBUG
                             cuvAssert(finish>start);
 #endif
                             m_ptr += start*o.stride(i);
                             if(idx.ranges_[i].is_degenerate()){
                                 // skip dimension
                             }else{
                                 shapes.push_back((finish-start)/stride);
                                 strides.push_back(o.stride(i)*stride);
                             }
                         }
                         
                         // adds missing shapes 
                         for(int i = D; i < o.ndim();i++){
                             shapes.push_back(o.shape(i));
                             strides.push_back(o.stride(i));
                         }
 
                         // store in m_info
                         m_info.resize(shapes.size());
                         std::copy(shapes.begin(),shapes.end(),m_info.host_shape[0].ptr);
                         std::copy(strides.begin(),strides.end(),m_info.host_stride[0].ptr);
                     }
                 template<int D, int E>
                     explicit tensor_view( const index_gen<D,E>& idx, const tensor<V,M,L>&o)
                     {
                         m_memory = o.mem();
                         m_ptr    = const_cast<V*>(o.ptr());
                         std::vector<int> shapes;
                         std::vector<int> strides;
                         shapes.reserve(o.ndim());
                         strides.reserve(o.ndim());
                         for(std::size_t i=0;i<D;i++){
                             int start  = idx.ranges_[i].get_start(0);
                             int finish = idx.ranges_[i].get_finish(o.shape(i));
                             int stride = idx.ranges_[i].stride();
                             if (start <0) start  += o.shape(i);
                             if (finish<0) finish += o.shape(i);
 #ifndef NDEBUG
                             cuvAssert(finish>start);
 #endif
                             m_ptr += start*o.stride(i);
                             if(idx.ranges_[i].is_degenerate()){
                                 // skip dimension
                             }else{
                                 shapes.push_back((finish-start)/stride);
                                 strides.push_back(o.stride(i)*stride);
                             }
                         }
 
                         // adds missing shapes 
                         for(int i = D; i < o.ndim();i++){
                             shapes.push_back(o.shape(i));
                             strides.push_back(o.stride(i));
                         }
 
                         // store in m_info
                         m_info.resize(shapes.size());
                         std::copy(shapes.begin(),shapes.end(),m_info.host_shape[0].ptr);
                         std::copy(strides.begin(),strides.end(),m_info.host_stride[0].ptr);
                     }
         };
 
     //namespace detail{
         template<class V, class M0, class M1, class L0, class L1>
             bool copy_memory(tensor<V,M0,L0>&dst, const tensor<V,M1,L1>&src, bool force_dst_contiguous){
                 typedef typename tensor<V,M0,L0>::size_type size_type;
                 allocator<V, size_type, M0> a;
                 if(dst.effective_shape() == src.effective_shape() && dst.ptr()){
                     if(dst.is_c_contiguous() && src.is_c_contiguous()){
                         // can copy w/o bothering about m_memory
                         a.copy(dst.ptr(), src.ptr(), src.size(), M1());
                     }else if(dst.is_c_contiguous() && src.is_2dcopyable()){
                         size_type row,col,pitch;
                         detail::get_pitched_params(row,col,pitch,src.info().host_shape, src.info().host_stride,L1());
                         a.copy2d(dst.ptr(), src.ptr(), col*sizeof(V),pitch*sizeof(V),row,col,M1());
                     }else if(!force_dst_contiguous && dst.is_2dcopyable() && src.is_c_contiguous()){
                         size_type row,col,pitch;
                         detail::get_pitched_params(row,col,pitch,dst.info().host_shape, dst.info().host_stride,L0());
                         a.copy2d(dst.ptr(), src.ptr(), pitch*sizeof(V),col*sizeof(V),row,col,M1());
                     }else if(!force_dst_contiguous && dst.is_2dcopyable() && src.is_c_contiguous()){
                         size_type srow,scol,spitch;
                         size_type drow,dcol,dpitch;
                         detail::get_pitched_params(drow,dcol,dpitch,dst.info().host_shape, dst.info().host_stride,L0());
                         detail::get_pitched_params(srow,scol,spitch,src.info().host_shape, src.info().host_stride,L1());
                         cuvAssert(scol==srow);
                         cuvAssert(dcol==drow);
                         a.copy2d(dst.ptr(), src.ptr(), dpitch*sizeof(V),spitch*sizeof(V),srow,scol,M1());
                     }else{
                         throw std::runtime_error("copying of generic strides not implemented yet");
                     }
                     if(!IsSame<L0,L1>::Result::value){
                         dst.info().host_stride.reverse();
                         dst.info().host_shape.reverse();
                     }
                     return true;
                 }
                 return false;
             }
 
         template<class V, class M0, class M1, class L0, class L1>
             void copy_memory(tensor<V,M0,L0>&dst, const tensor<V,M1,L1>&src, linear_memory_tag){
                 typedef typename tensor<V,M0,L0>::size_type size_type;
                 if(copy_memory(dst,src, true)) // destination must be contiguous
                     return;
                 dst.info().resize(src.ndim());
                 dst.info().host_shape = src.info().host_shape;
                 linear_memory<V,M0> d(src.size());
                 d.set_strides(dst.info().host_stride,dst.info().host_shape, L0());
                 allocator<V, size_type, M0> a;
                 if(src.is_c_contiguous()){ 
                     // easiest case: both linear, simply copy
                     a.copy(d.ptr(), src.ptr(), src.size(), M1());
                 }
                 else if(src.is_2dcopyable()){
                     // other memory is probably a pitched memory or some view onto an array
                     size_type row,col,pitch;
                     detail::get_pitched_params(row,col,pitch,src.info().host_shape, src.info().host_stride,L1());
                     a.copy2d(d.ptr(), src.ptr(), col*sizeof(V),pitch*sizeof(V),row,col,M1());
                 }else{
                     throw std::runtime_error("copying arbitrarily strided memory not implemented");
                 }
                 dst.mem().reset(new memory<V,M0>(d.release(),d.size()));
                 if(!IsSame<L0,L1>::Result::value){
                     dst.info().host_stride.reverse();
                     dst.info().host_shape.reverse();
                 }
             }
 
         template<class V, class M0, class M1, class L0, class L1>
             void copy_memory(tensor<V,M0,L0>&dst, const tensor<V,M1,L1>&src, pitched_memory_tag){
                 typedef typename tensor<V,M0,L0>::size_type size_type;
                 assert(src.ndim()>=2);
                 if(copy_memory(dst,src,false)) // destination need not be contiguous
                     return;
                 dst.info().resize(src.ndim());
                 dst.info().host_shape = src.info().host_shape;
                 size_type row,col,pitch;
                 detail::get_pitched_params(row,col,pitch,src.info().host_shape, src.info().host_stride,L1());
                 pitched_memory<V,M0> d(row,col);
                 //dst.mem().reset(d);
                 d->set_strides(dst.info().host_stride,dst.info().host_shape, L0());
                 allocator<V, size_type, M0> a;
                 if(src.is_2dcopyable()){
                     // other memory is probably a pitched memory or some view onto an array
                     detail::get_pitched_params(row,col,pitch,src.info().host_shape, src.info().host_stride,L1());
                     a.copy2d(d.ptr(), src.m_ptr, d.pitch()*sizeof(V),pitch*sizeof(V),row,col,M1());
                 }else{
                     throw std::runtime_error("copying arbitrarily strided memory not implemented");
                 }
                 dst.mem().reset(new memory<V,M0>(d.release(),d.size()));
 
                 if(!IsSame<L0,L1>::Result::value){
                     dst.info().host_stride.reverse();
                     dst.info().host_shape.reverse();
                 }
             }
     //}
  // data_structures
 
 
     template<class V, class V2, class M, class M2, class L>
         bool equal_shape(const tensor<V,M,L>& a, const tensor<V2,M2,L>& b){
             return a.effective_shape()==b.effective_shape();
         }
     
 
         template<class Mat, class NewVT>
                 struct switch_value_type{
                         typedef tensor<NewVT, typename Mat::memory_space_type, typename Mat::memory_layout_type> type; 
                 };
         template<class Mat, class NewML>
                 struct switch_memory_layout_type{
                         typedef tensor<typename Mat::value_type, typename Mat::memory_space_type, NewML> type; 
                 };
         template<class Mat, class NewMS>
                 struct switch_memory_space_type{
                         typedef tensor<typename Mat::value_type, NewMS, typename Mat::memory_layout_type> type; 
                 };
 
 }
 
 namespace std{
     template<class V>
     ostream& operator<<(ostream& o, const cuv::linear_memory<V, cuv::host_memory_space>& t){
         o << "[ ";
         for(unsigned int i=0;i<t.size();i++)
             o<< t[i]<<" ";
         o <<"]";
         return o;
     }
     template<class V>
     ostream& operator<<(ostream& o, const cuv::linear_memory<V, cuv::dev_memory_space>& t_){
         cuv::linear_memory<V, cuv::host_memory_space> t = t_; // pull
         o << "[ ";
         for(unsigned int i=0;i<t.size();i++)
             o<< t[i]<<" ";
         o <<"]";
         return o;
     }
 
     template<class V>
     ostream& operator<<(ostream& o, const cuv::pitched_memory<V, cuv::host_memory_space>& t){
         o << "[ ";
         for(unsigned int i=0;i<t.rows();i++){
             for(unsigned int j=0;j<t.rows();j++){
                 o<< t(i,j)<<" ";
             }
             if(i<t.rows()-1)
                 o<< std::endl;
         }
         o <<"]";
         return o;
     }
     template<class V>
     ostream& operator<<(ostream& o, const cuv::pitched_memory<V, cuv::dev_memory_space>& t_){
         cuv::pitched_memory<V, cuv::host_memory_space> t = t_; // pull
         o << "[ ";
         for(unsigned int i=0;i<t.rows();i++){
             for(unsigned int j=0;j<t.rows();j++){
                 o<< t(i,j)<<" ";
             }
             if(i<t.rows()-1)
                 o<< std::endl;
         }
         o <<"]";
         return o;
     }
 
     template<class V, class L>
     ostream& operator<<(ostream& o, const cuv::tensor<V, cuv::dev_memory_space, L>& t){
         return o << cuv::tensor<V,cuv::host_memory_space,L>(t);
     }
     template<class V, class L>
     ostream& operator<<(ostream& o, const cuv::tensor<V, cuv::host_memory_space, L>& t){
         if(t.ndim()==0)
             return o << "[]";
 
         if(t.ndim()==1){
             o << "[ ";
             for(unsigned int i=0;i<t.shape(0);i++) o<< t[i]<<" ";
             return o <<"]";
         }
         if(t.ndim()==2){
             o << "[";
             for(unsigned int i=0;i<t.shape(0);++i){
                 if(i>0)
                     o<<" ";
                 o << "[ ";
                 for(unsigned int j=0;j<t.shape(1);j++) o<< t(i,j)<<" ";
                 o <<"]";
                 if(i != t.shape(0)-1)
                     o <<std::endl;
             } 
             return o<<"]";
         }
         if(t.ndim()==3){
             o<<"["<<std::endl;
             for(unsigned int l=0;l<t.shape(0);l++){
                 o << "[";
                 for(unsigned int i=0;i<t.shape(1);++i){
                     if(i>0)
                         o<<" ";
                     o << "[ ";
                     //for(unsigned int j=0;j<t.shape(2);j++) o<< t(l,i,j)<<" ";
                     for(unsigned int j=0;j<t.shape(2);j++) o<< t[l*t.shape(1)*t.shape(2) + i*t.shape(2) + j]<<" ";
                     o <<"]";
                     if(i != t.shape(1)-1)
                         o <<std::endl;
                 } 
                 o<<"]";
                 if(l<t.shape(0)-1)
                     o<<std::endl;
             }
             return o<<"]";
         }
         throw std::runtime_error("printing of tensors with >3 dimensions not implemented");
     }
 } // io
 #endif /* __TENSOR2_HPP__ */