Extended CUDA Library (ecuda)  2.0
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
copy.hpp
Go to the documentation of this file.
1 /*
2 Copyright (c) 2014-2016, Scott Zuyderduyn
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice, this
9  list of conditions and the following disclaimer.
10 2. Redistributions in binary form must reproduce the above copyright notice,
11  this list of conditions and the following disclaimer in the documentation
12  and/or other materials provided with the distribution.
13 
14 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 
25 The views and conclusions contained in the software and documentation are those
26 of the authors and should not be interpreted as representing official policies,
27 either expressed or implied, of the FreeBSD Project.
28 */
29 
30 //----------------------------------------------------------------------------
31 // algo/copy.hpp
32 //
33 // Extension of std::copy that recognizes device memory and can be called from
34 // host or device code.
35 //
36 // Author: Scott D. Zuyderduyn, Ph.D. (scott.zuyderduyn@utoronto.ca)
37 //----------------------------------------------------------------------------
38 #pragma once
39 #ifndef ECUDA_ALGO_COPY_HPP
40 #define ECUDA_ALGO_COPY_HPP
41 
42 #include <iterator>
43 #include <vector>
44 
45 #include "../global.hpp"
46 #include "../algorithm.hpp"
47 #include "../allocators.hpp"
48 #include "../apiwrappers.hpp"
49 #include "../iterator.hpp"
50 #include "../utility.hpp"
51 
52 #include <typeinfo> // delme
53 
54 namespace ecuda {
55 
57 namespace detail {
58 
59 typedef ecuda::false_type host_type; // alias for ecuda::iterator_traits<>::is_device_iterator
60 typedef ecuda::true_type device_type; // alias for ecuda::iterator_traits<>::is_device_iterator
61 
62 } // namespace detail
64 
89 template<class InputIterator,class OutputIterator> __HOST__ __DEVICE__ inline OutputIterator copy( InputIterator first, InputIterator last, OutputIterator result );
90 
92 namespace impl {
93 
94 template<class Iterator>
95 inline
97 get_iterator_pointer( Iterator& iter )
98 {
99  return iter.operator->();
100 }
101 
102 template<typename T>
103 inline
104 typename ecuda::add_pointer<T>::type
105 get_iterator_pointer( T* ptr )
106 {
107  return ptr;
108 }
109 
110 } // namespace impl
112 
114 namespace impl {
115 
116 //
117 // Start of DEVICE to DEVICE implementations
118 //
119 
120 //
121 // Source: any device memory
122 // Destination: any device memory
123 // Value types: any
124 // On Device : element-by-element copy
125 // On Host : determine contiguity and data types of device memories, generate
126 // a compile-time error if called from the host and either of the
127 // memories are non-contiguous, otherwise delegate to a same-type
128 // copy or different-type copy as appropriate
129 //
130 template<class InputIterator,class OutputIterator>
131 __HOST__ __DEVICE__ inline OutputIterator copy(
132  InputIterator first,
133  InputIterator last,
134  OutputIterator result,
136 );
137 // implementation makes calls to functions in the ecuda::impl::device_to_device namespace
138 
139 namespace device_to_device {
140 
141 //
142 // Source: contiguous device memory
143 // Destination: contiguous device memory
144 // Value types: same
145 // On Device : element-by-element copy
146 // On Host : call ecuda::cudaMemcpy to copy sequence
147 //
148 template<class InputIterator,class OutputIterator>
149 __HOST__ __DEVICE__ inline OutputIterator copy(
150  InputIterator first, InputIterator last,
151  OutputIterator result,
153 )
154 {
155  #ifdef __CUDA_ARCH__
156  while( first != last ) { *result = *first; ++first; ++result; }
157  return result;
158  #else
159  typedef typename std::iterator_traits<OutputIterator>::value_type value_type;
160  typename std::iterator_traits<InputIterator>::difference_type n = ecuda::distance( first, last );
161  CUDA_CALL( cudaMemcpy<value_type>( result.operator->(), first.operator->(), static_cast<std::size_t>(n), cudaMemcpyDeviceToDevice ) );
162  ecuda::advance( result, static_cast<std::size_t>(n) );
163  return result;
164  #endif
165 }
166 
167 //
168 // Source: blocks of contiguous device memory
169 // Destination: contiguous device memory
170 // Value types: same
171 // On Device : element-by-element copy
172 // On Host : call copy on each individual contiguous block
173 //
174 template<class OutputIterator,typename T,typename P>
175 __HOST__ __DEVICE__ inline OutputIterator copy(
177  OutputIterator result,
179 )
180 {
181  #ifdef __CUDA_ARCH__
182  while( first != last ) { *result = *first; ++first; ++result; }
183  return result;
184  #else
185  typedef typename ecuda::iterator_traits<OutputIterator>::value_type value_type;
186  typedef device_contiguous_block_iterator<T,P> input_iterator_type;
187 
188  {
189  // if there is any leading data before the memory becomes regularly aligned
190  // then copy it first
191  const std::size_t leading = ( first.get_width() - first.get_offset() );
192  if( leading < first.get_width() ) { // leading with partial row
193  typename input_iterator_type::contiguous_iterator first2 = first.contiguous_begin();
194  result = ::ecuda::copy( first2, first2 + leading, result );
195  ::ecuda::advance( first, leading );
196  }
197  }
198 
199  {
200  // memory is now guaranteed to be regularly aligned so we can use cudaMemcpy2D
201  typedef typename ecuda::add_pointer<value_type>::type pointer;
202  pointer dest = naked_cast<pointer>( result.operator->() );
203  typedef typename ecuda::add_pointer<const value_type>::type const_pointer;
204  const_pointer src = naked_cast<const_pointer>( first.operator->() );
205 
206  const size_t pitch = first.operator->().get_pitch();
207  const std::size_t width = first.get_width();
208  const std::size_t rows = ::ecuda::distance( first, last ) / width;
209  CUDA_CALL( cudaMemcpy2D<value_type>( dest, width*sizeof(value_type), src, pitch, width, rows, cudaMemcpyDeviceToDevice ) );
210  ::ecuda::advance( first, width*rows );
211  ::ecuda::advance( result, width*rows );
212  }
213 
214  {
215  // if there is any trailing data where the memory ceases to be regularly aligned
216  // then copy the rest of it
217  const std::size_t trailing = ::ecuda::distance( first, last ) % first.get_width();
218  if( trailing > 0 ) {
219  typename input_iterator_type::contiguous_iterator first2 = first.contiguous_begin();
220  result = ::ecuda::copy( first2, first2 + trailing, result );
221  }
222  }
223 
224  return result;
225  #endif
226 }
227 
228 //
229 // Source : contiguous device memory
230 // Destination: blocks of contiguous device memory
231 // Value types: same
232 // On Device : element-by-element copy
233 // On Host : call copy on each individual contiguous block
234 //
235 template<class InputIterator,typename T,typename P>
237  InputIterator first, InputIterator last,
240 )
241 {
242  #ifdef __CUDA_ARCH__
243  while( first != last ) { *result = *first; ++first; ++result; }
244  return result;
245  #else
246  typedef device_contiguous_block_iterator<T,P> output_iterator_type;
248 
249  {
250  // if there is any leading data before the memory becomes regularly aligned
251  // then copy it first
252  const std::size_t leading = result.operator->().get_remaining_width();
253  if( leading < result.operator->().get_width() ) { // leading with partial row
254  ::ecuda::copy( first, first + leading, result.contiguous_begin() );
255  ::ecuda::advance( first, leading );
256  ::ecuda::advance( result, leading );
257  }
258  }
259 
260  {
261  // memory is now guaranteed to be regularly aligned so we can use cudaMemcpy2D
262  typedef typename ecuda::add_pointer<value_type>::type pointer;
263  pointer dest = naked_cast<pointer>( result.operator->() );
264  typedef typename ecuda::add_pointer<const value_type>::type const_pointer;
265  const_pointer src = naked_cast<const_pointer>( first.operator->() );
266 
267  const size_t pitch = result.operator->().get_pitch();
268  const std::size_t width = result.operator->().get_width();
269  const std::size_t rows = ::ecuda::distance( first, last ) / width;
270 
271  CUDA_CALL( cudaMemcpy2D<value_type>( dest, pitch, src, width*sizeof(value_type), width, rows, cudaMemcpyDeviceToDevice ) );
272  ::ecuda::advance( first, width*rows );
273  ::ecuda::advance( result, width*rows );
274  }
275 
276  {
277  // if there is any trailing data where the memory ceases to be regularly aligned
278  // then copy the rest of it
279  const std::size_t trailing = ::ecuda::distance( first, last ) % result.operator->().get_width();
280  if( trailing > 0 ) {
281  ::ecuda::copy( first, first + trailing, result.contiguous_begin() );
282  ::ecuda::advance( result, trailing );
283  }
284  }
285 
286  return result;
287  #endif
288 }
289 
290 //
291 // Source : blocks of contiguous device memory
292 // Destination: blocks of contiguous device memory
293 // Value types: same
294 // On Device : element-by-element copy
295 // On Host : call copy on each individual contiguous block
296 //
297 template<typename T,typename P,typename U,typename Q>
302 )
303 {
304  #ifdef __CUDA_ARCH__
305  while( first != last ) { *result = *first; ++first; ++result; }
306  return result;
307  #else
308  typedef device_contiguous_block_iterator<T,P> input_iterator_type;
309  typedef device_contiguous_block_iterator<U,Q> output_iterator_type;
311 
312  if( ( first.get_width() == result.get_width() ) &&
313  ( first.get_offset() == result.get_offset() ) ) { // only when this is true can we do a cudaMemcpy2D call
314 
315  {
316  // if there is any leading data before the memory becomes regularly aligned
317  // then copy it first
318  const std::size_t leading = result.get_width() - result.get_offset();
319  if( leading < result.get_width() ) { // leading with partial row
320  ::ecuda::copy( first, first + leading, result.contiguous_begin() );
321  ::ecuda::advance( first, leading );
322  ::ecuda::advance( result, leading );
323  }
324  }
325 
326  {
327  // memory is now guaranteed to be regularly aligned so we can use cudaMemcpy2D
328  typedef typename ecuda::add_pointer<value_type>::type pointer;
329  pointer dest = naked_cast<pointer>( result.operator->() );
330  typedef typename ecuda::add_pointer<const value_type>::type const_pointer;
331  const_pointer src = naked_cast<const_pointer>( first.operator->() );
332  const size_t src_pitch = first.operator->().get_pitch();
333  const size_t dest_pitch = result.operator->().get_pitch();
334  const std::size_t width = result.get_width();
335  const std::size_t rows = ::ecuda::distance( first, last ) / width;
336 
337  CUDA_CALL( cudaMemcpy2D<value_type>( dest, dest_pitch, src, src_pitch, width, rows, cudaMemcpyDeviceToDevice ) );
338  ::ecuda::advance( first, width*rows );
339  ::ecuda::advance( result, width*rows );
340  }
341 
342  {
343  // if there is any trailing data where the memory ceases to be regularly aligned
344  // then copy the rest of it
345  const std::size_t trailing = ::ecuda::distance( first, last ) % result.get_width();
346  if( trailing > 0 ) {
347  ::ecuda::copy( first, first + trailing, result.contiguous_begin() );
348  ::ecuda::advance( result, trailing );
349  }
350  }
351 
352  return result;
353 
354  } // end case where both alignments are compatible
355 
356  // this will copy pieces of differently aligned memory that
357  // will work, but requires (possibly many) calls to
358  // cudaMemcpy, so there will be a performance hit
359 
360  typename input_iterator_type::difference_type n = ecuda::distance( first, last );
361  while( n > 0 ) {
362  const std::size_t width = ecuda::min( first.get_width()-first.get_offset(), result.get_width()-result.get_offset() );
363  const std::size_t copy_width = width > n ? n : width;
364  typename input_iterator_type::contiguous_iterator first2 = first.contiguous_begin();
365  typename output_iterator_type::contiguous_iterator result2 = result.contiguous_begin();
366  ::ecuda::copy( first2, first2+copy_width, result2 );
367  first += copy_width; // original input iterator has to catch up
368  result += copy_width; // original output iterator has to catch up
369  n -= copy_width;
370  }
371  return result;
372  #endif
373 }
374 
375 } // namespace device_to_device
376 
377 //
378 // Implementation only, function declaration and documentation above.
379 //
381 template<class InputIterator,class OutputIterator>
382 __HOST__ __DEVICE__ inline OutputIterator copy(
383  InputIterator first,
384  InputIterator last,
385  OutputIterator result,
387 )
388 {
389  #ifdef __CUDA_ARCH__
390  while( first != last ) { *result = *first; ++first; ++result; }
391  return result;
392  #else
393 
394  typedef typename ecuda::iterator_traits<InputIterator>::is_contiguous input_contiguity;
395  typedef typename ecuda::iterator_traits<InputIterator>::iterator_category input_iterator_category;
396  {
397  // compile-time check that input iterator traverses contiguous memory
398  const bool isSomeKindOfContiguous =
399  ecuda::is_same<input_contiguity,ecuda::true_type>::value ||
400  ecuda::is_same<input_iterator_category,device_contiguous_block_iterator_tag>::value;
401  ECUDA_STATIC_ASSERT(isSomeKindOfContiguous,CANNOT_USE_NONCONTIGUOUS_DEVICE_ITERATOR_AS_SOURCE_FOR_COPY);
402  }
403 
404  typedef typename ecuda::iterator_traits<OutputIterator>::is_contiguous output_contiguity;
405  typedef typename ecuda::iterator_traits<OutputIterator>::iterator_category output_iterator_category;
406  {
407  // compile-time check that output iterator traverses contiguous memory
408  const bool isSomeKindOfContiguous =
409  ecuda::is_same<output_contiguity,ecuda::true_type>::value ||
410  ecuda::is_same<output_iterator_category,device_contiguous_block_iterator_tag>::value;
411  ECUDA_STATIC_ASSERT(isSomeKindOfContiguous,CANNOT_USE_NONCONTIGUOUS_DEVICE_ITERATOR_AS_DESTINATION_FOR_COPY);
412  }
415  {
416  // compile-time check that types are the same
417  // if not, copy to host staging memory, do type conversion, then copy
418  // final result to destination device memory
419  const bool isSameType = ecuda::is_same<typename ecuda::remove_const<T>::type,typename ecuda::remove_const<U>::type>::value;
420  if( !isSameType ) {
421  std::vector< typename ecuda::remove_const<T>::type, host_allocator<typename ecuda::remove_const<T>::type> > v1( std::distance( first, last ) );
422  ::ecuda::copy( first, last, v1.begin() );
423  std::vector< U, host_allocator<U> > v2( v1.size() );
424  ::ecuda::copy( v1.begin(), v1.end(), v2.begin() );
425  return ::ecuda::copy( v2.begin(), v2.end(), result );
426  } else {
428  }
429  }
430  #endif
431 }
432 
433 //
434 // Start of HOST to DEVICE implementations
435 //
436 
437 //
438 // Source: any host memory
439 // Destination: any device memory
440 // Value types: any
441 // On Device : compile-time assertion
442 // On Host : determine contiguity and data types of device memory, generate
443 // a compile-time error if called from the host and device memory
444 // is non-contiguous, perform a compile-time check for type equality
445 // and insert a conversion routine if necessary, then delegate to
446 // a device_contiguous or device_block_contiguous copy as
447 // appropriate
448 //
449 template<class InputIterator,class OutputIterator>
450 __HOST__ __DEVICE__ inline OutputIterator copy(
451  InputIterator first, InputIterator last,
452  OutputIterator result,
454 );
455 // implementation makes calls to functions in the ecuda::impl::host_to_device namespace
456 
457 namespace host_to_device {
458 
459 //
460 // Source: contiguous host memory
461 // Destination: contiguous device memory
462 // Value types: same
463 // On Device : compile-time assertion
464 // On Host : copy the host memory sequence to a contiguous block, and
465 // call copy again
466 //
467 template<class InputIterator,class OutputIterator>
468 __HOST__ __DEVICE__ inline OutputIterator copy(
469  InputIterator first,
470  InputIterator last,
471  OutputIterator result,
473 )
474 {
475  #ifdef __CUDA_ARCH__
476  return result; // never actually gets compiled, just here to satisfy nvcc
477  #else
478  typedef typename ecuda::iterator_traits<OutputIterator>::value_type value_type;
479  const typename ecuda::iterator_traits<InputIterator>::difference_type n = ecuda::distance( first, last ); // get length of host sequence
480  typedef typename ecuda::add_pointer<value_type>::type pointer;
481  pointer dest = naked_cast<pointer>( impl::get_iterator_pointer(result) );
482  typedef typename ecuda::add_pointer<const value_type>::type const_pointer;
483  const_pointer src = naked_cast<const_pointer>( impl::get_iterator_pointer(first) );
484  CUDA_CALL( cudaMemcpy<value_type>( dest, src, static_cast<std::size_t>(n), cudaMemcpyHostToDevice ) );
485  ecuda::advance( result, static_cast<std::size_t>(n) );
486  return result;
487  #endif
488 }
489 
490 //
491 // Source: contiguous host memory
492 // Destination: disparate blocks of contiguous device memory
493 // Value types: same
494 // On Device : compile-time assertion
495 // On Host : call copy on each contiguous block of device memory
496 //
497 template<class InputIterator,typename T,typename P>
499  InputIterator first,
500  InputIterator last,
503 )
504 {
505  #ifdef __CUDA_ARCH__
506  return result; // never actually gets compiled, just here to satisfy nvcc
507  #else
508  typedef device_contiguous_block_iterator<T,P> output_iterator_type;
510 
511  {
512  // if there is any leading data before the memory becomes regularly aligned
513  // then copy it first
514  const std::size_t leading = result.get_width() - result.get_offset();
515  if( leading < result.get_width() ) {
516  ::ecuda::copy( first, first + leading, result.contiguous_begin() );
517  ::ecuda::advance( first, leading );
518  ::ecuda::advance( result, leading );
519  }
520  }
521 
522  {
523  // memory is now guaranteed to be regularly aligned so we can use cudaMemcpy2D
524  typedef typename ecuda::add_pointer<value_type>::type pointer;
525  pointer dest = naked_cast<pointer>( result.operator->() );
526  typedef typename ecuda::add_pointer<const value_type>::type const_pointer;
527  const_pointer src = naked_cast<const_pointer>( get_iterator_pointer(first) );
528 
529  const size_t pitch = result.operator->().get_pitch();
530  const std::size_t width = result.get_width();
531  const std::size_t rows = ::ecuda::distance( first, last ) / width;
532  CUDA_CALL( cudaMemcpy2D<value_type>( dest, pitch, src, width*sizeof(value_type), width, rows, cudaMemcpyHostToDevice ) );
533  ::ecuda::advance( first, width*rows );
534  ::ecuda::advance( result, width*rows );
535  }
536 
537  {
538  // if there is any trailing data where the memory ceases to be regularly aligned
539  // then copy the rest of it
540  const std::size_t trailing = ::ecuda::distance( first, last ) % result.get_width();
541  if( trailing > 0 ) {
542  ::ecuda::copy( first, first + trailing, result.contiguous_begin() );
543  ::ecuda::advance( result, trailing );
544  }
545  }
546 
547  return result;
548  #endif
549 }
550 
551 } // namespace host_to_device
552 
553 //
554 // Implementation only, function declaration and documentation above.
555 //
557 template<class InputIterator,class OutputIterator>
558 __HOST__ __DEVICE__ inline OutputIterator copy(
559  InputIterator first, InputIterator last,
560  OutputIterator result,
561  ecuda::pair<detail::host_type,detail::device_type> memory_types // host -> device
562 )
563 {
564  #ifdef __CUDA_ARCH__
565  return result; // can never be called from device code, dummy return to satisfy nvcc
566  #else
567  // is the device iterator contiguous?
568  {
569  // compile time check that device iterator traverses contiguous memory
570  // or is at least comprised of a set of contiguous blocks
571  const bool isSomeKindOfContiguous =
572  ecuda::is_same<typename ecuda::iterator_traits<OutputIterator>::is_contiguous,ecuda::true_type>::value ||
573  ecuda::is_same<typename ecuda::iterator_traits<OutputIterator>::iterator_category,ecuda::device_contiguous_block_iterator_tag>::value;
574  ECUDA_STATIC_ASSERT(isSomeKindOfContiguous,CANNOT_USE_NONCONTIGUOUS_DEVICE_ITERATOR_AS_DESTINATION_FOR_COPY);
575  }
578  {
579  // run-time check that the host iterator traverses contiguous memory
580  // if not, make it so and call copy again
581  const typename std::iterator_traits<InputIterator>::pointer pStart = impl::get_iterator_pointer(first);
582  const typename std::iterator_traits<InputIterator>::pointer pEnd = impl::get_iterator_pointer(last);
583  if( (pEnd-pStart) != std::distance(first,last) ) {
584  std::vector< U, host_allocator<U> > v( first, last ); // get type conversion here for free
585  return host_to_device::copy( v.begin(), v.end(), result, typename ecuda::iterator_traits<OutputIterator>::iterator_category() );
586  }
587  }
588  // compile-time check that the input and output types are the same
589  // if not, do the conversion and call copy again
590  const bool isSameType = ecuda::is_same<T,U>::value;
591  if( !isSameType ) {
592  std::vector< U, host_allocator<U> > v( first, last ); // type conversion
593  return host_to_device::copy( v.begin(), v.end(), result, typename ecuda::iterator_traits<OutputIterator>::iterator_category() );
594  } else {
596  }
597  #endif
598 }
599 
600 //
601 // Start of DEVICE to HOST implementations
602 //
603 
604 namespace device_to_host {
605 
606 //
607 // Source: contiguous device memory
608 // Destination: contiguous host memory
609 // Value types: same
610 // On Device : compile-time assertion
611 // On Host : call ecuda::cudaMemcpy to copy sequence
612 //
613 template<class InputIterator,class OutputIterator>
614 __HOST__ __DEVICE__ inline OutputIterator copy(
615  InputIterator first,
616  InputIterator last,
617  OutputIterator result,
618  device_contiguous_iterator_tag // contiguous
619 )
620 {
621  #ifdef __CUDA_ARCH__
622  return result; // can never be called from device code, dummy return to satisfy nvcc
623  #else
624  typedef typename ecuda::iterator_traits<OutputIterator>::value_type value_type;
625  typedef typename ecuda::add_pointer<const value_type>::type src_pointer_type;
626  typedef typename ecuda::add_pointer<value_type>::type dest_pointer_type;
627  src_pointer_type src = naked_cast<src_pointer_type>( impl::get_iterator_pointer(first) );
628  dest_pointer_type dest = naked_cast<dest_pointer_type>( impl::get_iterator_pointer(result) );
630  CUDA_CALL( cudaMemcpy<value_type>( dest, src, static_cast<std::size_t>(n), cudaMemcpyDeviceToHost ) );
631  ecuda::advance( result, static_cast<std::size_t>(n) );
632  return result;
633  #endif
634 }
635 
636 //
637 // Source: blocks of contiguous device memory
638 // Destination: contiguous host memory
639 // Value types: same
640 // On Device : compile-time assertion
641 // On Host : call copy on each contiguous block of device memory
642 //
643 template<typename T,typename P,class OutputIterator>
644 __HOST__ __DEVICE__ inline OutputIterator copy(
647  OutputIterator result,
648  device_contiguous_block_iterator_tag // contiguous blocks
649 )
650 {
651  #ifdef __CUDA_ARCH__
652  return result; // can never be called from device code, dummy return to satisfy nvcc
653  #else
654  typedef typename ecuda::iterator_traits<OutputIterator>::value_type value_type;
655  typedef device_contiguous_block_iterator<T,P> input_iterator_type;
656  {
657  // if there is any leading data before the memory becomes regularly aligned
658  // then copy it first
659  const std::size_t leading = first.get_width() - first.get_offset();
660  if( leading < first.get_width() ) { // leading with partial row
661  typename input_iterator_type::contiguous_iterator first2 = first.contiguous_begin();
662  result = ::ecuda::copy( first2, first2 + leading, result );
663  ::ecuda::advance( first, leading );
664  }
665  }
666 
667  {
668  // memory is now guaranteed to be regularly aligned so we can use cudaMemcpy2D
669  typedef typename ecuda::add_pointer<value_type>::type pointer;
670  pointer dest = get_iterator_pointer( result );
671  typedef typename ecuda::add_pointer<const value_type>::type const_pointer;
672  const_pointer src = naked_cast<const_pointer>( first.operator->() );
673 
674  const size_t pitch = first.operator->().get_pitch();
675  const std::size_t width = first.get_width();
676  const std::size_t rows = ::ecuda::distance( first, last ) / width;
677 
678  CUDA_CALL( cudaMemcpy2D<value_type>( dest, width*sizeof(value_type), src, pitch, width, rows, cudaMemcpyDeviceToHost ) );
679  ::ecuda::advance( first, width*rows );
680  ::ecuda::advance( result, width*rows );
681  }
682 
683  {
684  // if there is any trailing data where the memory ceases to be regularly aligned
685  // then copy the rest of it
686  const std::size_t trailing = ::ecuda::distance( first, last ) % first.get_width();
687  if( trailing > 0 ) {
688  typename input_iterator_type::contiguous_iterator first2 = first.contiguous_begin();
689  result = ::ecuda::copy( first2, first2 + trailing, result );
690  }
691  }
692 
693  return result;
694  #endif
695 }
696 
697 } // namespace device_to_host
698 
699 //
700 // Source: any device memory
701 // Destination: any host memory
702 // Value types: any
703 // On Device : compile-time assertion
704 // On Host : determine contiguity and data types of device memory, generate
705 // a compile-time error if called from the host and device memory
706 // is non-contiguous, perform a compile-time check for type equality
707 // and insert a conversion routine if necessary, then delegate to
708 // a device_contiguous or device_block_contiguous copy as
709 // appropriate
710 //
712 template<class InputIterator,class OutputIterator>
713 __HOST__ __DEVICE__ inline OutputIterator copy(
714  InputIterator first,
715  InputIterator last,
716  OutputIterator result,
718 )
719 {
720  #ifdef __CUDA_ARCH__
721  return result; // can never be called from device code, dummy return to satisfy nvcc
722  #else
723  {
724  // compile time check that device iterator traverses contiguous memory
725  // or is at least comprised of a set of contiguous blocks
726  const bool isSomeKindOfContiguous =
727  ecuda::is_same<typename ecuda::iterator_traits<InputIterator>::is_contiguous,ecuda::true_type>::value ||
728  ecuda::is_same<typename ecuda::iterator_traits<InputIterator>::iterator_category,ecuda::device_contiguous_block_iterator_tag>::value;
729  ECUDA_STATIC_ASSERT(isSomeKindOfContiguous,CANNOT_USE_NONCONTIGUOUS_DEVICE_ITERATOR_AS_SOURCE_FOR_COPY);
730  }
733  {
734  // run time check that host iterator traverses contiguous memory
735  // if not, create a temporary container that is and re-call copy
737  typedef const char* raw_pointer_type;
738  raw_pointer_type pStart = naked_cast<raw_pointer_type>( impl::get_iterator_pointer(result) );
739  OutputIterator result2 = result;
740  ecuda::advance( result2, n );
741  raw_pointer_type pEnd = naked_cast<raw_pointer_type>( impl::get_iterator_pointer(result2) );
742  if( (pEnd-pStart) != ( n*sizeof(typename ecuda::iterator_traits<OutputIterator>::value_type) ) ) {
743  typedef typename ecuda::remove_const<U>::type U2; // need to strip source const otherwise this can't act as staging
744  std::vector< U2, host_allocator<U2> > v( n );
745  ::ecuda::copy( first, last, v.begin() );
746  return ::ecuda::copy( v.begin(), v.end(), result ); // get type conversion if needed, should resolve directly to std::copy
747  }
748  }
749  // compile-time check that the input and output types are the same
750  // if not, provide a temp destination of the correct type, copy
751  // there temporarily, and then do a host-to-host copy that does
752  // the type conversion
753  const bool isSameType = ecuda::is_same<T,U>::value;
754  if( !isSameType ) {
755  typedef typename ecuda::remove_const<T>::type T2; // need to strip source const otherwise this can't act as staging
756  std::vector< T2, host_allocator<T2> > v( ecuda::distance( first, last ) );
758  return ::ecuda::copy( v.begin(), v.end(), result ); // type conversion occurs here, should resolve directly to std::copy
759  } else {
760  return device_to_host::copy( first, last, result, typename ecuda::iterator_traits<InputIterator>::iterator_category() );
761  }
762  #endif
763 }
764 
765 //
766 // Start of HOST to HOST implementations
767 //
768 
769 //
770 // Source: any host memory
771 // Destination: any host memory
772 // Value types: any
773 // On Device : compile-time assertion
774 // On Host : just delegate to std::copy
775 //
777 template<class InputIterator,class OutputIterator>
778 __HOST__ __DEVICE__ inline OutputIterator copy(
779  InputIterator first,
780  InputIterator last,
781  OutputIterator result,
783 )
784 {
785  #ifdef __CUDA_ARCH__
786  return result; // can never be called from device code, dummy return to satisfy nvcc
787  #else
788  return std::copy( first, last, result );
789  #endif
790 }
791 
792 } // namespace impl
794 
795 //
796 // Entry point of the ecuda::copy function.
797 //
798 
800 template<class InputIterator,class OutputIterator>
801 __HOST__ __DEVICE__ inline OutputIterator copy( InputIterator first, InputIterator last, OutputIterator result )
802 {
803  typedef typename ecuda::iterator_traits<InputIterator>::is_device_iterator input_memory_type;
804  typedef typename ecuda::iterator_traits<OutputIterator>::is_device_iterator output_memory_type;
805  return impl::copy( first, last, result, ecuda::pair<input_memory_type,output_memory_type>() );
806 }
807 
808 
809 } // namespace ecuda
810 
811 #endif
__HOST__ __DEVICE__ OutputIterator copy(InputIterator first, InputIterator last, OutputIterator result)
Replacement for std::copy.
Definition: copy.hpp:801
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ void advance(InputIterator &iterator, Distance n)
Increments given iterator by n elements.
Definition: iterator.hpp:574
Iterator category denoting device memory that is made of contiguous blocks (but the blocks themselves...
Definition: iterator.hpp:71
base_type::iterator_category iterator_category
Definition: iterator.hpp:437
#define CUDA_CALL(x)
Definition: global.hpp:83
Iterator category denoting contiguous device memory.
Definition: iterator.hpp:62
__HOST__ __DEVICE__ std::size_t get_width() const __NOEXCEPT__
Definition: iterator.hpp:327
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ OutputIterator copy(InputIterator first, InputIterator last, OutputIterator result)
Replacement for std::copy.
Definition: copy.hpp:801
#define __HOST__
Definition: global.hpp:150
__HOST__ __DEVICE__ contiguous_iterator contiguous_begin() const __NOEXCEPT__
Definition: iterator.hpp:325
ecuda::false_type is_device_iterator
Definition: iterator.hpp:441
Allocator for page-locked host memory.
Definition: allocators.hpp:82
__HOST__ __DEVICE__ const T & min(const T &a, const T &b)
Definition: algorithm.hpp:48
Couples together a pair of values.
Definition: utility.hpp:53
#define ECUDA_STATIC_ASSERT(x, msg)
Definition: global.hpp:191
base_type::pointer pointer
Definition: iterator.hpp:438
base_type::difference_type difference_type
Definition: iterator.hpp:436
#define __DEVICE__
Definition: global.hpp:151
#define ECUDA_SUPPRESS_HD_WARNINGS
Definition: global.hpp:58
__HOST__ __DEVICE__ std::size_t get_offset() const __NOEXCEPT__
Definition: iterator.hpp:328
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ std::iterator_traits< Iterator >::difference_type distance(const Iterator &first, const Iterator &last)
Definition: iterator.hpp:627
base_type::value_type value_type
Definition: iterator.hpp:440