Extended CUDA Library (ecuda)
2.0
|
Classes | |
class | host_allocator |
Allocator for page-locked host memory. More... | |
class | device_allocator |
Allocator for device memory. More... | |
class | device_pitch_allocator |
Allocator for hardware aligned device memory. More... | |
class | array |
A fixed-size array stored in device memory. More... | |
class | cube |
A resizable cube stored in device memory. More... | |
class | cuda_error |
Exception for CUDA API cudaError_t errors. More... | |
class | device |
Encapsulates CUDA API device information functions. More... | |
class | event |
Encapsulates CUDA API event objects and functions. More... | |
class | padded_ptr |
A specialized pointer to padded memory. More... | |
struct | device_iterator_tag |
Iterator category denoting device memory. More... | |
struct | device_contiguous_iterator_tag |
Iterator category denoting contiguous device memory. More... | |
struct | device_contiguous_block_iterator_tag |
Iterator category denoting device memory that is made of contiguous blocks (but the blocks themselves are non-contiguous). More... | |
class | device_iterator |
class | device_contiguous_iterator |
class | device_contiguous_block_iterator |
class | reverse_device_iterator |
class | iterator_traits |
class | iterator_traits< device_iterator< T, PointerType, Category > > |
class | iterator_traits< device_contiguous_iterator< T > > |
class | iterator_traits< device_contiguous_block_iterator< T, P > > |
class | iterator_traits< reverse_device_iterator< Iterator > > |
class | iterator_traits< T * > |
class | matrix |
A resizable matrix stored in device memory. More... | |
struct | owner_less |
struct | owner_less< shared_ptr< T > > |
struct | default_device_delete |
The default destruction policy used by smart pointers to device memory. More... | |
struct | default_host_delete |
The default destruction policy used by smart pointers to page-locked host memory. More... | |
class | shared_ptr |
A smart pointer that retains shared ownership of an object in device memory. More... | |
class | striding_padded_ptr |
A specialized pointer to striding memory. More... | |
class | striding_ptr |
A specialized pointer to striding memory. More... | |
class | unique_ptr |
A smart pointer that retains sole ownership of an object. More... | |
struct | pair |
Couples together a pair of values. More... | |
class | vector |
A resizable vector stored in device memory. More... | |
Functions | |
template<class InputIterator , class OutputIterator > | |
__HOST__ __DEVICE__ OutputIterator | copy (InputIterator first, InputIterator last, OutputIterator result) |
Replacement for std::copy. More... | |
template<class InputIterator , class OutputIterator > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ OutputIterator | copy (InputIterator first, InputIterator last, OutputIterator result) |
Replacement for std::copy. More... | |
template<class InputIterator , typename T > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ ecuda::iterator_traits < InputIterator > ::difference_type | count (InputIterator first, InputIterator last, const T &value) |
template<class InputIterator , class UnaryPredicate > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ ecuda::iterator_traits < InputIterator > ::difference_type | count_if (InputIterator first, InputIterator last, UnaryPredicate p) |
template<class InputIterator1 , class InputIterator2 > | |
__HOST__ __DEVICE__ bool | equal (InputIterator1 first1, InputIterator1 last1, InputIterator2 first2) |
Replacement for std::equal. More... | |
template<class InputIterator1 , class InputIterator2 > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ bool | equal (InputIterator1 first1, InputIterator1 last1, InputIterator2 first2) |
Replacement for std::equal. More... | |
template<class ForwardIterator , typename T > | |
__HOST__ __DEVICE__ void | fill (ForwardIterator first, ForwardIterator last, const T &val) |
template<class ForwardIterator , typename T > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ void | fill (ForwardIterator first, ForwardIterator last, const T &val) |
template<class InputIterator , typename T > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ InputIterator | find (InputIterator first, InputIterator last, const T &value) |
template<class InputIterator , class UnaryPredicate > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ InputIterator | find_if (InputIterator first, InputIterator last, UnaryPredicate p) |
template<class InputIterator , class UnaryFunction > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ UnaryFunction | for_each (InputIterator first, InputIterator last, UnaryFunction f) |
template<class InputIterator1 , class InputIterator2 > | |
__HOST__ __DEVICE__ bool | lexicographical_compare (InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) |
template<class InputIterator1 , class InputIterator2 > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ bool | lexicographical_compare (InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) |
template<class ForwardIterator > | |
__HOST__ __DEVICE__ ForwardIterator | max_element (ForwardIterator first, ForwardIterator last) |
template<class InputIterator1 , class InputIterator2 > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ ecuda::pair< InputIterator1, InputIterator2 > | mismatch (InputIterator1 first1, InputIterator1 last1, InputIterator2 first2) |
template<class BidirectionalIterator > | |
__HOST__ __DEVICE__ void | reverse (BidirectionalIterator first, BidirectionalIterator last) |
template<class BidirectionalIterator > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ void | reverse (BidirectionalIterator first, BidirectionalIterator last) |
template<typename T > | |
__HOST__ __DEVICE__ const T & | min (const T &a, const T &b) |
template<typename T , class Compare > | |
__HOST__ __DEVICE__ const T & | min (const T &a, const T &b, Compare cmp) |
template<typename T > | |
__HOST__ __DEVICE__ const T & | max (const T &a, const T &b) |
template<typename T , class Compare > | |
__HOST__ __DEVICE__ const T & | max (const T &a, const T &b, Compare cmp) |
template<typename T > | |
__HOST__ __DEVICE__ void | swap (T &a, T &b) __NOEXCEPT__ |
template<class InputIterator , class UnaryPredicate > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ bool | any_of (InputIterator first, InputIterator last, UnaryPredicate p) |
template<class InputIterator , class UnaryPredicate > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ bool | none_of (InputIterator first, InputIterator last, UnaryPredicate p) |
template<typename T > | |
cudaError_t | cudaMemcpy (T *dest, const T *src, const size_t count, cudaMemcpyKind kind) |
Wrapper around CUDA API function cudaMemcpy. More... | |
template<typename T > | |
cudaError_t | cudaMemcpy2D (T *dest, const size_t dpitch, const T *src, const size_t spitch, const size_t width, const size_t height, cudaMemcpyKind kind) |
Wrapper around CUDA API function cudaMemcpy2D. More... | |
cudaError_t | cudaMemset (char *devPtr, const char &value, const size_t count) |
Re-implementation of CUDA API function cudaMemset that enforces a single-byte value. More... | |
template<typename T > | |
cudaError_t | cudaMemset (T *devPtr, const T &value, const size_t count) |
Re-implementation of CUDA API function cudaMemset that allows for any data type. More... | |
cudaError_t | cudaMemset2D (char *devPtr, const size_t pitch, const char &value, const size_t width, const size_t height) |
Re-implementation of CUDA API function cudaMemset2D that enforces a single-byte value. More... | |
template<typename T > | |
cudaError_t | cudaMemset2D (T *devPtr, const size_t pitch, const T &value, const size_t width, const size_t height) |
Re-implementation of CUDA API function cudaMemset2D that allows for any data type. More... | |
template<typename T > | |
cudaError_t | cudaMemcpyToSymbol (T *dest, const T *src, size_t count=1, size_t offset=0, enum cudaMemcpyKind kind=cudaMemcpyHostToDevice) |
template<typename T > | |
cudaError_t | cudaMemcpyToSymbol (T &dest, const T &src, enum cudaMemcpyKind kind=cudaMemcpyHostToDevice) |
__DEVICE__ void | threadfence () |
template<class InputIterator , typename Distance > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ void | advance (InputIterator &iterator, Distance n) |
Increments given iterator by n elements. More... | |
template<class Iterator > | |
ECUDA_SUPPRESS_HD_WARNINGS __HOST__ __DEVICE__ std::iterator_traits< Iterator > ::difference_type | distance (const Iterator &first, const Iterator &last) |
template<typename T , typename P > | |
__HOST__ __DEVICE__ std::iterator_traits < device_contiguous_block_iterator < T, P > >::difference_type | distance (const device_contiguous_block_iterator< T, P > &first, const device_contiguous_block_iterator< T, P > &last) |
template<typename T , class Alloc1 , class Alloc2 > | |
__HOST__ void | matrix_copy (matrix< T, Alloc1 > &dest, const matrix< T, Alloc2 > &src, typename matrix< T, Alloc2 >::size_type offsetRow=0, typename matrix< T, Alloc2 >::size_type offsetColumn=0) |
Copies some or all of a source matrix to a destination matrix. More... | |
template<typename T , class Alloc1 , class Alloc2 > | |
__HOST__ void | matrix_swap (matrix< T, Alloc1 > &mat1, matrix< T, Alloc2 > &mat2, typename matrix< T, Alloc1 >::size_type numberRows=0, typename matrix< T, Alloc1 >::size_type numberColumns=0, typename matrix< T, Alloc1 >::size_type offsetRow1=0, typename matrix< T, Alloc1 >::size_type offsetColumn1=0, typename matrix< T, Alloc2 >::size_type offsetRow2=0, typename matrix< T, Alloc2 >::size_type offsetColumn2=0) |
Swaps some or all of a source matrix with a destination matrix. More... | |
template<typename T , class Alloc > | |
__HOST__ void | matrix_transpose (matrix< T, Alloc > &src) |
template<class InputIterator , typename T > | |
__HOST__ __DEVICE__ T | accumulate (InputIterator first, InputIterator last, T init) |
Computes the sum of a sequence of elements. More... | |
template<class InputIterator , typename T , class BinaryOperation > | |
__HOST__ __DEVICE__ T | accumulate (InputIterator first, InputIterator last, T init, BinaryOperation op) |
Computes the sum of a sequence of elements. More... | |
|
inline |
Computes the sum of a sequence of elements.
Computes the sum of the given value init and the elements in the range [first,last).
first,last | the range of elements to sum |
init | initial value of the sum |
Definition at line 122 of file numeric.hpp.
|
inline |
Computes the sum of a sequence of elements.
Computes the sum of the given value init and the elements in the range [first,last). The sum is calculated using the binary operation function op that should have a signature equivalent to:
first,last | the range of elements to sum |
init | initial value of the sum |
op | binary operation function object that will be applied |
Definition at line 143 of file numeric.hpp.
|
inline |
Increments given iterator by n elements.
If n is negative, the iterator is decremented. This function will work on both iterators of host and device memory. However, if the iterator refers to non-contiguous device memory and this function is called from host code an assertion will fail at compile-time.
iterator | iterator to be advanced |
n | number of elements iterator should be advanced |
Definition at line 574 of file iterator.hpp.
|
inline |
Definition at line 80 of file algorithm.hpp.
|
inline |
Replacement for std::copy.
ecuda::copy is identical to std::copy, but can be a) called from device code, and b) supports device memory when called from host code.
Compile-time checks are performed to determine which action should be taken. If called from device code, then it must be true that both the input and output refer to device memory (otherwise nvcc will fail before evaluating the ecuda::copy call) and the copying is done on-device. If the called from host code and both the input and output refer to host memory, the evaluation is delegated to std::copy. If called from host code, and one or both of the input and output refers to device memory, there is a compile-time assertion that fails if the device memory is non-contiguous. Otherwise, a call to cudaMemcpy is performed with parameters depending on the input and output memory types (e.g. if input is host and if output is device, then cudaMemcpy is called with cudaMemcpyHostToDevice used as the cudaMemcpyKind parameter). In addition, when one or both of the input and output iterators refers to device memory, a call to ecuda::copy from host code results in a compile-time check to determine if the value_type of the input and output iterator are the same. If not, and the call is on host code, host staging memory is allocated to perform the type conversion.
first,last | the range of elements to copy |
result | the beginning of the destination range |
|
inline |
Replacement for std::copy.
ecuda::copy is identical to std::copy, but can be a) called from device code, and b) supports device memory when called from host code.
Compile-time checks are performed to determine which action should be taken. If called from device code, then it must be true that both the input and output refer to device memory (otherwise nvcc will fail before evaluating the ecuda::copy call) and the copying is done on-device. If the called from host code and both the input and output refer to host memory, the evaluation is delegated to std::copy. If called from host code, and one or both of the input and output refers to device memory, there is a compile-time assertion that fails if the device memory is non-contiguous. Otherwise, a call to cudaMemcpy is performed with parameters depending on the input and output memory types (e.g. if input is host and if output is device, then cudaMemcpy is called with cudaMemcpyHostToDevice used as the cudaMemcpyKind parameter). In addition, when one or both of the input and output iterators refers to device memory, a call to ecuda::copy from host code results in a compile-time check to determine if the value_type of the input and output iterator are the same. If not, and the call is on host code, host staging memory is allocated to perform the type conversion.
first,last | the range of elements to copy |
result | the beginning of the destination range |
|
inline |
|
inline |
Definition at line 91 of file count_if.hpp.
|
inline |
Wrapper around CUDA API function cudaMemcpy.
Copies a contiguous block of memory holding count elements of type T to another contiguous block of memory.
dest | Pointer to destination memory. |
src | Pointer to source memory. |
count | Number of elements to copy. |
kind | Type of transfer (cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, cudaMemcpyHostToDevice) |
Definition at line 62 of file apiwrappers.hpp.
|
inline |
Wrapper around CUDA API function cudaMemcpy2D.
Copies a matrix of width*height elements of type T from a contiguous memory block with a given pitch (in bytes) to another contiguous memory block with a given pitch (in bytes).
dest | Pointer to destination memory. |
dpitch | Pitch (in bytes) of destination memory. |
src | Pointer to source memory. |
spitch | Pitch (in bytes) of source memory. |
width | Width of matrix. |
height | Height of matrix. |
kind | Type of transfer (cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, cudaMemcpyHostToDevice) |
Definition at line 84 of file apiwrappers.hpp.
|
inline |
Definition at line 210 of file apiwrappers.hpp.
|
inline |
Definition at line 216 of file apiwrappers.hpp.
|
inline |
Re-implementation of CUDA API function cudaMemset that enforces a single-byte value.
This implementation simply calls the CUDA API cudaMemset function since the value argument is explicitly stated as single byte.
devPtr | Pointer to device memory. |
value | Value to set for each element. |
count | The number of elements to set. |
Definition at line 124 of file apiwrappers.hpp.
|
inline |
Re-implementation of CUDA API function cudaMemset that allows for any data type.
The CUDA API cudaMemset function allows only a single-byte value to be specified. This implementation allows any arbitrary data type and value to be specified. The function checks if value is represented by a single byte or, if multibyte, that each byte in the value is the same. If this true, the CUDA API cudaMemset function can be used. If not, then a staging block of host memory is first filled with the value and then copied to the device memory. Thus, this function is more general but keep in mind that there will be a performance hit if the provided value is not represented by a concatentation of the same single byte.
devPtr | Pointer to device memory. |
value | Value to set for each element. |
count | The number of elements to set. |
Definition at line 147 of file apiwrappers.hpp.
|
inline |
Re-implementation of CUDA API function cudaMemset2D that enforces a single-byte value.
This implementation simply calls the CUDA API cudaMemset2D function since the value argument is explicitly stated as single byte.
devPtr | Pointer to 2D device memory. |
pitch | Pitch in bytes of 2D device memory. |
value | Value to set for each element. |
width | Width of matrix. |
height | Height of matrix. |
Definition at line 170 of file apiwrappers.hpp.
cudaError_t ecuda::cudaMemset2D | ( | T * | devPtr, |
const size_t | pitch, | ||
const T & | value, | ||
const size_t | width, | ||
const size_t | height | ||
) |
Re-implementation of CUDA API function cudaMemset2D that allows for any data type.
The CUDA API cudaMemset2D function allows only a single-byte value to be specified. This implementation allows any arbitrary data type and value to be specified. The function checks if value is represented by a single byte or, if multibyte, that each byte in the value is the same. If this true, the CUDA API cudaMemset2D function can be used. If not, then a staging block of host memory is first filled with the value and then copied to the device memory. Thus, this function is more general but keep in mind that there will be a performance hit if the provided value is not represented by a concatentation of the same single byte.
devPtr | Pointer to 2D device memory. |
pitch | Pitch in bytes of 2D device memory. |
value | Value to set for each element. |
width | Width of matrix. |
height | Height of matrix. |
Definition at line 195 of file apiwrappers.hpp.
|
inline |
Definition at line 627 of file iterator.hpp.
|
inline |
Definition at line 636 of file iterator.hpp.
|
inline |
Replacement for std::equal.
ecuda::equal is identical to std::equal, but can be a) called from device code, and b) supports device memory when called from host code.
Compile-time checks are performed to determine which action should be taken. If called from device code, then it must be true that both ranges refer to device memory (otherwise nvcc will fail before evaluating the ecuda::equal call) and the comparison between ranges is done on-device. If the called from host code and both ranges refer to host memory, the evaluation is delegated to std::equal. If called from host code, and one or both ranges refer to device memory, the range(s) are copied to temporary host memory before delegating to std::equal.
|
inline |
Replacement for std::equal.
ecuda::equal is identical to std::equal, but can be a) called from device code, and b) supports device memory when called from host code.
Compile-time checks are performed to determine which action should be taken. If called from device code, then it must be true that both ranges refer to device memory (otherwise nvcc will fail before evaluating the ecuda::equal call) and the comparison between ranges is done on-device. If the called from host code and both ranges refer to host memory, the evaluation is delegated to std::equal. If called from host code, and one or both ranges refer to device memory, the range(s) are copied to temporary host memory before delegating to std::equal.
|
inline |
|
inline |
|
inline |
|
inline |
Definition at line 94 of file find_if.hpp.
|
inline |
Definition at line 89 of file for_each.hpp.
|
inline |
Definition at line 143 of file lexicographical_compare.hpp.
|
inline |
Definition at line 143 of file lexicographical_compare.hpp.
__HOST__ void ecuda::matrix_copy | ( | matrix< T, Alloc1 > & | dest, |
const matrix< T, Alloc2 > & | src, | ||
typename matrix< T, Alloc2 >::size_type | offsetRow = 0 , |
||
typename matrix< T, Alloc2 >::size_type | offsetColumn = 0 |
||
) |
Copies some or all of a source matrix to a destination matrix.
The subset of the source matrix can be specified by the offsetRow and offsetColumn parameters as well as the size of the destination matrix. If the destination matrix is larger than needed in either dimension the extra elements remain unaltered.
For example, to copy a subset of a matrix:
dest | the destination matrix |
src | the source matrix |
offsetRow | offset in the starting row of the source matrix (default: 0) |
offsetColumn | offset in the starting column of the destination matrix (default: 0) |
Definition at line 783 of file matrix.hpp.
__HOST__ void ecuda::matrix_swap | ( | matrix< T, Alloc1 > & | mat1, |
matrix< T, Alloc2 > & | mat2, | ||
typename matrix< T, Alloc1 >::size_type | numberRows = 0 , |
||
typename matrix< T, Alloc1 >::size_type | numberColumns = 0 , |
||
typename matrix< T, Alloc1 >::size_type | offsetRow1 = 0 , |
||
typename matrix< T, Alloc1 >::size_type | offsetColumn1 = 0 , |
||
typename matrix< T, Alloc2 >::size_type | offsetRow2 = 0 , |
||
typename matrix< T, Alloc2 >::size_type | offsetColumn2 = 0 |
||
) |
Swaps some or all of a source matrix with a destination matrix.
The subset of the two matrices can be specified with the offsetRow1, offsetColumn1, offsetRow2, offsetColumn2 parameters along with the numberRows and numberColumns parameters which are applied to both of the matrices.
If either of the subsets exceed the bounds of their matrix in either dimension a std::out_of_range exception is thrown.
mat1 | a matrix whose contents are to be swapped |
mat2 | the other matrix whose contents are to be swapped |
numberRows | the number of rows to swap |
numberColumns | the number of columns to swap |
offsetRow1 | the starting row in mat1 that will be swapped (default:0) |
offsetColumn1 | the starting column in mat1 that will be swapped (default:0) |
offsetRow2 | the starting row in mat2 that will be swapped (default:0) |
offsetColumn2 | the starting column in mat2 that will be swapped (default:0) |
std::out_of_range | thrown if the specified bounds of either matrix exceeds its actual dimensions |
Definition at line 815 of file matrix.hpp.
__HOST__ void ecuda::matrix_transpose | ( | matrix< T, Alloc > & | src | ) |
Definition at line 839 of file matrix.hpp.
|
inline |
Definition at line 51 of file algorithm.hpp.
|
inline |
Definition at line 52 of file algorithm.hpp.
|
inline |
Definition at line 94 of file max_element.hpp.
|
inline |
Definition at line 48 of file algorithm.hpp.
|
inline |
Definition at line 49 of file algorithm.hpp.
|
inline |
Definition at line 132 of file mismatch.hpp.
|
inline |
Definition at line 88 of file algorithm.hpp.
|
inline |
Definition at line 104 of file reverse.hpp.
|
inline |
Definition at line 104 of file reverse.hpp.
|
inline |
Definition at line 54 of file algorithm.hpp.
|
inline |
Definition at line 231 of file apiwrappers.hpp.