24 #include <curand_kernel.h>
26 #include <type_traits>
67 template <
class T_data>
88 template <
class T_data>
91 template <
class T_data>
123 const T_data *data,
bool fromHost,
int nb_streams);
138 const T_data *data,
int nb_streams);
195 std::ostringstream stream;
205 cudaMemcpyDeviceToHost));
214 cudaMemcpy(&tmp_float,
o_data,
sizeof(T_data), cudaMemcpyDeviceToHost));
227 template <
typename T_dest>
229 template <
typename T_dest>
245 int copy_into(ipc::Cacao<T_data> *cacaoInterface);
246 int copy_from(ipc::Cacao<T_data> *cacaoInterface);
250 return cudaMemset(this->d_data, 0, this->nb_elem *
sizeof(T_data));
253 inline int reset(cudaStream_t stream) {
254 return cudaMemsetAsync(this->d_data, 0, this->nb_elem *
sizeof(T_data), stream);
259 this->d_data, value, this->nb_elem,
260 this->current_context->
get_device(this->device));
276 void clip(T_data min, T_data max, cudaStream_t stream);
277 void clip(T_data min, T_data max) {
clip(min, max, 0);};
334 int lda, T_data beta,
int ldc);
346 int prng(T_data *output,
char gtype,
float alpha,
float beta);
347 int prng(T_data *output,
char gtype,
float alpha);
348 int prng(
char gtype,
float alpha,
float beta);
349 int prng(
char gtype,
float alpha);
357 int prng_host(
char gtype, T_data stddev, T_data alpha);
375 template <
class T_data>
377 os <<
"-----------------------" << std::endl;
378 os <<
"CarmaObj<" <<
typeid(T_data).name() <<
"> object on GPU"
381 os <<
"ndims = " << ndims << std::endl;
382 for (
long dim = 0; dim < ndims; dim++) {
383 os <<
"dim[" << dim <<
"] = " << obj.
get_dims(dim + 1) << std::endl;
386 os <<
"sizeof(" <<
typeid(T_data).name() <<
") = " <<
sizeof(T_data)
388 os <<
"-----------------------" << std::endl;
393 template <
class T_data>
394 void clip_array(T_data *d_data, T_data min, T_data max,
int N,
398 template <
class T_data>
399 void reduce(
int size,
int threads,
int blocks, T_data *d_idata,
401 template <
class T_data>
404 template <
class T_data>
406 T_data *&o_data,
int N);
407 template <
class T_data>
408 void reduceCubCU(T_data *cub_data,
size_t cub_data_size, T_data *data,
409 T_data *o_data,
int N, cudaStream_t stream=0);
412 template <
class T_data>
413 int transposeCU(T_data *d_idata, T_data *d_odata,
long N1,
long N2);
416 template <
class T_data>
419 template <
class T_data>
427 curandState *state,
char gtype,
int n,
float alpha,
434 template <
class T_in,
class T_out>
436 template <
class T_in,
class T_out>
437 void carma_initfft(
const long *dims_data, cufftHandle *plan, cufftType type_plan);
438 template <
class T_in,
class T_out>
439 int CarmaFFT(T_in *input, T_out *output,
int dir, cufftHandle plan);
442 template <
class T_data>
443 int fillindex(T_data *d_odata, T_data *d_idata,
int *indx,
int N,
445 template <
class T_data>
448 int getarray2d(T *d_odata, T *d_idata,
int x0,
int Ncol,
int NC,
int N,
451 int fillarray2d(T *d_odata, T *d_idata,
int x0,
int Ncol,
int NC,
int N,
454 int fillarray2d2(T *d_odata, T *d_idata,
int x0,
int Ncol,
int NC,
int N,
475 int custom_half_axpy(half alpha, half *source,
int incx,
int incy,
int N,
491 int extract(T *d_smallimg,
const T *d_fullimg,
int fullimg_size,
int center_pos,
492 int extract_size,
bool roll);
int fill_sym_matrix(char src_uplo, T *d_data, int Ncol, int N, CarmaDevice *device)
cufftType carma_select_plan()
CarmaObj< uint16_t > CarmaObjUSI
int launch_generic1d(T_data *d_idata, T_data *d_odata, int N, CarmaDevice *device)
CarmaObj< double2 > CarmaObjD2
void carma_initfft(const long *dims_data, cufftHandle *plan, cufftType type_plan)
std::ostream & operator<<(std::ostream &os, CarmaObj< T_data > &obj)
int transposeCU(T_data *d_idata, T_data *d_odata, long N1, long N2)
CarmaObj< float2 > CarmaObjS2
CarmaObj< unsigned int > CarmaObjUI
CarmaObj< int > CarmaObjI
void reduceCubCU(T_data *cub_data, size_t cub_data_size, T_data *data, T_data *o_data, int N, cudaStream_t stream=0)
int getarray2d(T *d_odata, T *d_idata, int x0, int Ncol, int NC, int N, CarmaDevice *device)
CarmaObj< cuFloatComplex > CarmaObjC
int fillindex(T_data *d_odata, T_data *d_idata, int *indx, int N, CarmaDevice *device)
void init_reduceCubCU(T_data *&cub_data, size_t &cub_data_size, T_data *data, T_data *&o_data, int N)
int carma_plus(T *d_odata, T elpha, int N, CarmaDevice *device)
int fillarray2d2(T *d_odata, T *d_idata, int x0, int Ncol, int NC, int N, CarmaDevice *device)
void reduce(int size, int threads, int blocks, T_data *d_idata, T_data *d_odata)
int extract(T *d_smallimg, const T *d_fullimg, int fullimg_size, int center_pos, int extract_size, bool roll)
Kernel to extract a part of the image centred on center_pos.
int carma_prng_cu(T *results, const int nb_threads, const int nb_blocks, curandState *state, char gtype, int n, float alpha, float beta)
int fillarray2d(T *d_odata, T *d_idata, int x0, int Ncol, int NC, int N, CarmaDevice *device)
CarmaObj< double > CarmaObjD
int carma_plusai(T *d_odata, T *i_data, int i, int sgn, int N, CarmaDevice *device)
void clip_array(T_data *d_data, T_data min, T_data max, int N, CarmaDevice *device, cudaStream_t stream)
CarmaObj< cuDoubleComplex > CarmaObjZ
int fillvalues(T_data *d_odata, T_data *val, int N, CarmaDevice *device)
int CarmaFFT(T_in *input, T_out *output, int dir, cufftHandle plan)
int carma_curand_montagn(curandState *state, T *d_odata, int N, CarmaDevice *device)
CarmaObj< float > CarmaObjS
int launch_generic2d(T_data *d_odata, T_data *d_idata, int N1, int N2)
int carma_prng_init(int *seed, const int nb_threads, const int nb_blocks, curandState *state)
this file provides tools to CarmaObj
int fill_array_with_value(T_data *d_data, T_data value, int N, CarmaDevice *device)
#define carma_safe_call(err)
this class provides the context in which CarmaObj are created
CarmaDevice * get_device(int dev)
MemType get_malloc_type()
int * strides
Strides for each dimension.
T_data * d_data
Pointer to data.
const long * get_dims_data()
int ndims
Number of dimensions.
long get_dims_data(int i)
MemType malloc_type
type of alloc
long * dims_data
Dimensions.
int nb_elem
Number of elements.
this class provides wrappers to the generic carma host object
this class provides wrappers to the generic carma object
cudaStream_t get_cuda_stream(int stream)
CarmaObj(CarmaContext *current_context, const long *dims_data, int nb_streams)
int nb_elem
number of elements in the array
T_data * cub_data
optional data (used for scan / reduction)
T_data * get_data_at(int index)
int device2host_opt(T_data *data)
int prng_host(char gtype, T_data stddev, T_data alpha)
CarmaContext * get_context()
void copy(CarmaObj< T_data > *source, int incx, int incy)
CarmaObj(const CarmaObj< T_data > *obj)
cufftType get_type_plan()
FFT plan type.
CarmaObj(CarmaContext *current_context, const long *dims_data)
void syrk(char uplo, char transa, T_data alpha, CarmaObj< T_data > *matA, int lda, T_data beta, int ldc)
int wait_stream(int stream)
CarmaObj(const CarmaObj &)=delete
const T_data operator[](int index) const
CarmaObj(CarmaContext *current_context, const std::vector< long > &dims)
CarmaContext * current_context
void axpy(T_data alpha, CarmaObj< T_data > *source, int incx, int incy, int offset=0)
int host2device_mat(const T_data *data, int lda, int ldb)
void swap_ptr(T_data *ptr)
CarmaObj(CarmaContext *current_context, const long *dims_data, const T_data *data, int nb_streams)
void gemm(char transa, char transb, T_data alpha, CarmaObj< T_data > *matA, int lda, CarmaObj< T_data > *matB, int ldb, T_data beta, int ldc)
void dgmm(char side, CarmaObj< T_data > *matA, int lda, CarmaObj< T_data > *vectx, int incx, int ldc)
int device2host(T_dest *data)
int init_prng_host(int seed)
int device2host_mat(T_data *data, int lda, int ldb)
void symm(char side, char uplo, T_data alpha, CarmaObj< T_data > *matA, int lda, CarmaObj< T_data > *matB, int ldb, T_data beta, int ldc)
unsigned int * values
optional data (used for sort)
size_t * d_num_valid
used for compact
int copy_from(const T_data *data, int nb_elem)
int prng_host(char gtype)
int copy_from_async(const T_data *data, int nb_elem, cudaStream_t stream)
void clip(T_data min, T_data max, cudaStream_t stream)
CarmaObj(CarmaContext *current_context, const long *dims_data, const T_data *data)
T_data dot(CarmaObj< T_data > *source, int incx, int incy)
void geam(char transa, char transb, T_data alpha, CarmaObj< T_data > *matA, int lda, T_data beta, CarmaObj< T_data > *matB, int ldb, int ldc)
CarmaObj(CarmaContext *current_context, const CarmaObj< T_data > *obj, int nb_streams)
void symv(char uplo, T_data alpha, CarmaObj< T_data > *matA, int lda, CarmaObj< T_data > *vectx, int incx, T_data beta, int incy)
cufftHandle * get_plan()
FFT plan.
void swap(CarmaObj< T_data > *source, int incx, int incy)
int prng(char gtype, float alpha)
void gemv(char trans, T_data alpha, CarmaObj< T_data > *matA, int lda, CarmaObj< T_data > *vectx, int incx, T_data beta, int incy)
void syrkx(char uplo, char transa, T_data alpha, CarmaObj< T_data > *matA, int lda, CarmaObj< T_data > *matB, int ldb, T_data beta, int ldc)
int device2host_async(T_data *data, cudaStream_t stream)
int copy_into(T_data *data, int nb_elem)
int host2device(const T_dest *data)
int prng(char gtype, float alpha, float beta)
T_data * d_data
Input data => change to vector.
int prng_montagn(float init_montagn)
int get_nb_streams() const
int transpose(CarmaObj< T_data > *source)
int device2host_vect(T_data *data, int incx, int incy)
cufftType type_plan
FFT plan type.
void scale(T_data alpha, int incx)
int host2device_async(const T_data *data, cudaStream_t stream)
void ger(T_data alpha, CarmaObj< T_data > *vectx, int incx, CarmaObj< T_data > *vecty, int incy, int lda)
int reset(cudaStream_t stream)
int device
device where the CarmaObj is allocate
long * dims_data
dimensions of the array
CarmaObj(CarmaContext *current_context, const CarmaObj< T_data > *obj)
int prng(T_data *output, char gtype, float alpha, float beta)
std::vector< T_data > h_data
void init(CarmaContext *current_context, const long *dims_data, const T_data *data, bool fromHost, int nb_streams)
void rot(CarmaObj< T_data > *source, int incx, int incy, T_data sc, T_data ss)
T_data * o_data
optional data (used for scan / reduction)
unsigned int * get_values()
optional data (used for sort)
cufftHandle plan
FFT plan.
const T_data get_o_data_value() const
int host2device_vect(const T_data *data, int incx, int incy)
this class provides the stream features to CarmaObj
int wait_stream(int stream)
cudaStream_t get_stream(int stream)
int roll(T *idata, int N, int M, int nim, CarmaDevice *device)