2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
12 #ifndef _CARMA_CONVOLUTIONFFT2D_CUH_
13 #define _CARMA_CONVOLUTIONFFT2D_CUH_
15 #include "convolutionFFT2D_common.h"
18 #define POWER_OF_TWO 1
21 texture<float, 1, cudaReadModeElementType> texFloat;
22 #define LOAD_FLOAT(i) tex1Dfetch(texFloat, i)
23 #define SET_FLOAT_BASE carma_safe_call(cudaBindTexture(0, texFloat, d_Src))
25 #define LOAD_FLOAT(i) d_Src[i]
26 #define SET_FLOAT_BASE
29 ////////////////////////////////////////////////////////////////////////////////
30 /// Position convolution kernel center at (0, 0) in the image
31 ////////////////////////////////////////////////////////////////////////////////
32 __global__ void padKernel_kernel(float *d_Dst, float *d_Src, int fftH, int fftW,
33 int kernelH, int kernelW, int kernelY,
35 const int y = blockDim.y * blockIdx.y + threadIdx.y;
36 const int x = blockDim.x * blockIdx.x + threadIdx.x;
38 if (y < kernelH && x < kernelW) {
40 if (ky < 0) ky += fftH;
42 if (kx < 0) kx += fftW;
43 d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
47 __global__ void pad_kernel_3d_kernel(float *d_Dst, float *d_Src, int fftH,
48 int fftW, int kernelH, int kernelW,
49 int kernelY, int kernelX, int nim) {
50 const int y = blockDim.y * blockIdx.y + threadIdx.y;
51 const int x = blockDim.x * blockIdx.x + threadIdx.x;
52 const int z = blockDim.z * blockIdx.z + threadIdx.z;
54 if (y < kernelH && x < kernelW && z < nim) {
56 if (ky < 0) ky += fftH;
58 if (kx < 0) kx += fftW;
59 int kz_dst = z * (fftH * fftW);
60 int kz_src = z * (kernelH * kernelW);
62 d_Dst[ky * fftW + kx + kz_dst] = LOAD_FLOAT(y * kernelW + x + kz_src);
66 ////////////////////////////////////////////////////////////////////////////////
67 // Prepare data for "pad to border" addressing mode
68 ////////////////////////////////////////////////////////////////////////////////
69 __global__ void pad_data_clamp_to_border_kernel(float *d_Dst, float *d_Src,
70 int fftH, int fftW, int dataH,
71 int dataW, int kernelH, int kernelW,
72 int kernelY, int kernelX) {
73 const int y = blockDim.y * blockIdx.y + threadIdx.y;
74 const int x = blockDim.x * blockIdx.x + threadIdx.x;
75 const int borderH = dataH + kernelY;
76 const int borderW = dataW + kernelX;
78 if (y < fftH && x < fftW) {
81 if (y < dataH) dy = y;
82 if (x < dataW) dx = x;
83 if (y >= dataH && y < borderH) dy = dataH - 1;
84 if (x >= dataW && x < borderW) dx = dataW - 1;
85 if (y >= borderH) dy = 0;
86 if (x >= borderW) dx = 0;
88 d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
92 __global__ void pad_data_clamp_to_border_3d_kernel(float *d_Dst, float *d_Src,
93 int fftH, int fftW, int dataH,
94 int dataW, int kernelH,
95 int kernelW, int kernelY,
96 int kernelX, int nim) {
97 const int y = blockDim.y * blockIdx.y + threadIdx.y;
98 const int x = blockDim.x * blockIdx.x + threadIdx.x;
99 const int z = blockDim.z * blockIdx.z + threadIdx.z;
101 const int borderH = dataH + kernelY;
102 const int borderW = dataW + kernelX;
104 if (y < fftH && x < fftW && z < nim) {
106 int kz_dst = z * (fftH * fftW);
107 int kz_src = z * (dataH * dataW);
109 if (y < dataH) dy = y;
110 if (x < dataW) dx = x;
111 if (y >= dataH && y < borderH) dy = dataH - 1;
112 if (x >= dataW && x < borderW) dx = dataW - 1;
113 if (y >= borderH) dy = 0;
114 if (x >= borderW) dx = 0;
116 d_Dst[y * fftW + x + kz_dst] = LOAD_FLOAT(dy * dataW + dx + kz_src);
120 ////////////////////////////////////////////////////////////////////////////////
121 // Modulate Fourier image of padded data by Fourier image of padded kernel
122 // and normalize by FFT size
123 ////////////////////////////////////////////////////////////////////////////////
124 inline __device__ void mulAndScale(fComplex &a, const fComplex &b,
126 fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
130 __global__ void modulate_and_normalize_kernel(fComplex *d_Dst, fComplex *d_Src,
131 int dataSize, float c) {
132 const int i = blockDim.x * blockIdx.x + threadIdx.x;
133 if (i >= dataSize) return;
135 fComplex a = d_Src[i];
136 fComplex b = d_Dst[i];
138 mulAndScale(a, b, c);
143 ////////////////////////////////////////////////////////////////////////////////
144 // 2D R2C / C2R post/preprocessing kernels
145 ////////////////////////////////////////////////////////////////////////////////
147 texture<fComplex, 1, cudaReadModeElementType> texComplexA;
148 texture<fComplex, 1, cudaReadModeElementType> texComplexB;
149 #define LOAD_FCOMPLEX(i) tex1Dfetch(texComplexA, i)
150 #define LOAD_FCOMPLEX_A(i) tex1Dfetch(texComplexA, i)
151 #define LOAD_FCOMPLEX_B(i) tex1Dfetch(texComplexB, i)
153 #define SET_FCOMPLEX_BASE carma_safe_call(cudaBindTexture(0, texComplexA, d_Src))
154 #define SET_FCOMPLEX_BASE_A \
155 carma_safe_call(cudaBindTexture(0, texComplexA, d_SrcA))
156 #define SET_FCOMPLEX_BASE_B \
157 carma_safe_call(cudaBindTexture(0, texComplexB, d_SrcB))
159 #define LOAD_FCOMPLEX(i) d_Src[i]
160 #define LOAD_FCOMPLEX_A(i) d_SrcA[i]
161 #define LOAD_FCOMPLEX_B(i) d_SrcB[i]
163 #define SET_FCOMPLEX_BASE
164 #define SET_FCOMPLEX_BASE_A
165 #define SET_FCOMPLEX_BASE_B
168 inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2,
169 const fComplex &twiddle) {
170 float A1 = 0.5f * (D1.x + D2.x);
171 float B1 = 0.5f * (D1.y - D2.y);
172 float A2 = 0.5f * (D1.y + D2.y);
173 float B2 = 0.5f * (D1.x - D2.x);
175 D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
176 D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
177 D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
178 D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
181 // Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
182 inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2,
183 const fComplex &twiddle) {
184 float A1 = /* 0.5f * */ (D1.x + D2.x);
185 float B1 = /* 0.5f * */ (D1.y - D2.y);
186 float A2 = /* 0.5f * */ (D1.y + D2.y);
187 float B2 = /* 0.5f * */ (D1.x - D2.x);
189 D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
190 D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
191 D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
192 D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
195 inline __device__ void getTwiddle(fComplex &twiddle, float phase) {
196 __sincosf(phase, &twiddle.y, &twiddle.x);
199 inline __device__ uint mod(uint a, uint DA) {
200 //(DA - a) % DA, assuming a <= DA
201 return a ? (DA - a) : a;
204 static inline uint factorRadix2(uint &log2N, uint n) {
209 for (log2N = 0; n % 2 == 0; n /= 2, log2N++)
215 inline __device__ void udivmod(uint ÷nd, uint divisor, uint &rem) {
217 rem = dividend % divisor;
220 rem = dividend & (divisor - 1);
221 dividend >>= (__ffs(divisor) - 1);
225 __global__ void sp_postprocess_2d_kernel(fComplex *d_Dst, fComplex *d_Src,
226 uint DY, uint DX, uint threadCount,
227 uint padding, float phaseBase) {
228 const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
229 if (threadId >= threadCount) return;
231 uint x, y, i = threadId;
232 udivmod(i, DX / 2, x);
235 const uint srcOffset = i * DY * DX;
236 const uint dstOffset = i * DY * (DX + padding);
238 // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
240 const uint loadPos1 = srcOffset + y * DX + x;
241 const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
242 const uint storePos1 = dstOffset + y * (DX + padding) + x;
243 const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);
245 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
246 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
249 getTwiddle(twiddle, phaseBase * (float)x);
250 spPostprocessC2C(D1, D2, twiddle);
252 d_Dst[storePos1] = D1;
253 d_Dst[storePos2] = D2;
256 // Process x = DX / 2
258 const uint loadPos1 = srcOffset + y * DX + DX / 2;
259 const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
260 const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;
261 const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;
263 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
264 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
266 // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
267 fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
268 spPostprocessC2C(D1, D2, twiddle);
270 d_Dst[storePos1] = D1;
271 d_Dst[storePos2] = D2;
275 __global__ void sp_preprocess_2d_kernel(fComplex *d_Dst, fComplex *d_Src, uint DY,
276 uint DX, uint threadCount, uint padding,
278 const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
279 if (threadId >= threadCount) return;
281 uint x, y, i = threadId;
282 udivmod(i, DX / 2, x);
285 // Avoid overwrites in columns 0 and DX / 2 by different threads (lower and
287 if ((x == 0) && (y > DY / 2)) return;
289 const uint srcOffset = i * DY * (DX + padding);
290 const uint dstOffset = i * DY * DX;
292 // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
294 const uint loadPos1 = srcOffset + y * (DX + padding) + x;
295 const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
296 const uint storePos1 = dstOffset + y * DX + x;
297 const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);
299 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
300 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
303 getTwiddle(twiddle, phaseBase * (float)x);
304 spPreprocessC2C(D1, D2, twiddle);
306 d_Dst[storePos1] = D1;
307 d_Dst[storePos2] = D2;
310 // Process x = DX / 2
312 const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;
313 const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
314 const uint storePos1 = dstOffset + y * DX + DX / 2;
315 const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;
317 fComplex D1 = LOAD_FCOMPLEX(loadPos1);
318 fComplex D2 = LOAD_FCOMPLEX(loadPos2);
320 // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
321 fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
322 spPreprocessC2C(D1, D2, twiddle);
324 d_Dst[storePos1] = D1;
325 d_Dst[storePos2] = D2;
329 ////////////////////////////////////////////////////////////////////////////////
330 // Combined sp_postprocess_2d + modulate_and_normalize + sp_preprocess_2d
331 ////////////////////////////////////////////////////////////////////////////////
332 __global__ void sp_process_2d_kernel(fComplex *d_Dst, fComplex *d_SrcA,
333 fComplex *d_SrcB, uint DY, uint DX,
334 uint threadCount, float phaseBase, float c) {
335 const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
336 if (threadId >= threadCount) return;
338 uint x, y, i = threadId;
340 udivmod(i, DY / 2, y);
342 const uint offset = i * DY * DX;
344 // Avoid overwrites in rows 0 and DY / 2 by different threads (left and right
345 // halves) Otherwise correctness for in-place transformations is affected
346 if ((y == 0) && (x > DX / 2)) return;
350 // Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
352 const uint pos1 = offset + y * DX + x;
353 const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);
355 fComplex D1 = LOAD_FCOMPLEX_A(pos1);
356 fComplex D2 = LOAD_FCOMPLEX_A(pos2);
357 fComplex K1 = LOAD_FCOMPLEX_B(pos1);
358 fComplex K2 = LOAD_FCOMPLEX_B(pos2);
359 getTwiddle(twiddle, phaseBase * (float)x);
361 spPostprocessC2C(D1, D2, twiddle);
362 spPostprocessC2C(K1, K2, twiddle);
363 mulAndScale(D1, K1, c);
364 mulAndScale(D2, K2, c);
365 spPreprocessC2C(D1, D2, twiddle);
372 const uint pos1 = offset + (DY / 2) * DX + x;
373 const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);
375 fComplex D1 = LOAD_FCOMPLEX_A(pos1);
376 fComplex D2 = LOAD_FCOMPLEX_A(pos2);
377 fComplex K1 = LOAD_FCOMPLEX_B(pos1);
378 fComplex K2 = LOAD_FCOMPLEX_B(pos2);
380 spPostprocessC2C(D1, D2, twiddle);
381 spPostprocessC2C(K1, K2, twiddle);
382 mulAndScale(D1, K1, c);
383 mulAndScale(D2, K2, c);
384 spPreprocessC2C(D1, D2, twiddle);
390 #endif //_CARMA_CONVOLUTIONFFT2D_CUH_