COMPASS  5.0.0
End-to-end AO simulation tool using GPU acceleration
convolutionFFT2D.cuh
1 /*
2  * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3  *
4  * Please refer to the NVIDIA end user license agreement (EULA) associated
5  * with this source code for terms and conditions that govern your use of
6  * this software. Any use, reproduction, disclosure, or distribution of
7  * this software and related documentation outside the terms of the EULA
8  * is strictly prohibited.
9  *
10  */
11 
12 #ifndef _CARMA_CONVOLUTIONFFT2D_CUH_
13 #define _CARMA_CONVOLUTIONFFT2D_CUH_
14 
15 #include "convolutionFFT2D_common.h"
16 
17 #define USE_TEXTURE 1
18 #define POWER_OF_TWO 1
19 
20 #if (USE_TEXTURE)
21 texture<float, 1, cudaReadModeElementType> texFloat;
22 #define LOAD_FLOAT(i) tex1Dfetch(texFloat, i)
23 #define SET_FLOAT_BASE carma_safe_call(cudaBindTexture(0, texFloat, d_Src))
24 #else
25 #define LOAD_FLOAT(i) d_Src[i]
26 #define SET_FLOAT_BASE
27 #endif
28 
29 ////////////////////////////////////////////////////////////////////////////////
30 /// Position convolution kernel center at (0, 0) in the image
31 ////////////////////////////////////////////////////////////////////////////////
32 __global__ void padKernel_kernel(float *d_Dst, float *d_Src, int fftH, int fftW,
33  int kernelH, int kernelW, int kernelY,
34  int kernelX) {
35  const int y = blockDim.y * blockIdx.y + threadIdx.y;
36  const int x = blockDim.x * blockIdx.x + threadIdx.x;
37 
38  if (y < kernelH && x < kernelW) {
39  int ky = y - kernelY;
40  if (ky < 0) ky += fftH;
41  int kx = x - kernelX;
42  if (kx < 0) kx += fftW;
43  d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
44  }
45 }
46 
47 __global__ void pad_kernel_3d_kernel(float *d_Dst, float *d_Src, int fftH,
48  int fftW, int kernelH, int kernelW,
49  int kernelY, int kernelX, int nim) {
50  const int y = blockDim.y * blockIdx.y + threadIdx.y;
51  const int x = blockDim.x * blockIdx.x + threadIdx.x;
52  const int z = blockDim.z * blockIdx.z + threadIdx.z;
53 
54  if (y < kernelH && x < kernelW && z < nim) {
55  int ky = y - kernelY;
56  if (ky < 0) ky += fftH;
57  int kx = x - kernelX;
58  if (kx < 0) kx += fftW;
59  int kz_dst = z * (fftH * fftW);
60  int kz_src = z * (kernelH * kernelW);
61 
62  d_Dst[ky * fftW + kx + kz_dst] = LOAD_FLOAT(y * kernelW + x + kz_src);
63  }
64 }
65 
66 ////////////////////////////////////////////////////////////////////////////////
67 // Prepare data for "pad to border" addressing mode
68 ////////////////////////////////////////////////////////////////////////////////
69 __global__ void pad_data_clamp_to_border_kernel(float *d_Dst, float *d_Src,
70  int fftH, int fftW, int dataH,
71  int dataW, int kernelH, int kernelW,
72  int kernelY, int kernelX) {
73  const int y = blockDim.y * blockIdx.y + threadIdx.y;
74  const int x = blockDim.x * blockIdx.x + threadIdx.x;
75  const int borderH = dataH + kernelY;
76  const int borderW = dataW + kernelX;
77 
78  if (y < fftH && x < fftW) {
79  int dy, dx;
80 
81  if (y < dataH) dy = y;
82  if (x < dataW) dx = x;
83  if (y >= dataH && y < borderH) dy = dataH - 1;
84  if (x >= dataW && x < borderW) dx = dataW - 1;
85  if (y >= borderH) dy = 0;
86  if (x >= borderW) dx = 0;
87 
88  d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
89  }
90 }
91 
92 __global__ void pad_data_clamp_to_border_3d_kernel(float *d_Dst, float *d_Src,
93  int fftH, int fftW, int dataH,
94  int dataW, int kernelH,
95  int kernelW, int kernelY,
96  int kernelX, int nim) {
97  const int y = blockDim.y * blockIdx.y + threadIdx.y;
98  const int x = blockDim.x * blockIdx.x + threadIdx.x;
99  const int z = blockDim.z * blockIdx.z + threadIdx.z;
100 
101  const int borderH = dataH + kernelY;
102  const int borderW = dataW + kernelX;
103 
104  if (y < fftH && x < fftW && z < nim) {
105  int dy, dx;
106  int kz_dst = z * (fftH * fftW);
107  int kz_src = z * (dataH * dataW);
108 
109  if (y < dataH) dy = y;
110  if (x < dataW) dx = x;
111  if (y >= dataH && y < borderH) dy = dataH - 1;
112  if (x >= dataW && x < borderW) dx = dataW - 1;
113  if (y >= borderH) dy = 0;
114  if (x >= borderW) dx = 0;
115 
116  d_Dst[y * fftW + x + kz_dst] = LOAD_FLOAT(dy * dataW + dx + kz_src);
117  }
118 }
119 
120 ////////////////////////////////////////////////////////////////////////////////
121 // Modulate Fourier image of padded data by Fourier image of padded kernel
122 // and normalize by FFT size
123 ////////////////////////////////////////////////////////////////////////////////
124 inline __device__ void mulAndScale(fComplex &a, const fComplex &b,
125  const float &c) {
126  fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
127  a = t;
128 }
129 
130 __global__ void modulate_and_normalize_kernel(fComplex *d_Dst, fComplex *d_Src,
131  int dataSize, float c) {
132  const int i = blockDim.x * blockIdx.x + threadIdx.x;
133  if (i >= dataSize) return;
134 
135  fComplex a = d_Src[i];
136  fComplex b = d_Dst[i];
137 
138  mulAndScale(a, b, c);
139 
140  d_Dst[i] = a;
141 }
142 
143 ////////////////////////////////////////////////////////////////////////////////
144 // 2D R2C / C2R post/preprocessing kernels
145 ////////////////////////////////////////////////////////////////////////////////
146 #if (USE_TEXTURE)
147 texture<fComplex, 1, cudaReadModeElementType> texComplexA;
148 texture<fComplex, 1, cudaReadModeElementType> texComplexB;
149 #define LOAD_FCOMPLEX(i) tex1Dfetch(texComplexA, i)
150 #define LOAD_FCOMPLEX_A(i) tex1Dfetch(texComplexA, i)
151 #define LOAD_FCOMPLEX_B(i) tex1Dfetch(texComplexB, i)
152 
153 #define SET_FCOMPLEX_BASE carma_safe_call(cudaBindTexture(0, texComplexA, d_Src))
154 #define SET_FCOMPLEX_BASE_A \
155  carma_safe_call(cudaBindTexture(0, texComplexA, d_SrcA))
156 #define SET_FCOMPLEX_BASE_B \
157  carma_safe_call(cudaBindTexture(0, texComplexB, d_SrcB))
158 #else
159 #define LOAD_FCOMPLEX(i) d_Src[i]
160 #define LOAD_FCOMPLEX_A(i) d_SrcA[i]
161 #define LOAD_FCOMPLEX_B(i) d_SrcB[i]
162 
163 #define SET_FCOMPLEX_BASE
164 #define SET_FCOMPLEX_BASE_A
165 #define SET_FCOMPLEX_BASE_B
166 #endif
167 
168 inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2,
169  const fComplex &twiddle) {
170  float A1 = 0.5f * (D1.x + D2.x);
171  float B1 = 0.5f * (D1.y - D2.y);
172  float A2 = 0.5f * (D1.y + D2.y);
173  float B2 = 0.5f * (D1.x - D2.x);
174 
175  D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
176  D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
177  D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
178  D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
179 }
180 
181 // Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
182 inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2,
183  const fComplex &twiddle) {
184  float A1 = /* 0.5f * */ (D1.x + D2.x);
185  float B1 = /* 0.5f * */ (D1.y - D2.y);
186  float A2 = /* 0.5f * */ (D1.y + D2.y);
187  float B2 = /* 0.5f * */ (D1.x - D2.x);
188 
189  D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
190  D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
191  D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
192  D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
193 }
194 
195 inline __device__ void getTwiddle(fComplex &twiddle, float phase) {
196  __sincosf(phase, &twiddle.y, &twiddle.x);
197 }
198 
199 inline __device__ uint mod(uint a, uint DA) {
200  //(DA - a) % DA, assuming a <= DA
201  return a ? (DA - a) : a;
202 }
203 
204 static inline uint factorRadix2(uint &log2N, uint n) {
205  if (!n) {
206  log2N = 0;
207  return 0;
208  } else {
209  for (log2N = 0; n % 2 == 0; n /= 2, log2N++)
210  ;
211  return n;
212  }
213 }
214 
215 inline __device__ void udivmod(uint &dividend, uint divisor, uint &rem) {
216 #if (!POWER_OF_TWO)
217  rem = dividend % divisor;
218  dividend /= divisor;
219 #else
220  rem = dividend & (divisor - 1);
221  dividend >>= (__ffs(divisor) - 1);
222 #endif
223 }
224 
225 __global__ void sp_postprocess_2d_kernel(fComplex *d_Dst, fComplex *d_Src,
226  uint DY, uint DX, uint threadCount,
227  uint padding, float phaseBase) {
228  const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
229  if (threadId >= threadCount) return;
230 
231  uint x, y, i = threadId;
232  udivmod(i, DX / 2, x);
233  udivmod(i, DY, y);
234 
235  const uint srcOffset = i * DY * DX;
236  const uint dstOffset = i * DY * (DX + padding);
237 
238  // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
239  {
240  const uint loadPos1 = srcOffset + y * DX + x;
241  const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
242  const uint storePos1 = dstOffset + y * (DX + padding) + x;
243  const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);
244 
245  fComplex D1 = LOAD_FCOMPLEX(loadPos1);
246  fComplex D2 = LOAD_FCOMPLEX(loadPos2);
247 
248  fComplex twiddle;
249  getTwiddle(twiddle, phaseBase * (float)x);
250  spPostprocessC2C(D1, D2, twiddle);
251 
252  d_Dst[storePos1] = D1;
253  d_Dst[storePos2] = D2;
254  }
255 
256  // Process x = DX / 2
257  if (x == 0) {
258  const uint loadPos1 = srcOffset + y * DX + DX / 2;
259  const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
260  const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;
261  const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;
262 
263  fComplex D1 = LOAD_FCOMPLEX(loadPos1);
264  fComplex D2 = LOAD_FCOMPLEX(loadPos2);
265 
266  // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
267  fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
268  spPostprocessC2C(D1, D2, twiddle);
269 
270  d_Dst[storePos1] = D1;
271  d_Dst[storePos2] = D2;
272  }
273 }
274 
275 __global__ void sp_preprocess_2d_kernel(fComplex *d_Dst, fComplex *d_Src, uint DY,
276  uint DX, uint threadCount, uint padding,
277  float phaseBase) {
278  const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
279  if (threadId >= threadCount) return;
280 
281  uint x, y, i = threadId;
282  udivmod(i, DX / 2, x);
283  udivmod(i, DY, y);
284 
285  // Avoid overwrites in columns 0 and DX / 2 by different threads (lower and
286  // upper halves)
287  if ((x == 0) && (y > DY / 2)) return;
288 
289  const uint srcOffset = i * DY * (DX + padding);
290  const uint dstOffset = i * DY * DX;
291 
292  // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
293  {
294  const uint loadPos1 = srcOffset + y * (DX + padding) + x;
295  const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
296  const uint storePos1 = dstOffset + y * DX + x;
297  const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);
298 
299  fComplex D1 = LOAD_FCOMPLEX(loadPos1);
300  fComplex D2 = LOAD_FCOMPLEX(loadPos2);
301 
302  fComplex twiddle;
303  getTwiddle(twiddle, phaseBase * (float)x);
304  spPreprocessC2C(D1, D2, twiddle);
305 
306  d_Dst[storePos1] = D1;
307  d_Dst[storePos2] = D2;
308  }
309 
310  // Process x = DX / 2
311  if (x == 0) {
312  const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;
313  const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
314  const uint storePos1 = dstOffset + y * DX + DX / 2;
315  const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;
316 
317  fComplex D1 = LOAD_FCOMPLEX(loadPos1);
318  fComplex D2 = LOAD_FCOMPLEX(loadPos2);
319 
320  // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
321  fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
322  spPreprocessC2C(D1, D2, twiddle);
323 
324  d_Dst[storePos1] = D1;
325  d_Dst[storePos2] = D2;
326  }
327 }
328 
329 ////////////////////////////////////////////////////////////////////////////////
330 // Combined sp_postprocess_2d + modulate_and_normalize + sp_preprocess_2d
331 ////////////////////////////////////////////////////////////////////////////////
332 __global__ void sp_process_2d_kernel(fComplex *d_Dst, fComplex *d_SrcA,
333  fComplex *d_SrcB, uint DY, uint DX,
334  uint threadCount, float phaseBase, float c) {
335  const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
336  if (threadId >= threadCount) return;
337 
338  uint x, y, i = threadId;
339  udivmod(i, DX, x);
340  udivmod(i, DY / 2, y);
341 
342  const uint offset = i * DY * DX;
343 
344  // Avoid overwrites in rows 0 and DY / 2 by different threads (left and right
345  // halves) Otherwise correctness for in-place transformations is affected
346  if ((y == 0) && (x > DX / 2)) return;
347 
348  fComplex twiddle;
349 
350  // Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
351  {
352  const uint pos1 = offset + y * DX + x;
353  const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);
354 
355  fComplex D1 = LOAD_FCOMPLEX_A(pos1);
356  fComplex D2 = LOAD_FCOMPLEX_A(pos2);
357  fComplex K1 = LOAD_FCOMPLEX_B(pos1);
358  fComplex K2 = LOAD_FCOMPLEX_B(pos2);
359  getTwiddle(twiddle, phaseBase * (float)x);
360 
361  spPostprocessC2C(D1, D2, twiddle);
362  spPostprocessC2C(K1, K2, twiddle);
363  mulAndScale(D1, K1, c);
364  mulAndScale(D2, K2, c);
365  spPreprocessC2C(D1, D2, twiddle);
366 
367  d_Dst[pos1] = D1;
368  d_Dst[pos2] = D2;
369  }
370 
371  if (y == 0) {
372  const uint pos1 = offset + (DY / 2) * DX + x;
373  const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);
374 
375  fComplex D1 = LOAD_FCOMPLEX_A(pos1);
376  fComplex D2 = LOAD_FCOMPLEX_A(pos2);
377  fComplex K1 = LOAD_FCOMPLEX_B(pos1);
378  fComplex K2 = LOAD_FCOMPLEX_B(pos2);
379 
380  spPostprocessC2C(D1, D2, twiddle);
381  spPostprocessC2C(K1, K2, twiddle);
382  mulAndScale(D1, K1, c);
383  mulAndScale(D2, K2, c);
384  spPreprocessC2C(D1, D2, twiddle);
385 
386  d_Dst[pos1] = D1;
387  d_Dst[pos2] = D2;
388  }
389 }
390 #endif //_CARMA_CONVOLUTIONFFT2D_CUH_