I am launching a kernel with linear blocks of 512 threads. Associated with each thread are six double precision values (two 3-element vectors) that I would like to store in shared memory, for a total of 512*6*8=24576 bytes. I would like to create pointers to the intermediate elements of shared to line all the vectors up as follows:
__global__ void my_kernel(double *global_data) {
extern __shared__ double shr[];
id = threadIdx.x;
double *X = &shr[id*3];
double *Y = &shr[(id+1)*3];
// Some arithmetic to set X[0:3] ad Y[0:3]
// Now I have a small for loop to compute something for each thread
for (int i = 0; i < 3; i++) {
for (int j=0; j < 3; j++) {
// Some computations involving the X and Y vectors
}
}
My problem is with accessing the values in X and Y using the looping indices. I am unable to explain the following behavior during the first loop iteration:
(cuda-gdb) cuda thread
thread (0,0,0)
(cuda-gdb) p shr[0]
$1 = 0.62293193093894383
(cuda-gdb) p &shr[0]
$2 = (#shared double *) 0x0
(cuda-gdb) p X[0]
$3 = 0.62293193093894383
(cuda-gdb) p &X[0]
$4 = (#generic double *) 0x1000000
(cuda-gdb) p X
$5 = (#generic double * #register) 0x1000000
I think this is normal. But then:
(cuda-gdb) p i == 0
$7 = true
(cuda-gdb) p X[i]
Error: Failed to read global memory at address 0x0 on device 0 sm 0 warp 0 lane 0 (error=7).
Why is it that when i == 0 I can access X[0] but not X[i]?
EDIT: Here is a complete working example demonstrating my issue:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
from math import pi
mydat = np.arange(12).astype(np.float64)
mydat_gpu = gpuarray.to_gpu(mydat)
mod = SourceModule("""
__global__ void my_kernel(double *mydat) {
extern __shared__ double shr[];
int id = threadIdx.x;
double *X = &shr[(id * 6)];
double *Y = &shr[(id * 6) + 3];
X[0] = mydat[0];
X[1] = mydat[1];
X[2] = mydat[2];
Y[0] = mydat[3];
Y[1] = mydat[4];
Y[2] = mydat[5];
__syncthreads();
double result;
for (int i = 0; i < 3; i++) {
result += X[i] + Y[i];
}
}
""")
my_kernel = mod.get_function("my_kernel")
blk = (1,1,1)
grd = (1,1,1)
my_kernel(mydat_gpu, grid=grd, block=blk, shared=(8*6))
At this point I start up a debugging session:
cuda-gdb --args python -m pycuda.debug minimal_working_example.py
(cuda-gdb) b my_kernel
Function "my_kernel" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (my_kernel) pending.
(cuda-gdb) run
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Breakpoint 1, my_kernel(double * #generic)<<<(1,1,1),(1,1,1)>>> (mydat=0x13034a0000)
at kernel.cu:5
5 int id = threadIdx.x;
(cuda-gdb) n
7 double *X = &shr[(id * 6)];
(cuda-gdb) p id
$1 = 0
(cuda-gdb) p id * 6
$2 = 0
(cuda-gdb) n
8 double *Y = &shr[(id * 6) + 3];
(cuda-gdb) p (id * 6) + 3
$3 = 3
(cuda-gdb) n
10 X[0] = mydat[0];
(cuda-gdb) n
11 X[1] = mydat[1];
(cuda-gdb) n
12 X[2] = mydat[2];
(cuda-gdb) n
13 Y[0] = mydat[3];
(cuda-gdb) n
14 Y[1] = mydat[4];
(cuda-gdb) n
15 Y[2] = mydat[5];
(cuda-gdb) p X
$4 = (#generic double * #register) 0x1000000
(cuda-gdb) p X[0]
$5 = 0
(cuda-gdb) p X[1]
$6 = 1
(cuda-gdb) p Y[0]
$7 = 3
(cuda-gdb) p Y[1]
$8 = 4
(cuda-gdb) n
18 __syncthreads();
(cuda-gdb) n
22 for (int i = 0; i < 3; i++) {
(cuda-gdb) n
23 result += X[i] + Y[i];
(cuda-gdb) p i
$9 = 0
(cuda-gdb) p X[0]
$10 = 0
(cuda-gdb) p X[i]
Error: Failed to read global memory at address 0x0 on device 0 sm 0 warp 0 lane 0 (error=7).
All that is happening here is that you are stepping through source instructions which have not actually been compiled into the running kernel. The variables you are trying to inspect have already gone out of scope and the debugger can no longer show them to you.
This is due to aggressive optimisation in the device code compiler. In your example, the summation loop doesn't produce an output which effects a write to global or shared memory, so the compiler just eliminates it. When stepping through the optimised code, the source debugger tried its best to show a 1:1 relationship between source and execution, but it isn't always possible, and this is the somewhat confusing result you are seeing.
You can confirm this for yourself by compiling your kernel code to PTX using nvcc and inspecting the code:
// .globl _Z9my_kernelPd
.visible .entry _Z9my_kernelPd(
.param .u64 _Z9my_kernelPd_param_0
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<7>;
.reg .b64 %rd<6>;
ld.param.u64 %rd1, [_Z9my_kernelPd_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd3, %r2, 8;
mov.u64 %rd4, shr;
add.s64 %rd5, %rd4, %rd3;
ld.global.nc.f64 %fd1, [%rd2];
ld.global.nc.f64 %fd2, [%rd2+8];
ld.global.nc.f64 %fd3, [%rd2+16];
ld.global.nc.f64 %fd4, [%rd2+24];
ld.global.nc.f64 %fd5, [%rd2+32];
ld.global.nc.f64 %fd6, [%rd2+40];
st.shared.f64 [%rd5], %fd1;
st.shared.f64 [%rd5+8], %fd2;
st.shared.f64 [%rd5+16], %fd3;
st.shared.f64 [%rd5+24], %fd4;
st.shared.f64 [%rd5+32], %fd5;
st.shared.f64 [%rd5+40], %fd6;
bar.sync 0;
ret;
}
You can see the last PTX instruction is bar, which is the instruction which the __syncthreads() device function emits. The loop for summation is not present.
If I modify you source like this:
__global__ void my_kernel2(double *mydat, double *out) {
extern __shared__ double shr[];
int id = threadIdx.x;
double *X = &shr[(id * 6)];
double *Y = &shr[(id * 6) + 3];
X[0] = mydat[0];
X[1] = mydat[1];
X[2] = mydat[2];
Y[0] = mydat[3];
Y[1] = mydat[4];
Y[2] = mydat[5];
__syncthreads();
double result;
for (int i = 0; i < 3; i++) {
result += X[i] + Y[i];
}
*out = result;
}
so that result is now stored to global memory and compile it to PTX:
.visible .entry _Z10my_kernel2PdS_(
.param .u64 _Z10my_kernel2PdS__param_0,
.param .u64 _Z10my_kernel2PdS__param_1
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<20>;
.reg .b64 %rd<8>;
ld.param.u64 %rd3, [_Z10my_kernel2PdS__param_0];
ld.param.u64 %rd2, [_Z10my_kernel2PdS__param_1];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd5, %r2, 8;
mov.u64 %rd6, shr;
add.s64 %rd1, %rd6, %rd5;
ld.global.f64 %fd1, [%rd4];
ld.global.f64 %fd2, [%rd4+8];
ld.global.f64 %fd3, [%rd4+16];
ld.global.f64 %fd4, [%rd4+24];
ld.global.f64 %fd5, [%rd4+32];
ld.global.f64 %fd6, [%rd4+40];
st.shared.f64 [%rd1], %fd1;
st.shared.f64 [%rd1+8], %fd2;
st.shared.f64 [%rd1+16], %fd3;
st.shared.f64 [%rd1+24], %fd4;
st.shared.f64 [%rd1+32], %fd5;
st.shared.f64 [%rd1+40], %fd6;
bar.sync 0;
ld.shared.f64 %fd7, [%rd1];
ld.shared.f64 %fd8, [%rd1+24];
add.f64 %fd9, %fd7, %fd8;
add.f64 %fd10, %fd9, %fd11;
ld.shared.f64 %fd12, [%rd1+8];
ld.shared.f64 %fd13, [%rd1+32];
add.f64 %fd14, %fd12, %fd13;
add.f64 %fd15, %fd10, %fd14;
ld.shared.f64 %fd16, [%rd1+16];
ld.shared.f64 %fd17, [%rd1+40];
add.f64 %fd18, %fd16, %fd17;
add.f64 %fd19, %fd15, %fd18;
cvta.to.global.u64 %rd7, %rd2;
st.global.f64 [%rd7], %fd19;
ret;
}
You can see that the (urolled) loop is now present in the PTX, and the debugger behaviour should be closer to what you expect if you were to try it.
As suggested in comments, you shouldn't ever spend time trying to analyse any code which doesn't change block or global state, because of the complications caused by compiler optimisation.
Related
I was looking to move back from using counter buffer for some compute shader routines, and had some unexpected behaviour on Nvidia cards
I made a really simplified example (so it does not make sense to do that, but that's the smallest that can reproduce the issue I encounter).
So I want to perform conditional writes in several locations on a buffer (also for simplification, I only run a single thread, since the behaviour can also be reproduced that way).
I will write 4 uints, then 2 uint3 (using InterlockedAdd to "simulate conditional writes")
So I use a single buffer (with raw access on uav), with the following simple layout :
0 -> First counter
4 -> Second counter
8 till 24 -> First 4 ints to write
24 till 48 -> Pair of uint3 to write
I also clear the buffer every frame (0 for each counter, and arbitrary value for the rest, 12345 in this case).
I copy the buffer staging resource in order to check the values, so yes my pipeline binding is correct, but I can post the code if asked for.
Now when I call the compute shader, only performing 4 increments as here :
RWByteAddressBuffer RWByteBuffer : BACKBUFFER;
#define COUNTER0_LOCATION 0
#define COUNTER1_LOCATION 4
#define PASS1_LOCATION 8
#define PASS2_LOCATION 24
[numthreads(1,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
uint i0,i1,i2,i3;
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i0);
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i1);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i2);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i3);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
}
I then obtain the following results (formatted a little):
4,0,
10,20,30,40,
12345,12345,12345,12345,12345,12345,12345,12345,12345
Which is correct (counter is 4 as I called 4 times, second one was not called), I get 10 till 40 in the right locations, and rest has default values
Now if I want to reuse those indices in order to write them to another location:
[numthreads(1,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
uint i0,i1,i2,i3;
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i0);
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i1);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i2);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i3);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
uint3 inds = uint3(i0, i1, i2);
uint3 inds2 = uint3(i1,i2,i3);
uint writeIndex;
RWByteBuffer.InterlockedAdd(COUNTER1_LOCATION, 1, writeIndex);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex * 12, inds);
RWByteBuffer.InterlockedAdd(COUNTER1_LOCATION, 1, writeIndex);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex * 12, inds2);
}
Now If I run that code on Intel card (tried HD4000 and HD4600), or ATI card 290, I get expected results eg :
4,2,
10,20,30,40,
0,1,2,1,2,3
But running that on NVidia (used 970m, gtx1080, gtx570) , I get the following :
4,2,
40,12345,12345,12345,
0,0,0,0,0,0
So it seems it suddenly returns 0 in the return value of interlocked add (it still increments properly as counter is 4, but we end up with 40 in last value.
Also we can see that only 0 got written in i1,i2,i3
In case I "reserve memory", eg, call Interlocked only once per location (incrementing by 4 and 2 , respectively):
[numthreads(1,1,1)]
void CSB(uint3 tid : SV_DispatchThreadID)
{
uint i0;
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 4, i0);
uint i1 = i0 + 1;
uint i2 = i0 + 2;
uint i3 = i0 + 3;
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
uint3 inds = uint3(i0, i1, i2);
uint3 inds2 = uint3(i1,i2,i3);
uint writeIndex;
RWByteBuffer.InterlockedAdd(COUNTER1_LOCATION, 2, writeIndex);
uint writeIndex2 = writeIndex + 1;
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex * 12, inds);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex2 * 12, inds2);
}
Then this works on all cards, but I have some cases when I have to rely on the earlier behaviour.
As a side note, if I use structured buffers with a counter flag on the uav instead of a location in a byte address and do :
RWStructuredBuffer<uint> rwCounterBuffer1;
RWStructuredBuffer<uint> rwCounterBuffer2;
RWByteAddressBuffer RWByteBuffer : BACKBUFFER;
#define PASS1_LOCATION 8
#define PASS2_LOCATION 24
[numthreads(1,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
uint i0 = rwCounterBuffer1.IncrementCounter();
uint i1 = rwCounterBuffer1.IncrementCounter();
uint i2 = rwCounterBuffer1.IncrementCounter();
uint i3 = rwCounterBuffer1.IncrementCounter();
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
uint3 inds = uint3(i0, i1, i2);
uint3 inds2 = uint3(i1,i2,i3);
uint writeIndex1= rwCounterBuffer2.IncrementCounter();
uint writeIndex2= rwCounterBuffer2.IncrementCounter();
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex1* 12, inds);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex2* 12, inds2);
}
This works correctly across all cards, but has all sorts of issues (that are out of topic for this question).
This is running on DirectX11 (I did not try it on DX12, and that's not relevant to my use case, except plain curiosity)
So is it a bug on NVidia?
Or is there something wrong with the first approach?
I have a C function that I want to convert to LUA but I'm getting strange results out of Lua:
unsigned short crc16(const char* pstrCurrent, int iCount)
{
unsigned short wCRC = 0;
int iIndex = 0;
while(--iCount >= 0)
{
wCRC = wCRC ^ ((int)(*pstrCurrent++) << 8);
printf ("WCRC = %u\n", wCRC);
}
return (wCRC & 0xFFFF);
}
and here is how I started the Lua:
local function crc16(keyCurrent, byteCount)
wCRC = 0
byteIndex = 1
local crcInput = {}
while byteCount > 0 do
print ("BYTE COUNT= " .. byteCount)
wCRC=bit32.bxor(wCRC, bit32.lshift(keyCurrent[byteIndex], 8))
print ( "WCRC = " .. wCRC )
byteCount = byteCount-1
byteIndex = byteIndex+1
end
end
Yes, I know the C function is incomplete, I just want to compare what's causing issues.
The prints of the WCRC is C and Lua print completely different numbers for the same input.
Is my Lua conversion incorrect? It is my second or third time using Lua so not quite sure what I'm doing wrong.
***************** UPDATE ********************
So here is the full C and LUA and a quick little test code:
unsigned short crc16(const char* pstrCurrent, int iCount)
{
unsigned short wCRC = 0;
int iIndex = 0;
// Perform the following for each character in the buffer
while(--iCount >= 0)
{
// Get the byte information for the calculation and
// advance the pointer
wCRC = wCRC ^ ((int)(*pstrCurrent++) << 8);
for(iIndex = 0; iIndex < 8; ++iIndex)
{
if(wCRC & 0x8000)
{
wCRC = (wCRC << 1) ^ 0x1021;
}
else
{
wCRC = wCRC << 1;
}
}
}
return (wCRC & 0xFFFF);
}
and the LUA conversion:
function crc16 (keyCurrent, iCount)
wCRC = 0
byteIndex = 1
iIndex = 0
local crcInput = {}
while iCount >= 1 do
wCRC = bit32.bxor (wCRC, bit32.lshift(keyCurrent[byteIndex], 8))
for iIndex=0,8 do
if (bit32.band (wCRC, 0x8000) ~= nil ) then
wCRC = bit32.bxor (bit32.lshift (wCRC, 1), 0x1021)
else
wCRC = bit32.lshift (wCRC, 1)
end
end
iCount = iCount-1
byteIndex = byteIndex+1
end
return (bit32.band (wCRC, 0xFFFF))
end
local dKey = {}
dKey = {8, 210, 59, 0, 18, 166, 254, 117}
print ( "CRC = " .. crc16 (dKey ,8) )
In C, for the same array I get: CRC16 = 567
In LUA, I get: CRC = 61471
Can someone tell me what I'm doing wrong?
Thanks
It seems they yield the same results:
pure-C
WCRC = 18432
WCRC = 11520
WCRC = 16640
WCRC = 11520
pure-Lua
BYTE COUNT= 4
WCRC = 18432
BYTE COUNT= 3
WCRC = 11520
BYTE COUNT= 2
WCRC = 16640
BYTE COUNT= 1
WCRC = 11520
ASCII convertor:
What do you mean?
There's mistakes in altered Lua sample.
1. bit32.band() returns number. Number 0 not equals to 'nil', that's totally different type. You're trying to compare number with nil, and that check will fail always.2. for iIndex=0,8 do iterates 9 times, including final index 8.
I'm writing a back propagation algorithm in matlab. But I can not get to write a good solution. I read a book Haykin and read some topics in Internet, how make it other people. I understand from door to door this algorithm in theory, but I have a much of error in practice. I have a NaN in my code.
You can see here.
I'm trying classification some points on plate. These are three ellipses, which are placed one inside the other.
I wrote this function. The second layer learn, but first layer dont learn.
function [E, W_1, W_2, B_1, B_2, X_3] = update(W_1, W_2, B_1, B_2, X_1, T, alpha)
V_1 = W_1 * X_1 + B_1;
X_2 = tansig(V_1);
V_2 = W_2 * X_2 + B_2;
X_3 = tansig(V_2);
E = 1 / 2 * sum((T - X_3) .^ 2);
dE = (T - X_3);
for j = 1 : size(X_2, 1)
delta_2_sum = 0;
for i = 1 : size(X_3, 1)
delta_2 = dE(i, 1) * dtansig(1, V_2(i, 1) );
W_2_tmp(i, j) = W_2(i, j) - alpha * delta_2 * X_2(j, 1);
B_2_tmp(i, 1) = B_2(i, 1) - alpha * delta_2;
end;
end;
for k = 1 : size(X_1, 1)
for j = 1 : size(X_2, 1)
delta_2_sum = 0;
for i = 1 : size(X_3, 1)
delta_2 = dE(i, 1) * dtansig(1, V_2(i, 1) );
delta_2_sum = delta_2_sum + W_2(i, j) * delta_2;
end;
delta_1 = delta_2_sum * dtansig(1, V_1(j, 1) );
W_1_tmp(j, k) = W_1(j, k) - alpha * delta_1 * X_1(k, 1);
B_1_tmp(j, 1) = B_1(j, 1) - alpha * delta_1;
end;
end;
if (min(W_1) < -10000 )
X = 1;
end;
B_1 = B_1_tmp;
B_2 = B_2_tmp;
W_1 = W_1_tmp
W_2 = W_2_tmp;
end
I wrote another variant code. And this code don't work. I calculated this code with 1-dimensional vector as input and as output. And I don't have truth result.
What can I do?
I use matlab nntool interface. But my backprop was written my hand.
How I can testing my code?
function [net] = backProp(net, epoch, alpha)
for u = 1 : epoch % Число эпох
for p = 1 : size(net.userdata{1, 1}, 2)
% Учим по всем элементам выборки
[~, ~, ~, De, Df, f] = frontProp(net, p, 1);
for l = size(net.LW, 1) : -1 : 1 % Обходим слои
if (size(net.LW, 1) == l )
delta{l} = De .* Df{l};
else
% size(delta{l + 1})
% size(net.LW{l + 1})
delta{l} = Df{l} .* (delta{l + 1}' * net.LW{l + 1} )';
end;
if (l == 1)
net.IW{l} + alpha * delta{l} * f{l}'
net.IW{l} = net.IW{l} + alpha * delta{l} * f{l}';
else
net.LW{l} + alpha * delta{l} * f{l}'
net.LW{l} = net.LW{l} + alpha * delta{l} * f{l}';
end;
end;
end;
end;
end
Now that CodeSprint 3 is over, I've been wondering how to solve this problem. We need to simply calculate nCr mod 142857 for large values of r and n (0<=n<=10^9 ; 0<=r<=n). I used a recursive method which goes through min(r, n-r) iterations to calculate the combination. Turns out this wasn't efficient enough. I've tried a few different methods, but they all seem to not be efficient enough. Any suggestions?
For non-prime mod, factor it (142857 = 3^3 * 11 * 13 * 37) and compute C(n,k) mod p^q for each prime factor of the mod using the general Lucas theorem, and combine them using Chinese remainder theorem.
For example, C(234, 44) mod 142857 = 6084, then
C(234, 44) mod 3^3 = 9
C(234, 44) mod 11 = 1
C(234, 44) mod 13 = 0
C(234, 44) mod 37 = 16
The Chinese Remainder theorem involves finding x such that
x = 9 mod 3^3
x = 1 mod 11
x = 0 mod 13
x = 16 mod 37
The result is x = 6084.
Example
C(234, 44) mod 3^3
First convert n, k, and n-k to base p
n = 234_10 = 22200_3
k = 44_10 = 1122_3
r = n-k = 190_10 = 21001_3
Next find the number of carries
e[i] = number of carries from i to end
e 4 3 2 1 0
1 1
r 2 1 0 0 1
k 1 1 2 2
n 2 2 2 0 0
Now create the factorial function needed for general Lucas
def f(n, p):
r = 1
for i in range(1, n+1):
if i % p != 0:
r *= i
return r
Since q = 3, you will consider only three digits of the base p representation at a time
So
f(222_3, 3)/[f(210_3, 3) * f(011_3, 3)] *
f(220_3, 3)/[f(100_3, 3) * f(112_3, 3)] *
f(200_3, 3)/[f(001_3, 3) * f(122_3, 3)] = 6719344775 / 7
Now
s = 1 if p = 2 and q >= 3 else -1
Then
p^e[0] * s * 6719344775 / 7 mod 3^3
e[0] = 2
p^e[0] = 3^2 = 9
s = -1
p^e[0] * s * 6719344775 = -60474102975
Now you have
-60474102975 / 7 mod 3^3
This is a linear congruence and can be solved with
ModularInverse(7, 3^3) = 4
4 * -60474102975 mod 27 = 9
Hence C(234, 44) mod 3^3 = 9
I can't manage to access the data in my constant memory and I don't know why. Here is a snippet of my code:
#define N 10
__constant__ int constBuf_d[N];
__global__ void foo( int *results, int *constBuf )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
if( idx < N )
{
results[idx] = constBuf[idx];
}
}
// main routine that executes on the host
int main(int argc, char* argv[])
{
int *results_h = new int[N];
int *results_d = NULL;
cudaMalloc((void **)&results_d, N*sizeof(int));
int arr[10] = { 16, 2, 77, 40, 12, 3, 5, 3, 6, 6 };
int *cpnt;
cudaError_t err = cudaGetSymbolAddress((void **)&cpnt, "constBuf_d");
if( err )
cout << "error!";
cudaMemcpyToSymbol((void**)&cpnt, arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);
foo <<< 1, 256 >>> ( results_d, cpnt );
cudaMemcpy(results_h, results_d, N*sizeof(int), cudaMemcpyDeviceToHost);
for( int i=0; i < N; ++i )
printf("%i ", results_h[i] );
}
For some reason, I only get "0" in results_h. I'm running CUDA 4.0 with a card with capability 1.1.
Any ideas? Thanks!
If you add proper error checking to your code, you will find that the cudaMemcpyToSymbol is failing with a invalid device symbol error. You either need to pass the symbol by name, or use cudaMemcpy instead. So this:
cudaGetSymbolAddress((void **)&cpnt, "constBuf_d");
cudaMemcpy(cpnt, arr, N*sizeof(int), cudaMemcpyHostToDevice);
or
cudaMemcpyToSymbol("constBuf_d", arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);
or
cudaMemcpyToSymbol(constBuf_d, arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);
will work. Having said that, passing a constant memory address as an argument to a kernel is the wrong way to use constant memory - it defeats the compiler from generating instructions to access memory via the constant memory cache. Compare the compute capability 1.2 PTX generated for your kernel:
.entry _Z3fooPiS_ (
.param .u32 __cudaparm__Z3fooPiS__results,
.param .u32 __cudaparm__Z3fooPiS__constBuf)
{
.reg .u16 %rh<4>;
.reg .u32 %r<12>;
.reg .pred %p<3>;
.loc 16 7 0
$LDWbegin__Z3fooPiS_:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.s32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
mov.u32 %r4, 9;
setp.gt.s32 %p1, %r3, %r4;
#%p1 bra $Lt_0_1026;
.loc 16 14 0
mul.lo.u32 %r5, %r3, 4;
ld.param.u32 %r6, [__cudaparm__Z3fooPiS__constBuf];
add.u32 %r7, %r6, %r5;
ld.global.s32 %r8, [%r7+0];
ld.param.u32 %r9, [__cudaparm__Z3fooPiS__results];
add.u32 %r10, %r9, %r5;
st.global.s32 [%r10+0], %r8;
$Lt_0_1026:
.loc 16 16 0
exit;
$LDWend__Z3fooPiS_:
} // _Z3fooPiS_
with this kernel:
__global__ void foo2( int *results )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
if( idx < N )
{
results[idx] = constBuf_d[idx];
}
}
which produces
.entry _Z4foo2Pi (
.param .u32 __cudaparm__Z4foo2Pi_results)
{
.reg .u16 %rh<4>;
.reg .u32 %r<12>;
.reg .pred %p<3>;
.loc 16 18 0
$LDWbegin__Z4foo2Pi:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.s32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
mov.u32 %r4, 9;
setp.gt.s32 %p1, %r3, %r4;
#%p1 bra $Lt_1_1026;
.loc 16 25 0
mul.lo.u32 %r5, %r3, 4;
mov.u32 %r6, constBuf_d;
add.u32 %r7, %r5, %r6;
ld.const.s32 %r8, [%r7+0];
ld.param.u32 %r9, [__cudaparm__Z4foo2Pi_results];
add.u32 %r10, %r9, %r5;
st.global.s32 [%r10+0], %r8;
$Lt_1_1026:
.loc 16 27 0
exit;
$LDWend__Z4foo2Pi:
} // _Z4foo2Pi
Note that in the second case, constBuf_d is accessed via ld.const.s32, rather than ld.global.s32, so that constant memory cache is used.
Excellent answer #talonmies. But I would like to mention that there have been changes in cuda 5. In the function MemcpyToSymbol(), char * argument is no longer supported.
The CUDA 5 release notes read:
** The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly.
Instead the copy have to be made to the constant memory as follows :
cudaMemcpyToSymbol( dev_x, x, N * sizeof(float) );
In this case "dev_x" is pointer to constant memory and "x" is pointer to host memory which needs to be copied into dev_x.