I can't manage to access the data in my constant memory and I don't know why. Here is a snippet of my code:
#define N 10
__constant__ int constBuf_d[N];
__global__ void foo( int *results, int *constBuf )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
if( idx < N )
{
results[idx] = constBuf[idx];
}
}
// main routine that executes on the host
int main(int argc, char* argv[])
{
int *results_h = new int[N];
int *results_d = NULL;
cudaMalloc((void **)&results_d, N*sizeof(int));
int arr[10] = { 16, 2, 77, 40, 12, 3, 5, 3, 6, 6 };
int *cpnt;
cudaError_t err = cudaGetSymbolAddress((void **)&cpnt, "constBuf_d");
if( err )
cout << "error!";
cudaMemcpyToSymbol((void**)&cpnt, arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);
foo <<< 1, 256 >>> ( results_d, cpnt );
cudaMemcpy(results_h, results_d, N*sizeof(int), cudaMemcpyDeviceToHost);
for( int i=0; i < N; ++i )
printf("%i ", results_h[i] );
}
For some reason, I only get "0" in results_h. I'm running CUDA 4.0 with a card with capability 1.1.
Any ideas? Thanks!
If you add proper error checking to your code, you will find that the cudaMemcpyToSymbol is failing with a invalid device symbol error. You either need to pass the symbol by name, or use cudaMemcpy instead. So this:
cudaGetSymbolAddress((void **)&cpnt, "constBuf_d");
cudaMemcpy(cpnt, arr, N*sizeof(int), cudaMemcpyHostToDevice);
or
cudaMemcpyToSymbol("constBuf_d", arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);
or
cudaMemcpyToSymbol(constBuf_d, arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);
will work. Having said that, passing a constant memory address as an argument to a kernel is the wrong way to use constant memory - it defeats the compiler from generating instructions to access memory via the constant memory cache. Compare the compute capability 1.2 PTX generated for your kernel:
.entry _Z3fooPiS_ (
.param .u32 __cudaparm__Z3fooPiS__results,
.param .u32 __cudaparm__Z3fooPiS__constBuf)
{
.reg .u16 %rh<4>;
.reg .u32 %r<12>;
.reg .pred %p<3>;
.loc 16 7 0
$LDWbegin__Z3fooPiS_:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.s32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
mov.u32 %r4, 9;
setp.gt.s32 %p1, %r3, %r4;
#%p1 bra $Lt_0_1026;
.loc 16 14 0
mul.lo.u32 %r5, %r3, 4;
ld.param.u32 %r6, [__cudaparm__Z3fooPiS__constBuf];
add.u32 %r7, %r6, %r5;
ld.global.s32 %r8, [%r7+0];
ld.param.u32 %r9, [__cudaparm__Z3fooPiS__results];
add.u32 %r10, %r9, %r5;
st.global.s32 [%r10+0], %r8;
$Lt_0_1026:
.loc 16 16 0
exit;
$LDWend__Z3fooPiS_:
} // _Z3fooPiS_
with this kernel:
__global__ void foo2( int *results )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
if( idx < N )
{
results[idx] = constBuf_d[idx];
}
}
which produces
.entry _Z4foo2Pi (
.param .u32 __cudaparm__Z4foo2Pi_results)
{
.reg .u16 %rh<4>;
.reg .u32 %r<12>;
.reg .pred %p<3>;
.loc 16 18 0
$LDWbegin__Z4foo2Pi:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.s32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
mov.u32 %r4, 9;
setp.gt.s32 %p1, %r3, %r4;
#%p1 bra $Lt_1_1026;
.loc 16 25 0
mul.lo.u32 %r5, %r3, 4;
mov.u32 %r6, constBuf_d;
add.u32 %r7, %r5, %r6;
ld.const.s32 %r8, [%r7+0];
ld.param.u32 %r9, [__cudaparm__Z4foo2Pi_results];
add.u32 %r10, %r9, %r5;
st.global.s32 [%r10+0], %r8;
$Lt_1_1026:
.loc 16 27 0
exit;
$LDWend__Z4foo2Pi:
} // _Z4foo2Pi
Note that in the second case, constBuf_d is accessed via ld.const.s32, rather than ld.global.s32, so that constant memory cache is used.
Excellent answer #talonmies. But I would like to mention that there have been changes in cuda 5. In the function MemcpyToSymbol(), char * argument is no longer supported.
The CUDA 5 release notes read:
** The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly.
Instead the copy have to be made to the constant memory as follows :
cudaMemcpyToSymbol( dev_x, x, N * sizeof(float) );
In this case "dev_x" is pointer to constant memory and "x" is pointer to host memory which needs to be copied into dev_x.
Related
I'm trying to implement a 32bits checksum macro written in masm32 to the Dart language.
Here is what I understood: the checksum function takes a String as input and returns the checksum in a 4 bytes integer.
But I don't get the same result.
Does anyone see my errors please?
; ecx : length of String variable
; esi : pointer to String variable
; eax : 'return' value of calculated checksum
CHECKSUM32_MACRO MACRO
LOCAL Checksum32Loop, Checksum32Done
xor eax,eax
cmp ecx,4
jb Checksum32Done
align 16
Checksum32Loop:
mov ebx,dword ptr [esi]
add eax,ebx
shl ebx,1
adc ebx,1
xor eax,ebx
add esi,4
sub ecx,4
jz Checksum32Done
cmp ecx,4
jae Checksum32Loop
mov edx,4
sub edx,ecx
sub esi,edx
mov ecx,4
jmp Checksum32Loop
Checksum32Done:
ENDM
int checksum(String src){
int i = src.length-1;
int res = 0;
do{
int c = src.codeUnitAt(i);
res += c;
String cBits = c.toRadixString(2);
int bitFort = int.parse(cBits[0]);
int transform = c << 1;
transform = transform + 1 + bitFort;
res = res ^ transform;
i--;
}while(i>=0);
return res;
}
I modified the code according to the advice, with as postulate an ASCII string so always multiple of 4, the time to understand the problem.
It still doesn't work though.
String deComp = File(CHEMIN_FICHIER_DECOMP).readAsStringSync();
List<int> encoded = [];
for (int i =0; i<deComp.length; i++){
List<int> cUtf8 = utf8.encode(deComp[i]);
encoded.addAll(cUtf8);
}
print(checksum_stack(encoded));
_______
int checksum_stack(List<int> src){
int i = 0;
int res = 0;
do{
int c = fusion(src.sublist(i, i+4));
res += c;
String cBits = c.toRadixString(2).padLeft(8, '0');
int bitFort = int.parse(cBits[0]);
int transform = c << 1;
transform = transform + 1 +bitFort;
res = res ^ transform;
i+=4;
}while(i < src.length-4);
return res;
}
int fusion(List<int> str){
if (str.length != 4) {
throw "need multiple of 4!";
}
String hexStr = "";
str.forEach((c) {
hexStr += c.toRadixString(16).padLeft(2, '0');
});
return int.parse(hexStr,radix: 16);
}
The transcription of the checksum algorithm is wrong.
Here's how I'd do it:
import 'dart:convert';
import 'dart:math';
import 'dart:typed_data';
int checksum(String string, {Encoding encoder = utf8, Endian endian = Endian.little})
{
final ByteData bytes = ByteData.sublistView(Uint8List.fromList(encoder.encode(string)));
int checksum = 0;
if (bytes.lengthInBytes >= 4)
{
for (int i = 0; i < bytes.lengthInBytes; i += 4)
{
int chunk = bytes.getUint32(min(i + 4, bytes.lengthInBytes) - 4, endian);
checksum = (checksum + chunk) ^ ((chunk << 1) + 1 + (chunk >> 31));
}
}
return checksum & 0xffffffff;
}
You totally missed that:
The code is working with DWORDs (32-bit integers).
Strings with less than 4 bytes have zero checksum.
The code handles strings with length non-multiple of four by reading the last four bytes (that, necessarily, overlap with the previous DWORD).
Here's the commented assembly:
CHECKSUM32_MACRO MACRO
LOCAL Checksum32Loop, Checksum32Done
xor eax,eax ;Checksum = 0
cmp ecx,4
jb Checksum32Done ;If len < 4 Then Return
align 16
Checksum32Loop:
mov ebx,dword ptr [esi] ;c = DWORD from string (**FOUR** bytes)
add eax,ebx ;Checksum += c
shl ebx,1 ;CF = c[31], c = c << 1
adc ebx,1 ;c += (1 + CF)
xor eax,ebx ;Checksum ^= c
add esi,4 ;Point to next DWORD
sub ecx,4 ;Len -= 4
jz Checksum32Done ;If Len == 0 Then Return
cmp ecx,4
jae Checksum32Loop ;If Len >= 4 Then Cycle back
mov edx,4
sub edx,ecx ;edx = 4 - Len (left, so it's 4 - Len % 4 in absolute terms)
sub esi,edx ;Point to last DWORD (Len-4 in absolute terms, go back 4-Len in relative terms)
mov ecx,4 ;Set Len=4 to cycle one more time
jmp Checksum32Loop
Checksum32Done:
ENDM
Also, note that converting numbers to string to extract digits or bits is generally a bad practice. Use the >> shift operator instead and eventually an AND.
Here is a simple bpftrace script:
#!/usr/bin/env bpftrace
tracepoint:syscalls:sys_enter_kill
{
$tpid = args->pid;
printf("%d %d %d\n", $tpid, $tpid < 0, $tpid >= 0);
}
It traces kill syscalls, prints the target PID and two additional values: whether it is negative, and whether it is non-negative.
Here is the output that I get:
# ./test.bt
Attaching 1 probe...
-1746 0 1
-2202 0 1
4160 0 1
4197 0 1
4197 0 1
-2202 0 1
-1746 0 1
Weirdly, both positive and negative pids appear to be positive for the comparison operator.
Just as a sanity, check, if I replace the assignment line with:
$tpid = -10;
what I get is exactly what I expect:
# ./test.bt
Attaching 1 probe...
-10 1 0
-10 1 0
-10 1 0
What am I doing wrong?
As you've discovered, bpftrace assigns a u64 type to your $tpid variable. Yet, according to the tracepoint format doc., args->pid should be of type pid_t, or int.
# cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_kill/format
name: sys_enter_kill
ID: 185
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:int __syscall_nr; offset:8; size:4; signed:1;
field:pid_t pid; offset:16; size:8; signed:0;
field:int sig; offset:24; size:8; signed:0;
print fmt: "pid: 0x%08lx, sig: 0x%08lx", ((unsigned long)(REC->pid)), ((unsigned long)(REC->sig))
The bpftrace function that assigns this type is TracepointFormatParser::adjust_integer_types(). This change was introduced by commit 42ce08f to address issue #124.
For the above tracepoint description, bpftrace generates the following structure:
struct _tracepoint_syscalls_sys_enter_kill
{
unsigned short common_type;
unsigned char common_flags;
unsigned char common_preempt_count;
int common_pid;
int __syscall_nr;
u64 pid;
s64 sig;
};
When it should likely generate:
struct _tracepoint_syscalls_sys_enter_kill
{
unsigned short common_type;
unsigned char common_flags;
unsigned char common_preempt_count;
int common_pid;
int __syscall_nr;
u32 pad1;
pid_t pid;
u32 pad2;
int sig;
};
bpftrace seems to be confused by the size parameter that doesn't match the type in the above description. All syscall arguments get size 8 (on 64-bit at least), but that doesn't mean all 8 bytes are used. I think it would be worth opening an issue on bpftrace.
There is something strange going on with integer types in bpftrace (see #554, #772, #834 for details).
It seems that in my case arg->pids gets treated as a 64-bit value by default, while it is actually not. So the solution is to explicitly cast it:
$tpid = (int32)args->pid;
And now it works as expected:
# bpftrace test.bt
Attaching 1 probe...
-2202 1 0
-1746 1 0
-2202 1 0
4160 0 1
4197 0 1
I was looking to move back from using counter buffer for some compute shader routines, and had some unexpected behaviour on Nvidia cards
I made a really simplified example (so it does not make sense to do that, but that's the smallest that can reproduce the issue I encounter).
So I want to perform conditional writes in several locations on a buffer (also for simplification, I only run a single thread, since the behaviour can also be reproduced that way).
I will write 4 uints, then 2 uint3 (using InterlockedAdd to "simulate conditional writes")
So I use a single buffer (with raw access on uav), with the following simple layout :
0 -> First counter
4 -> Second counter
8 till 24 -> First 4 ints to write
24 till 48 -> Pair of uint3 to write
I also clear the buffer every frame (0 for each counter, and arbitrary value for the rest, 12345 in this case).
I copy the buffer staging resource in order to check the values, so yes my pipeline binding is correct, but I can post the code if asked for.
Now when I call the compute shader, only performing 4 increments as here :
RWByteAddressBuffer RWByteBuffer : BACKBUFFER;
#define COUNTER0_LOCATION 0
#define COUNTER1_LOCATION 4
#define PASS1_LOCATION 8
#define PASS2_LOCATION 24
[numthreads(1,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
uint i0,i1,i2,i3;
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i0);
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i1);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i2);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i3);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
}
I then obtain the following results (formatted a little):
4,0,
10,20,30,40,
12345,12345,12345,12345,12345,12345,12345,12345,12345
Which is correct (counter is 4 as I called 4 times, second one was not called), I get 10 till 40 in the right locations, and rest has default values
Now if I want to reuse those indices in order to write them to another location:
[numthreads(1,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
uint i0,i1,i2,i3;
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i0);
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i1);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i2);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 1, i3);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
uint3 inds = uint3(i0, i1, i2);
uint3 inds2 = uint3(i1,i2,i3);
uint writeIndex;
RWByteBuffer.InterlockedAdd(COUNTER1_LOCATION, 1, writeIndex);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex * 12, inds);
RWByteBuffer.InterlockedAdd(COUNTER1_LOCATION, 1, writeIndex);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex * 12, inds2);
}
Now If I run that code on Intel card (tried HD4000 and HD4600), or ATI card 290, I get expected results eg :
4,2,
10,20,30,40,
0,1,2,1,2,3
But running that on NVidia (used 970m, gtx1080, gtx570) , I get the following :
4,2,
40,12345,12345,12345,
0,0,0,0,0,0
So it seems it suddenly returns 0 in the return value of interlocked add (it still increments properly as counter is 4, but we end up with 40 in last value.
Also we can see that only 0 got written in i1,i2,i3
In case I "reserve memory", eg, call Interlocked only once per location (incrementing by 4 and 2 , respectively):
[numthreads(1,1,1)]
void CSB(uint3 tid : SV_DispatchThreadID)
{
uint i0;
RWByteBuffer.InterlockedAdd(COUNTER0_LOCATION, 4, i0);
uint i1 = i0 + 1;
uint i2 = i0 + 2;
uint i3 = i0 + 3;
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
uint3 inds = uint3(i0, i1, i2);
uint3 inds2 = uint3(i1,i2,i3);
uint writeIndex;
RWByteBuffer.InterlockedAdd(COUNTER1_LOCATION, 2, writeIndex);
uint writeIndex2 = writeIndex + 1;
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex * 12, inds);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex2 * 12, inds2);
}
Then this works on all cards, but I have some cases when I have to rely on the earlier behaviour.
As a side note, if I use structured buffers with a counter flag on the uav instead of a location in a byte address and do :
RWStructuredBuffer<uint> rwCounterBuffer1;
RWStructuredBuffer<uint> rwCounterBuffer2;
RWByteAddressBuffer RWByteBuffer : BACKBUFFER;
#define PASS1_LOCATION 8
#define PASS2_LOCATION 24
[numthreads(1,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
uint i0 = rwCounterBuffer1.IncrementCounter();
uint i1 = rwCounterBuffer1.IncrementCounter();
uint i2 = rwCounterBuffer1.IncrementCounter();
uint i3 = rwCounterBuffer1.IncrementCounter();
RWByteBuffer.Store(PASS1_LOCATION + i0 * 4, 10);
RWByteBuffer.Store(PASS1_LOCATION + i1 * 4, 20);
RWByteBuffer.Store(PASS1_LOCATION + i2 * 4, 30);
RWByteBuffer.Store(PASS1_LOCATION + i3 * 4, 40);
uint3 inds = uint3(i0, i1, i2);
uint3 inds2 = uint3(i1,i2,i3);
uint writeIndex1= rwCounterBuffer2.IncrementCounter();
uint writeIndex2= rwCounterBuffer2.IncrementCounter();
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex1* 12, inds);
RWByteBuffer.Store3(PASS2_LOCATION + writeIndex2* 12, inds2);
}
This works correctly across all cards, but has all sorts of issues (that are out of topic for this question).
This is running on DirectX11 (I did not try it on DX12, and that's not relevant to my use case, except plain curiosity)
So is it a bug on NVidia?
Or is there something wrong with the first approach?
The printf function calls write (re. forktest.c):
void printf ( int fd, char *s, ... )
{
write( fd, s, strlen(s) );
}
Passing 1 as the fd writes to the console (as 1 maps to stdout). But where is write defined? I only see its declaration in user.h.
int write ( int, void*, int );
I'm assuming it somehow gets redirected to filewrite in file.c.
int filewrite (struct file *f, char *addr, int n )
{
int r;
if ( f->writable == 0 )
return -1;
if ( f->type == FD_PIPE )
return pipewrite( f->pipe, addr, n );
if ( f->type == FD_INODE )
{
// write a few blocks at a time to avoid exceeding
// the maximum log transaction size, including
// i-node, indirect block, allocation blocks,
// and 2 blocks of slop for non-aligned writes.
// this really belongs lower down, since writei()
// might be writing a device like the console.
int max = ( ( MAXOPBLOCKS - 1 - 1 - 2 ) / 2 ) * 512;
int i = 0;
while ( i < n )
{
int n1 = n - i;
if ( n1 > max )
n1 = max;
begin_op();
ilock( f->ip );
if ( ( r = writei( f->ip, addr + i, f->off, n1 ) ) > 0 )
f->off += r;
iunlock( f->ip );
end_op();
if ( r < 0 )
break;
if ( r != n1 )
panic( "short filewrite" );
i += r;
}
return i == n ? n : -1;
}
panic( "filewrite" );
}
And filewrite calls writei which is defined in fs.c.
int writei ( struct inode *ip, char *src, uint off, uint n )
{
uint tot, m;
struct buf *bp;
if ( ip->type == T_DEV )
{
if ( ip->major < 0 || ip->major >= NDEV || !devsw[ ip->major ].write )
return -1;
return devsw[ ip->major ].write( ip, src, n );
}
if ( off > ip->size || off + n < off )
return -1;
if ( off + n > MAXFILE*BSIZE )
return -1;
for ( tot = 0; tot < n; tot += m, off += m, src += m )
{
bp = bread( ip->dev, bmap( ip, off/BSIZE ) );
m = min( n - tot, BSIZE - off%BSIZE );
memmove( bp->data + off%BSIZE, src, m );
log_write( bp );
brelse( bp );
}
if ( n > 0 && off > ip->size )
{
ip->size = off;
iupdate( ip );
}
return n;
}
How does all this result in the terminal displaying the characters? How does the terminal know to read fd 1 for display, and where to find fd 1? What is the format of fd 1? Is it a standard?
Below is the full path from printf to the terminal. The gist is that eventually, xv6 writes the character to the CPU's serial port.
QEMU is initialized with the flags -nographic or -serial mon:stdio which tell it to use the terminal to send data to, or receive data from the CPU's serial port.
Step 1) printf in forktest.c
void printf ( int fd, const char *s, ... )
{
write( fd, s, strlen( s ) );
}
void forktest ( void )
{
...
printf( 1, "fork test\n" );
...
}
Step 2) write in usys.S
.globl write
write:
movl $SYS_write, %eax
int $T_SYSCALL
ret
Step 3) sys_write in sysfile.c
int sys_write ( void )
{
...
argfd( 0, 0, &f )
...
return filewrite( f, p, n );
}
static int argfd ( int n, int *pfd, struct file **pf )
{
...
f = myproc()->ofile[ fd ]
...
}
Previously during system initialization, main in init.c was called where the stdin (0), stdout (1), and stderr (2) file descriptors are created. This is what argfd finds when looking up the file descriptor argument to sys_write.
int main ( void )
{
...
if ( open( "console", O_RDWR ) < 0 )
{
mknod( "console", 1, 1 ); // stdin
open( "console", O_RDWR );
}
dup( 0 ); // stdout
dup( 0 ); // stderr
...
}
The stdin|out|err are inodes of type T_DEV because they are created using mknod in sysfile.c
int sys_mknod ( void )
{
...
ip = create( path, T_DEV, major, minor )
...
}
The major device number of 1 that is used to create them is mapped to the console. See file.h
// Table mapping major device number to device functions
struct devsw
{
int ( *read )( struct inode*, char*, int );
int ( *write )( struct inode*, char*, int );
};
extern struct devsw devsw [];
#define CONSOLE 1
Step 4) filewrite in file.c
int filewrite ( struct file *f, char *addr, int n )
{
...
if ( f->type == FD_INODE )
{
...
writei( f->ip, addr + i, f->off, n1 )
...
}
...
}
Step 5) writei in fs.c
int writei ( struct inode *ip, char *src, uint off, uint n )
{
...
if ( ip->type == T_DEV )
{
...
return devsw[ ip->major ].write( ip, src, n );
}
...
}
The call to devsw[ ip->major ].write( ip, src, n )
becomes devsw[ CONSOLE ].write( ip, src, n ).
Previously during system initialization, consoleinit mapped this to the function consolewrite (see console.c)
void consoleinit ( void )
{
...
devsw[ CONSOLE ].write = consolewrite;
devsw[ CONSOLE ].read = consoleread;
...
}
Step 6) consolewrite in console.c
int consolewrite ( struct inode *ip, char *buf, int n )
{
...
for ( i = 0; i < n; i += 1 )
{
consputc( buf[ i ] & 0xff );
}
...
}
Step 7) consoleputc in console.c
void consputc ( int c )
{
...
uartputc( c );
...
}
Step 8) uartputc in uart.c.
The out assembly instruction is used to write to the CPU's serial port.
#define COM1 0x3f8 // serial port
...
void uartputc ( int c )
{
...
outb( COM1 + 0, c );
}
Step 9) QEMU is configured to use the serial port for communication in the Makefile through the -nographic or -serial mon:stdio flags. QEMU uses the terminal to send data to the serial port, and to display data from the serial port.
qemu: fs.img xv6.img
$(QEMU) -serial mon:stdio $(QEMUOPTS)
qemu-nox: fs.img xv6.img
$(QEMU) -nographic $(QEMUOPTS)
fd==1 refers to stdout, or Standard Out. It's a common feature of Unix-like Operatin Systems. The kernel knows that it's not a real file. Writes to stdout are mapped to terminal output.
I am launching a kernel with linear blocks of 512 threads. Associated with each thread are six double precision values (two 3-element vectors) that I would like to store in shared memory, for a total of 512*6*8=24576 bytes. I would like to create pointers to the intermediate elements of shared to line all the vectors up as follows:
__global__ void my_kernel(double *global_data) {
extern __shared__ double shr[];
id = threadIdx.x;
double *X = &shr[id*3];
double *Y = &shr[(id+1)*3];
// Some arithmetic to set X[0:3] ad Y[0:3]
// Now I have a small for loop to compute something for each thread
for (int i = 0; i < 3; i++) {
for (int j=0; j < 3; j++) {
// Some computations involving the X and Y vectors
}
}
My problem is with accessing the values in X and Y using the looping indices. I am unable to explain the following behavior during the first loop iteration:
(cuda-gdb) cuda thread
thread (0,0,0)
(cuda-gdb) p shr[0]
$1 = 0.62293193093894383
(cuda-gdb) p &shr[0]
$2 = (#shared double *) 0x0
(cuda-gdb) p X[0]
$3 = 0.62293193093894383
(cuda-gdb) p &X[0]
$4 = (#generic double *) 0x1000000
(cuda-gdb) p X
$5 = (#generic double * #register) 0x1000000
I think this is normal. But then:
(cuda-gdb) p i == 0
$7 = true
(cuda-gdb) p X[i]
Error: Failed to read global memory at address 0x0 on device 0 sm 0 warp 0 lane 0 (error=7).
Why is it that when i == 0 I can access X[0] but not X[i]?
EDIT: Here is a complete working example demonstrating my issue:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
from math import pi
mydat = np.arange(12).astype(np.float64)
mydat_gpu = gpuarray.to_gpu(mydat)
mod = SourceModule("""
__global__ void my_kernel(double *mydat) {
extern __shared__ double shr[];
int id = threadIdx.x;
double *X = &shr[(id * 6)];
double *Y = &shr[(id * 6) + 3];
X[0] = mydat[0];
X[1] = mydat[1];
X[2] = mydat[2];
Y[0] = mydat[3];
Y[1] = mydat[4];
Y[2] = mydat[5];
__syncthreads();
double result;
for (int i = 0; i < 3; i++) {
result += X[i] + Y[i];
}
}
""")
my_kernel = mod.get_function("my_kernel")
blk = (1,1,1)
grd = (1,1,1)
my_kernel(mydat_gpu, grid=grd, block=blk, shared=(8*6))
At this point I start up a debugging session:
cuda-gdb --args python -m pycuda.debug minimal_working_example.py
(cuda-gdb) b my_kernel
Function "my_kernel" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (my_kernel) pending.
(cuda-gdb) run
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Breakpoint 1, my_kernel(double * #generic)<<<(1,1,1),(1,1,1)>>> (mydat=0x13034a0000)
at kernel.cu:5
5 int id = threadIdx.x;
(cuda-gdb) n
7 double *X = &shr[(id * 6)];
(cuda-gdb) p id
$1 = 0
(cuda-gdb) p id * 6
$2 = 0
(cuda-gdb) n
8 double *Y = &shr[(id * 6) + 3];
(cuda-gdb) p (id * 6) + 3
$3 = 3
(cuda-gdb) n
10 X[0] = mydat[0];
(cuda-gdb) n
11 X[1] = mydat[1];
(cuda-gdb) n
12 X[2] = mydat[2];
(cuda-gdb) n
13 Y[0] = mydat[3];
(cuda-gdb) n
14 Y[1] = mydat[4];
(cuda-gdb) n
15 Y[2] = mydat[5];
(cuda-gdb) p X
$4 = (#generic double * #register) 0x1000000
(cuda-gdb) p X[0]
$5 = 0
(cuda-gdb) p X[1]
$6 = 1
(cuda-gdb) p Y[0]
$7 = 3
(cuda-gdb) p Y[1]
$8 = 4
(cuda-gdb) n
18 __syncthreads();
(cuda-gdb) n
22 for (int i = 0; i < 3; i++) {
(cuda-gdb) n
23 result += X[i] + Y[i];
(cuda-gdb) p i
$9 = 0
(cuda-gdb) p X[0]
$10 = 0
(cuda-gdb) p X[i]
Error: Failed to read global memory at address 0x0 on device 0 sm 0 warp 0 lane 0 (error=7).
All that is happening here is that you are stepping through source instructions which have not actually been compiled into the running kernel. The variables you are trying to inspect have already gone out of scope and the debugger can no longer show them to you.
This is due to aggressive optimisation in the device code compiler. In your example, the summation loop doesn't produce an output which effects a write to global or shared memory, so the compiler just eliminates it. When stepping through the optimised code, the source debugger tried its best to show a 1:1 relationship between source and execution, but it isn't always possible, and this is the somewhat confusing result you are seeing.
You can confirm this for yourself by compiling your kernel code to PTX using nvcc and inspecting the code:
// .globl _Z9my_kernelPd
.visible .entry _Z9my_kernelPd(
.param .u64 _Z9my_kernelPd_param_0
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<7>;
.reg .b64 %rd<6>;
ld.param.u64 %rd1, [_Z9my_kernelPd_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd3, %r2, 8;
mov.u64 %rd4, shr;
add.s64 %rd5, %rd4, %rd3;
ld.global.nc.f64 %fd1, [%rd2];
ld.global.nc.f64 %fd2, [%rd2+8];
ld.global.nc.f64 %fd3, [%rd2+16];
ld.global.nc.f64 %fd4, [%rd2+24];
ld.global.nc.f64 %fd5, [%rd2+32];
ld.global.nc.f64 %fd6, [%rd2+40];
st.shared.f64 [%rd5], %fd1;
st.shared.f64 [%rd5+8], %fd2;
st.shared.f64 [%rd5+16], %fd3;
st.shared.f64 [%rd5+24], %fd4;
st.shared.f64 [%rd5+32], %fd5;
st.shared.f64 [%rd5+40], %fd6;
bar.sync 0;
ret;
}
You can see the last PTX instruction is bar, which is the instruction which the __syncthreads() device function emits. The loop for summation is not present.
If I modify you source like this:
__global__ void my_kernel2(double *mydat, double *out) {
extern __shared__ double shr[];
int id = threadIdx.x;
double *X = &shr[(id * 6)];
double *Y = &shr[(id * 6) + 3];
X[0] = mydat[0];
X[1] = mydat[1];
X[2] = mydat[2];
Y[0] = mydat[3];
Y[1] = mydat[4];
Y[2] = mydat[5];
__syncthreads();
double result;
for (int i = 0; i < 3; i++) {
result += X[i] + Y[i];
}
*out = result;
}
so that result is now stored to global memory and compile it to PTX:
.visible .entry _Z10my_kernel2PdS_(
.param .u64 _Z10my_kernel2PdS__param_0,
.param .u64 _Z10my_kernel2PdS__param_1
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<20>;
.reg .b64 %rd<8>;
ld.param.u64 %rd3, [_Z10my_kernel2PdS__param_0];
ld.param.u64 %rd2, [_Z10my_kernel2PdS__param_1];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd5, %r2, 8;
mov.u64 %rd6, shr;
add.s64 %rd1, %rd6, %rd5;
ld.global.f64 %fd1, [%rd4];
ld.global.f64 %fd2, [%rd4+8];
ld.global.f64 %fd3, [%rd4+16];
ld.global.f64 %fd4, [%rd4+24];
ld.global.f64 %fd5, [%rd4+32];
ld.global.f64 %fd6, [%rd4+40];
st.shared.f64 [%rd1], %fd1;
st.shared.f64 [%rd1+8], %fd2;
st.shared.f64 [%rd1+16], %fd3;
st.shared.f64 [%rd1+24], %fd4;
st.shared.f64 [%rd1+32], %fd5;
st.shared.f64 [%rd1+40], %fd6;
bar.sync 0;
ld.shared.f64 %fd7, [%rd1];
ld.shared.f64 %fd8, [%rd1+24];
add.f64 %fd9, %fd7, %fd8;
add.f64 %fd10, %fd9, %fd11;
ld.shared.f64 %fd12, [%rd1+8];
ld.shared.f64 %fd13, [%rd1+32];
add.f64 %fd14, %fd12, %fd13;
add.f64 %fd15, %fd10, %fd14;
ld.shared.f64 %fd16, [%rd1+16];
ld.shared.f64 %fd17, [%rd1+40];
add.f64 %fd18, %fd16, %fd17;
add.f64 %fd19, %fd15, %fd18;
cvta.to.global.u64 %rd7, %rd2;
st.global.f64 [%rd7], %fd19;
ret;
}
You can see that the (urolled) loop is now present in the PTX, and the debugger behaviour should be closer to what you expect if you were to try it.
As suggested in comments, you shouldn't ever spend time trying to analyse any code which doesn't change block or global state, because of the complications caused by compiler optimisation.