How CUDA constant memory allocation works? - memory

I'd like to get some insight about how constant memory is allocated (using CUDA 4.2). I know that the total available constant memory is 64KB. But when is this memory actually allocated on the device? Is this limit apply to each kernel, cuda context or for the whole application?
Let's say there are several kernels in a .cu file, each using less than 64K constant memory. But the total constant memory usage is more than 64K. Is it possible to call these kernels sequentially? What happens if they are called concurrently using different streams?
What happens if there is a large CUDA dynamic library with lots of kernels each using different amounts of constant memory?
What happens if there are two applications each requiring more than half of the available constant memory? The first application runs fine, but when will the second app fail? At app start, at cudaMemcpyToSymbol() calls or at kernel execution?

Parallel Thread Execution ISA Version 3.1 section 5.1.3 discusses constant banks.
Constant memory is restricted in size, currently limited to 64KB which
can be used to hold statically-sized constant variables. There is an
additional 640KB of constant memory, organized as ten independent 64KB
regions. The driver may allocate and initialize constant buffers in
these regions and pass pointers to the buffers as kernel function
parameters. Since the ten regions are not contiguous, the driver
must ensure that constant buffers are allocated so that each buffer
fits entirely within a 64KB region and does not span a region
boundary.
A simple program can be used to illustrate the use of constant memory.
__constant__ int kd_p1;
__constant__ short kd_p2;
__constant__ char kd_p3;
__constant__ double kd_p4;
__constant__ float kd_floats[8];
__global__ void parameters(int p1, short p2, char p3, double p4, int* pp1, short* pp2, char* pp3, double* pp4)
{
*pp1 = p1;
*pp2 = p2;
*pp3 = p3;
*pp4 = p4;
return;
}
__global__ void constants(int* pp1, short* pp2, char* pp3, double* pp4)
{
*pp1 = kd_p1;
*pp2 = kd_p2;
*pp3 = kd_p3;
*pp4 = kd_p4;
return;
}
Compile this for compute_30, sm_30 and execute cuobjdump -sass <executable or obj> to disassemble you should see
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = windows
compile_size = 32bit
identifier = c:/dev/constant_banks/kernel.cu
code for sm_30
Function : _Z10parametersiscdPiPsPcPd
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44]; // stack pointer
/*0010*/ /*0x40001de428004005*/ MOV R0, c [0x0] [0x150]; // pp1
/*0018*/ /*0x50009de428004005*/ MOV R2, c [0x0] [0x154]; // pp2
/*0020*/ /*0x0001dde428004005*/ MOV R7, c [0x0] [0x140]; // p1
/*0028*/ /*0x13f0dc4614000005*/ LDC.U16 R3, c [0x0] [0x144]; // p2
/*0030*/ /*0x60011de428004005*/ MOV R4, c [0x0] [0x158]; // pp3
/*0038*/ /*0x70019de428004005*/ MOV R6, c [0x0] [0x15c]; // pp4
/*0048*/ /*0x20021de428004005*/ MOV R8, c [0x0] [0x148]; // p4
/*0050*/ /*0x30025de428004005*/ MOV R9, c [0x0] [0x14c]; // p4
/*0058*/ /*0x1bf15c0614000005*/ LDC.U8 R5, c [0x0] [0x146]; // p3
/*0060*/ /*0x0001dc8590000000*/ ST [R0], R7; // *pp1 = p1
/*0068*/ /*0x0020dc4590000000*/ ST.U16 [R2], R3; // *pp2 = p2
/*0070*/ /*0x00415c0590000000*/ ST.U8 [R4], R5; // *pp3 = p3
/*0078*/ /*0x00621ca590000000*/ ST.64 [R6], R8; // *pp4 = p4
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0xe0001de74003ffff*/ BRA 0x90;
/*0098*/ /*0x00001de440000000*/ NOP CC.T;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
...........................................
Function : _Z9constantsPiPsPcPd
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44]; // stack pointer
/*0010*/ /*0x00001de428004005*/ MOV R0, c [0x0] [0x140]; // p1
/*0018*/ /*0x10009de428004005*/ MOV R2, c [0x0] [0x144]; // p2
/*0020*/ /*0x0001dde428004c00*/ MOV R7, c [0x3] [0x0]; // kd_p1
/*0028*/ /*0x13f0dc4614000c00*/ LDC.U16 R3, c [0x3] [0x4]; // kd_p2
/*0030*/ /*0x20011de428004005*/ MOV R4, c [0x0] [0x148]; // p3
/*0038*/ /*0x30019de428004005*/ MOV R6, c [0x0] [0x14c]; // p4
/*0048*/ /*0x20021de428004c00*/ MOV R8, c [0x3] [0x8]; // kd_p4
/*0050*/ /*0x30025de428004c00*/ MOV R9, c [0x3] [0xc]; // kd_p4
/*0058*/ /*0x1bf15c0614000c00*/ LDC.U8 R5, c [0x3] [0x6]; // kd_p3
/*0060*/ /*0x0001dc8590000000*/ ST [R0], R7;
/*0068*/ /*0x0020dc4590000000*/ ST.U16 [R2], R3;
/*0070*/ /*0x00415c0590000000*/ ST.U8 [R4], R5;
/*0078*/ /*0x00621ca590000000*/ ST.64 [R6], R8;
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0xe0001de74003ffff*/ BRA 0x90;
/*0098*/ /*0x00001de440000000*/ NOP CC.T;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
.....................................
I annotated to the right of the SASS.
On sm30 you can see that parameters are passed in constant bank 0 starting at offset 0x140.
User defined __constant__ variables are defined in constant bank 3.
If you execute cuobjdump --dump-elf <executable or obj> you can find other interesting constant information.
32bit elf: abi=6, sm=30, flags = 0x1e011e
Sections:
Index Offset Size ES Align Type Flags Link Info Name
1 34 142 0 1 STRTAB 0 0 0 .shstrtab
2 176 19b 0 1 STRTAB 0 0 0 .strtab
3 314 d0 10 4 SYMTAB 0 2 a .symtab
4 3e4 50 0 4 CUDA_INFO 0 3 b .nv.info._Z9constantsPiPsPcPd
5 434 30 0 4 CUDA_INFO 0 3 0 .nv.info
6 464 90 0 4 CUDA_INFO 0 3 a .nv.info._Z10parametersiscdPiPsPcPd
7 4f4 160 0 4 PROGBITS 2 0 a .nv.constant0._Z10parametersiscdPiPsPcPd
8 654 150 0 4 PROGBITS 2 0 b .nv.constant0._Z9constantsPiPsPcPd
9 7a8 30 0 8 PROGBITS 2 0 0 .nv.constant3
a 7d8 c0 0 4 PROGBITS 6 3 a00000b .text._Z10parametersiscdPiPsPcPd
b 898 c0 0 4 PROGBITS 6 3 a00000c .text._Z9constantsPiPsPcPd
.section .strtab
.section .shstrtab
.section .symtab
index value size info other shndx name
0 0 0 0 0 0 (null)
1 0 0 3 0 a .text._Z10parametersiscdPiPsPcPd
2 0 0 3 0 7 .nv.constant0._Z10parametersiscdPiPsPcPd
3 0 0 3 0 b .text._Z9constantsPiPsPcPd
4 0 0 3 0 8 .nv.constant0._Z9constantsPiPsPcPd
5 0 0 3 0 9 .nv.constant3
6 0 4 1 0 9 kd_p1
7 4 2 1 0 9 kd_p2
8 6 1 1 0 9 kd_p3
9 8 8 1 0 9 kd_p4
10 16 32 1 0 9 kd_floats
11 0 192 12 10 a _Z10parametersiscdPiPsPcPd
12 0 192 12 10 b _Z9constantsPiPsPcPd
The kernel parameter constant bank is versioned per launch so that concurrent kernels can be executed. The compiler and user constants are per CUmodule. It is the responsibility of the developer to manage coherency of this data. For example, the developer has to ensure that a cudaMemcpyToSymbol is update in a safe manner.

Related

LLVM is optimizing the intrinsic code as well

We have some code which is manually written as intrinsics. But LLVM is trying to optimize further because of --fast-math flag. Manual intrinsic is better compared to LLVM optimized one.
Example source code:
inline __m256 simd_evaluate_polynomial<__m256, APPROX_DEFAULT>(__m256 x, const std::array<__m256, APPROX_DEFAULT + 1>& coeff)
{
__m256 power = _mm256_set1_ps(1.0f);
__m256 res = _mm256_set1_ps(0.0f);
for (unsigned int i = 0; i <= APPROX_DEFAULT; i++) {
__m256 term = _mm256_mul_ps(coeff[i], power);
power = _mm256_mul_ps(power, x);
res = _mm256_add_ps(res, term);
}
return res;
}
For above function LLVM ASSEMBLY
Address Source Line Assembly CPU Time: Total CPU Time: Self
0x1402bbf7d 0 Block 1:
0x1402bbf7d 19 vmovaps ymm5, ymmword ptr [rip+0x50e4b5b] 0.1% 15.584ms
0x1402bbf85 19 vfmadd213ps ymm5, ymm3, ymmword ptr [rip+0x50e4b32] 0.1% 15.595ms
0x1402bbf8e 19 vfmadd213ps ymm5, ymm3, ymmword ptr [rip+0x50e4b09] 0.6% 93.654ms
0x1402bbf97 19 vfmadd213ps ymm5, ymm3, ymmword ptr [rip+0x50e4ae0] 0.2% 31.178ms
0x1402bbfa0 21 vfmadd213ps ymm5, ymm3, ymmword ptr [rip+0x50e4ab7] 0.3% 46.992ms
Can anyone please explain this why this is happening?

32-bit ADD on Aarch64 assembly

This is my first post here and I'm also kind of new to arm64 assembly, so I'm trying to do some arithmetic, but for example when I try to do an addition it seems to do it in 32-bit.
Here's my code:
.data
msg: .asciz "Value 1: "
msg2: .asciz "Value 2: "
result: .asciz "Result: %d\n"
fmt: .asciz "%d"
.balign 8
value1: .quad 0
.balign 8
value2: .quad 0
.balign 16
lr_value: .quad 0
.text
.global main
main:
adr x0, lr_value
str x30, [x0]
//Display message
adr x0,msg
bl printf
//Input first value
adr x0,fmt
adr x1,value1
bl scanf
//Display second message
adr x0,msg2
bl printf
//Input second value
adr x0,fmt
adr x1,value2
bl scanf
//Load first and second value
adr x1,value1
ldr x1,[x1]
adr x2,value2
ldr x2,[x2]
//Add both values on x1
add x1,x1,x2
//Show result
adr x0,result
bl printf
adr x0,lr_value
ldr x30,[x0]
mov w0,#0
ret
And here's the output:
Value 1: 2147483647
Value 2: 1
Result: -2147483648
What am I doing wrong? I've also tried multiplication and substraction
Edit: Solved it, turns out I had to use %ld instead of %d, thank you Nate Eldredge!

iOS Neon assembler sample questions

Just trying http://api.madewithmarmalade.com/ExampleArmASM.html and using iOS; the program run if I comment out the loop and the res is printed as 28. But if not comment it out, it will abend without printing the res.
Any hint why and how to fix it.
Thanks in advance.
My code is as follows:
#include <stdio.h>
#include <stdlib.h>
#define ARRAY_SIZE 512
#if defined __arm__ && defined __ARM_NEON__
static int computeSumNeon(const int a[])
{
// Computes the sum of all elements in the input array
int res = 0;
asm(".align 4 \n\t" //dennis warning avoiding
"vmov.i32 q8, #0 \n\t" //clear our accumulator register
"mov r3, #512 \n\t" //Loop condition n = ARRAY_SIZE
// ".loop1: \n\t" // No loop add 0-7 works as 28
"vld1.32 {d0, d1, d2, d3}, [%[input]]! \n\t" //load 8 elements into d0, d1, d2, d3 = q0, q1
"pld [%[input]] \n\t" // preload next set of elements
"vadd.i32 q8, q0, q8 \n\t" // q8 += q0
"vadd.i32 q8, q1, q8 \n\t" // q8 += q1
"subs r3, r3, #8 \n\t" // n -= 8
// "bne .loop1 \n\t" // n == 0?
"vpadd.i32 d0, d16, d17 \n\t" // d0[0] = d16[0] + d16[1], d0[1] = d17[0] + d17[1]
"vpaddl.u32 d0, d0 \n\t" // d0[0] = d0[0] + d0[1]
"vmov.32 %[result], d0[0] \n\t"
: [result] "=r" (res) , [input] "+r" (a)
:
: "q0", "q1", "q8", "r3");
return res;
}
#else
static int computeSumNeon(const int a[])
{
int i, res = 0;
for (i = 0; i < ARRAY_SIZE; i++)
res += a[i];
}
#endif
...
#implementation AppDelegate
- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
// Override point for customization after application launch.
//int* inp;
int inp[ARRAY_SIZE];
//posix_memalign((void**)&inp, 64, ARRAY_SIZE*sizeof(int)); // Align to cache line size (64bytes on a cortex A8)
// Initialise the array with consecutive integers.
int i;
for (i = 0; i < ARRAY_SIZE; i++)
{
inp[i] = i;
}
for (i = 0; i < ARRAY_SIZE; i++)
{
printf("%i,", inp[i]);
}
printf("\n\n sum 0-7:%i\n", 0+1+2+3+4+5+6+7);
int res = 0;
res = computeSumNeon(inp);
printf("res NEO :%i\n", res);
// free(inp); // error pointer being free was not allocated !!!
UISplitViewController *splitViewController = (UISplitViewController *)self.window.rootViewController;
UINavigationController *navigationController = [splitViewController.viewControllers lastObject];
navigationController.topViewController.navigationItem.leftBarButtonItem = splitViewController.displayModeButtonItem;
splitViewController.delegate = self;
return YES;
}
- (void)applicationWillResignActive:(UIApplication *)application {
...
==== assembly code generated
.align 1
.code 16 # #computeSumNeon
.thumb_func _computeSumNeon
_computeSumNeon:
Lfunc_begin3:
.loc 18 133 0 is_stmt 1 # ...
.cfi_startproc
# BB#0:
sub sp, #8
movs r1, #0
str r0, [sp, #4]
.loc 18 135 9 prologue_end # ...
Ltmp18:
str r1, [sp]
.loc 18 136 5 # ...
ldr r0, [sp, #4]
# InlineAsm Start
.align 4
vmov.i32 q8, #0x0
movw r3, #504
.loop1:
vld1.32 {d0, d1, d2, d3}, [r0]!
vadd.i32 q8, q0, q8
vadd.i32 q8, q1, q8
subs r3, #8
bne .loop1
vpadd.i32 d0, d16, d17
vpaddl.u32 d0, d0
vmov.32 r1, d0[0]
# InlineAsm End
str r1, [sp]
str r0, [sp, #4]
.loc 18 155 12 # ...
ldr r0, [sp]
.loc 18 155 5 is_stmt 0 # ...
add sp, #8
bx lr
Ltmp19:
Lfunc_end3:
.cfi_endproc

Getting Beaglebone PRUs to work using PASM

I have been trying to get the PRU to work in a way that makes sense to me and at this point I am completely clueless. I can get the examples to work, but anytime I make a change or try to write things from scratch I just beat my head against the wall. I just want to as a start access any of the USRLEDS and turn them off or on at some speed, or as first pass turn on a LED and leave it on. Here is a PASM code I got off the internet (Will post link when I find it):
.origin 0
.entrypoint START
#define PRU0_ARM_INTERRUPT 19
#define AM33XX
#define GPIO1 0x4804c000 //Trying to access the GPIO1
#define GPIO_CLEARDATAOUT 0x190 //writing 1 to the bit you want cleared in GPIO_DATAOUT register (what does that mean?)
#define GPIO_SETDATAOUT 0x194 (set a value for GPIO output pins, which pins am I even writing to? GPIO1?
#define GPIO_OE 0x134 //enable the pins output capabilities
START:
//clear that bit
lbco r0, c4, 4, 4 //This creates a constant offset and stores in c4, but why do you need that?
CLR r0, r0, 4 //if you copied the data why do you need to clear it?
SBCO r0, C4, 4, 4 //What is this for?
//MOV r1, 10
MOV r2, 0x00000000 //store address 0x00 into r2, why?
MOV r3, GPIO1 //Store GPIO1 address in r3
MOV r4, GPIO_OE //place address of GPIO_OE into r4
MOV r5, GPIO_SETDATAOUT //store address of GPIO_SETDATAOUT in r5
MOV r6, GPIO_CLEARDATAOUT //store addres of GPIOCLEARDATAOUT in r6
SBBO r2, r3, r4,4 //What is this even doing? Copying 4 bytes from r2 into r3+r4, but why do you want to copy that way and if not why not?
MOV r1, 10
MOV r2, 0xFFFFFFFF //Suppossedly this turn the GPIO1 ON and OFF?
SBBO r2, r3, r6, 4 and again the storage stuff?
HALT
I am also attaching the C code that I am using:
#include <stdio.h>
#include <pruss/prussdrv.h>
#include <pruss/pruss_intc_mapping.h>
#define PRU_NUM 0 //defining which PRU to use
int main() {
int ret;
tpruss_intc_initdata intc = PRUSS_INTC_INITDATA;
//initialize the PRU by using init command from prussdrv.h
ret = prussdrv_init();
if(ret != 0) {
printf("Error returned: %d\n",ret);
printf("PRU unable to be initialized");
return -1;
}
ret = prussdrv_open(PRU_EVTOUT_0);
if(ret != 0) {
printf("Error returned for prussdrv_open(): %d\n",ret);
printf("PRU can't open PRU_EVTOUT_0");
return -1;
}
//Map PRUS's INTC
ret = prussdrv_pruintc_init(&intc);
if (ret != 0) {
printf("Error returned for prussdrv_pruintc_int\n");
printf("PRU doesn't work");
return -1;
}
//load and execute binary on PRU
prussdrv_exec_program(PRU_NUM, "./ashwini_test.bin");
prussdrv_pru_wait_event(PRU_EVTOUT_0);
prussdrv_pru_clear_event(PRU_EVTOUT_0,PRU0_ARM_INTERRUPT);
/*Disable PRU and close memory mappings*/
prussdrv_pru_disable(PRU_NUM);
prussdrv_exit();
//prussdrv_pru_wait_event(PRU_EVTOUT_0);
return 0;
}
I have gone through THE TRM and https://groups.google.com/forum/#!topic/beaglebone/98eF1wQE_QA, and elinux and derekmolloy, I just feel like I am missing something very basic about how address scheme work or how to think about these things. Thanks again for your help!
When you say that's your PASM code... do you mean it's some code you got from somewhere else that you're trying to use? Because the comments on most lines asking what they do makes it seem unlikely that it's actually your code...
Anyways, can't really answer unless you have a specific question, but there's plenty of info out there about how to use the GPIO subsystem on the BeagleBone's AM335x processor. I talked about it some in a post a while back here: https://graycat.io/tutorials/beaglebone-io-using-python-mmap/
I've also got a few documented PRU assembly examples here: https://github.com/alexanderhiam/PRU-stuffs

Moving through allocated bytes

I have following declaration data:
temp db 50 DUP(0)
How do I access each byte?
Let's say I do mov temp, 48 and then want to move 49 into the next byte of the allocated ones. I tried to
inc temp
mov temp, 49
but it just increased temp value to 49
E.g.
mov [temp + 1], 49
or, if you want to dynamically select the slot in temp to store a value in
mov [temp + ebx], 49
where ebx holds the index value (could be any register)

Resources