Runtime exception, store address not aligned on word boundary - alignment

.text
.globl main
main:
addi $v0, $0, 4
la $a0, msg_dimensions_matrix
syscall
li $v0, 5 # Scans N (size of matrix).
syscall
move $s7, $v0 # Stores given matrix size in $s7.
addi $v0, $0, 4
la $a0, msg_fill
syscall
addi $t0, $0, 0 # Initializes counter (stored in $t0).
move $a0, $t0 # Moves counter to $a0.
mul $a3, $s7, $s7
mul $a3, $a3, 4
jal input_loop_0 # Calls function input_loop to fill the matrix.
addi $v0, $0, 4
la $a0, msg_dimensions_subm
syscall
li $v0, 5 # Scans M (size of submatrix).
syscall
move $s6, $v0 # Stores submatrix size in $s6.
addi $v0, $0, 4
la $a0, msg_leftdist
syscall
li $v0, 5 # Scans left distance.
syscall
move $s4, $v0 # Stores left distance in $s4.
li $v0, 5 # Scans upper distance.
syscall
move $s5, $v0 # Stores upper distance in $s5.
move $a0, $s5 # Stores upper distance in $a0.
move $a1, $s4 # Stores left distance in $a1.
move $a2, $s6 # Stores submatrix size in $a2.
move $a3, $s7 # Stores matrix size in $a3.
jal submatrix_transpos
j matrix_copy_0
input_loop_0:
addi $v0, $0, 4
la $a0, msg_input
syscall
li $v0, 5
syscall
move $t1, $v0 # Stores user input in $t1.
la $t7, init_matrix
add $t7, $0, $a0
sw $t1, 0($t7) # Stores user input properly in matrix.
addi $a0, $a0, 4 # Increases row counter by 4 bytes.
bne $t7, $a3, input_loop_0 # If row counter is not equal to matrix size, jumps to loop.
jr $ra
submatrix_transpos:
la $t0, init_matrix
mul $t2, $a2, $a0 # M * i
add $t2, $t2, $a1 # (M * i) + j
add $t0, $t0, $t2 # A + ((M * i) + j)
la $t1, submatrix
#addi $t1, $0, $t0 # A + ((M * i) + j)
add $s3, $0, $t0
addi $t4, $0, 0
addi $t7, $0, 4 # $t7 = 4.
mul $s1, $a3, $t7 # Calculates how many bytes each row of the initial matrix has.
mul $t9, $a2, $t7
mul $t5, $s1, $a3
add $s6, $t0, $t5
add $s6, $s6, $t9 # Stores position of bottom right element of submatrix, to be used to break from loop when bounds are reached.
addi $s7, $0, 0 # Number of times loop has been performed set to 0.
submatrix_loop:
lw $t8, init_matrix
sw $t8, submatrix
addi $t1, $t1, 4 # submatrix[j + 1]
add $t0, $t0, $s1 # matrix [i + 1]
addi $t4, $t4, 1 # Increases counter of times i or j is increased, by 1.
beq $t0, $s6, return # If bottom right corner of submatrix is reached, return.
beq $t4, $a2, reset0 # If i or j reaches submatrix bounds, reset accordingly.
j submatrix_loop # Else, then continue.
reset0:
la $t0, init_matrix
add $t0, $0, $s3
la $t1, submatrix
add $t0, $0, $s3
return:
jr $ra
matrix_copy_0:
la $t0, init_matrix # Load address of initial matrix.
la $t1, final_matrix # Load address of final matrix.
addi $t2, $0, 0 # Initialize counter to 0.
addi $t3, $0, 0
add $t4, $s7, $s7 # Size NxN of matrix.
for0:
lw $t3, init_matrix
sw $t3, final_matrix
addi $t2, $t2, 4 # Increase counter by 4 bytes.
addi $t3, $t3, 1
beq $t4, $t3, matrix_copy_1
j for0
matrix_copy_1:
la $t0, final_matrix
mul $t2, $s6, $s5 # M * i
add $t2, $t2, $s4 # (M * i) + j
add $t0, $t0, $t2 # A + ((M * i) + j)
la $t1, submatrix
addi $t8, $0, 0
addi $t3, $0, 0
mul $t7, $s7, 4
for1:
lw $t4, submatrix
sw $t4, final_matrix
addi $t0, $t0, 4
addi $t1, $t1, 4
addi $t8, $0, 1
beq $s6, $t8, reset1
j for1
reset1:
la $t0, final_matrix
mul $t2, $s6, $s5 # M * i
add $t2, $t2, $s4 # (M * i) + j
add $t0, $t0, $t2 # A + ((M * i) + j)
add $t0, $t0, $t7
addi $t3, $t3, 1 # Times in this label increased by one.
beq $t3, $s6, print
j for1
print:
addi $v0, $0, 4
la $a0, msg_final
syscall
la $t0, final_matrix
addi $t1, $0, 0
addi $t4, $0, 0 # Output counter, change line every
addi $t5, $0, 5 # 5 printed integers.
for2:
lw $t3, final_matrix
li $v0, 1 # Print content.
add $a0, $t3, $0
syscall
addi $t0, $t0, 4
addi $t4, $t4, 1
beq $t1, $s7, exit
beq $t4, $t5, reset2
j for2
reset2:
addi $v0, $0, 4
la $a0, newline
syscall
addi $t4, $t4, 0
j for2
exit:
li $v0, 10
syscall
.data
.align 2
init_matrix: .space 400 # Allocates memory for a matrix of size 10*10 maximum, containing integers.
submatrix: .space 400
final_matrix: .space 400
msg_dimensions_matrix: .asciiz "Please define the dimensions of the matrix: \n"
msg_dimensions_subm: .asciiz "Please define the dimensions of the submatrix: \n"
msg_fill: .asciiz "Now, you have to fill the matrix. \n"
msg_input: .asciiz "Please enter an integer: \n"
msg_leftdist: .asciiz "Please define the left distance: \n"
msg_upperdist: .asciiz "Please define the upper distance: \n"
msg_final: .asciiz "The new matrix is: \n"
newline: .asciiz "\n"
I'm getting: line 50: Runtime exception at 0x004000c8: store address not aligned on word boundary 0x10010533
Can someone tell me why this is happening? Wasn't ".align 2" supposed to prevent this?

The problem is that $a0 is not getting set to 0 in the following code:
input_loop_0:
addi $v0, $0, 4
la $a0, msg_input
syscall
li $v0, 5
syscall
move $t1, $v0 # Stores user input in $t1.
la $t7, init_matrix
add $t7, $0, $a0 <============== $a0 contains the (unaligned) address of msg_input
sw $t1, 0($t7) # Stores user input properly in matrix.
addi $a0, $a0, 4 # Increases row counter by 4 bytes.
bne $t7, $a3, input_loop_0

Related

remove a node from a double linked list in mips

I have been asked for a job for university in which I have to work with lists of objects in categorized form using circular double linked lists. In one of the exercises it asks me to delete a selected category. If the list of objects of the selected category is empty then it should delete only the category and automatically select the next one as the category if it exists, otherwise it should nullify the necessary pointers. If, on the other hand, it is not empty, you must first proceed to delete all the objects, returning the corresponding memory and then proceed to delete the category with the same caveats that were indicated.
The problem is that when I want to add a new category it gives me a memory alignment error and I don't know how to fix it. I would appreciate if someone can help me
.data
slist: .word 0
cclist: .word 0
wclist: .word 0
schedv: .space 32
menu: .ascii "\nColecciones de objetos categorizados\n"
.ascii "====================================\n"
.ascii "1-Nueva categoria\n"
.ascii "2-Siguiente categoria\n"
.ascii "3-Categoria anterior\n"
.ascii "4-Listar categorias\n"
.ascii "5-Borrar categoria actual\n"
.ascii "6-Anexar objeto a la categoria actual\n"
.ascii "7-Listar objetos de la categoria\n"
.ascii "8-Borrar objeto de la categoria\n"
.ascii "0-Salir\n"
.asciiz "Ingrese la opcion deseada: "
error: .asciiz "\nError: "
return: .asciiz "\n"
flecha: .asciiz "> "
catName: .asciiz "\nIngrese el nombre de una categoria: "
selCat: .asciiz "\nSe ha seleccionado la categoria: "
idObj: .asciiz "\nIngrese el ID del objeto a eliminar: "
objName: .asciiz "\nIngrese el nombre de un objeto: "
success: .asciiz "\nLa operaciĆ³n se realizo con exito\n"
.text
main: la $t0, schedv # initialization scheduler vector
la $t1, newcategory
sw $t1, 0($t0)
la $t1, nextcategory
sw $t1, 4($t0)
la $t1, prevcategory
sw $t1, 8($t0)
la $t1, listcategories
sw $t1, 12($t0)
la $t1, delcategory
sw $t1, 16($t0)
la $t1, newobject
sw $t1, 20($t0)
la $t1, listobjects
sw $t1, 24($t0)
# la $t1, delobject
# sw $t1, 28($t0)
mainloop: li $v0, 4
la $a0, menu
syscall
li $v0, 5
syscall # Pedir un entero
move $t2, $v0
condiciones: blt $t2, 0, error101
bgt $t2, 8, error101
beq $t2, 0, end
beq $t2, 1, opc1
beq $t2, 2, opc2
beq $t2, 3, opc3
beq $t2, 4, opc4
beq $t2, 5, opc5
beq $t2, 6, opc6
beq $t2, 7, opc7
beq $t2, 8, opc8
j mainloop
opc1: jal newcategory
j mainloop
opc2: jal nextcategory
j mainloop
opc3: jal prevcategory
j mainloop
opc4: jal listcategories
j mainloop
opc5: jal delcategory
j mainloop
opc6: jal newobject
j mainloop
opc7: jal listobjects
j mainloop
opc8: #jal delobjet
#j mainloop
#-----------------------------------------------------------------------#
# ERRORES #
#-----------------------------------------------------------------------#
printerror:
move $t0, $a0
li $v0, 4
la $a0, error
syscall
li $v0, 1
la $a0, ($t0)
syscall
li $v0, 4
la $a0, return
syscall
j mainloop
error101: li $a0, 101
jal printerror
error201: li $a0, 201
jal printerror
error202: li $a0, 202
jal printerror
error301: li $a0, 301
jal printerror
error401: li $a0, 401
jal printerror
error501: li $a0, 501
jal printerror
error601: li $a0, 601
jal printerror
error602: li $a0, 602
jal printerror
error701: li $a0, 701
jal printerror
#-----------------------------------------------------------------------#
# CATEGORIAS #
#-----------------------------------------------------------------------#
newcategory:
addiu $sp, $sp, -4
sw $ra, 4($sp)
la $a0, catName # input category name
jal getblock
move $a2, $v0 # $a2 = *char to category name
la $a0, cclist # $a0 = list
li $a1, 0 # $a1 = NULL
jal addnode
lw $t0, wclist
bnez $t0, newcategory_end
sw $v0, wclist # update working list if was NULL
newcategory_end:
li $v0, 0 # return success
lw $ra, 4($sp)
addiu $sp, $sp, 4
jr $ra
nextcategory: lw $t0, wclist
beqz $t0, error201 # wclist == 0
lw $t1, 12($t0)
beq $t0, $t1, error202 # prevcategory == 0
sw $t1, wclist # update wclist
li $v0, 4
la $a0, selCat
syscall
lw $t1, 8($t0)
li $v0, 4
la $a0, ($t0)
syscall
li $v0, 4
la $a0, success
syscall
li $v0, 0
jr $ra
prevcategory: lw $t0, wclist
beqz $t0, error201
lw $t0, ($t0) # $t0 = prevcategory
beqz $t0, error202 # prevcategory != 0
sw $t0, wclist # update wclist
li $v0, 4
la $a0, selCat
syscall
lw $t0, 8($t0)
li $v0, 4
la $a0, ($t0)
syscall
li $v0, 4
la $a0, success
syscall
li $v0, 0
jr $ra
listcategories: lw $t0, cclist
beqz $t0, error301
lw $t1, wclist # $t1 = selected category in progress
li $t2, 0 # i = 0
lw $t4, cclist #(aux)
catLoop: bne $t0, $t4, elseCat # cclist != cclist(aux)?
addi $t2, $t2, 1 # if cclist is repeated: i++
elseCat: beq $t2, 2, listcategories_end
bne $t1, $t0 noeq # skip if wclist != cclist
li $v0, 4 # (for current category)
la $a0, flecha
syscall # ">"
noeq: lw $t3, 8($t0) # category name
li $v0, 4
la $a0, ($t3)
syscall # print the category
lw $t0, 12($t0) # next category
j catLoop
listcategories_end:
li $v0, 4
la $a0, success
syscall
li $v0, 0
jr $ra
delcategory:
lw $t0, cclist
beqz $t0, error401
addiu $sp, $sp, -4
sw $ra, 4($sp)
lw $a0, wclist
la $a1, slist
jal delnode
delcategory_end:
li $v0, 4
la $a0, success
syscall
li $v0, 0
lw $ra, 4($sp)
addiu $sp, $sp, 4
jr $ra
#-----------------------------------------------------------------------#
# OBJETOS #
#-----------------------------------------------------------------------#
newobject:
lw $t0, cclist
beqz $t0, error501
addiu $sp, $sp, -4
sw $ra, 4($sp)
la $a0, objName
jal getblock
move $a2, $v0
lw $t0, wclist
la $a0, 4($t0)
li $a1, 0
jal addnode
lw $t0, 0($v0)
lw $t1, 4($t0)
addi $t1, $t1, 1
sw $t1, 4($v0)
newobject_end:
li $v0, 0 # return success
lw $ra, 4($sp)
addiu $sp, $sp, 4
jr $ra
listobjects:
lw $t0, wclist # $t0 = selected category in progress
beqz $t0, error601 # If there are no lists, error code 601
lw $t0, 4($t0) # $t0 = list of objects of the current category
li $t2, 0 # i = 0
lw $t4, ($t0) # $t4 = first element of object list
objLoop: bne $t0, $t4, objElse
addi $t2, $t2, 1 # if cclist is repeated: i++
objElse: beq $t2, 2, listobjects_end
lw $t3, 8($t0) # object name
li $v0, 4
la $a0, ($t3)
syscall # print the object
lw $t4, ($t0) # $t4 = first element of object list
lw $t0, 12($t0) # next object
j objLoop
listobjects_end:
li $v0, 4
la $a0, success
syscall
li $v0, 0
jr $ra
#-----------------------------------------------------------------------#
# NODOS #
#-----------------------------------------------------------------------#
# a0: list address
# a1: NULL if category, node address if object
# v0: node address added
addnode:
addi $sp, $sp, -8
sw $ra, 8($sp)
sw $a0, 4($sp)
jal smalloc
sw $a1, 4($v0) # set node content
sw $a2, 8($v0)
lw $a0, 4($sp)
lw $t0, ($a0) # first node address
beqz $t0, addnode_empty_list
addnode_to_end:
lw $t1, ($t0) # last node address
# update prev and next pointers of new node
sw $t1, 0($v0)
sw $t0, 12($v0)
# update prev and first node to new node
sw $v0, 12($t1)
sw $v0, 0($t0)
j addnode_exit
addnode_empty_list:
sw $v0, ($a0)
sw $v0, 0($v0)
sw $v0, 12($v0)
addnode_exit:
lw $ra, 8($sp)
addi $sp, $sp, 8
jr $ra
# a0: node address to delete
# a1: list address where node is deleted
delnode:
addi $sp, $sp, -8
sw $ra, 8($sp)
sw $a0, 4($sp)
lw $a0, 8($a0) # get block address
jal sfree # free block
lw $a0, 4($sp) # restore argument a0
lw $t0, 12($a0) # get address to next node of a0 node
beq $a0, $t0, delnode_point_self
lw $t1, 0($a0) # get address to prev node
sw $t1, 0($t0)
sw $t0, 12($t1)
lw $t1, 0($a1) # get address to first node again
bne $a0, $t1, delnode_exit
sw $t0, ($a1) # list point to next node
j delnode_exit
delnode_point_self:
sw $zero, ($a1) # only one node
delnode_exit:
jal sfree
lw $ra, 8($sp)
addi $sp, $sp, 8
jr $ra
# a0: msg to ask
# v0: block address allocated with string
getblock: addi $sp, $sp, -4
sw $ra, 4($sp)
li $v0, 4
syscall
jal smalloc
move $a0, $v0
li $a1, 16
li $v0, 8
syscall
move $v0, $a0
lw $ra, 4($sp)
addi $sp, $sp, 4
jr $ra
#-----------------------------------------------------------------------#
# GESTION DE MEMORIA #
#-----------------------------------------------------------------------#
smalloc:
lw $t0, slist
beqz $t0, sbrk
move $v0, $t0
lw $t0, 12($t0)
sw $t0, slist
jr $ra # return
sbrk:
li $a0, 16 # node size fixed 4 words
li $v0, 9
syscall
jr $ra # return
sfree:
lw $t0, slist
sw $t0, 12($a0)
sw $a0, slist # $a0 node address in unused list
jr $ra # return
end:

MIPS program to convert data(integer) from decimal base to base 4

I need to implement a program that traverses the linked lists and converts the integer from decimal base to base 4 and prints it to the screen.
here is what I wrote so far:
.data
num1: .word -8 , num3
num2: .word 1988 , 0
num3: .word -9034 , num5
num4: .word -100 , num2
num5: .word 1972 , num4
.globl main
.text
main:
la $t1,num1
li $s0,0 # sum of the linked list
li $s1,0 # sum of divided by 4 and postivie in the list
li $t3,0 # temp remainder for the div by 4
li $t4,4
li $t5,0 # counter how many words we enterted the stack
sumloop:
beqz $t1,exit
lw $t0,0($t1)
add $s0,$s0,$t0 # adding to the sum the value in the current node
lw $t1,4($t1)
### need your help here :)
blez $t0,sumloop # if negative or zero dont add to s1
div $t0,$t4
mfhi $t3
bnez $t3,sumloop # if the value isnt divided by 4 without remainder jump to sumloop
add $s1,$s1,$t0
j sumloop
exit:
move $a0,$s0
li $v0,1
syscall
li $a0,'\n'
li $v0,11
syscall
move $a0,$s1
li $v0,1
syscall
need your help please
Here is the code:
.data
result: .space 4
num1: .word -8,num3
num2: .word 1988,0
num3: .word -9034,num5
num4: .word -100, num2
num5: .word 1972, num4
.text
main:
li $a0, 128 #numbers of bytes to allocated
li $v0, 9 #sys function allocate heap memory
syscall
la $t1, result
sw $v0, ($t1) #store the address of the allocated heap memory in result
la $t0, num1
while:
lw $a0, ($t0)
print_value:
lw $a0, ($t0)
li $v0, 1
syscall
lw $a0, ($t0)
jal base10to4
li $a0, '\n'
li $v0, 11
syscall
addi $t0, $t0, 4
lw $t0, ($t0)
beq $t0, 0, end_while
j while
end_while:
end_program:
li $v0, 10
syscall
base10to4:
move $s0,$a0
la $s7,result
li $a0, '|'
li $v0, 11
syscall
li $s1, 4
li $s2, 0 #index for the result array
li $s6, 0 #will keep track if number is negative of positive
divide_loop:
beq $s0, $zero, end_loop
div $s0, $s0, $s1
mfhi $s5
move $s3, $s2
mul $s3, $s3, $s1
lw $s4, ($s7)
add $s3, $s3, $s4
blt $s5, $zero, negative
j next
negative:
li $s6, 1
mul $s5, $s5, -1 # if negative then make positive
next:
sw $s5, ($s3)
addi $s2, $s2, 1 #increment index with 1
j divide_loop
end_loop:
print_result:
# $s2 will contain the range of result array
addi $s2, $s2, -1
beq $s6, 1, print_minus
j print_loop
print_minus:
li $a0, '-'
li $v0, 11
syscall
print_loop:
blt $s2, $zero end_print_loop
move $s3, $s2
mul $s3, $s3, 4
lw $s4, ($s7)
add $s3, $s3, $s4
lw $s5, ($s3)
addi $s2, $s2, -1 #increment with 4 for the next address memory location
move $a0, $s5
li $v0, 1
syscall
j print_loop
end_print_loop:
return_base10to4:
jr $ra

32-bit ADD on Aarch64 assembly

This is my first post here and I'm also kind of new to arm64 assembly, so I'm trying to do some arithmetic, but for example when I try to do an addition it seems to do it in 32-bit.
Here's my code:
.data
msg: .asciz "Value 1: "
msg2: .asciz "Value 2: "
result: .asciz "Result: %d\n"
fmt: .asciz "%d"
.balign 8
value1: .quad 0
.balign 8
value2: .quad 0
.balign 16
lr_value: .quad 0
.text
.global main
main:
adr x0, lr_value
str x30, [x0]
//Display message
adr x0,msg
bl printf
//Input first value
adr x0,fmt
adr x1,value1
bl scanf
//Display second message
adr x0,msg2
bl printf
//Input second value
adr x0,fmt
adr x1,value2
bl scanf
//Load first and second value
adr x1,value1
ldr x1,[x1]
adr x2,value2
ldr x2,[x2]
//Add both values on x1
add x1,x1,x2
//Show result
adr x0,result
bl printf
adr x0,lr_value
ldr x30,[x0]
mov w0,#0
ret
And here's the output:
Value 1: 2147483647
Value 2: 1
Result: -2147483648
What am I doing wrong? I've also tried multiplication and substraction
Edit: Solved it, turns out I had to use %ld instead of %d, thank you Nate Eldredge!

Mips using the stack

I am writing a MIPS program to evaluate 3ab - 2bc - 5a + 20ac - 16. My program blows up at
lw $t1, ($sp). # pop 20ac. I have put an "<--" where it blows up
Here is my whole program. Any help would be appreciated
li.s $f7, 16.0
li.s $f0, 0.0
li.s $f1, 0.0
li.s $f2, 0.0
la $a0, prompt # print prompt
li $v0, 4
syscall
li $v0, 6
syscall # get a
s.s $f0 , x # store in x
nop
li $v0, 4
syscall # print prompt again
li $v0, 6
syscall
s.s $f0, y # store in y
nop
li $v0, 4
syscall # print prompt again
li $v0, 6
syscall
s.s $f0, z # store in z
nop
# get x, y, and z
l.s $f1, x
l.s $f2, y
l.s $f3, z
li.s $f4, 3.0
mul.s $f5, $f1, $f2 # a * b
mul.s $f5, $f5, $f4 # 3ab
addi $sp, $sp, -4
mfc1 $t1, $f5 # $t1 = f1
sw $t1, ($sp) # push 3ab onto stack
nop
li.s $f4, -2.0
mul.s $f5, $f2, $f3 # b * c
mul.s $f5, $f5, $f3 # -2bc
addi $sp, $sp, -4
mfc1 $t1, $f5 # $t1 = f1
sw $t1, ($sp) # push -2bc onto stack
nop
li.s $f4, -5.0
mul.s $f5, $f4, $f1 # 5 * a
addi $sp, $sp, -4
mfc1 $t1, $f5 # $t1 = f1
sw $t1, ($sp) # push -5a onto stack
nop
li.s $f4, 20.0
mul.s $f5, $f1, $f3 # a * c
mul.s $f5, $f5, $f4 # 20ac
addi $sp, $sp, -4
mfc1 $t1, $f5 # $t1 = f1
sw $t1, ($sp) # push 20ac onto stack ( top of stack )
nop
li.s $f4, -16.0
lw $t1, (sp) # pop 20ac<---- THIS WHERE IT BLOWS UP
nop
mtc1 $t1, $f5 # $f5 = $t1
addu $sp, $sp, 4
addu $f4, $f5, $f4 # 20ac - 16
lw $t1, (sp) # pop -5a
nop
mtc1 $t1, $f5 # $f5 = $t1
addu $sp, $sp, 4
addu $f4, $f5, $f4 # - 5a + 20ac - 16
lw $t1, (sp) # pop -2bc
nop
mtc1 $t1, $f5 # $f5 = $t1
addu $sp, $sp, 4
addu $f4, $f5, $f4 # - 2bc - 5a + 20ac - 16
lw $t1, (sp) # pop 3ab
nop
mtc1 $t1, $f5 # $f5 = $t1
addu $sp, $sp, 4
addu $f4, $f5, $f4 # 3ab - 2bc - 5a + 20ac - 16
li $v0, 10
syscall # exit
You're missing a dollar sign on the sp.

How CUDA constant memory allocation works?

I'd like to get some insight about how constant memory is allocated (using CUDA 4.2). I know that the total available constant memory is 64KB. But when is this memory actually allocated on the device? Is this limit apply to each kernel, cuda context or for the whole application?
Let's say there are several kernels in a .cu file, each using less than 64K constant memory. But the total constant memory usage is more than 64K. Is it possible to call these kernels sequentially? What happens if they are called concurrently using different streams?
What happens if there is a large CUDA dynamic library with lots of kernels each using different amounts of constant memory?
What happens if there are two applications each requiring more than half of the available constant memory? The first application runs fine, but when will the second app fail? At app start, at cudaMemcpyToSymbol() calls or at kernel execution?
Parallel Thread Execution ISA Version 3.1 section 5.1.3 discusses constant banks.
Constant memory is restricted in size, currently limited to 64KB which
can be used to hold statically-sized constant variables. There is an
additional 640KB of constant memory, organized as ten independent 64KB
regions. The driver may allocate and initialize constant buffers in
these regions and pass pointers to the buffers as kernel function
parameters. Since the ten regions are not contiguous, the driver
must ensure that constant buffers are allocated so that each buffer
fits entirely within a 64KB region and does not span a region
boundary.
A simple program can be used to illustrate the use of constant memory.
__constant__ int kd_p1;
__constant__ short kd_p2;
__constant__ char kd_p3;
__constant__ double kd_p4;
__constant__ float kd_floats[8];
__global__ void parameters(int p1, short p2, char p3, double p4, int* pp1, short* pp2, char* pp3, double* pp4)
{
*pp1 = p1;
*pp2 = p2;
*pp3 = p3;
*pp4 = p4;
return;
}
__global__ void constants(int* pp1, short* pp2, char* pp3, double* pp4)
{
*pp1 = kd_p1;
*pp2 = kd_p2;
*pp3 = kd_p3;
*pp4 = kd_p4;
return;
}
Compile this for compute_30, sm_30 and execute cuobjdump -sass <executable or obj> to disassemble you should see
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = windows
compile_size = 32bit
identifier = c:/dev/constant_banks/kernel.cu
code for sm_30
Function : _Z10parametersiscdPiPsPcPd
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44]; // stack pointer
/*0010*/ /*0x40001de428004005*/ MOV R0, c [0x0] [0x150]; // pp1
/*0018*/ /*0x50009de428004005*/ MOV R2, c [0x0] [0x154]; // pp2
/*0020*/ /*0x0001dde428004005*/ MOV R7, c [0x0] [0x140]; // p1
/*0028*/ /*0x13f0dc4614000005*/ LDC.U16 R3, c [0x0] [0x144]; // p2
/*0030*/ /*0x60011de428004005*/ MOV R4, c [0x0] [0x158]; // pp3
/*0038*/ /*0x70019de428004005*/ MOV R6, c [0x0] [0x15c]; // pp4
/*0048*/ /*0x20021de428004005*/ MOV R8, c [0x0] [0x148]; // p4
/*0050*/ /*0x30025de428004005*/ MOV R9, c [0x0] [0x14c]; // p4
/*0058*/ /*0x1bf15c0614000005*/ LDC.U8 R5, c [0x0] [0x146]; // p3
/*0060*/ /*0x0001dc8590000000*/ ST [R0], R7; // *pp1 = p1
/*0068*/ /*0x0020dc4590000000*/ ST.U16 [R2], R3; // *pp2 = p2
/*0070*/ /*0x00415c0590000000*/ ST.U8 [R4], R5; // *pp3 = p3
/*0078*/ /*0x00621ca590000000*/ ST.64 [R6], R8; // *pp4 = p4
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0xe0001de74003ffff*/ BRA 0x90;
/*0098*/ /*0x00001de440000000*/ NOP CC.T;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
...........................................
Function : _Z9constantsPiPsPcPd
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44]; // stack pointer
/*0010*/ /*0x00001de428004005*/ MOV R0, c [0x0] [0x140]; // p1
/*0018*/ /*0x10009de428004005*/ MOV R2, c [0x0] [0x144]; // p2
/*0020*/ /*0x0001dde428004c00*/ MOV R7, c [0x3] [0x0]; // kd_p1
/*0028*/ /*0x13f0dc4614000c00*/ LDC.U16 R3, c [0x3] [0x4]; // kd_p2
/*0030*/ /*0x20011de428004005*/ MOV R4, c [0x0] [0x148]; // p3
/*0038*/ /*0x30019de428004005*/ MOV R6, c [0x0] [0x14c]; // p4
/*0048*/ /*0x20021de428004c00*/ MOV R8, c [0x3] [0x8]; // kd_p4
/*0050*/ /*0x30025de428004c00*/ MOV R9, c [0x3] [0xc]; // kd_p4
/*0058*/ /*0x1bf15c0614000c00*/ LDC.U8 R5, c [0x3] [0x6]; // kd_p3
/*0060*/ /*0x0001dc8590000000*/ ST [R0], R7;
/*0068*/ /*0x0020dc4590000000*/ ST.U16 [R2], R3;
/*0070*/ /*0x00415c0590000000*/ ST.U8 [R4], R5;
/*0078*/ /*0x00621ca590000000*/ ST.64 [R6], R8;
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0xe0001de74003ffff*/ BRA 0x90;
/*0098*/ /*0x00001de440000000*/ NOP CC.T;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
.....................................
I annotated to the right of the SASS.
On sm30 you can see that parameters are passed in constant bank 0 starting at offset 0x140.
User defined __constant__ variables are defined in constant bank 3.
If you execute cuobjdump --dump-elf <executable or obj> you can find other interesting constant information.
32bit elf: abi=6, sm=30, flags = 0x1e011e
Sections:
Index Offset Size ES Align Type Flags Link Info Name
1 34 142 0 1 STRTAB 0 0 0 .shstrtab
2 176 19b 0 1 STRTAB 0 0 0 .strtab
3 314 d0 10 4 SYMTAB 0 2 a .symtab
4 3e4 50 0 4 CUDA_INFO 0 3 b .nv.info._Z9constantsPiPsPcPd
5 434 30 0 4 CUDA_INFO 0 3 0 .nv.info
6 464 90 0 4 CUDA_INFO 0 3 a .nv.info._Z10parametersiscdPiPsPcPd
7 4f4 160 0 4 PROGBITS 2 0 a .nv.constant0._Z10parametersiscdPiPsPcPd
8 654 150 0 4 PROGBITS 2 0 b .nv.constant0._Z9constantsPiPsPcPd
9 7a8 30 0 8 PROGBITS 2 0 0 .nv.constant3
a 7d8 c0 0 4 PROGBITS 6 3 a00000b .text._Z10parametersiscdPiPsPcPd
b 898 c0 0 4 PROGBITS 6 3 a00000c .text._Z9constantsPiPsPcPd
.section .strtab
.section .shstrtab
.section .symtab
index value size info other shndx name
0 0 0 0 0 0 (null)
1 0 0 3 0 a .text._Z10parametersiscdPiPsPcPd
2 0 0 3 0 7 .nv.constant0._Z10parametersiscdPiPsPcPd
3 0 0 3 0 b .text._Z9constantsPiPsPcPd
4 0 0 3 0 8 .nv.constant0._Z9constantsPiPsPcPd
5 0 0 3 0 9 .nv.constant3
6 0 4 1 0 9 kd_p1
7 4 2 1 0 9 kd_p2
8 6 1 1 0 9 kd_p3
9 8 8 1 0 9 kd_p4
10 16 32 1 0 9 kd_floats
11 0 192 12 10 a _Z10parametersiscdPiPsPcPd
12 0 192 12 10 b _Z9constantsPiPsPcPd
The kernel parameter constant bank is versioned per launch so that concurrent kernels can be executed. The compiler and user constants are per CUmodule. It is the responsibility of the developer to manage coherency of this data. For example, the developer has to ensure that a cudaMemcpyToSymbol is update in a safe manner.

Resources