cudamemcpy error:"the launch timed out and was terminated" - memory

My code is a parallel implmentation that calculates the nth digit of pi. When I finish the kernel and try to copy the memory back to the host I get a "the launch timed out and was terminated" error.
I used this code for error checking for each cudamalloc, cudamemcpy, and kernal launch.
std::string error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
These calls were saying everything was fine until the first cudamemcpy call after returning from the kernel. the error happens in the line "cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);" in main. Any help is appreciated.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of x mod y */
__device__ int inv_mod(int x,int y) {
int q,u,v,a,c,t;
u=x;
v=y;
c=1;
a=0;
do {
q=v/u;
t=c;
c=a-q*c;
a=t;
t=u;
u=v-q*u;
v=t;
} while (u!=0);
a=a%y;
if (a<0) a=y+a;
return a;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of u mod v, if v is odd */
__device__ int inv_mod2(int u,int v) {
int u1,u3,v1,v3,t1,t3;
u1=1;
u3=u;
v1=v;
v3=v;
if ((u&1)!=0) {
t1=0;
t3=-v;
goto Y4;
} else {
t1=1;
t3=u;
}
do {
do {
if ((t1&1)==0) {
t1=t1>>1;
t3=t3>>1;
} else {
t1=(t1+v)>>1;
t3=t3>>1;
}
Y4:;
} while ((t3&1)==0);
if (t3>=0) {
u1=t1;
u3=t3;
} else {
v1=v-t1;
v3=-t3;
}
t1=u1-v1;
t3=u3-v3;
if (t1<0) {
t1=t1+v;
}
} while (t3 != 0);
return u1;
}
/* return (a^b) mod m */
__device__ int pow_mod(int a,int b,int m)
{
int r,aa;
r=1;
aa=a;
while (1) {
if (b&1) r=mul_mod(r,aa,m);
b=b>>1;
if (b == 0) break;
aa=mul_mod(aa,aa,m);
}
return r;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return true if n is prime */
int is_prime(int n)
{
int r,i;
if ((n % 2) == 0) return 0;
r=(int)(sqrtf(n));
for(i=3;i<=r;i+=2) if ((n % i) == 0) return 0;
return 1;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the prime number immediatly after n */
int next_prime(int n)
{
do {
n++;
} while (!is_prime(n));
return n;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
#define DIVN(t,a,v,vinc,kq,kqinc) \
{ \
kq+=kqinc; \
if (kq >= a) { \
do { kq-=a; } while (kq>=a); \
if (kq == 0) { \
do { \
t=t/a; \
v+=vinc; \
} while ((t % a) == 0); \
} \
} \
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
__global__ void digi_calc(int *s, int *av, int *primes, int N, int n, int nthreads){
int a,vmax,num,den,k,kq1,kq2,kq3,kq4,t,v,i,t1, h;
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
// GIANT LOOP
for (h = 0; h<1; h++){
if(tid > nthreads) continue;
a = primes[tid];
vmax=(int)(logf(3*N)/logf(a));
if (a==2) {
vmax=vmax+(N-n);
if (vmax<=0) continue;
}
av[tid]=1;
for(i=0;i<vmax;i++) av[tid]*= a;
s[tid]=0;
den=1;
kq1=0;
kq2=-1;
kq3=-3;
kq4=-2;
if (a==2) {
num=1;
v=-n;
} else {
num=pow_mod(2,n,av[tid]);
v=0;
}
for(k=1;k<=N;k++) {
t=2*k;
DIVN(t,a,v,-1,kq1,2);
num=mul_mod(num,t,av[tid]);
t=2*k-1;
DIVN(t,a,v,-1,kq2,2);
num=mul_mod(num,t,av[tid]);
t=3*(3*k-1);
DIVN(t,a,v,1,kq3,9);
den=mul_mod(den,t,av[tid]);
t=(3*k-2);
DIVN(t,a,v,1,kq4,3);
if (a!=2) t=t*2; else v++;
den=mul_mod(den,t,av[tid]);
if (v > 0) {
if (a!=2) t=inv_mod2(den,av[tid]);
else t=inv_mod(den,av[tid]);
t=mul_mod(t,num,av[tid]);
for(i=v;i<vmax;i++) t=mul_mod(t,a,av[tid]);
t1=(25*k-3);
t=mul_mod(t,t1,av[tid]);
s[tid]+=t;
if (s[tid]>=av[tid]) s-=av[tid];
}
}
t=pow_mod(5,n-1,av[tid]);
s[tid]=mul_mod(s[tid],t,av[tid]);
}
__syncthreads();
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
int main(int argc,char *argv[])
{
int N,n,i,totalp, h;
double sum;
const char *error;
int *sdev, *avdev, *shost, *avhost, *adev, *ahost;
argc = 2;
argv[1] = "2";
if (argc<2 || (n=atoi(argv[1])) <= 0) {
printf("This program computes the n'th decimal digit of pi\n"
"usage: pi n , where n is the digit you want\n"
);
exit(1);
}
sum = 0;
N=(int)((n+20)*logf(10)/logf(13.5));
totalp=(N/logf(N))+10;
ahost = (int *)calloc(totalp, sizeof(int));
i = 0;
ahost[0]=2;
for(i=1; ahost[i-1]<=(3*N); ahost[i+1]=next_prime(ahost[i])){
i++;
}
// allocate host memory
size_t size = i*sizeof(int);
shost = (int *)malloc(size);
avhost = (int *)malloc(size);
//allocate memory on device
cudaMalloc((void **) &sdev, size);
cudaMalloc((void **) &avdev, size);
cudaMalloc((void **) &adev, size);
cudaMemcpy(adev, ahost, size, cudaMemcpyHostToDevice);
if (i >= 512){
h = 512;
}
else h = i;
dim3 dimGrid(((i+512)/512),1,1);
dim3 dimBlock(h,1,1);
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
//copy memory back to host
cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);
cudaMemcpy(shost, sdev, size, cudaMemcpyDeviceToHost);
// end malloc's, memcpy's, kernel calls
for(h = 0; h <=i; h++){
sum=fmod(sum+(double) shost[h]/ (double) avhost[h],1.0);
}
printf("Decimal digits of pi at position %d: %09d\n",n,(int)(sum*1e9));
//free memory
cudaFree(sdev);
cudaFree(avdev);
cudaFree(adev);
free(shost);
free(avhost);
free(ahost);
return 0;
}

This is exactly the same problem you asked about in this question. The kernel is getting terminated early by the driver because it is taking too long to finish. If you read the documentation for any of these runtime API functions you will see the following note:
Note:
Note that this function may also return error codes from previous,
asynchronous launches.
All that is happening is that the first API call after the kernel launch is returning the error incurred while the kernel was running - in this case the cudaMemcpy call. The way you can confirm this for yourself is to do something like this directly after the kernel launch:
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
std::string error = cudaGetErrorString(cudaPeekAtLastError());
printf("%s\n", error);
error = cudaGetErrorString(cudaThreadSynchronize());
printf("%s\n", error);
The cudaPeekAtLastError() call will show you if there are any errors in the kernel launch, and the error code returned by the cudaThreadSynchronize() call will show whether any errors were generated while the kernel was executing.
The solution is exactly as outlined in the previous question: probably the simplest way is redesign the code so it is "re-entrant" so you can split the work over several kernel launches, with each kernel launch safely under the display driver watchdog timer limit.

Cuda somehow buffers all the read/write operations on global memory. So you can batch the operations in some loop with some kernel, and it will take actually NO TIME. Then, when you call memcpy, all the buffered operations are done, and it can timeout. Method to go with, is to call cudaThreadSynchronize procedure between iterations.
So remember: if a kernel run takes only nanoseconds to calculate - it doesn't mean that it is so fast - some of the writes to the global memory, are done when memcpy or threadsynchronize is called.

Related

stack smashing in C code about making a histogram

I need to make a c program that will make a histogram of all the letters present in a phrase the user gives. When I run it, I does it but gives a "* stack smashing detected *: terminated". Where would this error be coming from? (for ease right now I set max to 3). In the future i'll have it find the max
Thank you
Andrew
#include <stdio.h>
#include <ctype.h>
#include <string.h>
static void ReadText(int histo[26],int max) {
char phrase[100];
int i;
char Letter;
char toArray;
// read in phrase
printf("Enter Phrase: "); // reads in phrase with spaces between words
scanf("%[^\n]",phrase);
// count the number of certain letters that occur
for(i = 0; i <= strlen(phrase);++i) {
Letter = phrase[i];
if(isalpha(Letter) != 0){
Letter = tolower(Letter);
toArray = Letter - 97;
histo[(int)toArray] = histo[(int)toArray] + 1;
}
}
}
static void DrawHist(int histo[26], int max){
int i;
int j;
int histo2[50];
for(i = 0; i <= 26; i++) {
histo2[i+i] = histo[i];
if(i < 25) {
histo2[i+i+1] = 0;
}
}
// (i = 1; i <= 50; i++) {
// printf("%d",histo2[i]);
//}
//printf("\n");
for(i=max;i>0;--i) {
for(j=0;j<=51;++j) {
if((j < 51) && (histo2[j] >= i)) {
printf("|");
}
else if((j < 51) && (histo2[j] < i)){
printf(" ");
}
else if(j == 51){
printf("\n");
}
}
}
printf("+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n");
printf("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\n");
}
int main() {
int histo[26] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int max = 3;
//int i;
ReadText(histo,max);
//for(i = 0; i<26;++i) {
// printf("%d",histo[i]);
//}
DrawHist(histo,max);
return 0;
}

Clang memory allocation

Could anyone please help me understand why Clang reallocates the same memory address for different variables while their lifetimes intersect?
I am using a sample program (below) to show the problem.
When I compile the program with clang -O0, variable j in function ok has the same memory address as variable solutions in function nqueens.
Function ok is called inside function nqueens, which means that the lifetime of the variables intersect; the same stack space cannot be used/reused for both functions.
Compiling the program with gcc or clang at -O1, however, they are assigned different memory addresses.
Any help is appreciated!
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <alloca.h>
/* Checking information */
static int solutions[] = {
1,
0,
0,
2,
10, /* 5 */
4,
40,
92,
352,
724, /* 10 */
2680,
14200,
73712,
365596,
};
#define MAX_SOLUTIONS sizeof(solutions)/sizeof(int)
int total_count;
int sharedVar = 0;
int ok(int n, char *a)
{
int i, j;
char p, q;
printf("jjjjjjjjj: %d, %p\n", n,&j);
for (i = 0; i < n; i++) {
p = a[i];
for (j = i + 1; j < n; j++) {
q = a[j];
if (q == p || q == p - (j - i) || q == p + (j - i))
return 0;
}
}
return 1;
}
void nqueens (int n, int j, char *a, int *solutions)
{
int i,res;
sharedVar = sharedVar * j - n;
if (n == j) {
/* good solution, count it */
*solutions = 1;
return;
}
printf("solutions: %d, %p\n", j, &solutions);
*solutions = 0;
/* try each possible position for queen <j> */
for (i = 0; i < n; i++) {
a[j] = (char) i;
if (ok(j + 1, a)) {
nqueens(n, j + 1, a,&res);
*solutions += res;
}
}
}
int main()
{
int size = 3;
char *a;
// printf("total_count: %p\n", &total_count);
total_count=0;
a = (char *)alloca(size * sizeof(char));
printf("Computing N-Queens algorithm (n=%d) ", size);
sharedVar = -5;
nqueens(size, 0, a, &total_count);
printf("completed!\n");
printf("sharedVar: %d\n", sharedVar);
}

BinaryOperator doesn't work when comes to a=function(b,c)?

I want to identify the Expression like int a = function(b,c), so I wrote the code as followers:
void foo(int* a, int *b) {
int x;
int m;
int z;
int *p;
if (a[0] > 1) {
b[0] = 2;
z=10;
x = function( sizeof(char));
}
m = function( sizeof(char));
bar(x,m);
}
void bar(float x, float y);
int function(int size){
return size;
}
And than I used clang -Xclang -ast-dump -fsyntax-only cfunc_with_if.c to get the AST of the code:
From the result I found the AST Node type of int a = function(b,c) is BinaryOperator. In order to verify this, I use VisitStmt(Stmt *s) to print out all stmts' type.
bool VisitStmt(Stmt *s) {
if(isa<Stmt>(s)) {
Stmt *Statement = dyn_cast<Stmt>(s);
//Statement->dump();
std::string st(Statement->getStmtClassName());
st = st + "\n";
TheRewriter.InsertText(Statement->getLocStart(), st, true, true);
}
return true;
}
But the result is so weird. There is nothing printed out about the type of int a = function(b,c). and I'm so confused about the result. Is there some error in my code or something else?
There's no output at bar(x,m); either. Are there any errors when the tool compiles the code being analyzed? As written above, the code would fail to compile at x = function( sizeof(char)); since function has not been declared. Even when compilation has failed due to errors, the libtool tools can still run at least partially, with strange results.
Edit to add: what happens if you run the tool on this code?
void bar(float x, float y);
int function(int size);
void foo(int* a, int *b) {
int x;
int m;
int z;
int *p;
if (a[0] > 1) {
b[0] = 2;
z=10;
x = function( sizeof(char));
}
m = function( sizeof(char));
bar(x,m);
}
void bar(float x, float y);
int function(int size){
return size;
}

how to align in printf function

I want to make the printf function print from right to left because this program convert the value of number to binary and I want it to be printed in proper form for example if I convert 16 it is written like that 00001 but it must look like that 10000 so does anyone know how to do that thanks in advance
#include <stdio.h>
#include <stdlib.h>
int main()
{
int x,rem;
printf("please enter number: ");
scanf("%d",&x);
while (x !=0)
{
rem=x%2;
if (rem==0)
{
printf("0");
}
else
{
printf("1");
}
x = x/2;
rem = 0;
}
return 0;
}
Here it is:
void print_binary(int x)
{
int skip = 1;
unsigned int mask = 1 << 31;
while(mask > 0){
if(x & mask){
skip = 0;
printf("1");
}else{
if(!skip) printf("0");
}
mask >>= 1;
}
printf("\n");
}
This will print the binary number without trailing zeroes.
If you rather want the result to be stored in a string, you can use:
#include <string.h>
void int_to_binary(int x, char * buff) // buff size must be >= 32 !
{
buff[0] = '\0'; // ensure string ends with \0
unsigned int mask = 1 << 31;
for (; mask > 0; mask >>= 1)
{
strcat(buff, (x & mask) ? "1" : "0");
}
}
To check both codes, use:
int main(int argc, char* argv[])
{
int x;
printf("please enter number: ");
scanf("%d",&x);
char bin[32];
int_to_binary(x, bin);
printf("%s\n", bin);
print_binary(x);
}
What we do is using a mask, which in binary is one "1" beginning on the far left and moving one step right at each loop. The "&" is a bite-wise operator (I let you google it to know how it works). If you need more explanation, feel free to ask.
#include<stdio.h>
#include<stdlib.h>
int main()
{
int binary[20];
int q,i=0;
printf("Enter the decimal no\n");
scanf("%d",&q);
while(q > 0)
{
binary[i]=q%2;
i++;
q=q/2;
}
for(int j=i-1;j>=0;j--)
{
printf("%d",binary[j]);
}
return 0;
}

Problem forking processes and creating threads

My program is supposed to fork three processes. Each of these processes will create three threads and fork two additional processes. These two additional processes will create three threads.
Here is my code. I've tried to keep things simple with nested loops. I think at some point I might be forking more processes or creating more threads.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
void *printme(void* Array){
int *Arr = (int *) Array;
int len = sizeof(Arr) / sizeof(int);
if (len == 1){
printf("I'm thread %d.%d",Arr[0],Arr[1]);
}
else if (len == 2){
printf("I'm thread %d.%d.%d",Arr[0],Arr[1],Arr[2]);
}
printf("\n");
pthread_exit(NULL);
}
int main(void){
int i, j, k, l;
int threadLevel1[2];
int threadLevel2[3];
printf("\n");
for (i = 1 ; i < 4 ; i++){ // Loop to fork the three main processes.
if (fork() != 0){
sleep(4);
}
else{
//The newly forked process will create three threads and fork two additional processes.
for (j = 1 ; j < 4 ; j++){
pthread_t t;
threadLevel1[0] = i;
threadLevel1[1] = j;
if (pthread_create(&t, NULL, printme, (void*) threadLevel1) != 0){
perror("pthread_create");
exit(1);
}
}
for (k = 1; k < 3 ; k++){
pid_t a = fork();
if (a != 0){
sleep(2);
}
else if (a == -1){
perror("fork"); /* display error message */
exit(0);
}
else{
for (l = 1 ; l < 4 ; l++){
pthread_t t;
threadLevel2[0] = i;
threadLevel2[1] = k;
threadLevel2[2] = l;
if (pthread_create(&t, NULL, printme, (void*) threadLevel2)!=0) {
perror("pthread_create");
exit(1);
}
}
}
}
}
}
return 0;
}
You have a problem in your code here:
void *printme(void* Array){
int *Arr = (int *) Array;
int len = sizeof(Arr) / sizeof(int);
The value len will always be the same no matter what is passed in to printme. That's because C passes arrays as pointers, not as objects with embedded lengths.

Resources