Memory Leak in Binary Tree Initialize Function - memory
I am attempting to make and evaluate a binary expression tree based on a postfix user input string in C. My binary tree initialization function is causing memory leaks, however. To summarize my algorithm, the user enters a postfix string of input which is parsed through by a function and assembled into the tree. Here's my full code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define TRUE 1
#define FALSE 0
// Define binary expression tree data structure
typedef struct binExpTree {
char *val;
struct binExpTree *left;
struct binExpTree *right;
} expTree;
// Define expression tree stack data structure
typedef struct expTreeStack {
int height;
int used;
expTree **expTreeDarr;
} treeStack;
// Function prototypes
void initStack(treeStack *stack);
expTree * getTopStack(treeStack *stack);
int isEmptyStack(treeStack *stack);
void pushStack(treeStack *stack, expTree *treeNode);
expTree * popStack(treeStack *stack);
void clearStack(treeStack *stack);
expTree * initTree(char *val);
void printCommands();
expTree * parseExpression(char *expString);
void clearTree(expTree *rootNode);
void printInfix(expTree *rootNode);
void printPrefix(expTree *rootNode);
int evalExpression(expTree *rootNode);
/* File contains all functions necessary for stack operations */
// Initialize empty binary tree stack of size 4
void initStack(treeStack *stack) {
stack->height = 4;
stack->used = 0;
stack->expTreeDarr = (expTree **)malloc(sizeof(expTree *) * stack->height);
}
// Return the tree node from the stack's top
expTree * getTopStack(treeStack *stack) {
if (stack->used > 0) {
return stack->expTreeDarr[stack->used - 1];
}
else {
return NULL;
}
}
// Discern whether tree stack is empty
int isEmptyStack(treeStack *stack) {
if (stack->used == 0) {
return TRUE;
}
else {
return FALSE;
}
}
// Push tree node pointer onto stack
void pushStack(treeStack *stack, expTree *treeNode) {
if (stack->used == stack->height) {
expTree **expTreeTmp = stack->expTreeDarr;
stack->height += 4;
stack->expTreeDarr = (expTree **)malloc(sizeof(expTree *) * stack->height);
for (int i = 0; i < stack->used; i++) {
stack->expTreeDarr[i] = expTreeTmp[i];
//free(expTreeTmp[i]);
}
free(expTreeTmp);
}
stack->expTreeDarr[stack->used] = treeNode;
stack->used = stack->used + 1;
}
// Pop tree node pointer from the stack
expTree * popStack(treeStack *stack) {
expTree *stackTmp = getTopStack(stack);
expTree *newNode = (expTree *)malloc(sizeof(expTree));
*newNode = *stackTmp;
stack->used -= 1;
return newNode;
}
// Empty stack of all data (make sure this works)
void clearStack(treeStack *stack) {
for (int i = 0; i < stack->used; i++) {
clearTree(stack->expTreeDarr[i]);
}
free(stack->expTreeDarr);
stack->used = 0;
stack->height = 0;
}
/* File contains all functions necessary for binary tree operations */
// Initialize binary expression tree with specified operator/operand
expTree * initTree(char *val) {
expTree *newTree = (expTree *)malloc(sizeof(expTree));
newTree->val = (char *)malloc(strlen(val) + 1);
strcpy(newTree->val, val);
newTree->left = NULL;
newTree->right = NULL;
return newTree;
}
// Print commands available to the user
void printCommands() {
printf("The commands for this program are:\n\n");
printf("q - to quit the program\n");
printf("? - to list the accepted commands\n");
printf("or any postfix mathematical expression using the operators of *, /, +, -\n");
}
// Return size of binary expression tree
int sizeTree(expTree *treeNode) {
if (treeNode == NULL) {
return 0;
}
else {
return 1 + sizeTree(treeNode->left) + sizeTree(treeNode->right);
}
}
// Construct a postfix binary expression tree from expression string
expTree * parseExpression(char *expString) {
char *expStringCopy = (char *)malloc(strlen(expString) + 1);
expTree *treeNode;
treeStack expStack;
initStack(&expStack);
strcpy(expStringCopy, expString);
char *expStringTok = strtok(expStringCopy, " ");
while (expStringTok != NULL) {
if (*expStringTok == '+' || *expStringTok == '-' ||
*expStringTok == '*' || *expStringTok == '/') {
if (expStack.used < 2) {
return NULL;
}
treeNode = initTree(expStringTok);
treeNode->right = popStack(&expStack);
treeNode->left = popStack(&expStack);
pushStack(&expStack, treeNode);
}
else {
treeNode = initTree(expStringTok);
pushStack(&expStack, treeNode);
}
expStringTok = strtok(NULL, " ");
}
if (expStack.used > 1 || (*(treeNode->val) != '+' && *(treeNode->val) != '-' &&
*(treeNode->val) != '*' && *(treeNode->val) != '/')) {
return NULL;
}
free(expStringCopy);
treeNode = popStack(&expStack);
clearStack(&expStack);
return treeNode;
}
// Clear binary expression tree
void clearTree(expTree *rootNode) {
if (rootNode == NULL) {
return;
}
else {
clearTree(rootNode->left);
clearTree(rootNode->right);
free(rootNode->val);
free(rootNode);
}
}
// Print infix notation of expression
void printInfix(expTree *rootNode) {
if (rootNode == NULL) {
return;
}
else {
if (*(rootNode->val) == '+' || *(rootNode->val) == '-' ||
*(rootNode->val) == '*' || *(rootNode->val) == '/') {
printf("( ");
}
printInfix(rootNode->left);
printf(" %s ", rootNode->val);
printInfix(rootNode->right);
if (*(rootNode->val) == '+' || *(rootNode->val) == '-' ||
*(rootNode->val) == '*' || *(rootNode->val) == '/') {
printf(" )");
}
}
}
// Print prefix notation of expression
void printPrefix(expTree *rootNode) {
if (rootNode == NULL) {
return;
}
else {
printf(" %s ", rootNode->val);
printPrefix(rootNode->left);
printPrefix(rootNode->right);
}
}
// Evaluate the expression tree
int evalExpression(expTree *rootNode) {
char op;
if (*(rootNode->val) == '+') {
return evalExpression(rootNode->left) + evalExpression(rootNode->right);
}
else if (*(rootNode->val) == '-') {
return evalExpression(rootNode->left) - evalExpression(rootNode->right);
}
else if (*(rootNode->val) == '*') {
return evalExpression(rootNode->left) * evalExpression(rootNode->right);
}
else if (*(rootNode->val) == '/') {
return evalExpression(rootNode->left) / evalExpression(rootNode->right);
}
else {
return atoi(rootNode->val);
}
}
int main(int argc, char const *argv[])
{
char input[300];
expTree *expPostfix;
/* set up an infinite loop */
while (1)
{
fgets(input,300,stdin);
/* remove the newline character from the input */
int i = 0;
while (input[i] != '\n' && input[i] != '\0') {
i++;
}
input[i] = '\0';
/* check if user enter q or Q to quit program */
if ( (strcmp (input, "q") == 0) || (strcmp (input, "Q") == 0) )
break;
/* check if user enter ? to see command list */
else if ( strcmp (input, "?") == 0)
printCommands();
/* user enters an expression */
else {
// Parse the expression into a binary expression tree
printf("%s\n", input);
expPostfix = parseExpression(input);
// Discern whether expression is valid
if (expPostfix == NULL) {
printf("Invalid expression. Enter a valid postfix expression \n");
continue;
}
// Print the expression in infix notation
printf("Infix notation: ");
printInfix(expPostfix);
printf("\n");
// Print the expression in prefix notation
printf("Prefix notation: ");
printPrefix(expPostfix);
printf("\n");
// Print the expression in postfix notation
printf("Postfix notation: ");
printf("%s\n", input);
// Evaluate expression and print result
printf("Expression result: %d \n\n", evalExpression(expPostfix));
clearTree(expPostfix);
}
}
printf("\nGoodbye\n");
return 0;
}
Upon running with Valgrind and an input of "1 1 -", this is the output:
==35604==
==35604== HEAP SUMMARY:
==35604== in use at exit: 72 bytes in 3 blocks
==35604== total heap usage: 13 allocs, 10 frees, 2,236 bytes allocated
==35604==
==35604== 24 bytes in 1 blocks are definitely lost in loss record 1 of 2
==35604== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==35604== by 0x10952C: initTree (proj4base_38.c:143)
==35604== by 0x1096CC: parseExpression (proj4base_38.c:194)
==35604== by 0x109B8A: main (proj4base_38.c:323)
==35604==
==35604== 48 bytes in 2 blocks are definitely lost in loss record 2 of 2
==35604== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==35604== by 0x10952C: initTree (proj4base_38.c:143)
==35604== by 0x109719: parseExpression (proj4base_38.c:201)
==35604== by 0x109B8A: main (proj4base_38.c:323)
==35604==
==35604== LEAK SUMMARY:
==35604== definitely lost: 72 bytes in 3 blocks
==35604== indirectly lost: 0 bytes in 0 blocks
==35604== possibly lost: 0 bytes in 0 blocks
==35604== still reachable: 0 bytes in 0 blocks
==35604== suppressed: 0 bytes in 0 blocks
==35604==
==35604== For lists of detected and suppressed errors, rerun with: -s
==35604== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 0 from 0)
So it seems the culprit is my initTree() function. However, I just cannot wrap my head around why this memory is being lost. I hope this isn't too much code. This is an edit from previously, where someone informed me there was not enough information to go on.
The leak is caused by popStack, because the target of stackTmp gets leaked when the function exits:
expTree * popStack(treeStack *stack) {
expTree *stackTmp = getTopStack(stack);
expTree *newNode = (expTree *)malloc(sizeof(expTree));
*newNode = *stackTmp;
stack->used -= 1;
return newNode;
}
Given that the stack seemed to be the exclusive owner of the tree, and it no longer has a pointer to it, popStack can avoid the leak by simply not making a copy and returning the original:
expTree * popStack(treeStack *stack) {
expTree *topNode = getTopStack(stack);
stack->used -= 1;
return topNode;
}
Related
Run-Time Check Failure #2 - Stack around the variable 'maxlong' was corrupted
I wrote a code to print the length of the longest line from my input and print out as much as possible of the longest line(There is a maximum length of what I can output the longest line). But I got this error. I tried everything I could yet still no clue. Is there anyone who might help? #include<stdio.h> #define MAXLINE 10 int getline(char line[], int len) { int i,c,max; i = 0; max = 0; while ((c = getchar()) != EOF&&c!='\n') { if (i < len) { line[i] = c; } i += 1; } if (i < len) { if (c == '\n') { line[i] = c; i += 1; } line[i] = '\0'; } else if (i >= len) { line[len] = '\0'; } return i; } void copy(char from[], char to[]) { int i = 0; while (from[i] != '\0') { to[i] = from[i]; i += 1; } to[i] = '\0'; } main() { char longline[MAXLINE]; char longestline[MAXLINE]; char maxlong[MAXLINE]; int length; int max = 0; int maxline = 0; while ((length = getline(longline, MAXLINE)) > 0) { if (length > max && length < MAXLINE) { copy(longline, longestline); max = length; } else if (length > MAXLINE) { if (length > maxline) { maxline = length; copy(longline, maxlong); } } } if (maxline == 0) { printf("%s", longestline); } else { printf("%s\n%d\n", maxlong, maxline); } }
Round Robin Algorithm Using Circular Linked List
Use a circular singly linked list to implement Round Robin process scheduling algorithm in which each process is provided a fixed time (quantum) to execute and is pre-empted after that time period to allow the other process to execute. Assume a set of ‘n’ processes are ready for execution. Read the time quantum and for each of the processes, read the total execution time. Name the processes as ‘A’, ‘B’ and so on in sequence. Each node should contain the name of the process, its total execution time and the remaining execution time. If a process completes its execution, remove it from the list after displaying its name and the completion time. Input format: First line contains the value of ‘n’, the number of processes Second line contains the time quantum The remaining lines contain the total execution time of the processes in order. 5 2 6 3 7 5 1 Output: E 9 B 12 A 18 D 21 C 22
#include <iostream> using namespace std; class node { public: char name; int tm; int rt; node *next; }; class rr { public: node * Head = NULL; int j = 65; void insert (int n) { node *nn = new node; nn->name = j++; nn->tm = n; nn->rt = nn->tm; if (Head == NULL) { Head = nn; Head->next = Head; } else { node *temp = Head; while (temp->next != Head) temp = temp->next; nn->next = temp->next; temp->next = nn; } } void quantum (int t) { node *temp = Head; int c = 0, i = 0; while (Head != NULL) { { temp->rt = temp->rt - t; c = c + t; if (temp->rt <= 0) { c = c + temp->rt; cout << temp->name; cout << c << endl; del (temp->name); if (temp->next == temp) { break; } } temp = temp->next; } } } void del (char x) { node *p = NULL; node *temp = Head; if (Head->name == x) { while (temp->next != Head) temp = temp->next; p = Head; temp->next = Head->next; Head = Head->next; delete p; } else { while (temp->name != x) { p = temp; temp = temp->next; } p->next = temp->next; delete temp; } } }; int main () { rr robin; int i, n, x, y, t; cin >> y; cin >> t; for (i = 0; i < y; i++) { cin >> n; robin.insert (n); } robin.quantum (t); return 0; }
how to align in printf function
I want to make the printf function print from right to left because this program convert the value of number to binary and I want it to be printed in proper form for example if I convert 16 it is written like that 00001 but it must look like that 10000 so does anyone know how to do that thanks in advance #include <stdio.h> #include <stdlib.h> int main() { int x,rem; printf("please enter number: "); scanf("%d",&x); while (x !=0) { rem=x%2; if (rem==0) { printf("0"); } else { printf("1"); } x = x/2; rem = 0; } return 0; }
Here it is: void print_binary(int x) { int skip = 1; unsigned int mask = 1 << 31; while(mask > 0){ if(x & mask){ skip = 0; printf("1"); }else{ if(!skip) printf("0"); } mask >>= 1; } printf("\n"); } This will print the binary number without trailing zeroes. If you rather want the result to be stored in a string, you can use: #include <string.h> void int_to_binary(int x, char * buff) // buff size must be >= 32 ! { buff[0] = '\0'; // ensure string ends with \0 unsigned int mask = 1 << 31; for (; mask > 0; mask >>= 1) { strcat(buff, (x & mask) ? "1" : "0"); } } To check both codes, use: int main(int argc, char* argv[]) { int x; printf("please enter number: "); scanf("%d",&x); char bin[32]; int_to_binary(x, bin); printf("%s\n", bin); print_binary(x); } What we do is using a mask, which in binary is one "1" beginning on the far left and moving one step right at each loop. The "&" is a bite-wise operator (I let you google it to know how it works). If you need more explanation, feel free to ask.
#include<stdio.h> #include<stdlib.h> int main() { int binary[20]; int q,i=0; printf("Enter the decimal no\n"); scanf("%d",&q); while(q > 0) { binary[i]=q%2; i++; q=q/2; } for(int j=i-1;j>=0;j--) { printf("%d",binary[j]); } return 0; }
Rot13 implementation: error in translate_string function
I wrote a rot13.c program but I can tell something in my loop inside rot13_translate_string is causing the program to just print out blank lines. Any thoughts? Thank you! #include <stdio.h> #include <stdlib.h> #include <string.h> char rot13_translate_character(char c) { if( 'A' <= c && c <= 'M' ) { return c + 13; } else if( 'N' <= c && c <= 'Z' ) { return c - 13; } else if( 'a' <= c && c <= 'm' ) { return c + 13; } else if( 'n' <= c && c <= 'z' ) { return c - 13; } else { return c; } } char *rot13_translate_string(const char *str) { int len = strlen(str); char *translation = calloc(len, sizeof(char)); int i; do //****HERE IN THIS SECTION { /* Translate each character, starting from the end of the string. */ translation[len] = rot13_translate_character(str[len]); len--; } while( len < 0 ); //< return translation; } And here is the main (part of the same file) - is the condition for my for i = 1 ok? int main(int argc, char **argv) { if( argc < 2) { fprintf(stderr, "Usage: %s word [word ...]\n", argv[0]); return 1; } /* Translate each of the arguments */ int i; for( i = 1; i < argc; i++) //*****IS this right? { char *translation = rot13_translate_string( argv[i] ); fprintf(stdout, "%s\n", translation); } return 0; }
As just it was pointed out by Janis is the control on the loop do ... while. It should be while( len >= 0 ); A "while" loop runs while the control expression is true (and terminates once the expression becomes false). You define the variable len just before the loop and it cannot be <0. So you never really enter in the loop. You obtain a line for each input word because of fprintf(stdout, "%s\n", translation); line, where you print for each (empty) word a line (\n). In other languages, for example in Pascal, there is "repeat until" loop construction, which continues to run until the control expression is true, and only after that it changes it terminates. In that case you could use a condition with <0. In C to follow the same logic you can use while loop and negate the condition. In your case } while (! (len < 0) );
cudamemcpy error:"the launch timed out and was terminated"
My code is a parallel implmentation that calculates the nth digit of pi. When I finish the kernel and try to copy the memory back to the host I get a "the launch timed out and was terminated" error. I used this code for error checking for each cudamalloc, cudamemcpy, and kernal launch. std::string error = cudaGetErrorString(cudaGetLastError()); printf("%s\n", error); These calls were saying everything was fine until the first cudamemcpy call after returning from the kernel. the error happens in the line "cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);" in main. Any help is appreciated. #include <stdlib.h> #include <stdio.h> #include <math.h> #define mul_mod(a,b,m) fmod( (double) a * (double) b, m) /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// /* return the inverse of x mod y */ __device__ int inv_mod(int x,int y) { int q,u,v,a,c,t; u=x; v=y; c=1; a=0; do { q=v/u; t=c; c=a-q*c; a=t; t=u; u=v-q*u; v=t; } while (u!=0); a=a%y; if (a<0) a=y+a; return a; } /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// /* return the inverse of u mod v, if v is odd */ __device__ int inv_mod2(int u,int v) { int u1,u3,v1,v3,t1,t3; u1=1; u3=u; v1=v; v3=v; if ((u&1)!=0) { t1=0; t3=-v; goto Y4; } else { t1=1; t3=u; } do { do { if ((t1&1)==0) { t1=t1>>1; t3=t3>>1; } else { t1=(t1+v)>>1; t3=t3>>1; } Y4:; } while ((t3&1)==0); if (t3>=0) { u1=t1; u3=t3; } else { v1=v-t1; v3=-t3; } t1=u1-v1; t3=u3-v3; if (t1<0) { t1=t1+v; } } while (t3 != 0); return u1; } /* return (a^b) mod m */ __device__ int pow_mod(int a,int b,int m) { int r,aa; r=1; aa=a; while (1) { if (b&1) r=mul_mod(r,aa,m); b=b>>1; if (b == 0) break; aa=mul_mod(aa,aa,m); } return r; } /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// /* return true if n is prime */ int is_prime(int n) { int r,i; if ((n % 2) == 0) return 0; r=(int)(sqrtf(n)); for(i=3;i<=r;i+=2) if ((n % i) == 0) return 0; return 1; } /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// /* return the prime number immediatly after n */ int next_prime(int n) { do { n++; } while (!is_prime(n)); return n; } /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// #define DIVN(t,a,v,vinc,kq,kqinc) \ { \ kq+=kqinc; \ if (kq >= a) { \ do { kq-=a; } while (kq>=a); \ if (kq == 0) { \ do { \ t=t/a; \ v+=vinc; \ } while ((t % a) == 0); \ } \ } \ } /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// __global__ void digi_calc(int *s, int *av, int *primes, int N, int n, int nthreads){ int a,vmax,num,den,k,kq1,kq2,kq3,kq4,t,v,i,t1, h; unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x; // GIANT LOOP for (h = 0; h<1; h++){ if(tid > nthreads) continue; a = primes[tid]; vmax=(int)(logf(3*N)/logf(a)); if (a==2) { vmax=vmax+(N-n); if (vmax<=0) continue; } av[tid]=1; for(i=0;i<vmax;i++) av[tid]*= a; s[tid]=0; den=1; kq1=0; kq2=-1; kq3=-3; kq4=-2; if (a==2) { num=1; v=-n; } else { num=pow_mod(2,n,av[tid]); v=0; } for(k=1;k<=N;k++) { t=2*k; DIVN(t,a,v,-1,kq1,2); num=mul_mod(num,t,av[tid]); t=2*k-1; DIVN(t,a,v,-1,kq2,2); num=mul_mod(num,t,av[tid]); t=3*(3*k-1); DIVN(t,a,v,1,kq3,9); den=mul_mod(den,t,av[tid]); t=(3*k-2); DIVN(t,a,v,1,kq4,3); if (a!=2) t=t*2; else v++; den=mul_mod(den,t,av[tid]); if (v > 0) { if (a!=2) t=inv_mod2(den,av[tid]); else t=inv_mod(den,av[tid]); t=mul_mod(t,num,av[tid]); for(i=v;i<vmax;i++) t=mul_mod(t,a,av[tid]); t1=(25*k-3); t=mul_mod(t,t1,av[tid]); s[tid]+=t; if (s[tid]>=av[tid]) s-=av[tid]; } } t=pow_mod(5,n-1,av[tid]); s[tid]=mul_mod(s[tid],t,av[tid]); } __syncthreads(); } /////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////// int main(int argc,char *argv[]) { int N,n,i,totalp, h; double sum; const char *error; int *sdev, *avdev, *shost, *avhost, *adev, *ahost; argc = 2; argv[1] = "2"; if (argc<2 || (n=atoi(argv[1])) <= 0) { printf("This program computes the n'th decimal digit of pi\n" "usage: pi n , where n is the digit you want\n" ); exit(1); } sum = 0; N=(int)((n+20)*logf(10)/logf(13.5)); totalp=(N/logf(N))+10; ahost = (int *)calloc(totalp, sizeof(int)); i = 0; ahost[0]=2; for(i=1; ahost[i-1]<=(3*N); ahost[i+1]=next_prime(ahost[i])){ i++; } // allocate host memory size_t size = i*sizeof(int); shost = (int *)malloc(size); avhost = (int *)malloc(size); //allocate memory on device cudaMalloc((void **) &sdev, size); cudaMalloc((void **) &avdev, size); cudaMalloc((void **) &adev, size); cudaMemcpy(adev, ahost, size, cudaMemcpyHostToDevice); if (i >= 512){ h = 512; } else h = i; dim3 dimGrid(((i+512)/512),1,1); dim3 dimBlock(h,1,1); // launch kernel digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i); //copy memory back to host cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost); cudaMemcpy(shost, sdev, size, cudaMemcpyDeviceToHost); // end malloc's, memcpy's, kernel calls for(h = 0; h <=i; h++){ sum=fmod(sum+(double) shost[h]/ (double) avhost[h],1.0); } printf("Decimal digits of pi at position %d: %09d\n",n,(int)(sum*1e9)); //free memory cudaFree(sdev); cudaFree(avdev); cudaFree(adev); free(shost); free(avhost); free(ahost); return 0; }
This is exactly the same problem you asked about in this question. The kernel is getting terminated early by the driver because it is taking too long to finish. If you read the documentation for any of these runtime API functions you will see the following note: Note: Note that this function may also return error codes from previous, asynchronous launches. All that is happening is that the first API call after the kernel launch is returning the error incurred while the kernel was running - in this case the cudaMemcpy call. The way you can confirm this for yourself is to do something like this directly after the kernel launch: // launch kernel digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i); std::string error = cudaGetErrorString(cudaPeekAtLastError()); printf("%s\n", error); error = cudaGetErrorString(cudaThreadSynchronize()); printf("%s\n", error); The cudaPeekAtLastError() call will show you if there are any errors in the kernel launch, and the error code returned by the cudaThreadSynchronize() call will show whether any errors were generated while the kernel was executing. The solution is exactly as outlined in the previous question: probably the simplest way is redesign the code so it is "re-entrant" so you can split the work over several kernel launches, with each kernel launch safely under the display driver watchdog timer limit.
Cuda somehow buffers all the read/write operations on global memory. So you can batch the operations in some loop with some kernel, and it will take actually NO TIME. Then, when you call memcpy, all the buffered operations are done, and it can timeout. Method to go with, is to call cudaThreadSynchronize procedure between iterations. So remember: if a kernel run takes only nanoseconds to calculate - it doesn't mean that it is so fast - some of the writes to the global memory, are done when memcpy or threadsynchronize is called.