Segmentation fault (core dumped) when i use pthreads on ubuntu - pthreads

I use pthreads on ubuntu to implement multithreaded matrix-vector multiplication, but the runtime reports an error Segmentation fault
#pragma comment(lib, "pthreadVC2.lib")
#define _CRT_SECURE_NO_WARNINGS 1
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
/* Global variables */
int thread_count = 8;
int m, n;
double* A = NULL;
double* x = NULL;
double* y = NULL;
/* Serial functions */
void Usage(char* prog_name);
void Read_matrix(char* prompt, double A[], int m, int n);
void Read_vector(char* prompt, double x[], int n);
void Print_matrix(char* title, double A[], int m, int n);
void Print_vector(char* title, double y[], double m);
/* Parallel function */
void* Pth_mat_vect(void* rank);
/*------------------------------------------------------------------*/
int main(int argc, char* argv[]) {
long thread;
pthread_t* thread_handles;
thread_count = atoi(argv[1]);
thread_handles = malloc(thread_count * sizeof(pthread_t));
printf("Enter m and n\n");
scanf("%d%d", &m, &n);
A = malloc(m * n * sizeof(double));
x = malloc(n * sizeof(double));
y = malloc(m * sizeof(double));
Read_matrix("Enter the matrix", A, m, n);
Print_matrix("We read", A, m, n);
Read_vector("Enter the vector", x, n);
Print_vector("We read", x, n);
for (thread = 0; thread < thread_count; thread++)
pthread_create(&thread_handles[thread], NULL,
Pth_mat_vect, (void*)thread);
for (thread = 0; thread < thread_count; thread++)
pthread_join(thread_handles[thread], NULL);
Print_vector("The product is", y, m);
free(A);
free(x);
free(y);
return 0;
} /* main */
/*------------------------------------------------------------------
* Function: Read_matrix
* Purpose: Read in the matrix
* In args: prompt, m, n
* Out arg: A
*/
void Read_matrix(char* prompt, double A[], int m, int n) {
int i, j;
printf("%s\n", prompt);
for (i = 0; i < m; i++)
for (j = 0; j < n; j++)
scanf("%lf", &A[i * n + j]);
} /* Read_matrix */
/*------------------------------------------------------------------
* Function: Read_vector
* Purpose: Read in the vector x
* In arg: prompt, n
* Out arg: x
*/
void Read_vector(char* prompt, double x[], int n) {
int i;
printf("%s\n", prompt);
for (i = 0; i < n; i++)
scanf("%lf", &x[i]);
} /* Read_vector */
/*------------------------------------------------------------------
* Function: Pth_mat_vect
* Purpose: Multiply an mxn matrix by an nx1 column vector
* In arg: rank
* Global in vars: A, x, m, n, thread_count
* Global out var: y
*/
void* Pth_mat_vect(void* rank) {
long my_rank = (long)rank;
int i, j;
int local_m = m / thread_count;
int my_first_row = my_rank * local_m;
int my_last_row = (my_rank + 1) * local_m - 1;
for (i = my_first_row; i <= my_last_row; i++) {
y[i] = 0.0;
for (j = 0; j < n; j++)
y[i] += A[i * n + j] * x[j];
}
return NULL;
} /* Pth_mat_vect */
/*------------------------------------------------------------------
* Function: Print_matrix
* Purpose: Print the matrix
* In args: title, A, m, n
*/
void Print_matrix(char* title, double A[], int m, int n) {
int i, j;
printf("%s\n", title);
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%4.1f ", A[i * n + j]);
printf("\n");
}
} /* Print_matrix */
/*------------------------------------------------------------------
* Function: Print_vector
* Purpose: Print a vector
* In args: title, y, m
*/
void Print_vector(char* title, double y[], double m) {
int i;
printf("%s\n", title);
for (i = 0; i < m; i++)
printf("%4.1f ", y[i]);
printf("\n");
} /* Print_vector */
This code is from An Introduction to Parallel Programming
I know this error seems to be related to memory, in fact the code runs without entering main().I tried some other people's methods, but none of them worked.

Related

Exception thrown at 0x00007FFD9ABF024E (ucrtbased.dll) in myapp.exe: 0xC0000005: Access violation reading location

I'm trying to create a char matrix using dynamic allocation (char**). It represents a board where the margins are '#' character and in the middle is the ASCII 32 (blank space). When I run the code this massage appear: "Exception thrown at 0x00007FFD9ABF024E (ucrtbased.dll) in myapp.exe: 0xC0000005: Access violation reading location " in some cpp file.
Here's my code:
#include <iostream>
using namespace std;
char** allocateBoard(int n)
{
char** Board = 0;
Board = new char* [n+2];
int i;
for (i = 0; i < n + 2; i++)
{
Board[i] = new char[n * 2 + 2];
}
return Board;
}
void initBoard(char**& Board, int n)
{
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n * 2; j++)
{
if (i == 0 || i == n - 1) Board[i][j] = '#';
else if (j == 0 || j == n * 2 - 1) Board[i][j] = '#';
else Board[i][j] = 32;
}
}
}
void showBoard(char** Board, int n)
{
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n * 2; j++)
{
cout << Board[i][j];
}
cout << endl;
}
}
int main()
{
int n = 4;
char** Board = 0;
Board = allocateBoard(n);
initBoard(Board, n);
showBoard(Board, n);
cout << endl;
showBoard(Board, n);
for (int i = 0; i < n * 2 + 4; i++)
{
delete[] Board[i];
}
delete[] Board;
return 0;
}
Does anyone know where is the problem? As a very beginner I can't see where is the mistake. I've allocated more space in the matrix than I'm actually using so I can't figure why this message appears. Is the deallocation the problem?
Thanks!

Clang memory allocation

Could anyone please help me understand why Clang reallocates the same memory address for different variables while their lifetimes intersect?
I am using a sample program (below) to show the problem.
When I compile the program with clang -O0, variable j in function ok has the same memory address as variable solutions in function nqueens.
Function ok is called inside function nqueens, which means that the lifetime of the variables intersect; the same stack space cannot be used/reused for both functions.
Compiling the program with gcc or clang at -O1, however, they are assigned different memory addresses.
Any help is appreciated!
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <alloca.h>
/* Checking information */
static int solutions[] = {
1,
0,
0,
2,
10, /* 5 */
4,
40,
92,
352,
724, /* 10 */
2680,
14200,
73712,
365596,
};
#define MAX_SOLUTIONS sizeof(solutions)/sizeof(int)
int total_count;
int sharedVar = 0;
int ok(int n, char *a)
{
int i, j;
char p, q;
printf("jjjjjjjjj: %d, %p\n", n,&j);
for (i = 0; i < n; i++) {
p = a[i];
for (j = i + 1; j < n; j++) {
q = a[j];
if (q == p || q == p - (j - i) || q == p + (j - i))
return 0;
}
}
return 1;
}
void nqueens (int n, int j, char *a, int *solutions)
{
int i,res;
sharedVar = sharedVar * j - n;
if (n == j) {
/* good solution, count it */
*solutions = 1;
return;
}
printf("solutions: %d, %p\n", j, &solutions);
*solutions = 0;
/* try each possible position for queen <j> */
for (i = 0; i < n; i++) {
a[j] = (char) i;
if (ok(j + 1, a)) {
nqueens(n, j + 1, a,&res);
*solutions += res;
}
}
}
int main()
{
int size = 3;
char *a;
// printf("total_count: %p\n", &total_count);
total_count=0;
a = (char *)alloca(size * sizeof(char));
printf("Computing N-Queens algorithm (n=%d) ", size);
sharedVar = -5;
nqueens(size, 0, a, &total_count);
printf("completed!\n");
printf("sharedVar: %d\n", sharedVar);
}

nth Catalan number using Combinations

I have written a c++ program to find n-th catalan number by using combinations but I am always getting output 0. Please point out mistakes in this code:
#include <iostream>
using namespace std;
int fact(unsigned int x)
{
unsigned long long f = 1;
for (int i = 1; i <= x; i++)
{
f = f*i;
}
return f;
}
int comb(int y, int z)
{
unsigned long long int c;
c = fact(y) / (fact(z)*fact(y - z));
return c;
}
int catalan(int b)
{
unsigned long long int a;
a = (1 / (b + 1))*(comb((2 * b), b));
return a;
}
int main()
{
int n;
cout << "enter value of n for nth catalan number=";
cin >> n;
cout << n << " Catalan number=" << catalan(n) << endl;
return 0;
}
(1 / (b + 1)) is always going to be zero. Instead use
a = comb(2 * b, b) / (b + 1);
Also, you do your calculations using unsigned long long. Why not use that as return type instead of int.

Finding bug in segment tree implementation

I was trying to solve this problem - link. Segment Tree with Lazy Propagation. But I dont know where I made mistake. Please help me find the bug.
I am new to Segment Tree with Lazy Propagation. But my code seems ok.
#include <bits/stdc++.h>
using namespace std;
const int MAXN = 100000;
struct info{ long long sum, prop; }; info tree[300010];
void update(int node, int l, int r, int i, int j, int val) {
if( i > r || j < l) return;
if(i <= l && j >= r) {
tree[node].sum += (r-l+1)*val;
tree[node].prop += val;
return;
} int left = node*2, right = left|1, mid = (l+r)/2;
update(left, l, mid, i,j,val);
update(right, mid+1, r, i,j,val);
tree[node].sum = tree[left].sum + tree[right].sum + (r-l+1)*tree[node].prop;
}
long long query(int node, int l, int r, int i, int j, long long carry = 0) {
if(i > r || j < l ) return 0;
if(i <= l && j >= r) return tree[node].sum + (r-l+1)*carry;
int mid = (l+r)/2, left = node*2, right = left|1;
long long ret = query(left, l, mid, i,j, carry + tree[node].prop);
ret += query(right,mid+1,r,i,j, carry + tree[node].prop);
return ret;
}
int main(int argc, char const *argv[]) {
#ifndef ONLINE_JUDGE
freopen("in", "r", stdin);
#endif
int t,co=0; scanf("%d", &t); while(t--) {
int n, q; scanf("%d %d", &n, &q);
for(int i=0; i<=3*n; i++) tree[i].sum = tree[i].prop = 0;
printf("Case %d:\n", ++co);
while(q--) {
int type,a,b,c; scanf("%d", &type); if(!type) {
scanf("%d %d %d", &a, &b, &c);
update(1, 0, n-1, a,b,c);
} else {
scanf("%d %d", &a, &b);
printf("%d\n", query(1, 0, n-1, a,b));
}
}
}
}
First of all this site is not for finding bug in code. Try https://codereview.stackexchange.com/ for this purpose.
Anyway In your code return type of function query() is long long but in main you are printing integer type. change printf("%d\n", query(1, 0, n-1, a,b)); to printf("%lld\n", query(1, 0, n-1, a,b)); and I hope you will get AC.

Why do operations with an array corrupt the values?

I'm trying to implement the Particle Swarm Optimization on CUDA. I'm partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.
The problem is, when I'm trying to modify array element like so
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.
I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can't spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].
Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;
typedef struct {
unsigned k_max;
float c1;
float c2;
unsigned p;
float inertia_factor;
float Ef;
float X_low[f_n];
float X_high[f_n];
float X_min[n][f_n];
} params_t;
typedef void (*kernelWrapperType) (
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
);
typedef float (*twoArgsFuncType) (
float x1,
float x2
);
__global__ void kernelInit(
float* X,
size_t pitch,
int width,
float X_high,
float X_low
) {
// Silly, but pretty reliable way to address array elements
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int r = tid / width;
int c = tid % width;
float* pElement = (float*)((char*)X + r * pitch) + c;
*pElement = *pElement * (X_high - X_low) - X_low;
//*pElement = (X_high - X_low) - X_low;
}
__device__ float kernelF1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
void f1(
float *X,
float *X_highVec,
float *V,
float *X_best,
float *Y,
float *Y_best,
float *X_swarmBest,
bool &termination,
const float &inertia,
const params_t *params,
const unsigned &f
) {
float *X_d = NULL;
float *Y_d = NULL;
unsigned length = n * p;
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
size_t pitch;
size_t dpitch;
cudaError_t err;
unsigned width = n;
unsigned height = p;
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
pitch = n * sizeof(float);
err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
err = cudaMalloc (&Y_d, sizeof(float) * p);
err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
err = cudaFree(X_d);
err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
err = cudaFree(Y_d);
}
float F1(
float x1,
float x2
) {
float y = pow(x1, 2.f) + pow(x2, 2.f);
return y;
}
/*
* Generates random float in [0.0; 1.0]
*/
float frand(){
return (float)rand()/(float)RAND_MAX;
}
/*
* This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
* brings the result vector back to host and dumps it on the console.
*/
int main() {
const params_t params = {
100,
0.5,
0.5,
p,
0.98,
0.01,
{-5.12, -2.048, -5.12},
{5.12, 2.048, 5.12},
{{0, 1, 0}, {0, 1, 0}}
};
float X[p][n];
float X_highVec[n];
float V[p][n];
float X_best[p][n];
float Y[p] = {0};
float Y_best[p] = {0};
float X_swarmBest[n];
kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
twoArgsFuncType F[f_n] = {&F1, &F1, &F1};
for (unsigned f = 0; f < f_n; f++) {
printf("Optimizing function #%u\n", f);
srand ( time(NULL) );
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
X[i][j] = X_best[i][j] = frand();
for (int i = 0; i < n; i++)
X_highVec[i] = params.X_high[f];
for (unsigned i = 0; i < p; i++)
for (unsigned j = 0; j < n; j++)
V[i][j] = frand();
for (unsigned i = 0; i < p; i++)
Y_best[i] = F[f](X[i][0], X[i][1]);
for (unsigned i = 0; i < n; i++)
X_swarmBest[i] = params.X_high[f];
float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);
bool termination = false;
float inertia = 1.;
for (unsigned k = 0; k < params.k_max; k++) {
F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
}
for (unsigned i = 0; i < p; i++)
{
for (unsigned j = 0; j < n; j++)
{
printf("%f\t", X[i][j]);
}
printf("F = %f\n", Y[i]);
}
getchar();
}
}
Update: I tried adding error handling like so
err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
fprintf(stderr, cudaGetErrorString(err));
exit(1);
}
after each API call, but it gave me nothing and didn't return (I still get all the results and program works to the end).
This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:
const unsigned n = 2;
const unsigned p = 64;
unsigned length = n * p
dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;
kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);
So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.
As for infinite values, that is to be expected isn't it? Your code starts with values in (0,1), and then does
X[i] = X[i] * (5.12--5.12) - -5.12
100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by
X[i] = X[i] * (2.048--2.048) - -2.048
100 times, which is the rough equivalent of multiplying by 4^100, finally followed by
X[i] = X[i] * (5.12--5.12) - -5.12
again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

Resources