auto-vectorization using GCC does not work - vectorization

I'm compiling the following code, named 'example.C' with and without auto-vectorization, the running time are identical. Could this be that my CPU does not support it or I didn't compile it correctly?
int main(){
const unsigned int ArraySize = 10000000;
float* a = new float[ArraySize];
float* b = new float[ArraySize];
float* c = new float[ArraySize];
for (unsigned int j = 0; j< 200 ; j++) // some repetitions
for ( unsigned int i = 0; i < ArraySize; ++ i)
c[i] = a[i] * b[i];
}
Here is how I compile it without auto-vectorization.
g++ example.C -o example
with the auto-vectorization:
g++ example.C -o example -ftree-vectorize
Thanks for any inputs!

Related

Segmentation fault (core dumped) when i use pthreads on ubuntu

I use pthreads on ubuntu to implement multithreaded matrix-vector multiplication, but the runtime reports an error Segmentation fault
#pragma comment(lib, "pthreadVC2.lib")
#define _CRT_SECURE_NO_WARNINGS 1
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
/* Global variables */
int thread_count = 8;
int m, n;
double* A = NULL;
double* x = NULL;
double* y = NULL;
/* Serial functions */
void Usage(char* prog_name);
void Read_matrix(char* prompt, double A[], int m, int n);
void Read_vector(char* prompt, double x[], int n);
void Print_matrix(char* title, double A[], int m, int n);
void Print_vector(char* title, double y[], double m);
/* Parallel function */
void* Pth_mat_vect(void* rank);
/*------------------------------------------------------------------*/
int main(int argc, char* argv[]) {
long thread;
pthread_t* thread_handles;
thread_count = atoi(argv[1]);
thread_handles = malloc(thread_count * sizeof(pthread_t));
printf("Enter m and n\n");
scanf("%d%d", &m, &n);
A = malloc(m * n * sizeof(double));
x = malloc(n * sizeof(double));
y = malloc(m * sizeof(double));
Read_matrix("Enter the matrix", A, m, n);
Print_matrix("We read", A, m, n);
Read_vector("Enter the vector", x, n);
Print_vector("We read", x, n);
for (thread = 0; thread < thread_count; thread++)
pthread_create(&thread_handles[thread], NULL,
Pth_mat_vect, (void*)thread);
for (thread = 0; thread < thread_count; thread++)
pthread_join(thread_handles[thread], NULL);
Print_vector("The product is", y, m);
free(A);
free(x);
free(y);
return 0;
} /* main */
/*------------------------------------------------------------------
* Function: Read_matrix
* Purpose: Read in the matrix
* In args: prompt, m, n
* Out arg: A
*/
void Read_matrix(char* prompt, double A[], int m, int n) {
int i, j;
printf("%s\n", prompt);
for (i = 0; i < m; i++)
for (j = 0; j < n; j++)
scanf("%lf", &A[i * n + j]);
} /* Read_matrix */
/*------------------------------------------------------------------
* Function: Read_vector
* Purpose: Read in the vector x
* In arg: prompt, n
* Out arg: x
*/
void Read_vector(char* prompt, double x[], int n) {
int i;
printf("%s\n", prompt);
for (i = 0; i < n; i++)
scanf("%lf", &x[i]);
} /* Read_vector */
/*------------------------------------------------------------------
* Function: Pth_mat_vect
* Purpose: Multiply an mxn matrix by an nx1 column vector
* In arg: rank
* Global in vars: A, x, m, n, thread_count
* Global out var: y
*/
void* Pth_mat_vect(void* rank) {
long my_rank = (long)rank;
int i, j;
int local_m = m / thread_count;
int my_first_row = my_rank * local_m;
int my_last_row = (my_rank + 1) * local_m - 1;
for (i = my_first_row; i <= my_last_row; i++) {
y[i] = 0.0;
for (j = 0; j < n; j++)
y[i] += A[i * n + j] * x[j];
}
return NULL;
} /* Pth_mat_vect */
/*------------------------------------------------------------------
* Function: Print_matrix
* Purpose: Print the matrix
* In args: title, A, m, n
*/
void Print_matrix(char* title, double A[], int m, int n) {
int i, j;
printf("%s\n", title);
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%4.1f ", A[i * n + j]);
printf("\n");
}
} /* Print_matrix */
/*------------------------------------------------------------------
* Function: Print_vector
* Purpose: Print a vector
* In args: title, y, m
*/
void Print_vector(char* title, double y[], double m) {
int i;
printf("%s\n", title);
for (i = 0; i < m; i++)
printf("%4.1f ", y[i]);
printf("\n");
} /* Print_vector */
This code is from An Introduction to Parallel Programming
I know this error seems to be related to memory, in fact the code runs without entering main().I tried some other people's methods, but none of them worked.

Clang memory allocation

Could anyone please help me understand why Clang reallocates the same memory address for different variables while their lifetimes intersect?
I am using a sample program (below) to show the problem.
When I compile the program with clang -O0, variable j in function ok has the same memory address as variable solutions in function nqueens.
Function ok is called inside function nqueens, which means that the lifetime of the variables intersect; the same stack space cannot be used/reused for both functions.
Compiling the program with gcc or clang at -O1, however, they are assigned different memory addresses.
Any help is appreciated!
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <alloca.h>
/* Checking information */
static int solutions[] = {
1,
0,
0,
2,
10, /* 5 */
4,
40,
92,
352,
724, /* 10 */
2680,
14200,
73712,
365596,
};
#define MAX_SOLUTIONS sizeof(solutions)/sizeof(int)
int total_count;
int sharedVar = 0;
int ok(int n, char *a)
{
int i, j;
char p, q;
printf("jjjjjjjjj: %d, %p\n", n,&j);
for (i = 0; i < n; i++) {
p = a[i];
for (j = i + 1; j < n; j++) {
q = a[j];
if (q == p || q == p - (j - i) || q == p + (j - i))
return 0;
}
}
return 1;
}
void nqueens (int n, int j, char *a, int *solutions)
{
int i,res;
sharedVar = sharedVar * j - n;
if (n == j) {
/* good solution, count it */
*solutions = 1;
return;
}
printf("solutions: %d, %p\n", j, &solutions);
*solutions = 0;
/* try each possible position for queen <j> */
for (i = 0; i < n; i++) {
a[j] = (char) i;
if (ok(j + 1, a)) {
nqueens(n, j + 1, a,&res);
*solutions += res;
}
}
}
int main()
{
int size = 3;
char *a;
// printf("total_count: %p\n", &total_count);
total_count=0;
a = (char *)alloca(size * sizeof(char));
printf("Computing N-Queens algorithm (n=%d) ", size);
sharedVar = -5;
nqueens(size, 0, a, &total_count);
printf("completed!\n");
printf("sharedVar: %d\n", sharedVar);
}

Compiler commands for accull while using opencv

I'm trying to accelerate an opencv program I wrote using OpenACC, I'm using the accull compiler to do this. However, I'm having a very hard time finding any documentation or examples that would help me on this issue.
http://scelementary.com/2015/04/30/openacc-on-jetson-tk1.html
I don't have any experience with ACCULL, but I can provide you with an example that uses OpenCV and OpenACC and maybe that'll help you get moving. This has been tested on X86 with PGI on Ubunut 14.04. This will read an image, invert the pixels, and write an image back out.
invert.cpp:
void invert(unsigned char *imgData, int w, int h, int ch, int step)
{
int i,j,c;
#pragma acc parallel loop collapse(3) copy(imgData[:h*w*ch])
for ( i = 0; i < h; i++)
for ( j = 0; j < w; j++ )
for ( c = 0; c < ch; c++ )
imgData[i*step + j*ch + c] = 255 - imgData[i*step + j*ch + c];
}
main.cpp:
#include <stdio.h>
#include <opencv/cv.h>
#include <opencv/cvaux.h>
#include <opencv/highgui.h>
void invert(unsigned char*,int,int,int,int);
int main(int argc, char* argv[])
{
if (argc < 3)
{
fprintf(stderr,"Usage: %s inFilename outFilename\n",argv[0]);
return -1;
}
IplImage* img = cvLoadImage(argv[1]);
printf("%s: %d x %d, %d %d\n", argv[1],img->width, img->height, img->widthStep, img->nChannels);
invert((unsigned char*)img->imageData,img->width,img->height, img->nChannels, img->widthStep);
if(!cvSaveImage(argv[2],img))
fprintf(stderr,"Failed to write to %s.\n",argv[2]);
cvReleaseImage(&img);
return 0;
}
Makefile:
a.out: main.cpp invert.cpp
pgc++ -fast -ta=tesla -c invert.cpp
pgc++ -fast -ta=tesla -c main.cpp
pgc++ -ta=tesla invert.o main.o -lopencv_legacy -lopencv_highgui -lopencv_core

set CPU affinity of a particular pthread failure

My speedup-example.cpp source code is shown below
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "tern/user.h"
#define N 8
#define M 10000
int nwait = 0;
int nexit = 0;
volatile long long sum;
long loops = 6e3;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_barrier_t bar;
void set_affinity(int core_id) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core_id, &cpuset);
assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0);
}
void* thread_func(void *arg) {
set_affinity((int)(long)arg);
for (int j = 0; j < M; j++) {
pthread_mutex_lock(&mutex);
nwait++;
for (long i = 0; i < loops; i++) // This is the key of speedup for parrot: the mutex needs to be a little bit congested.
sum += i;
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
soba_wait(0);
pthread_barrier_wait(&bar);
for (long i = 0; i < loops; i++)
sum += i*i*i*i*i*i;
//fprintf(stderr, "compute thread %u %d\n", (unsigned)thread, sched_getcpu());
}
}
int main(int argc, char *argv[]) {
set_affinity(23);
soba_init(0, N, 20);
pthread_t th[N];
int ret;
pthread_cond_init(&cond, NULL);
pthread_barrier_init(&bar, NULL, N);
for(unsigned i=0; i<N; ++i) {
ret = pthread_create(&th[i], NULL, thread_func, (void*)i);
assert(!ret && "pthread_create() failed!");
}
for (int j = 0; j < M; j++) {
while (nwait < N) {
sched_yield();
}
pthread_mutex_lock(&mutex);
nwait = 0;
//fprintf(stderr, "broadcast %u %d\n", (unsigned)pthread_self(), sched_getcpu());
pthread_cond_broadcast(&cond);
pthread_mutex_unlock(&mutex);
}
for(unsigned i=0; i<N; ++i)
pthread_join(th[i], NULL);
exit(0);
}
I already succeeded wrote the mk of speedup-example.cpp
gcc speedup-example.cpp -o speedup-example -O2 -g \-I$XTERN_ROOT/include -L$XTERN_ROOT/dync_hook -Wl,--rpath,$XTERN_ROOT/dync_hook -lxtern-annot \-lpthread
But when I want to run it, problems occur.
For example
$ time ./speedup-example
It informs me that
speedup-example.cpp:23: void set_affinity(int): Assertion `pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0' failed.
Can someone help me solve this problem? Many thanks.

Obtain array from IplImage in JavaCV

I need to convert the code below from C++ to Java. In C++ I use openCV and I need to convert it in Java using JavaCV.
IplImage* img = cvLoadImage(argv[0]);
int rows = img->height;
int cols = img->width;
Mat matimg(img);
vector<vector<double> > img_vec(rows, vector<double>(cols));
for (int i=0; i < rows; i++) {
for (int j =0; j < cols; j++){
unsigned char temp;
temp = ((uchar*) matimg.data + i * matimg.step)[j * matimg.elemSize() + 1 ];
img_vec[i][j] = (double) temp;
}
}
I've tried the following conversion to java, but it doesn't work properly. I printed the values of temp and it is 0 all the times and for the same imgage the values of matimg.step and matimg.elemSize() are different in the C++ code and the Java code.
In c++ I get matimg.step = 2400 and matimg.elemSize() = 3 while in Java i get 3000 and 1.
Here is the code in java:
IplImage img = cvLoadImage(argv[0]);
int rows = img.height();
int cols = img.width();
CvMat matimg = img.asCvMat();
double img_vec[][] = new double[rows][cols];
for (int i=0; i < rows; i++) {
for (int j =0; j < cols; j++){
short temp;
temp = matimg.data_s().get(i * matimg.step() + j * matimg.elemSize() + 1);
img_vec[i][j] = (double) temp;
}
}
I don't understand where am I doing wrong?
Any help is appreciated,
Thanks.
I've solved my problem using this:
ByteBuffer buffer = img.getByteBuffer();
double img_vec[][] = new double[rows][cols];
for (int i=0; i < rows; i++) {
for (int j =0; j < cols; j++){
int ind = i * img.widthStep() + j * img.nChannels() + 1;
img_vec[i][j] = (buffer.get(ind) & 0xFF);
}
}

Resources