set CPU affinity of a particular pthread failure - pthreads

My speedup-example.cpp source code is shown below
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "tern/user.h"
#define N 8
#define M 10000
int nwait = 0;
int nexit = 0;
volatile long long sum;
long loops = 6e3;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_barrier_t bar;
void set_affinity(int core_id) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core_id, &cpuset);
assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0);
}
void* thread_func(void *arg) {
set_affinity((int)(long)arg);
for (int j = 0; j < M; j++) {
pthread_mutex_lock(&mutex);
nwait++;
for (long i = 0; i < loops; i++) // This is the key of speedup for parrot: the mutex needs to be a little bit congested.
sum += i;
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
soba_wait(0);
pthread_barrier_wait(&bar);
for (long i = 0; i < loops; i++)
sum += i*i*i*i*i*i;
//fprintf(stderr, "compute thread %u %d\n", (unsigned)thread, sched_getcpu());
}
}
int main(int argc, char *argv[]) {
set_affinity(23);
soba_init(0, N, 20);
pthread_t th[N];
int ret;
pthread_cond_init(&cond, NULL);
pthread_barrier_init(&bar, NULL, N);
for(unsigned i=0; i<N; ++i) {
ret = pthread_create(&th[i], NULL, thread_func, (void*)i);
assert(!ret && "pthread_create() failed!");
}
for (int j = 0; j < M; j++) {
while (nwait < N) {
sched_yield();
}
pthread_mutex_lock(&mutex);
nwait = 0;
//fprintf(stderr, "broadcast %u %d\n", (unsigned)pthread_self(), sched_getcpu());
pthread_cond_broadcast(&cond);
pthread_mutex_unlock(&mutex);
}
for(unsigned i=0; i<N; ++i)
pthread_join(th[i], NULL);
exit(0);
}
I already succeeded wrote the mk of speedup-example.cpp
gcc speedup-example.cpp -o speedup-example -O2 -g \-I$XTERN_ROOT/include -L$XTERN_ROOT/dync_hook -Wl,--rpath,$XTERN_ROOT/dync_hook -lxtern-annot \-lpthread
But when I want to run it, problems occur.
For example
$ time ./speedup-example
It informs me that
speedup-example.cpp:23: void set_affinity(int): Assertion `pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0' failed.
Can someone help me solve this problem? Many thanks.

Related

i dont know why my threads in C dont work in parallel

I'm trying to write a program to simulate an aircraft carrier running on threads in Cand I don't know why my threads seem to run after each other
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#define N 15 // sapaces for planes on carrier
#define K 9 // a number less than n, and if there are currently fewer than K aircraft on the carrier, landing aircraft have priority
#define S 1
pthread_mutex_t mutex;
pthread_cond_t landing_cond;
pthread_cond_t takeoff_cond;
int planes_on_deck = 0;
void* landing() {
while(1) {
pthread_mutex_lock(&mutex);
while (planes_on_deck>=N) {
pthread_cond_wait(&landing_cond, &mutex);
}
planes_on_deck++;
printf("landing.\n");
Sleep(1000);
printf("number of planes on carrier: %d\n\n", planes_on_deck);
pthread_mutex_unlock(&mutex);
pthread_cond_broadcast(&takeoff_cond);
}
return 0;
}
void* takeoff() {
while(1) {
pthread_mutex_lock(&mutex);
while (planes_on_deck < K) {
pthread_cond_wait(&takeoff_cond, &mutex);
}
planes_on_deck--;
printf("takeoff.\n");
Sleep(1000);
printf("number of planes on carrier: %d\n\n", planes_on_deck);
pthread_mutex_unlock(&mutex);
pthread_cond_broadcast(&landing_cond);
}
return 0;
}
int main() {
printf("number of planes on carrier: %d\n", planes_on_deck);
pthread_t landing_threads[S];
pthread_t takeoff_threads[S];
pthread_mutex_init(&mutex, NULL);
pthread_cond_init(&landing_cond, NULL);
pthread_cond_init(&takeoff_cond, NULL);
for (int i = 0; i < S; i++){
if (pthread_create(&landing_threads[i], NULL, &landing, NULL) != 0){
perror("fail");
}
if (pthread_create(&takeoff_threads[i], NULL, &takeoff, NULL) != 0){
perror("fail");
}
}
for (int i = 0; i < S; i++){
if (pthread_join(takeoff_threads[i], NULL) != 0){
return 2;
}
if (pthread_join(landing_threads[i], NULL) != 0){
return 2;
}
}
pthread_mutex_destroy(&mutex);
pthread_cond_destroy(&landing_cond);
pthread_cond_destroy(&takeoff_cond);
return 0;
}
at the beginning it starts to list landing planes as it should and when it reaches K it should give the opportunity for planes to take off
but he does so only when he reaches the space limit on the ship
Maybe you can tell me, what's wrong with my program, why it behaves the way it does
why it behaves the way it does
You call Sleep() with the mutex held. That prevents the other thread from doing anything. You should unlock the mutex before going to sleep.
In general, sleeping (or doing other long operations, like reading from a remote socket or opening a file) with a mutex held is almost always wrong.

Clang memory allocation

Could anyone please help me understand why Clang reallocates the same memory address for different variables while their lifetimes intersect?
I am using a sample program (below) to show the problem.
When I compile the program with clang -O0, variable j in function ok has the same memory address as variable solutions in function nqueens.
Function ok is called inside function nqueens, which means that the lifetime of the variables intersect; the same stack space cannot be used/reused for both functions.
Compiling the program with gcc or clang at -O1, however, they are assigned different memory addresses.
Any help is appreciated!
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <alloca.h>
/* Checking information */
static int solutions[] = {
1,
0,
0,
2,
10, /* 5 */
4,
40,
92,
352,
724, /* 10 */
2680,
14200,
73712,
365596,
};
#define MAX_SOLUTIONS sizeof(solutions)/sizeof(int)
int total_count;
int sharedVar = 0;
int ok(int n, char *a)
{
int i, j;
char p, q;
printf("jjjjjjjjj: %d, %p\n", n,&j);
for (i = 0; i < n; i++) {
p = a[i];
for (j = i + 1; j < n; j++) {
q = a[j];
if (q == p || q == p - (j - i) || q == p + (j - i))
return 0;
}
}
return 1;
}
void nqueens (int n, int j, char *a, int *solutions)
{
int i,res;
sharedVar = sharedVar * j - n;
if (n == j) {
/* good solution, count it */
*solutions = 1;
return;
}
printf("solutions: %d, %p\n", j, &solutions);
*solutions = 0;
/* try each possible position for queen <j> */
for (i = 0; i < n; i++) {
a[j] = (char) i;
if (ok(j + 1, a)) {
nqueens(n, j + 1, a,&res);
*solutions += res;
}
}
}
int main()
{
int size = 3;
char *a;
// printf("total_count: %p\n", &total_count);
total_count=0;
a = (char *)alloca(size * sizeof(char));
printf("Computing N-Queens algorithm (n=%d) ", size);
sharedVar = -5;
nqueens(size, 0, a, &total_count);
printf("completed!\n");
printf("sharedVar: %d\n", sharedVar);
}

hugepages allocated by mmap is slower than posix_memalign

#include <stdlib.h>
#include <stdio.h>
#include <strings.h>
#include <sys/mman.h>
#include <time.h>
#define HUGEPAGE 2048*1024
void *normal_malloc(int len)
{
void *ptr = malloc(len);
bzero(ptr, len);
return ptr;
}
void *trans_malloc(int len)
{
void *ptr = NULL;
int ret = posix_memalign(&ptr, HUGEPAGE, len);
if(ret) perror("posix_memalign");
ret = madvise(ptr, len, MADV_HUGEPAGE);
bzero(ptr, len);
return ptr;
}
void *mmap_malloc(int len)
{
void *ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1,0);
return ptr;
}
int main(int argc, char **argv)
{
char *ptr = NULL;
int len = HUGEPAGE*256;
srand(time(NULL));
switch(argc){
case 1: ptr = normal_malloc(len);break;
case 2: ptr = trans_malloc(len);break;
case 3: ptr = mmap_malloc(len); break;
}
long j = 0;
for(int i=0;i<len;i++){
j += ptr[rand()%len];
}
return 0;
}
I use normal malloc and posix_memalign and mmap to test performance.
My test result is :
malloc cost about 29.7s, posix_memalign cost about 23.5s, and mmap is very near with malloc.
Both posix_memalign and mmap uses hugepages. Why one has obvious improvement, the other not? Do I use mmap in the wrong way?
I don't do bzero for mmap since the man pages says "its contents are initialized to zero".

Compiler commands for accull while using opencv

I'm trying to accelerate an opencv program I wrote using OpenACC, I'm using the accull compiler to do this. However, I'm having a very hard time finding any documentation or examples that would help me on this issue.
http://scelementary.com/2015/04/30/openacc-on-jetson-tk1.html
I don't have any experience with ACCULL, but I can provide you with an example that uses OpenCV and OpenACC and maybe that'll help you get moving. This has been tested on X86 with PGI on Ubunut 14.04. This will read an image, invert the pixels, and write an image back out.
invert.cpp:
void invert(unsigned char *imgData, int w, int h, int ch, int step)
{
int i,j,c;
#pragma acc parallel loop collapse(3) copy(imgData[:h*w*ch])
for ( i = 0; i < h; i++)
for ( j = 0; j < w; j++ )
for ( c = 0; c < ch; c++ )
imgData[i*step + j*ch + c] = 255 - imgData[i*step + j*ch + c];
}
main.cpp:
#include <stdio.h>
#include <opencv/cv.h>
#include <opencv/cvaux.h>
#include <opencv/highgui.h>
void invert(unsigned char*,int,int,int,int);
int main(int argc, char* argv[])
{
if (argc < 3)
{
fprintf(stderr,"Usage: %s inFilename outFilename\n",argv[0]);
return -1;
}
IplImage* img = cvLoadImage(argv[1]);
printf("%s: %d x %d, %d %d\n", argv[1],img->width, img->height, img->widthStep, img->nChannels);
invert((unsigned char*)img->imageData,img->width,img->height, img->nChannels, img->widthStep);
if(!cvSaveImage(argv[2],img))
fprintf(stderr,"Failed to write to %s.\n",argv[2]);
cvReleaseImage(&img);
return 0;
}
Makefile:
a.out: main.cpp invert.cpp
pgc++ -fast -ta=tesla -c invert.cpp
pgc++ -fast -ta=tesla -c main.cpp
pgc++ -ta=tesla invert.o main.o -lopencv_legacy -lopencv_highgui -lopencv_core

Problem forking processes and creating threads

My program is supposed to fork three processes. Each of these processes will create three threads and fork two additional processes. These two additional processes will create three threads.
Here is my code. I've tried to keep things simple with nested loops. I think at some point I might be forking more processes or creating more threads.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
void *printme(void* Array){
int *Arr = (int *) Array;
int len = sizeof(Arr) / sizeof(int);
if (len == 1){
printf("I'm thread %d.%d",Arr[0],Arr[1]);
}
else if (len == 2){
printf("I'm thread %d.%d.%d",Arr[0],Arr[1],Arr[2]);
}
printf("\n");
pthread_exit(NULL);
}
int main(void){
int i, j, k, l;
int threadLevel1[2];
int threadLevel2[3];
printf("\n");
for (i = 1 ; i < 4 ; i++){ // Loop to fork the three main processes.
if (fork() != 0){
sleep(4);
}
else{
//The newly forked process will create three threads and fork two additional processes.
for (j = 1 ; j < 4 ; j++){
pthread_t t;
threadLevel1[0] = i;
threadLevel1[1] = j;
if (pthread_create(&t, NULL, printme, (void*) threadLevel1) != 0){
perror("pthread_create");
exit(1);
}
}
for (k = 1; k < 3 ; k++){
pid_t a = fork();
if (a != 0){
sleep(2);
}
else if (a == -1){
perror("fork"); /* display error message */
exit(0);
}
else{
for (l = 1 ; l < 4 ; l++){
pthread_t t;
threadLevel2[0] = i;
threadLevel2[1] = k;
threadLevel2[2] = l;
if (pthread_create(&t, NULL, printme, (void*) threadLevel2)!=0) {
perror("pthread_create");
exit(1);
}
}
}
}
}
}
return 0;
}
You have a problem in your code here:
void *printme(void* Array){
int *Arr = (int *) Array;
int len = sizeof(Arr) / sizeof(int);
The value len will always be the same no matter what is passed in to printme. That's because C passes arrays as pointers, not as objects with embedded lengths.

Resources