How to Create threads recursively without Loops - pthreads

I would like to create a code that will:
create recursive threads without using loops, the threads has to execute certain routine. I am using Pthread_create on ubuntu
#include <pthread.h>
#include <stdio.h>
#define NUM_THREADS 8
void *PrintHello(void *threadid)
{
printf("\n%d: Hello World!\n", threadid);
pthread_exit(NULL);
}
int main (int argc, char *argv[])
{
pthread_t threads[NUM_THREADS];
int rc, t;
for(t=0; t<NUM_THREADS; t++)
{
printf("Creating thread %d\n", t);
rc = pthread_create(&threads[t], NULL, PrintHello, (void *)t);
if (rc)
{
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
pthread_exit(NULL);
}

Yes you can create thread without using for loop , I have modified your code and used function recursion to create pthread..
#include <pthread.h>
#include <stdio.h>
#define NUM_THREADS 8
pthread_t threads[NUM_THREADS];
void *PrintHello(void *threadid)
{
printf("\n%d: Hello World!\n", threadid);
pthread_exit(NULL);
}
void create_thread(int n){
if (n > 0 ){
//Create thread
printf("Creating thread %d\n", ((NUM_THREADS - n) + 1) );
//NUM_THREADS - n to start index from 0
int rc = pthread_create(&threads[NUM_THREADS - n], NULL, PrintHello, (void *)(NUM_THREADS - n));
if (rc)
{
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
n--;
create_thread(n);
}
return;
}
int main(int argc, char *argv[])
{
int rc, t;
create_thread( NUM_THREADS );
//Wait to finish all thread
for (t = 0; t < NUM_THREADS; t++)
{
pthread_join(threads[t],NULL);
}
pthread_exit(NULL);
return 0;
}
Hope this will help you.

Related

hugepages allocated by mmap is slower than posix_memalign

#include <stdlib.h>
#include <stdio.h>
#include <strings.h>
#include <sys/mman.h>
#include <time.h>
#define HUGEPAGE 2048*1024
void *normal_malloc(int len)
{
void *ptr = malloc(len);
bzero(ptr, len);
return ptr;
}
void *trans_malloc(int len)
{
void *ptr = NULL;
int ret = posix_memalign(&ptr, HUGEPAGE, len);
if(ret) perror("posix_memalign");
ret = madvise(ptr, len, MADV_HUGEPAGE);
bzero(ptr, len);
return ptr;
}
void *mmap_malloc(int len)
{
void *ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1,0);
return ptr;
}
int main(int argc, char **argv)
{
char *ptr = NULL;
int len = HUGEPAGE*256;
srand(time(NULL));
switch(argc){
case 1: ptr = normal_malloc(len);break;
case 2: ptr = trans_malloc(len);break;
case 3: ptr = mmap_malloc(len); break;
}
long j = 0;
for(int i=0;i<len;i++){
j += ptr[rand()%len];
}
return 0;
}
I use normal malloc and posix_memalign and mmap to test performance.
My test result is :
malloc cost about 29.7s, posix_memalign cost about 23.5s, and mmap is very near with malloc.
Both posix_memalign and mmap uses hugepages. Why one has obvious improvement, the other not? Do I use mmap in the wrong way?
I don't do bzero for mmap since the man pages says "its contents are initialized to zero".

Using MPI_Send inside a pthread in C

i am trying to create a ring of mpi processes where each MPI process will launch a pthread and the threads will perform the ring, i used pthread so i can use the MPI processes to do another task. It seems that i can't use MPI_send or MPI_Recv inside a pthread, i have no compilation error but i do have a run time error.
i compile using this command
mpicc -lpthread threaded_ring.c
this is the runtime error
a.out:28372 terminated with signal 11 at PC=2aaaaaae312d SP=2aaab0771860. Backtrace:
/usr/lib64/libpsm_infinipath.so.1(psmi_mpool_get+0xd)[0x2aaaaaae312d]
a.out:28366 terminated with signal 11 at PC=333c00c110 SP=2aaab02d9698. Backtrace:
/lib64/libpthread.so.0(pthread_spin_lock+0x0)[0x333c00c110]
/usr/lib64/libpsm_infinipath.so.1(psmi_amsh_short_request+0x180)[0x2aaaaaad31b0]
/usr/lib64/libpsm_infinipath.so.1(+0xd9f6)[0x2aaaaaad49f6]
/usr/lib64/libpsm_infinipath.so.1(psm_mq_send+0x41)[0x2aaaaaaf5d51]
/usr/local/mpi/mvapich2/intel12/1.8.1/lib/libmpich.so.3(psm_send_pkt+0xb1)[0x2aaaaae0af21]
/usr/local/mpi/mvapich2/intel12/1.8.1/lib/libmpich.so.3(psm_istartmsgv+0x130)[0x2aaaaae0a010]
/usr/local/mpi/mvapich2/intel12/1.8.1/lib/libmpich.so.3(MPIDI_CH3_iStartMsgv+0x6)[0x2aaaaaddf1e6]
/usr/local/mpi/mvapich2/intel12/1.8.1/lib/libmpich.so.3(MPIDI_CH3_EagerContigSend+0x89)[0x2aaaaada6e39]
/usr/local/mpi/mvapich2/intel12/1.8.1/lib/libmpich.so.3(MPID_Send+0x116)[0x2aaaaade3136]
/usr/local/mpi/mvapich2/intel12/1.8.1/lib/libmpich.so.3(MPI_Send+0xf8)[0x2aaaaae2a408]
./a.out[0x4022ba]
/lib64/libpthread.so.0[0x333c0077f1]
/lib64/libc.so.6(clone+0x6d)[0x333bce570d]
a.out:28373 terminated with signal 11 at PC=333bf9d428 SP=2aaab0771838. Backtrace:
a.out:28370 terminated with signal 11 at PC=2aaaaaae312d SP=2aaab0771860. Backtrace:
here is my code
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
void *ring_func(void *p)
{
int token=1;
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (world_rank==0){
MPI_Send(&token, 1, MPI_INT, (world_rank + 1) % world_size, 0,
MPI_COMM_WORLD);
}
if (world_rank != 0) {
MPI_Recv(&token, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
printf("Process %d received token %d from process %d\n", world_rank, token,
world_rank - 1);
}
pthread_exit(NULL);
}
int main(int argc, char** argv) {
// Initialize the MPI threaded environment
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE , &provided);
if (provided < MPI_THREAD_MULTIPLE)
{
printf("Error: the MPI library doesn't provide the required thread level\n");
MPI_Abort(MPI_COMM_WORLD, 0);
}
pthread_t ring ;
pthread_create (&ring, NULL, ring_func, NULL) ;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}
Thanks to Hristo lliev i was able to solve the problem, the problem was that the main thread was finsishing before my pthread but when i added pthread_join the main thread waited for the pthread to join before calling MPI_Finalize(). Here is the new code
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
void *ring_func(void *p)
{
int token;
// Receive from the lower process and send to the higher process. Take care
// of the special case when you are the first process to prevent deadlock.
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (world_rank != 0) {
MPI_Recv(&token, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
printf("Process %d received token %d from process %d\n", world_rank, token,
world_rank - 1);
} else {
// Set the token's value if you are process
token = -1;
}
MPI_Send(&token, 1, MPI_INT, (world_rank + 1) % world_size, 0,
MPI_COMM_WORLD);
if (world_rank == 0) {
// sleep(20);
MPI_Recv(&token, 1, MPI_INT, world_size - 1, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
printf("Process %d received token %d from process %d\n", world_rank, token,
world_size - 1);
}
pthread_exit(NULL);
}
int main(int argc, char** argv) {
// Initialize the MPI threaded environment
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE , &provided);
if (provided != MPI_THREAD_MULTIPLE)
{
printf("Error: the MPI library doesn't provide the required thread level\n");
MPI_Abort(MPI_COMM_WORLD, 0);
}
pthread_t ring ;
pthread_create (&ring, NULL, ring_func, NULL) ;
pthread_join(ring,NULL);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}

set CPU affinity of a particular pthread failure

My speedup-example.cpp source code is shown below
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "tern/user.h"
#define N 8
#define M 10000
int nwait = 0;
int nexit = 0;
volatile long long sum;
long loops = 6e3;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_barrier_t bar;
void set_affinity(int core_id) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core_id, &cpuset);
assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0);
}
void* thread_func(void *arg) {
set_affinity((int)(long)arg);
for (int j = 0; j < M; j++) {
pthread_mutex_lock(&mutex);
nwait++;
for (long i = 0; i < loops; i++) // This is the key of speedup for parrot: the mutex needs to be a little bit congested.
sum += i;
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
soba_wait(0);
pthread_barrier_wait(&bar);
for (long i = 0; i < loops; i++)
sum += i*i*i*i*i*i;
//fprintf(stderr, "compute thread %u %d\n", (unsigned)thread, sched_getcpu());
}
}
int main(int argc, char *argv[]) {
set_affinity(23);
soba_init(0, N, 20);
pthread_t th[N];
int ret;
pthread_cond_init(&cond, NULL);
pthread_barrier_init(&bar, NULL, N);
for(unsigned i=0; i<N; ++i) {
ret = pthread_create(&th[i], NULL, thread_func, (void*)i);
assert(!ret && "pthread_create() failed!");
}
for (int j = 0; j < M; j++) {
while (nwait < N) {
sched_yield();
}
pthread_mutex_lock(&mutex);
nwait = 0;
//fprintf(stderr, "broadcast %u %d\n", (unsigned)pthread_self(), sched_getcpu());
pthread_cond_broadcast(&cond);
pthread_mutex_unlock(&mutex);
}
for(unsigned i=0; i<N; ++i)
pthread_join(th[i], NULL);
exit(0);
}
I already succeeded wrote the mk of speedup-example.cpp
gcc speedup-example.cpp -o speedup-example -O2 -g \-I$XTERN_ROOT/include -L$XTERN_ROOT/dync_hook -Wl,--rpath,$XTERN_ROOT/dync_hook -lxtern-annot \-lpthread
But when I want to run it, problems occur.
For example
$ time ./speedup-example
It informs me that
speedup-example.cpp:23: void set_affinity(int): Assertion `pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0' failed.
Can someone help me solve this problem? Many thanks.

shared memory father and child in c

David Shwartz helped me alot and now it kinda works...
do you have any idea for more elgant way to parse the input, if the input consists more than 2 numbers to add which need to be processed by the child ? I want the child to get only two integers so that's why I created the shared memory so the father will send the child the result(shared memory) + another integer.
Thank you all.
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <string.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
volatile int *shared=0;
int shmid;
int main()
{
char line[256];
int readByte;
int fd[2]; //pipe to son, who processes addition
int pid;
shmid=shmget ( IPC_PRIVATE, sizeof(int) , 0600 );
shared=shmat ( shmid, 0 , 0);
if ( pipe(fd) )
{
perror("pipe");
exit(-1);
}
pid=fork();
if (pid!=0) // father
{
close (fd[0]);
readByte=read(0, line, 256);
line[readByte-1]='\0';
printf("%d",readByte);
int arr[2];
int i=0;
int j=0;
int flag=0;
char num[10];
while (i<readByte)
{
if (line[i]=='+' )
{
i++;
j=0;
flag=1;
}
while (line[i]!='+' && line[i]!='\0')
{
num[j]=line[i];
i++;
j++;
}
num[j]='\0';
if (flag==0)
arr[0]=atoi(num);
else
{
arr[1]=atoi(num);
i++;
}
}
printf("first %d\n",arr[0]);
printf("sec %d\n",arr[1]);
write(fd[1], &arr, sizeof(arr));
wait(NULL);
printf ( "%d\n" , *shared );
}
else
// son
{
int arr[2];
int sum;
readByte = read(fd[0], &arr, sizeof(arr));
printf("son printing: %d\n",arr[0]);
printf("son printing: %d\n",arr[1]);
sum =arr[0]+arr[1];
*shared=sum;
close (fd[0]);
shmdt ( (const void *) shared );
}
shmdt ( (const void *) shared );
shmctl ( shmid , IPC_RMID , 0 );
close(fd[1]);
return 0;
}
You throw away the return value of shmat. And you expect shared to be shared, but it's just a regular variable. Also, you need to prevent the compiler from optimizing away accesses to the shared memory. Here it is with all the fatal bugs fixed:
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <string.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
volatile int *shared;
int shmid;
int main()
{
int s,i;
shmid=shmget ( IPC_PRIVATE, sizeof(int), 0600 );
shared=shmat ( shmid, 0 , 0);
*shared=100;
printf ( "%d\n" , *shared);
if ( fork()==0 ) // son
{
*shared=1000;
shmdt ( (const void *) shared );
}
else // father
{
wait ( &s );
printf ( "%d\n" , *shared);
shmdt ( (const void *) shared );
shmctl ( shmid , IPC_RMID , 0 );
}
return 0;
}

HIDAPI in two threads

According to https://github.com/signal11/hidapi/issues/72 HIDAPI ought to be thread safe on Linux machines. However, I can't get it working at all. This is what I do:
#ifdef WIN32
#include <windows.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include "hidapi.h"
hid_device *handle;
static void *TaskCode(void *argument)
{
int res;
//hid_device *handle;
unsigned char buf[64];
// res = hid_init();
// if( res == -1 )
// {
// return (void*)1;
// }
//
// handle = hid_open(0x0911, 0x251c, NULL);
// if( handle == NULL )
// {
// return (void*)2;
// }
printf( "while 2\n");
while( 1 )
{
memset( buf, 64, 0 );
res = hid_read(handle, buf, 0);
if( res == -1 )
{
return (void*)3;
}
printf( "received %d bytes\n", res);
for (int i = 0; i < res; i++)
printf("Byte %d: %02x ", i+1, buf[i]);
//printf( "%02x ", buf[0]);
fflush(stdout);
}
return (void*)0;
}
int main(int argc, char* argv[])
{
int res;
//hid_device *handle;
unsigned char buf[65];
res = hid_init();
if( res == -1 )
{
return 1;
}
handle = hid_open(0x0911, 0x251c, NULL);
if( handle == NULL )
{
return 2;
}
hid_set_nonblocking( handle, 0 );
pthread_t thread;
int rc = pthread_create(&thread, NULL, TaskCode, NULL);
printf( "while 1\n");
while(1)
{
int a = getchar();
if( a == 'a')
{
// Get Device Type (cmd 0x82). The first byte is the report number (0x0).
buf[0] = 0x0;
buf[1] = 0x82;
res = hid_write(handle, buf, 65);
if( res != -1 )
printf( "write ok, transferred %d bytes\n", res );
else
{
printf( "write error\n" );
char* str = hid_error(handle);
printf( "error: %s\n", str );
return 1;
}
}
else if( a== 'b')
break;
}
void* trc;
rc = pthread_join(thread, &trc);
printf( "rc code: %d\n", (int)trc );
// Finalize the hidapi library
res = hid_exit();
return 0;
}
If I don't use the global handle, I get 'write error' every time. If I do, as in the example, formally everything works but hid_read always returns 0 bytes... Of course, if I do simple hid_write() followed by hid_read(), I'll get the correct reply to the command 0x82 as intended. I'm really lost here, am I overlooking something?
EDIT: to clarify, zero bytes return also for everything, incl. buttons on mouse etc. So it seems to work but the data buffer is always zero bytes.
Shame on me, a dumb mistake. The code should be:
memset( buf, 0, 64 );
res = hid_read(handle, buf, 64);
and then it works. Should sleep more and write less!

Resources