i dont know why my threads in C dont work in parallel

i dont know why my threads in C dont work in parallel - pthreads

I'm trying to write a program to simulate an aircraft carrier running on threads in Cand I don't know why my threads seem to run after each other
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#define N 15 // sapaces for planes on carrier
#define K 9 // a number less than n, and if there are currently fewer than K aircraft on the carrier, landing aircraft have priority
#define S 1
pthread_mutex_t mutex;
pthread_cond_t landing_cond;
pthread_cond_t takeoff_cond;
int planes_on_deck = 0;
void* landing() {
while(1) {
pthread_mutex_lock(&mutex);
while (planes_on_deck>=N) {
pthread_cond_wait(&landing_cond, &mutex);
}
planes_on_deck++;
printf("landing.\n");
Sleep(1000);
printf("number of planes on carrier: %d\n\n", planes_on_deck);
pthread_mutex_unlock(&mutex);
pthread_cond_broadcast(&takeoff_cond);
}
return 0;
}
void* takeoff() {
while(1) {
pthread_mutex_lock(&mutex);
while (planes_on_deck < K) {
pthread_cond_wait(&takeoff_cond, &mutex);
}
planes_on_deck--;
printf("takeoff.\n");
Sleep(1000);
printf("number of planes on carrier: %d\n\n", planes_on_deck);
pthread_mutex_unlock(&mutex);
pthread_cond_broadcast(&landing_cond);
}
return 0;
}
int main() {
printf("number of planes on carrier: %d\n", planes_on_deck);
pthread_t landing_threads[S];
pthread_t takeoff_threads[S];
pthread_mutex_init(&mutex, NULL);
pthread_cond_init(&landing_cond, NULL);
pthread_cond_init(&takeoff_cond, NULL);
for (int i = 0; i < S; i++){
if (pthread_create(&landing_threads[i], NULL, &landing, NULL) != 0){
perror("fail");
}
if (pthread_create(&takeoff_threads[i], NULL, &takeoff, NULL) != 0){
perror("fail");
}
}
for (int i = 0; i < S; i++){
if (pthread_join(takeoff_threads[i], NULL) != 0){
return 2;
}
if (pthread_join(landing_threads[i], NULL) != 0){
return 2;
}
}
pthread_mutex_destroy(&mutex);
pthread_cond_destroy(&landing_cond);
pthread_cond_destroy(&takeoff_cond);
return 0;
}
at the beginning it starts to list landing planes as it should and when it reaches K it should give the opportunity for planes to take off
but he does so only when he reaches the space limit on the ship
Maybe you can tell me, what's wrong with my program, why it behaves the way it does

why it behaves the way it does
You call Sleep() with the mutex held. That prevents the other thread from doing anything. You should unlock the mutex before going to sleep.
In general, sleeping (or doing other long operations, like reading from a remote socket or opening a file) with a mutex held is almost always wrong.

Related

create Pthreads in loop

I create some threads in a for loop and after this loop, join them in other loop. they do their function till all of them finish it,do they? my last result is logically wrong . my result is correct, just when join each thread after create it!!

Yes i think you are doing right.Letsee for example
extern "C"
{
#include <pthread.h>
#include <unistd.h>
}
#include <iostream>
using namespace std;
const int NUMBER_OF_THREADS = 5;
void * thread_talk(void * thread_nr)
{
//do some operation here
pthread_exit(NULL); //exit from current thread
}
int main()
{
pthread_t thread[NUMBER_OF_THREADS];
cout << "Starting all threads..." << endl;
int temp_arg[NUMBER_OF_THREADS] ;
/*creating all threads*/
for(int current_t = 0; current_t < NUMBER_OF_THREADS; current_t++)
{
temp_arg[current_t] = current_t;
int result = pthread_create(&thread[current_t], NULL, thread_talk, static_cast<void*>(&temp_arg[current_t])) ;
if (result !=0)
{
cout << "Error creating thread " << current_t << ". Return code:" << result << endl;
}
}
/*creating all threads*/
/*Joining all threads*/
for(int current_t = 0; current_t < NUMBER_OF_THREADS; current_t++)
{
pthread_join(thread[current_t], NULL);
}
/*Joining all threads*/
cout << "All threads completed." ;
return 0;
}
Its your decision when you want to exit that thread by calling pthread_exit function .Absolutely there is no certainty that which thread will be executed first.Your OS will decide when resources are available for your threads and execute them on whatever CPU is least occupied

set CPU affinity of a particular pthread failure

My speedup-example.cpp source code is shown below
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "tern/user.h"
#define N 8
#define M 10000
int nwait = 0;
int nexit = 0;
volatile long long sum;
long loops = 6e3;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_barrier_t bar;
void set_affinity(int core_id) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core_id, &cpuset);
assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0);
}
void* thread_func(void *arg) {
set_affinity((int)(long)arg);
for (int j = 0; j < M; j++) {
pthread_mutex_lock(&mutex);
nwait++;
for (long i = 0; i < loops; i++) // This is the key of speedup for parrot: the mutex needs to be a little bit congested.
sum += i;
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
soba_wait(0);
pthread_barrier_wait(&bar);
for (long i = 0; i < loops; i++)
sum += i*i*i*i*i*i;
//fprintf(stderr, "compute thread %u %d\n", (unsigned)thread, sched_getcpu());
}
}
int main(int argc, char *argv[]) {
set_affinity(23);
soba_init(0, N, 20);
pthread_t th[N];
int ret;
pthread_cond_init(&cond, NULL);
pthread_barrier_init(&bar, NULL, N);
for(unsigned i=0; i<N; ++i) {
ret = pthread_create(&th[i], NULL, thread_func, (void*)i);
assert(!ret && "pthread_create() failed!");
}
for (int j = 0; j < M; j++) {
while (nwait < N) {
sched_yield();
}
pthread_mutex_lock(&mutex);
nwait = 0;
//fprintf(stderr, "broadcast %u %d\n", (unsigned)pthread_self(), sched_getcpu());
pthread_cond_broadcast(&cond);
pthread_mutex_unlock(&mutex);
}
for(unsigned i=0; i<N; ++i)
pthread_join(th[i], NULL);
exit(0);
}
I already succeeded wrote the mk of speedup-example.cpp
gcc speedup-example.cpp -o speedup-example -O2 -g \-I$XTERN_ROOT/include -L$XTERN_ROOT/dync_hook -Wl,--rpath,$XTERN_ROOT/dync_hook -lxtern-annot \-lpthread
But when I want to run it, problems occur.
For example
$ time ./speedup-example
It informs me that
speedup-example.cpp:23: void set_affinity(int): Assertion `pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) ==0' failed.
Can someone help me solve this problem? Many thanks.

HIDAPI in two threads

According to https://github.com/signal11/hidapi/issues/72 HIDAPI ought to be thread safe on Linux machines. However, I can't get it working at all. This is what I do:
#ifdef WIN32
#include <windows.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include "hidapi.h"
hid_device *handle;
static void *TaskCode(void *argument)
{
int res;
//hid_device *handle;
unsigned char buf[64];
// res = hid_init();
// if( res == -1 )
// {
// return (void*)1;
// }
//
// handle = hid_open(0x0911, 0x251c, NULL);
// if( handle == NULL )
// {
// return (void*)2;
// }
printf( "while 2\n");
while( 1 )
{
memset( buf, 64, 0 );
res = hid_read(handle, buf, 0);
if( res == -1 )
{
return (void*)3;
}
printf( "received %d bytes\n", res);
for (int i = 0; i < res; i++)
printf("Byte %d: %02x ", i+1, buf[i]);
//printf( "%02x ", buf[0]);
fflush(stdout);
}
return (void*)0;
}
int main(int argc, char* argv[])
{
int res;
//hid_device *handle;
unsigned char buf[65];
res = hid_init();
if( res == -1 )
{
return 1;
}
handle = hid_open(0x0911, 0x251c, NULL);
if( handle == NULL )
{
return 2;
}
hid_set_nonblocking( handle, 0 );
pthread_t thread;
int rc = pthread_create(&thread, NULL, TaskCode, NULL);
printf( "while 1\n");
while(1)
{
int a = getchar();
if( a == 'a')
{
// Get Device Type (cmd 0x82). The first byte is the report number (0x0).
buf[0] = 0x0;
buf[1] = 0x82;
res = hid_write(handle, buf, 65);
if( res != -1 )
printf( "write ok, transferred %d bytes\n", res );
else
{
printf( "write error\n" );
char* str = hid_error(handle);
printf( "error: %s\n", str );
return 1;
}
}
else if( a== 'b')
break;
}
void* trc;
rc = pthread_join(thread, &trc);
printf( "rc code: %d\n", (int)trc );
// Finalize the hidapi library
res = hid_exit();
return 0;
}
If I don't use the global handle, I get 'write error' every time. If I do, as in the example, formally everything works but hid_read always returns 0 bytes... Of course, if I do simple hid_write() followed by hid_read(), I'll get the correct reply to the command 0x82 as intended. I'm really lost here, am I overlooking something?
EDIT: to clarify, zero bytes return also for everything, incl. buttons on mouse etc. So it seems to work but the data buffer is always zero bytes.

Shame on me, a dumb mistake. The code should be:
memset( buf, 0, 64 );
res = hid_read(handle, buf, 64);
and then it works. Should sleep more and write less!

cudamemcpy error:"the launch timed out and was terminated"

My code is a parallel implmentation that calculates the nth digit of pi. When I finish the kernel and try to copy the memory back to the host I get a "the launch timed out and was terminated" error.
I used this code for error checking for each cudamalloc, cudamemcpy, and kernal launch.
std::string error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
These calls were saying everything was fine until the first cudamemcpy call after returning from the kernel. the error happens in the line "cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);" in main. Any help is appreciated.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of x mod y */
__device__ int inv_mod(int x,int y) {
int q,u,v,a,c,t;
u=x;
v=y;
c=1;
a=0;
do {
q=v/u;
t=c;
c=a-q*c;
a=t;
t=u;
u=v-q*u;
v=t;
} while (u!=0);
a=a%y;
if (a<0) a=y+a;
return a;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of u mod v, if v is odd */
__device__ int inv_mod2(int u,int v) {
int u1,u3,v1,v3,t1,t3;
u1=1;
u3=u;
v1=v;
v3=v;
if ((u&1)!=0) {
t1=0;
t3=-v;
goto Y4;
} else {
t1=1;
t3=u;
}
do {
do {
if ((t1&1)==0) {
t1=t1>>1;
t3=t3>>1;
} else {
t1=(t1+v)>>1;
t3=t3>>1;
}
Y4:;
} while ((t3&1)==0);
if (t3>=0) {
u1=t1;
u3=t3;
} else {
v1=v-t1;
v3=-t3;
}
t1=u1-v1;
t3=u3-v3;
if (t1<0) {
t1=t1+v;
}
} while (t3 != 0);
return u1;
}
/* return (a^b) mod m */
__device__ int pow_mod(int a,int b,int m)
{
int r,aa;
r=1;
aa=a;
while (1) {
if (b&1) r=mul_mod(r,aa,m);
b=b>>1;
if (b == 0) break;
aa=mul_mod(aa,aa,m);
}
return r;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return true if n is prime */
int is_prime(int n)
{
int r,i;
if ((n % 2) == 0) return 0;
r=(int)(sqrtf(n));
for(i=3;i<=r;i+=2) if ((n % i) == 0) return 0;
return 1;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the prime number immediatly after n */
int next_prime(int n)
{
do {
n++;
} while (!is_prime(n));
return n;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
#define DIVN(t,a,v,vinc,kq,kqinc) \
{ \
kq+=kqinc; \
if (kq >= a) { \
do { kq-=a; } while (kq>=a); \
if (kq == 0) { \
do { \
t=t/a; \
v+=vinc; \
} while ((t % a) == 0); \
} \
} \
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
__global__ void digi_calc(int *s, int *av, int *primes, int N, int n, int nthreads){
int a,vmax,num,den,k,kq1,kq2,kq3,kq4,t,v,i,t1, h;
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
// GIANT LOOP
for (h = 0; h<1; h++){
if(tid > nthreads) continue;
a = primes[tid];
vmax=(int)(logf(3*N)/logf(a));
if (a==2) {
vmax=vmax+(N-n);
if (vmax<=0) continue;
}
av[tid]=1;
for(i=0;i<vmax;i++) av[tid]*= a;
s[tid]=0;
den=1;
kq1=0;
kq2=-1;
kq3=-3;
kq4=-2;
if (a==2) {
num=1;
v=-n;
} else {
num=pow_mod(2,n,av[tid]);
v=0;
}
for(k=1;k<=N;k++) {
t=2*k;
DIVN(t,a,v,-1,kq1,2);
num=mul_mod(num,t,av[tid]);
t=2*k-1;
DIVN(t,a,v,-1,kq2,2);
num=mul_mod(num,t,av[tid]);
t=3*(3*k-1);
DIVN(t,a,v,1,kq3,9);
den=mul_mod(den,t,av[tid]);
t=(3*k-2);
DIVN(t,a,v,1,kq4,3);
if (a!=2) t=t*2; else v++;
den=mul_mod(den,t,av[tid]);
if (v > 0) {
if (a!=2) t=inv_mod2(den,av[tid]);
else t=inv_mod(den,av[tid]);
t=mul_mod(t,num,av[tid]);
for(i=v;i<vmax;i++) t=mul_mod(t,a,av[tid]);
t1=(25*k-3);
t=mul_mod(t,t1,av[tid]);
s[tid]+=t;
if (s[tid]>=av[tid]) s-=av[tid];
}
}
t=pow_mod(5,n-1,av[tid]);
s[tid]=mul_mod(s[tid],t,av[tid]);
}
__syncthreads();
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
int main(int argc,char *argv[])
{
int N,n,i,totalp, h;
double sum;
const char *error;
int *sdev, *avdev, *shost, *avhost, *adev, *ahost;
argc = 2;
argv[1] = "2";
if (argc<2 || (n=atoi(argv[1])) <= 0) {
printf("This program computes the n'th decimal digit of pi\n"
"usage: pi n , where n is the digit you want\n"
);
exit(1);
}
sum = 0;
N=(int)((n+20)*logf(10)/logf(13.5));
totalp=(N/logf(N))+10;
ahost = (int *)calloc(totalp, sizeof(int));
i = 0;
ahost[0]=2;
for(i=1; ahost[i-1]<=(3*N); ahost[i+1]=next_prime(ahost[i])){
i++;
}
// allocate host memory
size_t size = i*sizeof(int);
shost = (int *)malloc(size);
avhost = (int *)malloc(size);
//allocate memory on device
cudaMalloc((void **) &sdev, size);
cudaMalloc((void **) &avdev, size);
cudaMalloc((void **) &adev, size);
cudaMemcpy(adev, ahost, size, cudaMemcpyHostToDevice);
if (i >= 512){
h = 512;
}
else h = i;
dim3 dimGrid(((i+512)/512),1,1);
dim3 dimBlock(h,1,1);
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
//copy memory back to host
cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);
cudaMemcpy(shost, sdev, size, cudaMemcpyDeviceToHost);
// end malloc's, memcpy's, kernel calls
for(h = 0; h <=i; h++){
sum=fmod(sum+(double) shost[h]/ (double) avhost[h],1.0);
}
printf("Decimal digits of pi at position %d: %09d\n",n,(int)(sum*1e9));
//free memory
cudaFree(sdev);
cudaFree(avdev);
cudaFree(adev);
free(shost);
free(avhost);
free(ahost);
return 0;
}

This is exactly the same problem you asked about in this question. The kernel is getting terminated early by the driver because it is taking too long to finish. If you read the documentation for any of these runtime API functions you will see the following note:
Note:
Note that this function may also return error codes from previous,
asynchronous launches.
All that is happening is that the first API call after the kernel launch is returning the error incurred while the kernel was running - in this case the cudaMemcpy call. The way you can confirm this for yourself is to do something like this directly after the kernel launch:
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
std::string error = cudaGetErrorString(cudaPeekAtLastError());
printf("%s\n", error);
error = cudaGetErrorString(cudaThreadSynchronize());
printf("%s\n", error);
The cudaPeekAtLastError() call will show you if there are any errors in the kernel launch, and the error code returned by the cudaThreadSynchronize() call will show whether any errors were generated while the kernel was executing.
The solution is exactly as outlined in the previous question: probably the simplest way is redesign the code so it is "re-entrant" so you can split the work over several kernel launches, with each kernel launch safely under the display driver watchdog timer limit.

Cuda somehow buffers all the read/write operations on global memory. So you can batch the operations in some loop with some kernel, and it will take actually NO TIME. Then, when you call memcpy, all the buffered operations are done, and it can timeout. Method to go with, is to call cudaThreadSynchronize procedure between iterations.
So remember: if a kernel run takes only nanoseconds to calculate - it doesn't mean that it is so fast - some of the writes to the global memory, are done when memcpy or threadsynchronize is called.

Problem forking processes and creating threads

My program is supposed to fork three processes. Each of these processes will create three threads and fork two additional processes. These two additional processes will create three threads.
Here is my code. I've tried to keep things simple with nested loops. I think at some point I might be forking more processes or creating more threads.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
void *printme(void* Array){
int *Arr = (int *) Array;
int len = sizeof(Arr) / sizeof(int);
if (len == 1){
printf("I'm thread %d.%d",Arr[0],Arr[1]);
}
else if (len == 2){
printf("I'm thread %d.%d.%d",Arr[0],Arr[1],Arr[2]);
}
printf("\n");
pthread_exit(NULL);
}
int main(void){
int i, j, k, l;
int threadLevel1[2];
int threadLevel2[3];
printf("\n");
for (i = 1 ; i < 4 ; i++){ // Loop to fork the three main processes.
if (fork() != 0){
sleep(4);
}
else{
//The newly forked process will create three threads and fork two additional processes.
for (j = 1 ; j < 4 ; j++){
pthread_t t;
threadLevel1[0] = i;
threadLevel1[1] = j;
if (pthread_create(&t, NULL, printme, (void*) threadLevel1) != 0){
perror("pthread_create");
exit(1);
}
}
for (k = 1; k < 3 ; k++){
pid_t a = fork();
if (a != 0){
sleep(2);
}
else if (a == -1){
perror("fork"); /* display error message */
exit(0);
}
else{
for (l = 1 ; l < 4 ; l++){
pthread_t t;
threadLevel2[0] = i;
threadLevel2[1] = k;
threadLevel2[2] = l;
if (pthread_create(&t, NULL, printme, (void*) threadLevel2)!=0) {
perror("pthread_create");
exit(1);
}
}
}
}
}
}
return 0;
}

You have a problem in your code here:
void *printme(void* Array){
int *Arr = (int *) Array;
int len = sizeof(Arr) / sizeof(int);
The value len will always be the same no matter what is passed in to printme. That's because C passes arrays as pointers, not as objects with embedded lengths.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

i dont know why my threads in C dont work in parallel - pthreads

Related

create Pthreads in loop

set CPU affinity of a particular pthread failure

HIDAPI in two threads

cudamemcpy error:"the launch timed out and was terminated"

Problem forking processes and creating threads

Categories

Resources