I've produced this kernel
struct csrFormat {
int M, N;
int *IRP;
int *JA;
double *AS;
struct vector {
int dim;
double *val;
double csrSIMDReduction(struct csrFormat *csr, struct vector *vec, struct vector *res, int nz){
int M = csr->M;
int nonzerosPerRow = nz/M;
int chunk_size = 10000/nonzerosPerRow;
double *val = res->val;
struct timeval start,end;
gettimeofday(&start, NULL);
//IRP is the row_pointer array
//JA is the column indices array
//AS is the values array
int i,j,tmp;
#pragma omp parallel
#pragma omp for private(i, j, tmp) schedule(dynamic, chunk_size)
for (i=0; i<M; i++)
double result = 0.0;
#pragma omp simd reduction(+ : result)
for (j = csr->IRP[i]; j < csr->IRP[i+1]; j++)
tmp = csr->JA[j];
result += csr->AS[j] * vec->val[tmp];
val[i] = result;
gettimeofday(&end, NULL);
res->dim = M;
//printf("%ld.%06ld\n", start.tv_sec, start.tv_usec);
//printf("%ld.%06ld\n", end.tv_sec, end.tv_usec);
long t = (end.tv_sec - start.tv_sec)*1000000.0 + end.tv_usec - start.tv_usec;
return (double) t;
but i'm having a doubt: how does the vectorization works?
I mean, from what i've understood, the inner cicle is runned running multiple iterations together, but how can a single thread do this?
EDIT: code updated.
I use pthreads on ubuntu to implement multithreaded matrix-vector multiplication, but the runtime reports an error Segmentation fault
#pragma comment(lib, "pthreadVC2.lib")
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
/* Global variables */
int thread_count = 8;
int m, n;
double* A = NULL;
double* x = NULL;
double* y = NULL;
/* Serial functions */
void Usage(char* prog_name);
void Read_matrix(char* prompt, double A[], int m, int n);
void Read_vector(char* prompt, double x[], int n);
void Print_matrix(char* title, double A[], int m, int n);
void Print_vector(char* title, double y[], double m);
/* Parallel function */
void* Pth_mat_vect(void* rank);
int main(int argc, char* argv[]) {
long thread;
pthread_t* thread_handles;
thread_count = atoi(argv[1]);
thread_handles = malloc(thread_count * sizeof(pthread_t));
printf("Enter m and n\n");
scanf("%d%d", &m, &n);
A = malloc(m * n * sizeof(double));
x = malloc(n * sizeof(double));
y = malloc(m * sizeof(double));
Read_matrix("Enter the matrix", A, m, n);
Print_matrix("We read", A, m, n);
Read_vector("Enter the vector", x, n);
Print_vector("We read", x, n);
for (thread = 0; thread < thread_count; thread++)
pthread_create(&thread_handles[thread], NULL,
Pth_mat_vect, (void*)thread);
for (thread = 0; thread < thread_count; thread++)
pthread_join(thread_handles[thread], NULL);
Print_vector("The product is", y, m);
return 0;
} /* main */
* Function: Read_matrix
* Purpose: Read in the matrix
* In args: prompt, m, n
* Out arg: A
void Read_matrix(char* prompt, double A[], int m, int n) {
int i, j;
printf("%s\n", prompt);
for (i = 0; i < m; i++)
for (j = 0; j < n; j++)
scanf("%lf", &A[i * n + j]);
} /* Read_matrix */
* Function: Read_vector
* Purpose: Read in the vector x
* In arg: prompt, n
* Out arg: x
void Read_vector(char* prompt, double x[], int n) {
int i;
printf("%s\n", prompt);
for (i = 0; i < n; i++)
scanf("%lf", &x[i]);
} /* Read_vector */
* Function: Pth_mat_vect
* Purpose: Multiply an mxn matrix by an nx1 column vector
* In arg: rank
* Global in vars: A, x, m, n, thread_count
* Global out var: y
void* Pth_mat_vect(void* rank) {
long my_rank = (long)rank;
int i, j;
int local_m = m / thread_count;
int my_first_row = my_rank * local_m;
int my_last_row = (my_rank + 1) * local_m - 1;
for (i = my_first_row; i <= my_last_row; i++) {
y[i] = 0.0;
for (j = 0; j < n; j++)
y[i] += A[i * n + j] * x[j];
return NULL;
} /* Pth_mat_vect */
* Function: Print_matrix
* Purpose: Print the matrix
* In args: title, A, m, n
void Print_matrix(char* title, double A[], int m, int n) {
int i, j;
printf("%s\n", title);
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%4.1f ", A[i * n + j]);
} /* Print_matrix */
* Function: Print_vector
* Purpose: Print a vector
* In args: title, y, m
void Print_vector(char* title, double y[], double m) {
int i;
printf("%s\n", title);
for (i = 0; i < m; i++)
printf("%4.1f ", y[i]);
} /* Print_vector */
This code is from An Introduction to Parallel Programming
I know this error seems to be related to memory, in fact the code runs without entering main().I tried some other people's methods, but none of them worked.
struct node{
char data;
struct node*next;
struct node* top =NULL;
void push(char x){
struct node* temp =(struct node*)malloc(sizeof(struct node));
printf("Stacks Overflow\n");
temp->data = x;
temp->next = top;
top = temp;
char pop(){
printf("Stacks Underflow\n");
return -1;
char x;
struct node* p = top;
top = top->next;
x = p->data;
return x;
char peek(int pos){
struct node*p = top;
int i;
for(i=0;p!=NULL && i<pos-1;i++)
p = p->next;
return (p!=NULL)?p->data:-1;
int main(){
char n, x;
printf("Enter the Number of Character to be pushed\n");
scanf("%d", &n);
for(char i=0;i<n;i++){
printf("Enter the Character\n");
x = getchar();
printf("%d,", pop());
why this code not work with char(n, x) but work fine with int(x, n).
I tried to implement char stack using linked list.
It works fine with integer but doesnot work with character.
When I use to insert char, one input get skipped but this doesnot happen in int .. Why?
how can I free this malloc from function in main?
void insert(int data){
struct node* temp = (struct node*)malloc(sizeof(struct node));
temp->data = data;
temp->next = head;
head = temp;
here is the main function, I did try to free the head but still the memory still increasing every time add some inputs
int main(){
head = NULL;
int n, data;
printf("how many numbers to be entered in linked list? ");
scanf("%d", &n);
for(int i=0; i<n; i++){
data = rand() % 100;
goto again;
I am trying to solve 445. Add Two Numbers II from LeetCode where it is asked:
Given two non-empty linked lists representing two non-negative integers, add the two numbers and return it as a linked list. The most significant digit comes first and each of their nodes contain a single digit.
For some test cases, I am getting negative number from the sum method that I have implemented. I think it is impossible to get any negative digit in my code. Can you help me finding the bug?
Below is the code that I tried:
* Definition for singly-linked list.
* public class ListNode {
* int val;
* ListNode next;
* ListNode() {}
* ListNode(int val) { this.val = val; }
* ListNode(int val, ListNode next) { this.val = val; this.next = next; }
* }
class Solution {
public ListNode addTwoNumbers(ListNode l1, ListNode l2) {
//ListNode previous = null;
int len1 = 0;
int len2 = 0;
ListNode head1 = l1;
ListNode head2 = l2;
int sum=0;
if (len1 >= len2){
sum= sum(head1,head2,len1,len2);
sum= sum(head2,head1,len2,len1);
String sumString = "" + sum;
ListNode extraHead = new ListNode(1);
ListNode copy = extraHead;
for(int i = 0;i<sumString.length();i++){
ListNode bit = new ListNode(Integer.parseInt(String.valueOf(sumString.charAt(i))));//Integer.parseInt(String.valueOf(sumString.charAt(i)))
extraHead.next = bit;
extraHead = extraHead.next;
return copy.next;
public int sum(ListNode l1, ListNode l2, int len1, int len2){
int diff = 0;
int resLen = 0;
diff = len1 - len2;
resLen = len1;
int[] res = new int[resLen];
ListNode fast = l1;
ListNode slow = l2;
for(int count = 0; count<diff; count++){
res[count] = fast.val;
fast = fast.next;
for(int count = diff;count < res.length;count++){
res[count] = fast.val + slow.val;
int sum = 0;
for(int i = len1;i>0;i-- ){
sum = sum + res[len1-i] * (int)Math.pow(10,i-1);
return sum;
Here is the error message that I get:
Error Details
java.lang.NumberFormatException: For input string: "-"
at line 68, java.base/java.lang.NumberFormatException.forInputString
at line 648, java.base/java.lang.Integer.parseInt
at line 776, java.base/java.lang.Integer.parseInt
at line 41, Solution.addTwoNumbers
at line 54, __DriverSolution__.__helper__
at line 87, __Driver__.main
Here is the input causing the error:
I am trying to understand the LASSO algorithm for linear regression. I have implemented the algorithm using naive coordinate descent method for optimization. However the coefficients that I obtained from my code, wasn't matching with those obtained from the 'glmnet'package for LASSO in R. I wanted to understand how I could make the algorithm more accurate, so that the coefficients match with those obtained from R. I think they use coordinate descent as well.
Note: I have generated some toy data with 11 observations, and 6
features(x,x^2 ,x^3,...,x^6). The last column contains the y values
generated from a dummy function (e^(-x^2)). I wanted to use LASSO to
estimate this function. Also, I have randomly picked the initial
weight vector, multiple times to crosscheck my results.
Here is my code:
int num_dim = 6;
int num_obs = 11;
/*Computes the normalization factor*/
float norm_feature(int j,double arr[][7],int n){
float sum = 0.0;
int i;
sum = sum + pow(arr[i][j],2);
return sum;
/*Computes the partial sum*/
float approx(int dim,int d_ignore,float weights[],double arr[][7],int
int flag = 1;
if(d_ignore == -1)
flag = 0;
int j;
float sum = 0.0;
if(j != d_ignore)
sum = sum + weights[j]*arr[i][j];
return sum;
/* Computes rho-j */
float rho_j(double arr[][7],int n,int j,float weights[7]){
float sum = 0.0;
int i;
float partial_sum ;
partial_sum = approx(num_dim,j,weights,arr,i);
sum = sum + arr[i][j]*(arr[i][num_dim]-partial_sum);
return sum;
float intercept(float arr1[7],double arr[][7],int dim) {
int i;
float sum =0.0;
for (i = 0; i < num_obs; i++) {
sum = sum + pow((arr[i][num_dim]) - approx(num_dim, -1, arr1, arr,
i), 1);
return sum;
int main(){
double data[num_obs][7];
int i=0,j=0;
float a = 1.0;
float lambda = 0.1; //Setting lambda
float weights[7]; //weights[6] contains the intercept
srand((unsigned int) time(NULL));
/*Generating the data matrix */
data[i][0] = ((float)rand()/(float)(RAND_MAX)) * a;
data[i][j] = pow(data[i][0],j+1);
data[i][6] = exp(-pow(data[i][0],2)); // the last column in the
datamatrix contains the y values generated by the dummy function
/*Printing the data matrix */
printf("Data Matrix:\n");
printf("%lf ",data[i][j]);}
int seed =0;
while(seed<20) {
//Initializing the weight vector
for (i = 0; i < 7; i++)
weights[i] = ((float) rand() / (float) (RAND_MAX)) * a;
int iter = 500;
int t = 0;
int r, l;
double rho[num_dim];
for (i = 0; i < 6; i++) {
rho[i] = rho_j(data, num_obs, r, weights);
// Intercept initialization
weights[num_dim] = intercept(weights,data,num_dim);
printf("Weights initialization: ");
for (i = 0; i < (num_dim+1); i++)
printf("%f ", weights[i]);
while (t < iter) {
for (r = 0; r < num_dim; r++) {
rho[r] = rho_j(data, num_obs, r, weights);
//printf("rho %d:%f ",r,rho[r]);
if (rho[r] < -lambda / 2)
weights[r] = (rho[r] + lambda / 2) / norm_feature(r,
data, num_obs);
else if (rho[r] > lambda / 2)
weights[r] = (rho[r] - lambda / 2) / norm_feature(r,
data, num_obs);
weights[r] = 0;
weights[num_dim] = intercept(weights, data, num_dim);
/* printf("Iter(%d): ", t);
for (l = 0; l < 7; l++)
printf("%f ", weights[l]);
printf("Final Weights: ");
for (i = 0; i < 7; i++)
printf("%f ", weights[i]);
return 0;