openmp spmv CSR vectorization - vectorization

I've produced this kernel
struct csrFormat {
int M, N;
int *IRP;
int *JA;
double *AS;
};
struct vector {
int dim;
double *val;
};
double csrSIMDReduction(struct csrFormat *csr, struct vector *vec, struct vector *res, int nz){
int M = csr->M;
int nonzerosPerRow = nz/M;
int chunk_size = 10000/nonzerosPerRow;
double *val = res->val;
struct timeval start,end;
gettimeofday(&start, NULL);
//IRP is the row_pointer array
//JA is the column indices array
//AS is the values array
int i,j,tmp;
#pragma omp parallel
{
#pragma omp for private(i, j, tmp) schedule(dynamic, chunk_size)
for (i=0; i<M; i++)
{
double result = 0.0;
#pragma omp simd reduction(+ : result)
for (j = csr->IRP[i]; j < csr->IRP[i+1]; j++)
{
tmp = csr->JA[j];
result += csr->AS[j] * vec->val[tmp];
}
val[i] = result;
}
}
gettimeofday(&end, NULL);
res->dim = M;
//printf("%ld.%06ld\n", start.tv_sec, start.tv_usec);
//printf("%ld.%06ld\n", end.tv_sec, end.tv_usec);
long t = (end.tv_sec - start.tv_sec)*1000000.0 + end.tv_usec - start.tv_usec;
return (double) t;
}
but i'm having a doubt: how does the vectorization works?
I mean, from what i've understood, the inner cicle is runned running multiple iterations together, but how can a single thread do this?
EDIT: code updated.

Related

Segmentation fault (core dumped) when i use pthreads on ubuntu

I use pthreads on ubuntu to implement multithreaded matrix-vector multiplication, but the runtime reports an error Segmentation fault
#pragma comment(lib, "pthreadVC2.lib")
#define _CRT_SECURE_NO_WARNINGS 1
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
/* Global variables */
int thread_count = 8;
int m, n;
double* A = NULL;
double* x = NULL;
double* y = NULL;
/* Serial functions */
void Usage(char* prog_name);
void Read_matrix(char* prompt, double A[], int m, int n);
void Read_vector(char* prompt, double x[], int n);
void Print_matrix(char* title, double A[], int m, int n);
void Print_vector(char* title, double y[], double m);
/* Parallel function */
void* Pth_mat_vect(void* rank);
/*------------------------------------------------------------------*/
int main(int argc, char* argv[]) {
long thread;
pthread_t* thread_handles;
thread_count = atoi(argv[1]);
thread_handles = malloc(thread_count * sizeof(pthread_t));
printf("Enter m and n\n");
scanf("%d%d", &m, &n);
A = malloc(m * n * sizeof(double));
x = malloc(n * sizeof(double));
y = malloc(m * sizeof(double));
Read_matrix("Enter the matrix", A, m, n);
Print_matrix("We read", A, m, n);
Read_vector("Enter the vector", x, n);
Print_vector("We read", x, n);
for (thread = 0; thread < thread_count; thread++)
pthread_create(&thread_handles[thread], NULL,
Pth_mat_vect, (void*)thread);
for (thread = 0; thread < thread_count; thread++)
pthread_join(thread_handles[thread], NULL);
Print_vector("The product is", y, m);
free(A);
free(x);
free(y);
return 0;
} /* main */
/*------------------------------------------------------------------
* Function: Read_matrix
* Purpose: Read in the matrix
* In args: prompt, m, n
* Out arg: A
*/
void Read_matrix(char* prompt, double A[], int m, int n) {
int i, j;
printf("%s\n", prompt);
for (i = 0; i < m; i++)
for (j = 0; j < n; j++)
scanf("%lf", &A[i * n + j]);
} /* Read_matrix */
/*------------------------------------------------------------------
* Function: Read_vector
* Purpose: Read in the vector x
* In arg: prompt, n
* Out arg: x
*/
void Read_vector(char* prompt, double x[], int n) {
int i;
printf("%s\n", prompt);
for (i = 0; i < n; i++)
scanf("%lf", &x[i]);
} /* Read_vector */
/*------------------------------------------------------------------
* Function: Pth_mat_vect
* Purpose: Multiply an mxn matrix by an nx1 column vector
* In arg: rank
* Global in vars: A, x, m, n, thread_count
* Global out var: y
*/
void* Pth_mat_vect(void* rank) {
long my_rank = (long)rank;
int i, j;
int local_m = m / thread_count;
int my_first_row = my_rank * local_m;
int my_last_row = (my_rank + 1) * local_m - 1;
for (i = my_first_row; i <= my_last_row; i++) {
y[i] = 0.0;
for (j = 0; j < n; j++)
y[i] += A[i * n + j] * x[j];
}
return NULL;
} /* Pth_mat_vect */
/*------------------------------------------------------------------
* Function: Print_matrix
* Purpose: Print the matrix
* In args: title, A, m, n
*/
void Print_matrix(char* title, double A[], int m, int n) {
int i, j;
printf("%s\n", title);
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%4.1f ", A[i * n + j]);
printf("\n");
}
} /* Print_matrix */
/*------------------------------------------------------------------
* Function: Print_vector
* Purpose: Print a vector
* In args: title, y, m
*/
void Print_vector(char* title, double y[], double m) {
int i;
printf("%s\n", title);
for (i = 0; i < m; i++)
printf("%4.1f ", y[i]);
printf("\n");
} /* Print_vector */
This code is from An Introduction to Parallel Programming
I know this error seems to be related to memory, in fact the code runs without entering main().I tried some other people's methods, but none of them worked.

Stacks Using Linked List

#include<stdio.h>
#include<stdlib.h>
struct node{
char data;
struct node*next;
};
struct node* top =NULL;
void push(char x){
struct node* temp =(struct node*)malloc(sizeof(struct node));
if(temp==NULL){
printf("Stacks Overflow\n");
return;
}
else{
temp->data = x;
temp->next = top;
top = temp;
}
}
char pop(){
if(top==NULL){
printf("Stacks Underflow\n");
return -1;
}
else{
char x;
struct node* p = top;
top = top->next;
x = p->data;
free(p);
return x;
}
}
char peek(int pos){
struct node*p = top;
int i;
for(i=0;p!=NULL && i<pos-1;i++)
p = p->next;
return (p!=NULL)?p->data:-1;
}
int main(){
char n, x;
printf("Enter the Number of Character to be pushed\n");
scanf("%d", &n);
for(char i=0;i<n;i++){
printf("Enter the Character\n");
x = getchar();
push(x);
}
while(top){
printf("%d,", pop());
}
}
why this code not work with char(n, x) but work fine with int(x, n).
I tried to implement char stack using linked list.
It works fine with integer but doesnot work with character.
When I use to insert char, one input get skipped but this doesnot happen in int .. Why?

How can to free allocated memory

how can I free this malloc from function in main?
void insert(int data){
struct node* temp = (struct node*)malloc(sizeof(struct node));
temp->data = data;
temp->next = head;
head = temp;
}
here is the main function, I did try to free the head but still the memory still increasing every time add some inputs
int main(){
head = NULL;
int n, data;
srand(time(NULL));
again:
printf("how many numbers to be entered in linked list? ");
scanf("%d", &n);
for(int i=0; i<n; i++){
data = rand() % 100;
insert(data);
print();
}
goto again;
free(head);
}

leetcode practice: can find the bug of returning a negative number from sum method

I am trying to solve 445. Add Two Numbers II from LeetCode where it is asked:
Given two non-empty linked lists representing two non-negative integers, add the two numbers and return it as a linked list. The most significant digit comes first and each of their nodes contain a single digit.
For some test cases, I am getting negative number from the sum method that I have implemented. I think it is impossible to get any negative digit in my code. Can you help me finding the bug?
Below is the code that I tried:
/**
* Definition for singly-linked list.
* public class ListNode {
* int val;
* ListNode next;
* ListNode() {}
* ListNode(int val) { this.val = val; }
* ListNode(int val, ListNode next) { this.val = val; this.next = next; }
* }
*/
class Solution {
public ListNode addTwoNumbers(ListNode l1, ListNode l2) {
//ListNode previous = null;
int len1 = 0;
int len2 = 0;
ListNode head1 = l1;
ListNode head2 = l2;
while(l1!=null){
len1++;
l1=l1.next;
}
while(l2!=null){
len2++;
l2=l2.next;
}
int sum=0;
if (len1 >= len2){
sum= sum(head1,head2,len1,len2);
}else{
sum= sum(head2,head1,len2,len1);
}
String sumString = "" + sum;
ListNode extraHead = new ListNode(1);
ListNode copy = extraHead;
for(int i = 0;i<sumString.length();i++){
ListNode bit = new ListNode(Integer.parseInt(String.valueOf(sumString.charAt(i))));//Integer.parseInt(String.valueOf(sumString.charAt(i)))
extraHead.next = bit;
extraHead = extraHead.next;
}
return copy.next;
}
public int sum(ListNode l1, ListNode l2, int len1, int len2){
int diff = 0;
int resLen = 0;
diff = len1 - len2;
resLen = len1;
int[] res = new int[resLen];
ListNode fast = l1;
ListNode slow = l2;
for(int count = 0; count<diff; count++){
res[count] = fast.val;
fast = fast.next;
}
for(int count = diff;count < res.length;count++){
res[count] = fast.val + slow.val;
fast=fast.next;
slow=slow.next;
}
int sum = 0;
for(int i = len1;i>0;i-- ){
sum = sum + res[len1-i] * (int)Math.pow(10,i-1);
}
return sum;
}
}
Here is the error message that I get:
Error Details
java.lang.NumberFormatException: For input string: "-"
at line 68, java.base/java.lang.NumberFormatException.forInputString
at line 648, java.base/java.lang.Integer.parseInt
at line 776, java.base/java.lang.Integer.parseInt
at line 41, Solution.addTwoNumbers
at line 54, __DriverSolution__.__helper__
at line 87, __Driver__.main
Here is the input causing the error:
[3,9,9,9,9,9,9,9,9,9]
[7]

Implementation of LASSO in C

I am trying to understand the LASSO algorithm for linear regression. I have implemented the algorithm using naive coordinate descent method for optimization. However the coefficients that I obtained from my code, wasn't matching with those obtained from the 'glmnet'package for LASSO in R. I wanted to understand how I could make the algorithm more accurate, so that the coefficients match with those obtained from R. I think they use coordinate descent as well.
Note: I have generated some toy data with 11 observations, and 6
features(x,x^2 ,x^3,...,x^6). The last column contains the y values
generated from a dummy function (e^(-x^2)). I wanted to use LASSO to
estimate this function. Also, I have randomly picked the initial
weight vector, multiple times to crosscheck my results.
Here is my code:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<math.h>
#include<time.h>
int num_dim = 6;
int num_obs = 11;
/*Computes the normalization factor*/
float norm_feature(int j,double arr[][7],int n){
float sum = 0.0;
int i;
for(i=0;i<n;i++){
sum = sum + pow(arr[i][j],2);
}
return sum;
}
/*Computes the partial sum*/
float approx(int dim,int d_ignore,float weights[],double arr[][7],int
i){
int flag = 1;
if(d_ignore == -1)
flag = 0;
int j;
float sum = 0.0;
for(j=0;j<dim;j++){
if(j != d_ignore)
sum = sum + weights[j]*arr[i][j];
else
continue;
}
return sum;
}
/* Computes rho-j */
float rho_j(double arr[][7],int n,int j,float weights[7]){
float sum = 0.0;
int i;
float partial_sum ;
for(i=0;i<n;i++){
partial_sum = approx(num_dim,j,weights,arr,i);
sum = sum + arr[i][j]*(arr[i][num_dim]-partial_sum);
}
return sum;
}
float intercept(float arr1[7],double arr[][7],int dim) {
int i;
float sum =0.0;
for (i = 0; i < num_obs; i++) {
sum = sum + pow((arr[i][num_dim]) - approx(num_dim, -1, arr1, arr,
i), 1);
}
return sum;
}
int main(){
double data[num_obs][7];
int i=0,j=0;
float a = 1.0;
float lambda = 0.1; //Setting lambda
float weights[7]; //weights[6] contains the intercept
srand((unsigned int) time(NULL));
/*Generating the data matrix */
for(i=0;i<11;i++)
data[i][0] = ((float)rand()/(float)(RAND_MAX)) * a;
for(i=0;i<11;i++)
for(j=1;j<6;j++)
data[i][j] = pow(data[i][0],j+1);
for(i=0;i<11;i++)
data[i][6] = exp(-pow(data[i][0],2)); // the last column in the
datamatrix contains the y values generated by the dummy function
/*Printing the data matrix */
printf("Data Matrix:\n");
for(i=0;i<11;i++){
for(j=0;j<7;j++){
printf("%lf ",data[i][j]);}
printf("\n");}
printf("\n");
int seed =0;
while(seed<20) {
//Initializing the weight vector
for (i = 0; i < 7; i++)
weights[i] = ((float) rand() / (float) (RAND_MAX)) * a;
int iter = 500;
int t = 0;
int r, l;
double rho[num_dim];
for (i = 0; i < 6; i++) {
rho[i] = rho_j(data, num_obs, r, weights);
}
// Intercept initialization
weights[num_dim] = intercept(weights,data,num_dim);
printf("Weights initialization: ");
for (i = 0; i < (num_dim+1); i++)
printf("%f ", weights[i]);
printf("\n");
while (t < iter) {
for (r = 0; r < num_dim; r++) {
rho[r] = rho_j(data, num_obs, r, weights);
//printf("rho %d:%f ",r,rho[r]);
if (rho[r] < -lambda / 2)
weights[r] = (rho[r] + lambda / 2) / norm_feature(r,
data, num_obs);
else if (rho[r] > lambda / 2)
weights[r] = (rho[r] - lambda / 2) / norm_feature(r,
data, num_obs);
else
weights[r] = 0;
weights[num_dim] = intercept(weights, data, num_dim);
}
/* printf("Iter(%d): ", t);
for (l = 0; l < 7; l++)
printf("%f ", weights[l]);
printf("\n");*/
t++;
}
//printf("\n");
printf("Final Weights: ");
for (i = 0; i < 7; i++)
printf("%f ", weights[i]);
printf("\n");
printf("\n");
seed++;
}
return 0;
}
PseudoCode:

Resources